M7350v1_en_gpl

This commit is contained in:
T
2024-09-09 08:52:07 +00:00
commit f9cc65cfda
65988 changed files with 26357421 additions and 0 deletions
+52
View File
@@ -0,0 +1,52 @@
#
# This Kconfig describes xen options
#
config XEN
bool "Xen guest support"
select PARAVIRT
select PARAVIRT_CLOCK
depends on X86_64 || (X86_32 && X86_PAE && !X86_VISWS)
depends on X86_CMPXCHG && X86_TSC
help
This is the Linux Xen port. Enabling this will allow the
kernel to boot in a paravirtualized environment under the
Xen hypervisor.
config XEN_DOM0
def_bool y
depends on XEN && PCI_XEN && SWIOTLB_XEN
depends on X86_LOCAL_APIC && X86_IO_APIC && ACPI && PCI
# Dummy symbol since people have come to rely on the PRIVILEGED_GUEST
# name in tools.
config XEN_PRIVILEGED_GUEST
def_bool XEN_DOM0
config XEN_PVHVM
def_bool y
depends on XEN && PCI && X86_LOCAL_APIC
config XEN_MAX_DOMAIN_MEMORY
int
default 500 if X86_64
default 64 if X86_32
depends on XEN
help
This only affects the sizing of some bss arrays, the unused
portions of which are freed.
config XEN_SAVE_RESTORE
bool
depends on XEN
select HIBERNATE_CALLBACKS
default y
config XEN_DEBUG_FS
bool "Enable Xen debug and tuning parameters in debugfs"
depends on XEN && DEBUG_FS
default n
help
Enable statistics output and various tuning options in debugfs.
Enabling this option may incur a significant performance overhead.
+24
View File
@@ -0,0 +1,24 @@
ifdef CONFIG_FUNCTION_TRACER
# Do not profile debug and lowlevel utilities
CFLAGS_REMOVE_spinlock.o = -pg
CFLAGS_REMOVE_time.o = -pg
CFLAGS_REMOVE_irq.o = -pg
endif
# Make sure early boot has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_enlighten.o := $(nostackp)
CFLAGS_mmu.o := $(nostackp)
obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
time.o xen-asm.o xen-asm_$(BITS).o \
grant-table.o suspend.o platform-pci-unplug.o \
p2m.o
obj-$(CONFIG_EVENT_TRACING) += trace.o
obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
obj-$(CONFIG_XEN_DOM0) += vga.o
obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o
+125
View File
@@ -0,0 +1,125 @@
#include <linux/init.h>
#include <linux/debugfs.h>
#include <linux/slab.h>
#include <linux/module.h>
#include "debugfs.h"
static struct dentry *d_xen_debug;
struct dentry * __init xen_init_debugfs(void)
{
if (!d_xen_debug) {
d_xen_debug = debugfs_create_dir("xen", NULL);
if (!d_xen_debug)
pr_warning("Could not create 'xen' debugfs directory\n");
}
return d_xen_debug;
}
struct array_data
{
void *array;
unsigned elements;
};
static int u32_array_open(struct inode *inode, struct file *file)
{
file->private_data = NULL;
return nonseekable_open(inode, file);
}
static size_t format_array(char *buf, size_t bufsize, const char *fmt,
u32 *array, unsigned array_size)
{
size_t ret = 0;
unsigned i;
for(i = 0; i < array_size; i++) {
size_t len;
len = snprintf(buf, bufsize, fmt, array[i]);
len++; /* ' ' or '\n' */
ret += len;
if (buf) {
buf += len;
bufsize -= len;
buf[-1] = (i == array_size-1) ? '\n' : ' ';
}
}
ret++; /* \0 */
if (buf)
*buf = '\0';
return ret;
}
static char *format_array_alloc(const char *fmt, u32 *array, unsigned array_size)
{
size_t len = format_array(NULL, 0, fmt, array, array_size);
char *ret;
ret = kmalloc(len, GFP_KERNEL);
if (ret == NULL)
return NULL;
format_array(ret, len, fmt, array, array_size);
return ret;
}
static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len,
loff_t *ppos)
{
struct inode *inode = file->f_path.dentry->d_inode;
struct array_data *data = inode->i_private;
size_t size;
if (*ppos == 0) {
if (file->private_data) {
kfree(file->private_data);
file->private_data = NULL;
}
file->private_data = format_array_alloc("%u", data->array, data->elements);
}
size = 0;
if (file->private_data)
size = strlen(file->private_data);
return simple_read_from_buffer(buf, len, ppos, file->private_data, size);
}
static int xen_array_release(struct inode *inode, struct file *file)
{
kfree(file->private_data);
return 0;
}
static const struct file_operations u32_array_fops = {
.owner = THIS_MODULE,
.open = u32_array_open,
.release= xen_array_release,
.read = u32_array_read,
.llseek = no_llseek,
};
struct dentry *xen_debugfs_create_u32_array(const char *name, umode_t mode,
struct dentry *parent,
u32 *array, unsigned elements)
{
struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL);
if (data == NULL)
return NULL;
data->array = array;
data->elements = elements;
return debugfs_create_file(name, mode, parent, data, &u32_array_fops);
}
+10
View File
@@ -0,0 +1,10 @@
#ifndef _XEN_DEBUGFS_H
#define _XEN_DEBUGFS_H
struct dentry * __init xen_init_debugfs(void);
struct dentry *xen_debugfs_create_u32_array(const char *name, umode_t mode,
struct dentry *parent,
u32 *array, unsigned elements);
#endif /* _XEN_DEBUGFS_H */
File diff suppressed because it is too large Load Diff
+127
View File
@@ -0,0 +1,127 @@
/******************************************************************************
* grant_table.c
* x86 specific part
*
* Granting foreign access to our memory reservation.
*
* Copyright (c) 2005-2006, Christopher Clark
* Copyright (c) 2004-2005, K A Fraser
* Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
* VA Linux Systems Japan. Split out x86 specific part.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version 2
* as published by the Free Software Foundation; or, when distributed
* separately from the Linux kernel or incorporated into other
* software packages, subject to the following license:
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this source file (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy, modify,
* merge, publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <xen/interface/xen.h>
#include <xen/page.h>
#include <xen/grant_table.h>
#include <asm/pgtable.h>
static int map_pte_fn(pte_t *pte, struct page *pmd_page,
unsigned long addr, void *data)
{
unsigned long **frames = (unsigned long **)data;
set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
(*frames)++;
return 0;
}
/*
* This function is used to map shared frames to store grant status. It is
* different from map_pte_fn above, the frames type here is uint64_t.
*/
static int map_pte_fn_status(pte_t *pte, struct page *pmd_page,
unsigned long addr, void *data)
{
uint64_t **frames = (uint64_t **)data;
set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
(*frames)++;
return 0;
}
static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
unsigned long addr, void *data)
{
set_pte_at(&init_mm, addr, pte, __pte(0));
return 0;
}
int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
unsigned long max_nr_gframes,
void **__shared)
{
int rc;
void *shared = *__shared;
if (shared == NULL) {
struct vm_struct *area =
alloc_vm_area(PAGE_SIZE * max_nr_gframes, NULL);
BUG_ON(area == NULL);
shared = area->addr;
*__shared = shared;
}
rc = apply_to_page_range(&init_mm, (unsigned long)shared,
PAGE_SIZE * nr_gframes,
map_pte_fn, &frames);
return rc;
}
int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
unsigned long max_nr_gframes,
grant_status_t **__shared)
{
int rc;
grant_status_t *shared = *__shared;
if (shared == NULL) {
/* No need to pass in PTE as we are going to do it
* in apply_to_page_range anyhow. */
struct vm_struct *area =
alloc_vm_area(PAGE_SIZE * max_nr_gframes, NULL);
BUG_ON(area == NULL);
shared = area->addr;
*__shared = shared;
}
rc = apply_to_page_range(&init_mm, (unsigned long)shared,
PAGE_SIZE * nr_gframes,
map_pte_fn_status, &frames);
return rc;
}
void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
{
apply_to_page_range(&init_mm, (unsigned long)shared,
PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL);
}
+133
View File
@@ -0,0 +1,133 @@
#include <linux/hardirq.h>
#include <asm/x86_init.h>
#include <xen/interface/xen.h>
#include <xen/interface/sched.h>
#include <xen/interface/vcpu.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
#include "xen-ops.h"
/*
* Force a proper event-channel callback from Xen after clearing the
* callback mask. We do this in a very simple manner, by making a call
* down into Xen. The pending flag will be checked by Xen on return.
*/
void xen_force_evtchn_callback(void)
{
(void)HYPERVISOR_xen_version(0, NULL);
}
static unsigned long xen_save_fl(void)
{
struct vcpu_info *vcpu;
unsigned long flags;
vcpu = this_cpu_read(xen_vcpu);
/* flag has opposite sense of mask */
flags = !vcpu->evtchn_upcall_mask;
/* convert to IF type flag
-0 -> 0x00000000
-1 -> 0xffffffff
*/
return (-flags) & X86_EFLAGS_IF;
}
PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl);
static void xen_restore_fl(unsigned long flags)
{
struct vcpu_info *vcpu;
/* convert from IF type flag */
flags = !(flags & X86_EFLAGS_IF);
/* There's a one instruction preempt window here. We need to
make sure we're don't switch CPUs between getting the vcpu
pointer and updating the mask. */
preempt_disable();
vcpu = this_cpu_read(xen_vcpu);
vcpu->evtchn_upcall_mask = flags;
preempt_enable_no_resched();
/* Doesn't matter if we get preempted here, because any
pending event will get dealt with anyway. */
if (flags == 0) {
preempt_check_resched();
barrier(); /* unmask then check (avoid races) */
if (unlikely(vcpu->evtchn_upcall_pending))
xen_force_evtchn_callback();
}
}
PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);
static void xen_irq_disable(void)
{
/* There's a one instruction preempt window here. We need to
make sure we're don't switch CPUs between getting the vcpu
pointer and updating the mask. */
preempt_disable();
this_cpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
preempt_enable_no_resched();
}
PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable);
static void xen_irq_enable(void)
{
struct vcpu_info *vcpu;
/* We don't need to worry about being preempted here, since
either a) interrupts are disabled, so no preemption, or b)
the caller is confused and is trying to re-enable interrupts
on an indeterminate processor. */
vcpu = this_cpu_read(xen_vcpu);
vcpu->evtchn_upcall_mask = 0;
/* Doesn't matter if we get preempted here, because any
pending event will get dealt with anyway. */
barrier(); /* unmask then check (avoid races) */
if (unlikely(vcpu->evtchn_upcall_pending))
xen_force_evtchn_callback();
}
PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable);
static void xen_safe_halt(void)
{
/* Blocking includes an implicit local_irq_enable(). */
if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
BUG();
}
static void xen_halt(void)
{
if (irqs_disabled())
HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
else
xen_safe_halt();
}
static const struct pv_irq_ops xen_irq_ops __initconst = {
.save_fl = PV_CALLEE_SAVE(xen_save_fl),
.restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
.irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
.irq_enable = PV_CALLEE_SAVE(xen_irq_enable),
.safe_halt = xen_safe_halt,
.halt = xen_halt,
#ifdef CONFIG_X86_64
.adjust_exception_frame = xen_adjust_exception_frame,
#endif
};
void __init xen_init_irq_ops(void)
{
pv_irq_ops = xen_irq_ops;
x86_init.irqs.intr_init = xen_init_IRQ;
}
File diff suppressed because it is too large Load Diff
+26
View File
@@ -0,0 +1,26 @@
#ifndef _XEN_MMU_H
#include <linux/linkage.h>
#include <asm/page.h>
enum pt_level {
PT_PGD,
PT_PUD,
PT_PMD,
PT_PTE
};
bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte);
unsigned long xen_read_cr2_direct(void);
extern void xen_init_mmu_ops(void);
extern void xen_hvm_init_mmu_ops(void);
#endif /* _XEN_MMU_H */
+208
View File
@@ -0,0 +1,208 @@
/*
* Xen hypercall batching.
*
* Xen allows multiple hypercalls to be issued at once, using the
* multicall interface. This allows the cost of trapping into the
* hypervisor to be amortized over several calls.
*
* This file implements a simple interface for multicalls. There's a
* per-cpu buffer of outstanding multicalls. When you want to queue a
* multicall for issuing, you can allocate a multicall slot for the
* call and its arguments, along with storage for space which is
* pointed to by the arguments (for passing pointers to structures,
* etc). When the multicall is actually issued, all the space for the
* commands and allocated memory is freed for reuse.
*
* Multicalls are flushed whenever any of the buffers get full, or
* when explicitly requested. There's no way to get per-multicall
* return results back. It will BUG if any of the multicalls fail.
*
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/debugfs.h>
#include <asm/xen/hypercall.h>
#include "multicalls.h"
#include "debugfs.h"
#define MC_BATCH 32
#define MC_DEBUG 0
#define MC_ARGS (MC_BATCH * 16)
struct mc_buffer {
unsigned mcidx, argidx, cbidx;
struct multicall_entry entries[MC_BATCH];
#if MC_DEBUG
struct multicall_entry debug[MC_BATCH];
void *caller[MC_BATCH];
#endif
unsigned char args[MC_ARGS];
struct callback {
void (*fn)(void *);
void *data;
} callbacks[MC_BATCH];
};
static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
void xen_mc_flush(void)
{
struct mc_buffer *b = &__get_cpu_var(mc_buffer);
struct multicall_entry *mc;
int ret = 0;
unsigned long flags;
int i;
BUG_ON(preemptible());
/* Disable interrupts in case someone comes in and queues
something in the middle */
local_irq_save(flags);
trace_xen_mc_flush(b->mcidx, b->argidx, b->cbidx);
switch (b->mcidx) {
case 0:
/* no-op */
BUG_ON(b->argidx != 0);
break;
case 1:
/* Singleton multicall - bypass multicall machinery
and just do the call directly. */
mc = &b->entries[0];
mc->result = privcmd_call(mc->op,
mc->args[0], mc->args[1], mc->args[2],
mc->args[3], mc->args[4]);
ret = mc->result < 0;
break;
default:
#if MC_DEBUG
memcpy(b->debug, b->entries,
b->mcidx * sizeof(struct multicall_entry));
#endif
if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
BUG();
for (i = 0; i < b->mcidx; i++)
if (b->entries[i].result < 0)
ret++;
#if MC_DEBUG
if (ret) {
printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
ret, smp_processor_id());
dump_stack();
for (i = 0; i < b->mcidx; i++) {
printk(KERN_DEBUG " call %2d/%d: op=%lu arg=[%lx] result=%ld\t%pF\n",
i+1, b->mcidx,
b->debug[i].op,
b->debug[i].args[0],
b->entries[i].result,
b->caller[i]);
}
}
#endif
}
b->mcidx = 0;
b->argidx = 0;
for (i = 0; i < b->cbidx; i++) {
struct callback *cb = &b->callbacks[i];
(*cb->fn)(cb->data);
}
b->cbidx = 0;
local_irq_restore(flags);
WARN_ON(ret);
}
struct multicall_space __xen_mc_entry(size_t args)
{
struct mc_buffer *b = &__get_cpu_var(mc_buffer);
struct multicall_space ret;
unsigned argidx = roundup(b->argidx, sizeof(u64));
trace_xen_mc_entry_alloc(args);
BUG_ON(preemptible());
BUG_ON(b->argidx >= MC_ARGS);
if (unlikely(b->mcidx == MC_BATCH ||
(argidx + args) >= MC_ARGS)) {
trace_xen_mc_flush_reason((b->mcidx == MC_BATCH) ?
XEN_MC_FL_BATCH : XEN_MC_FL_ARGS);
xen_mc_flush();
argidx = roundup(b->argidx, sizeof(u64));
}
ret.mc = &b->entries[b->mcidx];
#if MC_DEBUG
b->caller[b->mcidx] = __builtin_return_address(0);
#endif
b->mcidx++;
ret.args = &b->args[argidx];
b->argidx = argidx + args;
BUG_ON(b->argidx >= MC_ARGS);
return ret;
}
struct multicall_space xen_mc_extend_args(unsigned long op, size_t size)
{
struct mc_buffer *b = &__get_cpu_var(mc_buffer);
struct multicall_space ret = { NULL, NULL };
BUG_ON(preemptible());
BUG_ON(b->argidx >= MC_ARGS);
if (unlikely(b->mcidx == 0 ||
b->entries[b->mcidx - 1].op != op)) {
trace_xen_mc_extend_args(op, size, XEN_MC_XE_BAD_OP);
goto out;
}
if (unlikely((b->argidx + size) >= MC_ARGS)) {
trace_xen_mc_extend_args(op, size, XEN_MC_XE_NO_SPACE);
goto out;
}
ret.mc = &b->entries[b->mcidx - 1];
ret.args = &b->args[b->argidx];
b->argidx += size;
BUG_ON(b->argidx >= MC_ARGS);
trace_xen_mc_extend_args(op, size, XEN_MC_XE_OK);
out:
return ret;
}
void xen_mc_callback(void (*fn)(void *), void *data)
{
struct mc_buffer *b = &__get_cpu_var(mc_buffer);
struct callback *cb;
if (b->cbidx == MC_BATCH) {
trace_xen_mc_flush_reason(XEN_MC_FL_CALLBACK);
xen_mc_flush();
}
trace_xen_mc_callback(fn, data);
cb = &b->callbacks[b->cbidx++];
cb->fn = fn;
cb->data = data;
}
+68
View File
@@ -0,0 +1,68 @@
#ifndef _XEN_MULTICALLS_H
#define _XEN_MULTICALLS_H
#include <trace/events/xen.h>
#include "xen-ops.h"
/* Multicalls */
struct multicall_space
{
struct multicall_entry *mc;
void *args;
};
/* Allocate room for a multicall and its args */
struct multicall_space __xen_mc_entry(size_t args);
DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
/* Call to start a batch of multiple __xen_mc_entry()s. Must be
paired with xen_mc_issue() */
static inline void xen_mc_batch(void)
{
unsigned long flags;
/* need to disable interrupts until this entry is complete */
local_irq_save(flags);
trace_xen_mc_batch(paravirt_get_lazy_mode());
__this_cpu_write(xen_mc_irq_flags, flags);
}
static inline struct multicall_space xen_mc_entry(size_t args)
{
xen_mc_batch();
return __xen_mc_entry(args);
}
/* Flush all pending multicalls */
void xen_mc_flush(void);
/* Issue a multicall if we're not in a lazy mode */
static inline void xen_mc_issue(unsigned mode)
{
trace_xen_mc_issue(mode);
if ((paravirt_get_lazy_mode() & mode) == 0)
xen_mc_flush();
/* restore flags saved in xen_mc_batch */
local_irq_restore(this_cpu_read(xen_mc_irq_flags));
}
/* Set up a callback to be called when the current batch is flushed */
void xen_mc_callback(void (*fn)(void *), void *data);
/*
* Try to extend the arguments of the previous multicall command. The
* previous command's op must match. If it does, then it attempts to
* extend the argument space allocated to the multicall entry by
* arg_size bytes.
*
* The returned multicall_space will return with mc pointing to the
* command on success, or NULL on failure, and args pointing to the
* newly allocated space.
*/
struct multicall_space xen_mc_extend_args(unsigned long op, size_t arg_size);
#endif /* _XEN_MULTICALLS_H */
+949
View File
@@ -0,0 +1,949 @@
/*
* Xen leaves the responsibility for maintaining p2m mappings to the
* guests themselves, but it must also access and update the p2m array
* during suspend/resume when all the pages are reallocated.
*
* The p2m table is logically a flat array, but we implement it as a
* three-level tree to allow the address space to be sparse.
*
* Xen
* |
* p2m_top p2m_top_mfn
* / \ / \
* p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
* / \ / \ / /
* p2m p2m p2m p2m p2m p2m p2m ...
*
* The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
*
* The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
* maximum representable pseudo-physical address space is:
* P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
*
* P2M_PER_PAGE depends on the architecture, as a mfn is always
* unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
* 512 and 1024 entries respectively.
*
* In short, these structures contain the Machine Frame Number (MFN) of the PFN.
*
* However not all entries are filled with MFNs. Specifically for all other
* leaf entries, or for the top root, or middle one, for which there is a void
* entry, we assume it is "missing". So (for example)
* pfn_to_mfn(0x90909090)=INVALID_P2M_ENTRY.
*
* We also have the possibility of setting 1-1 mappings on certain regions, so
* that:
* pfn_to_mfn(0xc0000)=0xc0000
*
* The benefit of this is, that we can assume for non-RAM regions (think
* PCI BARs, or ACPI spaces), we can create mappings easily b/c we
* get the PFN value to match the MFN.
*
* For this to work efficiently we have one new page p2m_identity and
* allocate (via reserved_brk) any other pages we need to cover the sides
* (1GB or 4MB boundary violations). All entries in p2m_identity are set to
* INVALID_P2M_ENTRY type (Xen toolstack only recognizes that and MFNs,
* no other fancy value).
*
* On lookup we spot that the entry points to p2m_identity and return the
* identity value instead of dereferencing and returning INVALID_P2M_ENTRY.
* If the entry points to an allocated page, we just proceed as before and
* return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in
* appropriate functions (pfn_to_mfn).
*
* The reason for having the IDENTITY_FRAME_BIT instead of just returning the
* PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a
* non-identity pfn. To protect ourselves against we elect to set (and get) the
* IDENTITY_FRAME_BIT on all identity mapped PFNs.
*
* This simplistic diagram is used to explain the more subtle piece of code.
* There is also a digram of the P2M at the end that can help.
* Imagine your E820 looking as so:
*
* 1GB 2GB
* /-------------------+---------\/----\ /----------\ /---+-----\
* | System RAM | Sys RAM ||ACPI| | reserved | | Sys RAM |
* \-------------------+---------/\----/ \----------/ \---+-----/
* ^- 1029MB ^- 2001MB
*
* [1029MB = 263424 (0x40500), 2001MB = 512256 (0x7D100),
* 2048MB = 524288 (0x80000)]
*
* And dom0_mem=max:3GB,1GB is passed in to the guest, meaning memory past 1GB
* is actually not present (would have to kick the balloon driver to put it in).
*
* When we are told to set the PFNs for identity mapping (see patch: "xen/setup:
* Set identity mapping for non-RAM E820 and E820 gaps.") we pass in the start
* of the PFN and the end PFN (263424 and 512256 respectively). The first step
* is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page
* covers 512^2 of page estate (1GB) and in case the start or end PFN is not
* aligned on 512^2*PAGE_SIZE (1GB) we loop on aligned 1GB PFNs from start pfn
* to end pfn. We reserve_brk top leaf pages if they are missing (means they
* point to p2m_mid_missing).
*
* With the E820 example above, 263424 is not 1GB aligned so we allocate a
* reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000.
* Each entry in the allocate page is "missing" (points to p2m_missing).
*
* Next stage is to determine if we need to do a more granular boundary check
* on the 4MB (or 2MB depending on architecture) off the start and end pfn's.
* We check if the start pfn and end pfn violate that boundary check, and if
* so reserve_brk a middle (p2m[x][y]) leaf page. This way we have a much finer
* granularity of setting which PFNs are missing and which ones are identity.
* In our example 263424 and 512256 both fail the check so we reserve_brk two
* pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing"
* values) and assign them to p2m[1][2] and p2m[1][488] respectively.
*
* At this point we would at minimum reserve_brk one page, but could be up to
* three. Each call to set_phys_range_identity has at maximum a three page
* cost. If we were to query the P2M at this stage, all those entries from
* start PFN through end PFN (so 1029MB -> 2001MB) would return
* INVALID_P2M_ENTRY ("missing").
*
* The next step is to walk from the start pfn to the end pfn setting
* the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity.
* If we find that the middle leaf is pointing to p2m_missing we can swap it
* over to p2m_identity - this way covering 4MB (or 2MB) PFN space. At this
* point we do not need to worry about boundary aligment (so no need to
* reserve_brk a middle page, figure out which PFNs are "missing" and which
* ones are identity), as that has been done earlier. If we find that the
* middle leaf is not occupied by p2m_identity or p2m_missing, we dereference
* that page (which covers 512 PFNs) and set the appropriate PFN with
* IDENTITY_FRAME_BIT. In our example 263424 and 512256 end up there, and we
* set from p2m[1][2][256->511] and p2m[1][488][0->256] with
* IDENTITY_FRAME_BIT set.
*
* All other regions that are void (or not filled) either point to p2m_missing
* (considered missing) or have the default value of INVALID_P2M_ENTRY (also
* considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511]
* contain the INVALID_P2M_ENTRY value and are considered "missing."
*
* This is what the p2m ends up looking (for the E820 above) with this
* fabulous drawing:
*
* p2m /--------------\
* /-----\ | &mfn_list[0],| /-----------------\
* | 0 |------>| &mfn_list[1],| /---------------\ | ~0, ~0, .. |
* |-----| | ..., ~0, ~0 | | ~0, ~0, [x]---+----->| IDENTITY [@256] |
* | 1 |---\ \--------------/ | [p2m_identity]+\ | IDENTITY [@257] |
* |-----| \ | [p2m_identity]+\\ | .... |
* | 2 |--\ \-------------------->| ... | \\ \----------------/
* |-----| \ \---------------/ \\
* | 3 |\ \ \\ p2m_identity
* |-----| \ \-------------------->/---------------\ /-----------------\
* | .. +->+ | [p2m_identity]+-->| ~0, ~0, ~0, ... |
* \-----/ / | [p2m_identity]+-->| ..., ~0 |
* / /---------------\ | .... | \-----------------/
* / | IDENTITY[@0] | /-+-[x], ~0, ~0.. |
* / | IDENTITY[@256]|<----/ \---------------/
* / | ~0, ~0, .... |
* | \---------------/
* |
* p2m_missing p2m_missing
* /------------------\ /------------\
* | [p2m_mid_missing]+---->| ~0, ~0, ~0 |
* | [p2m_mid_missing]+---->| ..., ~0 |
* \------------------/ \------------/
*
* where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT)
*/
#include <linux/init.h>
#include <linux/module.h>
#include <linux/list.h>
#include <linux/hash.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <asm/cache.h>
#include <asm/setup.h>
#include <asm/xen/page.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
#include <xen/grant_table.h>
#include "multicalls.h"
#include "xen-ops.h"
static void __init m2p_override_init(void);
unsigned long xen_max_p2m_pfn __read_mostly;
#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
/* Placeholders for holes in the address space */
static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE);
RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
/* We might hit two boundary violations at the start and end, at max each
* boundary violation will require three middle nodes. */
RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3);
static inline unsigned p2m_top_index(unsigned long pfn)
{
BUG_ON(pfn >= MAX_P2M_PFN);
return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
}
static inline unsigned p2m_mid_index(unsigned long pfn)
{
return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
}
static inline unsigned p2m_index(unsigned long pfn)
{
return pfn % P2M_PER_PAGE;
}
static void p2m_top_init(unsigned long ***top)
{
unsigned i;
for (i = 0; i < P2M_TOP_PER_PAGE; i++)
top[i] = p2m_mid_missing;
}
static void p2m_top_mfn_init(unsigned long *top)
{
unsigned i;
for (i = 0; i < P2M_TOP_PER_PAGE; i++)
top[i] = virt_to_mfn(p2m_mid_missing_mfn);
}
static void p2m_top_mfn_p_init(unsigned long **top)
{
unsigned i;
for (i = 0; i < P2M_TOP_PER_PAGE; i++)
top[i] = p2m_mid_missing_mfn;
}
static void p2m_mid_init(unsigned long **mid)
{
unsigned i;
for (i = 0; i < P2M_MID_PER_PAGE; i++)
mid[i] = p2m_missing;
}
static void p2m_mid_mfn_init(unsigned long *mid)
{
unsigned i;
for (i = 0; i < P2M_MID_PER_PAGE; i++)
mid[i] = virt_to_mfn(p2m_missing);
}
static void p2m_init(unsigned long *p2m)
{
unsigned i;
for (i = 0; i < P2M_MID_PER_PAGE; i++)
p2m[i] = INVALID_P2M_ENTRY;
}
/*
* Build the parallel p2m_top_mfn and p2m_mid_mfn structures
*
* This is called both at boot time, and after resuming from suspend:
* - At boot time we're called very early, and must use extend_brk()
* to allocate memory.
*
* - After resume we're called from within stop_machine, but the mfn
* tree should alreay be completely allocated.
*/
void __ref xen_build_mfn_list_list(void)
{
unsigned long pfn;
/* Pre-initialize p2m_top_mfn to be completely missing */
if (p2m_top_mfn == NULL) {
p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_mid_mfn_init(p2m_mid_missing_mfn);
p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_top_mfn_p_init(p2m_top_mfn_p);
p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_top_mfn_init(p2m_top_mfn);
} else {
/* Reinitialise, mfn's all change after migration */
p2m_mid_mfn_init(p2m_mid_missing_mfn);
}
for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
unsigned topidx = p2m_top_index(pfn);
unsigned mididx = p2m_mid_index(pfn);
unsigned long **mid;
unsigned long *mid_mfn_p;
mid = p2m_top[topidx];
mid_mfn_p = p2m_top_mfn_p[topidx];
/* Don't bother allocating any mfn mid levels if
* they're just missing, just update the stored mfn,
* since all could have changed over a migrate.
*/
if (mid == p2m_mid_missing) {
BUG_ON(mididx);
BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
continue;
}
if (mid_mfn_p == p2m_mid_missing_mfn) {
/*
* XXX boot-time only! We should never find
* missing parts of the mfn tree after
* runtime. extend_brk() will BUG if we call
* it too late.
*/
mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_mid_mfn_init(mid_mfn_p);
p2m_top_mfn_p[topidx] = mid_mfn_p;
}
p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
}
}
void xen_setup_mfn_list_list(void)
{
BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
virt_to_mfn(p2m_top_mfn);
HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
}
/* Set up p2m_top to point to the domain-builder provided p2m pages */
void __init xen_build_dynamic_phys_to_machine(void)
{
unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
unsigned long pfn;
xen_max_p2m_pfn = max_pfn;
p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_init(p2m_missing);
p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_mid_init(p2m_mid_missing);
p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_top_init(p2m_top);
p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_init(p2m_identity);
/*
* The domain builder gives us a pre-constructed p2m array in
* mfn_list for all the pages initially given to us, so we just
* need to graft that into our tree structure.
*/
for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
unsigned topidx = p2m_top_index(pfn);
unsigned mididx = p2m_mid_index(pfn);
if (p2m_top[topidx] == p2m_mid_missing) {
unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_mid_init(mid);
p2m_top[topidx] = mid;
}
/*
* As long as the mfn_list has enough entries to completely
* fill a p2m page, pointing into the array is ok. But if
* not the entries beyond the last pfn will be undefined.
*/
if (unlikely(pfn + P2M_PER_PAGE > max_pfn)) {
unsigned long p2midx;
p2midx = max_pfn % P2M_PER_PAGE;
for ( ; p2midx < P2M_PER_PAGE; p2midx++)
mfn_list[pfn + p2midx] = INVALID_P2M_ENTRY;
}
p2m_top[topidx][mididx] = &mfn_list[pfn];
}
m2p_override_init();
}
unsigned long get_phys_to_machine(unsigned long pfn)
{
unsigned topidx, mididx, idx;
if (unlikely(pfn >= MAX_P2M_PFN))
return INVALID_P2M_ENTRY;
topidx = p2m_top_index(pfn);
mididx = p2m_mid_index(pfn);
idx = p2m_index(pfn);
/*
* The INVALID_P2M_ENTRY is filled in both p2m_*identity
* and in p2m_*missing, so returning the INVALID_P2M_ENTRY
* would be wrong.
*/
if (p2m_top[topidx][mididx] == p2m_identity)
return IDENTITY_FRAME(pfn);
return p2m_top[topidx][mididx][idx];
}
EXPORT_SYMBOL_GPL(get_phys_to_machine);
static void *alloc_p2m_page(void)
{
return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
}
static void free_p2m_page(void *p)
{
free_page((unsigned long)p);
}
/*
* Fully allocate the p2m structure for a given pfn. We need to check
* that both the top and mid levels are allocated, and make sure the
* parallel mfn tree is kept in sync. We may race with other cpus, so
* the new pages are installed with cmpxchg; if we lose the race then
* simply free the page we allocated and use the one that's there.
*/
static bool alloc_p2m(unsigned long pfn)
{
unsigned topidx, mididx;
unsigned long ***top_p, **mid;
unsigned long *top_mfn_p, *mid_mfn;
topidx = p2m_top_index(pfn);
mididx = p2m_mid_index(pfn);
top_p = &p2m_top[topidx];
mid = *top_p;
if (mid == p2m_mid_missing) {
/* Mid level is missing, allocate a new one */
mid = alloc_p2m_page();
if (!mid)
return false;
p2m_mid_init(mid);
if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
free_p2m_page(mid);
}
top_mfn_p = &p2m_top_mfn[topidx];
mid_mfn = p2m_top_mfn_p[topidx];
BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
if (mid_mfn == p2m_mid_missing_mfn) {
/* Separately check the mid mfn level */
unsigned long missing_mfn;
unsigned long mid_mfn_mfn;
mid_mfn = alloc_p2m_page();
if (!mid_mfn)
return false;
p2m_mid_mfn_init(mid_mfn);
missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
mid_mfn_mfn = virt_to_mfn(mid_mfn);
if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
free_p2m_page(mid_mfn);
else
p2m_top_mfn_p[topidx] = mid_mfn;
}
if (p2m_top[topidx][mididx] == p2m_identity ||
p2m_top[topidx][mididx] == p2m_missing) {
/* p2m leaf page is missing */
unsigned long *p2m;
unsigned long *p2m_orig = p2m_top[topidx][mididx];
p2m = alloc_p2m_page();
if (!p2m)
return false;
p2m_init(p2m);
if (cmpxchg(&mid[mididx], p2m_orig, p2m) != p2m_orig)
free_p2m_page(p2m);
else
mid_mfn[mididx] = virt_to_mfn(p2m);
}
return true;
}
static bool __init __early_alloc_p2m(unsigned long pfn)
{
unsigned topidx, mididx, idx;
topidx = p2m_top_index(pfn);
mididx = p2m_mid_index(pfn);
idx = p2m_index(pfn);
/* Pfff.. No boundary cross-over, lets get out. */
if (!idx)
return false;
WARN(p2m_top[topidx][mididx] == p2m_identity,
"P2M[%d][%d] == IDENTITY, should be MISSING (or alloced)!\n",
topidx, mididx);
/*
* Could be done by xen_build_dynamic_phys_to_machine..
*/
if (p2m_top[topidx][mididx] != p2m_missing)
return false;
/* Boundary cross-over for the edges: */
if (idx) {
unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
unsigned long *mid_mfn_p;
p2m_init(p2m);
p2m_top[topidx][mididx] = p2m;
/* For save/restore we need to MFN of the P2M saved */
mid_mfn_p = p2m_top_mfn_p[topidx];
WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing),
"P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n",
topidx, mididx);
mid_mfn_p[mididx] = virt_to_mfn(p2m);
}
return idx != 0;
}
unsigned long __init set_phys_range_identity(unsigned long pfn_s,
unsigned long pfn_e)
{
unsigned long pfn;
if (unlikely(pfn_s >= MAX_P2M_PFN || pfn_e >= MAX_P2M_PFN))
return 0;
if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
return pfn_e - pfn_s;
if (pfn_s > pfn_e)
return 0;
for (pfn = (pfn_s & ~(P2M_MID_PER_PAGE * P2M_PER_PAGE - 1));
pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE));
pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE)
{
unsigned topidx = p2m_top_index(pfn);
unsigned long *mid_mfn_p;
unsigned long **mid;
mid = p2m_top[topidx];
mid_mfn_p = p2m_top_mfn_p[topidx];
if (mid == p2m_mid_missing) {
mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_mid_init(mid);
p2m_top[topidx] = mid;
BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
}
/* And the save/restore P2M tables.. */
if (mid_mfn_p == p2m_mid_missing_mfn) {
mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_mid_mfn_init(mid_mfn_p);
p2m_top_mfn_p[topidx] = mid_mfn_p;
p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
/* Note: we don't set mid_mfn_p[midix] here,
* look in __early_alloc_p2m */
}
}
__early_alloc_p2m(pfn_s);
__early_alloc_p2m(pfn_e);
for (pfn = pfn_s; pfn < pfn_e; pfn++)
if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn)))
break;
if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s),
"Identity mapping failed. We are %ld short of 1-1 mappings!\n",
(pfn_e - pfn_s) - (pfn - pfn_s)))
printk(KERN_DEBUG "1-1 mapping on %lx->%lx\n", pfn_s, pfn);
return pfn - pfn_s;
}
/* Try to install p2m mapping; fail if intermediate bits missing */
bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
{
unsigned topidx, mididx, idx;
if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
return true;
}
if (unlikely(pfn >= MAX_P2M_PFN)) {
BUG_ON(mfn != INVALID_P2M_ENTRY);
return true;
}
topidx = p2m_top_index(pfn);
mididx = p2m_mid_index(pfn);
idx = p2m_index(pfn);
/* For sparse holes were the p2m leaf has real PFN along with
* PCI holes, stick in the PFN as the MFN value.
*/
if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) {
if (p2m_top[topidx][mididx] == p2m_identity)
return true;
/* Swap over from MISSING to IDENTITY if needed. */
if (p2m_top[topidx][mididx] == p2m_missing) {
WARN_ON(cmpxchg(&p2m_top[topidx][mididx], p2m_missing,
p2m_identity) != p2m_missing);
return true;
}
}
if (p2m_top[topidx][mididx] == p2m_missing)
return mfn == INVALID_P2M_ENTRY;
p2m_top[topidx][mididx][idx] = mfn;
return true;
}
bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
{
if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
if (!alloc_p2m(pfn))
return false;
if (!__set_phys_to_machine(pfn, mfn))
return false;
}
return true;
}
#define M2P_OVERRIDE_HASH_SHIFT 10
#define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT)
static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH);
static DEFINE_SPINLOCK(m2p_override_lock);
static void __init m2p_override_init(void)
{
unsigned i;
m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH,
sizeof(unsigned long));
for (i = 0; i < M2P_OVERRIDE_HASH; i++)
INIT_LIST_HEAD(&m2p_overrides[i]);
}
static unsigned long mfn_hash(unsigned long mfn)
{
return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT);
}
/* Add an MFN override for a particular page */
int m2p_add_override(unsigned long mfn, struct page *page,
struct gnttab_map_grant_ref *kmap_op)
{
unsigned long flags;
unsigned long pfn;
unsigned long uninitialized_var(address);
unsigned level;
pte_t *ptep = NULL;
pfn = page_to_pfn(page);
if (!PageHighMem(page)) {
address = (unsigned long)__va(pfn << PAGE_SHIFT);
ptep = lookup_address(address, &level);
if (WARN(ptep == NULL || level != PG_LEVEL_4K,
"m2p_add_override: pfn %lx not mapped", pfn))
return -EINVAL;
}
WARN_ON(PagePrivate(page));
SetPagePrivate(page);
set_page_private(page, mfn);
page->index = pfn_to_mfn(pfn);
if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn))))
return -ENOMEM;
if (kmap_op != NULL) {
if (!PageHighMem(page)) {
struct multicall_space mcs =
xen_mc_entry(sizeof(*kmap_op));
MULTI_grant_table_op(mcs.mc,
GNTTABOP_map_grant_ref, kmap_op, 1);
xen_mc_issue(PARAVIRT_LAZY_MMU);
}
/* let's use dev_bus_addr to record the old mfn instead */
kmap_op->dev_bus_addr = page->index;
page->index = (unsigned long) kmap_op;
}
spin_lock_irqsave(&m2p_override_lock, flags);
list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);
spin_unlock_irqrestore(&m2p_override_lock, flags);
return 0;
}
EXPORT_SYMBOL_GPL(m2p_add_override);
int m2p_remove_override(struct page *page, bool clear_pte)
{
unsigned long flags;
unsigned long mfn;
unsigned long pfn;
unsigned long uninitialized_var(address);
unsigned level;
pte_t *ptep = NULL;
pfn = page_to_pfn(page);
mfn = get_phys_to_machine(pfn);
if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT))
return -EINVAL;
if (!PageHighMem(page)) {
address = (unsigned long)__va(pfn << PAGE_SHIFT);
ptep = lookup_address(address, &level);
if (WARN(ptep == NULL || level != PG_LEVEL_4K,
"m2p_remove_override: pfn %lx not mapped", pfn))
return -EINVAL;
}
spin_lock_irqsave(&m2p_override_lock, flags);
list_del(&page->lru);
spin_unlock_irqrestore(&m2p_override_lock, flags);
WARN_ON(!PagePrivate(page));
ClearPagePrivate(page);
if (clear_pte) {
struct gnttab_map_grant_ref *map_op =
(struct gnttab_map_grant_ref *) page->index;
set_phys_to_machine(pfn, map_op->dev_bus_addr);
if (!PageHighMem(page)) {
struct multicall_space mcs;
struct gnttab_unmap_grant_ref *unmap_op;
/*
* It might be that we queued all the m2p grant table
* hypercalls in a multicall, then m2p_remove_override
* get called before the multicall has actually been
* issued. In this case handle is going to -1 because
* it hasn't been modified yet.
*/
if (map_op->handle == -1)
xen_mc_flush();
/*
* Now if map_op->handle is negative it means that the
* hypercall actually returned an error.
*/
if (map_op->handle == GNTST_general_error) {
printk(KERN_WARNING "m2p_remove_override: "
"pfn %lx mfn %lx, failed to modify kernel mappings",
pfn, mfn);
return -1;
}
mcs = xen_mc_entry(
sizeof(struct gnttab_unmap_grant_ref));
unmap_op = mcs.args;
unmap_op->host_addr = map_op->host_addr;
unmap_op->handle = map_op->handle;
unmap_op->dev_bus_addr = 0;
MULTI_grant_table_op(mcs.mc,
GNTTABOP_unmap_grant_ref, unmap_op, 1);
xen_mc_issue(PARAVIRT_LAZY_MMU);
set_pte_at(&init_mm, address, ptep,
pfn_pte(pfn, PAGE_KERNEL));
__flush_tlb_single(address);
map_op->host_addr = 0;
}
} else
set_phys_to_machine(pfn, page->index);
return 0;
}
EXPORT_SYMBOL_GPL(m2p_remove_override);
struct page *m2p_find_override(unsigned long mfn)
{
unsigned long flags;
struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)];
struct page *p, *ret;
ret = NULL;
spin_lock_irqsave(&m2p_override_lock, flags);
list_for_each_entry(p, bucket, lru) {
if (page_private(p) == mfn) {
ret = p;
break;
}
}
spin_unlock_irqrestore(&m2p_override_lock, flags);
return ret;
}
unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
{
struct page *p = m2p_find_override(mfn);
unsigned long ret = pfn;
if (p)
ret = page_to_pfn(p);
return ret;
}
EXPORT_SYMBOL_GPL(m2p_find_override_pfn);
#ifdef CONFIG_XEN_DEBUG_FS
#include <linux/debugfs.h>
#include "debugfs.h"
static int p2m_dump_show(struct seq_file *m, void *v)
{
static const char * const level_name[] = { "top", "middle",
"entry", "abnormal", "error"};
#define TYPE_IDENTITY 0
#define TYPE_MISSING 1
#define TYPE_PFN 2
#define TYPE_UNKNOWN 3
static const char * const type_name[] = {
[TYPE_IDENTITY] = "identity",
[TYPE_MISSING] = "missing",
[TYPE_PFN] = "pfn",
[TYPE_UNKNOWN] = "abnormal"};
unsigned long pfn, prev_pfn_type = 0, prev_pfn_level = 0;
unsigned int uninitialized_var(prev_level);
unsigned int uninitialized_var(prev_type);
if (!p2m_top)
return 0;
for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn++) {
unsigned topidx = p2m_top_index(pfn);
unsigned mididx = p2m_mid_index(pfn);
unsigned idx = p2m_index(pfn);
unsigned lvl, type;
lvl = 4;
type = TYPE_UNKNOWN;
if (p2m_top[topidx] == p2m_mid_missing) {
lvl = 0; type = TYPE_MISSING;
} else if (p2m_top[topidx] == NULL) {
lvl = 0; type = TYPE_UNKNOWN;
} else if (p2m_top[topidx][mididx] == NULL) {
lvl = 1; type = TYPE_UNKNOWN;
} else if (p2m_top[topidx][mididx] == p2m_identity) {
lvl = 1; type = TYPE_IDENTITY;
} else if (p2m_top[topidx][mididx] == p2m_missing) {
lvl = 1; type = TYPE_MISSING;
} else if (p2m_top[topidx][mididx][idx] == 0) {
lvl = 2; type = TYPE_UNKNOWN;
} else if (p2m_top[topidx][mididx][idx] == IDENTITY_FRAME(pfn)) {
lvl = 2; type = TYPE_IDENTITY;
} else if (p2m_top[topidx][mididx][idx] == INVALID_P2M_ENTRY) {
lvl = 2; type = TYPE_MISSING;
} else if (p2m_top[topidx][mididx][idx] == pfn) {
lvl = 2; type = TYPE_PFN;
} else if (p2m_top[topidx][mididx][idx] != pfn) {
lvl = 2; type = TYPE_PFN;
}
if (pfn == 0) {
prev_level = lvl;
prev_type = type;
}
if (pfn == MAX_DOMAIN_PAGES-1) {
lvl = 3;
type = TYPE_UNKNOWN;
}
if (prev_type != type) {
seq_printf(m, " [0x%lx->0x%lx] %s\n",
prev_pfn_type, pfn, type_name[prev_type]);
prev_pfn_type = pfn;
prev_type = type;
}
if (prev_level != lvl) {
seq_printf(m, " [0x%lx->0x%lx] level %s\n",
prev_pfn_level, pfn, level_name[prev_level]);
prev_pfn_level = pfn;
prev_level = lvl;
}
}
return 0;
#undef TYPE_IDENTITY
#undef TYPE_MISSING
#undef TYPE_PFN
#undef TYPE_UNKNOWN
}
static int p2m_dump_open(struct inode *inode, struct file *filp)
{
return single_open(filp, p2m_dump_show, NULL);
}
static const struct file_operations p2m_dump_fops = {
.open = p2m_dump_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static struct dentry *d_mmu_debug;
static int __init xen_p2m_debugfs(void)
{
struct dentry *d_xen = xen_init_debugfs();
if (d_xen == NULL)
return -ENOMEM;
d_mmu_debug = debugfs_create_dir("mmu", d_xen);
debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops);
return 0;
}
fs_initcall(xen_p2m_debugfs);
#endif /* CONFIG_XEN_DEBUG_FS */
+67
View File
@@ -0,0 +1,67 @@
/* Glue code to lib/swiotlb-xen.c */
#include <linux/dma-mapping.h>
#include <linux/pci.h>
#include <xen/swiotlb-xen.h>
#include <asm/xen/hypervisor.h>
#include <xen/xen.h>
#include <asm/iommu_table.h>
int xen_swiotlb __read_mostly;
static struct dma_map_ops xen_swiotlb_dma_ops = {
.mapping_error = xen_swiotlb_dma_mapping_error,
.alloc = xen_swiotlb_alloc_coherent,
.free = xen_swiotlb_free_coherent,
.sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu,
.sync_single_for_device = xen_swiotlb_sync_single_for_device,
.sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu,
.sync_sg_for_device = xen_swiotlb_sync_sg_for_device,
.map_sg = xen_swiotlb_map_sg_attrs,
.unmap_sg = xen_swiotlb_unmap_sg_attrs,
.map_page = xen_swiotlb_map_page,
.unmap_page = xen_swiotlb_unmap_page,
.dma_supported = xen_swiotlb_dma_supported,
};
/*
* pci_xen_swiotlb_detect - set xen_swiotlb to 1 if necessary
*
* This returns non-zero if we are forced to use xen_swiotlb (by the boot
* option).
*/
int __init pci_xen_swiotlb_detect(void)
{
/* If running as PV guest, either iommu=soft, or swiotlb=force will
* activate this IOMMU. If running as PV privileged, activate it
* irregardless.
*/
if ((xen_initial_domain() || swiotlb || swiotlb_force) &&
(xen_pv_domain()))
xen_swiotlb = 1;
/* If we are running under Xen, we MUST disable the native SWIOTLB.
* Don't worry about swiotlb_force flag activating the native, as
* the 'swiotlb' flag is the only one turning it on. */
if (xen_pv_domain())
swiotlb = 0;
return xen_swiotlb;
}
void __init pci_xen_swiotlb_init(void)
{
if (xen_swiotlb) {
xen_swiotlb_init(1);
dma_ops = &xen_swiotlb_dma_ops;
/* Make sure ACS will be enabled */
pci_request_acs();
}
}
IOMMU_INIT_FINISH(pci_xen_swiotlb_detect,
0,
pci_xen_swiotlb_init,
0);
+143
View File
@@ -0,0 +1,143 @@
/******************************************************************************
* platform-pci-unplug.c
*
* Xen platform PCI device driver
* Copyright (c) 2010, Citrix
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, write to the Free Software Foundation, Inc., 59 Temple
* Place - Suite 330, Boston, MA 02111-1307 USA.
*
*/
#include <linux/init.h>
#include <linux/io.h>
#include <linux/module.h>
#include <xen/platform_pci.h>
#define XEN_PLATFORM_ERR_MAGIC -1
#define XEN_PLATFORM_ERR_PROTOCOL -2
#define XEN_PLATFORM_ERR_BLACKLIST -3
/* store the value of xen_emul_unplug after the unplug is done */
int xen_platform_pci_unplug;
EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
#ifdef CONFIG_XEN_PVHVM
static int xen_emul_unplug;
static int check_platform_magic(void)
{
short magic;
char protocol;
magic = inw(XEN_IOPORT_MAGIC);
if (magic != XEN_IOPORT_MAGIC_VAL) {
printk(KERN_ERR "Xen Platform PCI: unrecognised magic value\n");
return XEN_PLATFORM_ERR_MAGIC;
}
protocol = inb(XEN_IOPORT_PROTOVER);
printk(KERN_DEBUG "Xen Platform PCI: I/O protocol version %d\n",
protocol);
switch (protocol) {
case 1:
outw(XEN_IOPORT_LINUX_PRODNUM, XEN_IOPORT_PRODNUM);
outl(XEN_IOPORT_LINUX_DRVVER, XEN_IOPORT_DRVVER);
if (inw(XEN_IOPORT_MAGIC) != XEN_IOPORT_MAGIC_VAL) {
printk(KERN_ERR "Xen Platform: blacklisted by host\n");
return XEN_PLATFORM_ERR_BLACKLIST;
}
break;
default:
printk(KERN_WARNING "Xen Platform PCI: unknown I/O protocol version");
return XEN_PLATFORM_ERR_PROTOCOL;
}
return 0;
}
void xen_unplug_emulated_devices(void)
{
int r;
/* user explicitly requested no unplug */
if (xen_emul_unplug & XEN_UNPLUG_NEVER)
return;
/* check the version of the xen platform PCI device */
r = check_platform_magic();
/* If the version matches enable the Xen platform PCI driver.
* Also enable the Xen platform PCI driver if the host does
* not support the unplug protocol (XEN_PLATFORM_ERR_MAGIC)
* but the user told us that unplugging is unnecessary. */
if (r && !(r == XEN_PLATFORM_ERR_MAGIC &&
(xen_emul_unplug & XEN_UNPLUG_UNNECESSARY)))
return;
/* Set the default value of xen_emul_unplug depending on whether or
* not the Xen PV frontends and the Xen platform PCI driver have
* been compiled for this kernel (modules or built-in are both OK). */
if (!xen_emul_unplug) {
if (xen_must_unplug_nics()) {
printk(KERN_INFO "Netfront and the Xen platform PCI driver have "
"been compiled for this kernel: unplug emulated NICs.\n");
xen_emul_unplug |= XEN_UNPLUG_ALL_NICS;
}
if (xen_must_unplug_disks()) {
printk(KERN_INFO "Blkfront and the Xen platform PCI driver have "
"been compiled for this kernel: unplug emulated disks.\n"
"You might have to change the root device\n"
"from /dev/hd[a-d] to /dev/xvd[a-d]\n"
"in your root= kernel command line option\n");
xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS;
}
}
/* Now unplug the emulated devices */
if (!(xen_emul_unplug & XEN_UNPLUG_UNNECESSARY))
outw(xen_emul_unplug, XEN_IOPORT_UNPLUG);
xen_platform_pci_unplug = xen_emul_unplug;
}
static int __init parse_xen_emul_unplug(char *arg)
{
char *p, *q;
int l;
for (p = arg; p; p = q) {
q = strchr(p, ',');
if (q) {
l = q - p;
q++;
} else {
l = strlen(p);
}
if (!strncmp(p, "all", l))
xen_emul_unplug |= XEN_UNPLUG_ALL;
else if (!strncmp(p, "ide-disks", l))
xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS;
else if (!strncmp(p, "aux-ide-disks", l))
xen_emul_unplug |= XEN_UNPLUG_AUX_IDE_DISKS;
else if (!strncmp(p, "nics", l))
xen_emul_unplug |= XEN_UNPLUG_ALL_NICS;
else if (!strncmp(p, "unnecessary", l))
xen_emul_unplug |= XEN_UNPLUG_UNNECESSARY;
else if (!strncmp(p, "never", l))
xen_emul_unplug |= XEN_UNPLUG_NEVER;
else
printk(KERN_WARNING "unrecognised option '%s' "
"in parameter 'xen_emul_unplug'\n", p);
}
return 0;
}
early_param("xen_emul_unplug", parse_xen_emul_unplug);
#endif
+427
View File
@@ -0,0 +1,427 @@
/*
* Machine specific setup for xen
*
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/pm.h>
#include <linux/memblock.h>
#include <linux/cpuidle.h>
#include <linux/cpufreq.h>
#include <asm/elf.h>
#include <asm/vdso.h>
#include <asm/e820.h>
#include <asm/setup.h>
#include <asm/acpi.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
#include <xen/xen.h>
#include <xen/page.h>
#include <xen/interface/callback.h>
#include <xen/interface/memory.h>
#include <xen/interface/physdev.h>
#include <xen/features.h>
#include "xen-ops.h"
#include "vdso.h"
/* These are code, but not functions. Defined in entry.S */
extern const char xen_hypervisor_callback[];
extern const char xen_failsafe_callback[];
extern void xen_sysenter_target(void);
extern void xen_syscall_target(void);
extern void xen_syscall32_target(void);
/* Amount of extra memory space we add to the e820 ranges */
struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
/* Number of pages released from the initial allocation. */
unsigned long xen_released_pages;
/*
* The maximum amount of extra memory compared to the base size. The
* main scaling factor is the size of struct page. At extreme ratios
* of base:extra, all the base memory can be filled with page
* structures for the extra memory, leaving no space for anything
* else.
*
* 10x seems like a reasonable balance between scaling flexibility and
* leaving a practically usable system.
*/
#define EXTRA_MEM_RATIO (10)
static void __init xen_add_extra_mem(u64 start, u64 size)
{
unsigned long pfn;
int i;
for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
/* Add new region. */
if (xen_extra_mem[i].size == 0) {
xen_extra_mem[i].start = start;
xen_extra_mem[i].size = size;
break;
}
/* Append to existing region. */
if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) {
xen_extra_mem[i].size += size;
break;
}
}
if (i == XEN_EXTRA_MEM_MAX_REGIONS)
printk(KERN_WARNING "Warning: not enough extra memory regions\n");
memblock_reserve(start, size);
xen_max_p2m_pfn = PFN_DOWN(start + size);
for (pfn = PFN_DOWN(start); pfn <= xen_max_p2m_pfn; pfn++)
__set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
}
static unsigned long __init xen_release_chunk(unsigned long start,
unsigned long end)
{
struct xen_memory_reservation reservation = {
.address_bits = 0,
.extent_order = 0,
.domid = DOMID_SELF
};
unsigned long len = 0;
unsigned long pfn;
int ret;
for(pfn = start; pfn < end; pfn++) {
unsigned long mfn = pfn_to_mfn(pfn);
/* Make sure pfn exists to start with */
if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
continue;
set_xen_guest_handle(reservation.extent_start, &mfn);
reservation.nr_extents = 1;
ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
&reservation);
WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);
if (ret == 1) {
__set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
len++;
}
}
printk(KERN_INFO "Freeing %lx-%lx pfn range: %lu pages freed\n",
start, end, len);
return len;
}
static unsigned long __init xen_set_identity_and_release(
const struct e820entry *list, size_t map_size, unsigned long nr_pages)
{
phys_addr_t start = 0;
unsigned long released = 0;
unsigned long identity = 0;
const struct e820entry *entry;
int i;
/*
* Combine non-RAM regions and gaps until a RAM region (or the
* end of the map) is reached, then set the 1:1 map and
* release the pages (if available) in those non-RAM regions.
*
* The combined non-RAM regions are rounded to a whole number
* of pages so any partial pages are accessible via the 1:1
* mapping. This is needed for some BIOSes that put (for
* example) the DMI tables in a reserved region that begins on
* a non-page boundary.
*/
for (i = 0, entry = list; i < map_size; i++, entry++) {
phys_addr_t end = entry->addr + entry->size;
if (entry->type == E820_RAM || i == map_size - 1) {
unsigned long start_pfn = PFN_DOWN(start);
unsigned long end_pfn = PFN_UP(end);
if (entry->type == E820_RAM)
end_pfn = PFN_UP(entry->addr);
if (start_pfn < end_pfn) {
if (start_pfn < nr_pages)
released += xen_release_chunk(
start_pfn, min(end_pfn, nr_pages));
identity += set_phys_range_identity(
start_pfn, end_pfn);
}
start = end;
}
}
printk(KERN_INFO "Released %lu pages of unused memory\n", released);
printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity);
return released;
}
static unsigned long __init xen_get_max_pages(void)
{
unsigned long max_pages = MAX_DOMAIN_PAGES;
domid_t domid = DOMID_SELF;
int ret;
/*
* For the initial domain we use the maximum reservation as
* the maximum page.
*
* For guest domains the current maximum reservation reflects
* the current maximum rather than the static maximum. In this
* case the e820 map provided to us will cover the static
* maximum region.
*/
if (xen_initial_domain()) {
ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid);
if (ret > 0)
max_pages = ret;
}
return min(max_pages, MAX_DOMAIN_PAGES);
}
static void xen_align_and_add_e820_region(u64 start, u64 size, int type)
{
u64 end = start + size;
/* Align RAM regions to page boundaries. */
if (type == E820_RAM) {
start = PAGE_ALIGN(start);
end &= ~((u64)PAGE_SIZE - 1);
}
e820_add_region(start, end - start, type);
}
/**
* machine_specific_memory_setup - Hook for machine specific memory setup.
**/
char * __init xen_memory_setup(void)
{
static struct e820entry map[E820MAX] __initdata;
unsigned long max_pfn = xen_start_info->nr_pages;
unsigned long long mem_end;
int rc;
struct xen_memory_map memmap;
unsigned long max_pages;
unsigned long extra_pages = 0;
int i;
int op;
max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
mem_end = PFN_PHYS(max_pfn);
memmap.nr_entries = E820MAX;
set_xen_guest_handle(memmap.buffer, map);
op = xen_initial_domain() ?
XENMEM_machine_memory_map :
XENMEM_memory_map;
rc = HYPERVISOR_memory_op(op, &memmap);
if (rc == -ENOSYS) {
BUG_ON(xen_initial_domain());
memmap.nr_entries = 1;
map[0].addr = 0ULL;
map[0].size = mem_end;
/* 8MB slack (to balance backend allocations). */
map[0].size += 8ULL << 20;
map[0].type = E820_RAM;
rc = 0;
}
BUG_ON(rc);
/* Make sure the Xen-supplied memory map is well-ordered. */
sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries);
max_pages = xen_get_max_pages();
if (max_pages > max_pfn)
extra_pages += max_pages - max_pfn;
/*
* Set P2M for all non-RAM pages and E820 gaps to be identity
* type PFNs. Any RAM pages that would be made inaccesible by
* this are first released.
*/
xen_released_pages = xen_set_identity_and_release(
map, memmap.nr_entries, max_pfn);
extra_pages += xen_released_pages;
/*
* Clamp the amount of extra memory to a EXTRA_MEM_RATIO
* factor the base size. On non-highmem systems, the base
* size is the full initial memory allocation; on highmem it
* is limited to the max size of lowmem, so that it doesn't
* get completely filled.
*
* In principle there could be a problem in lowmem systems if
* the initial memory is also very large with respect to
* lowmem, but we won't try to deal with that here.
*/
extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
extra_pages);
i = 0;
while (i < memmap.nr_entries) {
u64 addr = map[i].addr;
u64 size = map[i].size;
u32 type = map[i].type;
if (type == E820_RAM) {
if (addr < mem_end) {
size = min(size, mem_end - addr);
} else if (extra_pages) {
size = min(size, (u64)extra_pages * PAGE_SIZE);
extra_pages -= size / PAGE_SIZE;
xen_add_extra_mem(addr, size);
} else
type = E820_UNUSABLE;
}
xen_align_and_add_e820_region(addr, size, type);
map[i].addr += size;
map[i].size -= size;
if (map[i].size == 0)
i++;
}
/*
* In domU, the ISA region is normal, usable memory, but we
* reserve ISA memory anyway because too many things poke
* about in there.
*/
e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
E820_RESERVED);
/*
* Reserve Xen bits:
* - mfn_list
* - xen_start_info
* See comment above "struct start_info" in <xen/interface/xen.h>
*/
memblock_reserve(__pa(xen_start_info->mfn_list),
xen_start_info->pt_base - xen_start_info->mfn_list);
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
return "Xen";
}
/*
* Set the bit indicating "nosegneg" library variants should be used.
* We only need to bother in pure 32-bit mode; compat 32-bit processes
* can have un-truncated segments, so wrapping around is allowed.
*/
static void __init fiddle_vdso(void)
{
#ifdef CONFIG_X86_32
u32 *mask;
mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK);
*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK);
*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
#endif
}
static int __cpuinit register_callback(unsigned type, const void *func)
{
struct callback_register callback = {
.type = type,
.address = XEN_CALLBACK(__KERNEL_CS, func),
.flags = CALLBACKF_mask_events,
};
return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
}
void __cpuinit xen_enable_sysenter(void)
{
int ret;
unsigned sysenter_feature;
#ifdef CONFIG_X86_32
sysenter_feature = X86_FEATURE_SEP;
#else
sysenter_feature = X86_FEATURE_SYSENTER32;
#endif
if (!boot_cpu_has(sysenter_feature))
return;
ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
if(ret != 0)
setup_clear_cpu_cap(sysenter_feature);
}
void __cpuinit xen_enable_syscall(void)
{
#ifdef CONFIG_X86_64
int ret;
ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
if (ret != 0) {
printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
/* Pretty fatal; 64-bit userspace has no other
mechanism for syscalls. */
}
if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
ret = register_callback(CALLBACKTYPE_syscall32,
xen_syscall32_target);
if (ret != 0)
setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
}
#endif /* CONFIG_X86_64 */
}
void __init xen_arch_setup(void)
{
xen_panic_handler_init();
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
if (!xen_feature(XENFEAT_auto_translated_physmap))
HYPERVISOR_vm_assist(VMASST_CMD_enable,
VMASST_TYPE_pae_extended_cr3);
if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
BUG();
xen_enable_sysenter();
xen_enable_syscall();
#ifdef CONFIG_ACPI
if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
disable_acpi();
}
#endif
memcpy(boot_command_line, xen_start_info->cmd_line,
MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
/* Set up idle, making sure it calls safe_halt() pvop */
#ifdef CONFIG_X86_32
boot_cpu_data.hlt_works_ok = 1;
#endif
disable_cpuidle();
disable_cpufreq();
WARN_ON(set_pm_idle_to_default());
fiddle_vdso();
}
+592
View File
@@ -0,0 +1,592 @@
/*
* Xen SMP support
*
* This file implements the Xen versions of smp_ops. SMP under Xen is
* very straightforward. Bringing a CPU up is simply a matter of
* loading its initial context and setting it running.
*
* IPIs are handled through the Xen event mechanism.
*
* Because virtual CPUs can be scheduled onto any real CPU, there's no
* useful topology information for the kernel to make use of. As a
* result, all CPUs are treated as if they're single-core and
* single-threaded.
*/
#include <linux/sched.h>
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/smp.h>
#include <asm/paravirt.h>
#include <asm/desc.h>
#include <asm/pgtable.h>
#include <asm/cpu.h>
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>
#include <asm/xen/interface.h>
#include <asm/xen/hypercall.h>
#include <xen/xen.h>
#include <xen/page.h>
#include <xen/events.h>
#include <xen/hvc-console.h>
#include "xen-ops.h"
#include "mmu.h"
cpumask_var_t xen_cpu_initialized_map;
static DEFINE_PER_CPU(int, xen_resched_irq);
static DEFINE_PER_CPU(int, xen_callfunc_irq);
static DEFINE_PER_CPU(int, xen_callfuncsingle_irq);
static DEFINE_PER_CPU(int, xen_debug_irq) = -1;
static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
/*
* Reschedule call back.
*/
static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
{
inc_irq_stat(irq_resched_count);
scheduler_ipi();
return IRQ_HANDLED;
}
static void __cpuinit cpu_bringup(void)
{
int cpu;
cpu_init();
touch_softlockup_watchdog();
preempt_disable();
xen_enable_sysenter();
xen_enable_syscall();
cpu = smp_processor_id();
smp_store_cpu_info(cpu);
cpu_data(cpu).x86_max_cores = 1;
set_cpu_sibling_map(cpu);
xen_setup_cpu_clockevents();
notify_cpu_starting(cpu);
ipi_call_lock();
set_cpu_online(cpu, true);
ipi_call_unlock();
this_cpu_write(cpu_state, CPU_ONLINE);
wmb();
/* We can take interrupts now: we're officially "up". */
local_irq_enable();
wmb(); /* make sure everything is out */
}
static void __cpuinit cpu_bringup_and_idle(void)
{
cpu_bringup();
cpu_idle();
}
static int xen_smp_intr_init(unsigned int cpu)
{
int rc;
const char *resched_name, *callfunc_name, *debug_name;
resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
cpu,
xen_reschedule_interrupt,
IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
resched_name,
NULL);
if (rc < 0)
goto fail;
per_cpu(xen_resched_irq, cpu) = rc;
callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
cpu,
xen_call_function_interrupt,
IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
callfunc_name,
NULL);
if (rc < 0)
goto fail;
per_cpu(xen_callfunc_irq, cpu) = rc;
debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu);
rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt,
IRQF_DISABLED | IRQF_PERCPU | IRQF_NOBALANCING,
debug_name, NULL);
if (rc < 0)
goto fail;
per_cpu(xen_debug_irq, cpu) = rc;
callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR,
cpu,
xen_call_function_single_interrupt,
IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
callfunc_name,
NULL);
if (rc < 0)
goto fail;
per_cpu(xen_callfuncsingle_irq, cpu) = rc;
return 0;
fail:
if (per_cpu(xen_resched_irq, cpu) >= 0)
unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL);
if (per_cpu(xen_callfunc_irq, cpu) >= 0)
unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
if (per_cpu(xen_debug_irq, cpu) >= 0)
unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
if (per_cpu(xen_callfuncsingle_irq, cpu) >= 0)
unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu),
NULL);
return rc;
}
static void __init xen_fill_possible_map(void)
{
int i, rc;
if (xen_initial_domain())
return;
for (i = 0; i < nr_cpu_ids; i++) {
rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
if (rc >= 0) {
num_processors++;
set_cpu_possible(i, true);
}
}
}
static void __init xen_filter_cpu_maps(void)
{
int i, rc;
unsigned int subtract = 0;
if (!xen_initial_domain())
return;
num_processors = 0;
disabled_cpus = 0;
for (i = 0; i < nr_cpu_ids; i++) {
rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
if (rc >= 0) {
num_processors++;
set_cpu_possible(i, true);
} else {
set_cpu_possible(i, false);
set_cpu_present(i, false);
subtract++;
}
}
#ifdef CONFIG_HOTPLUG_CPU
/* This is akin to using 'nr_cpus' on the Linux command line.
* Which is OK as when we use 'dom0_max_vcpus=X' we can only
* have up to X, while nr_cpu_ids is greater than X. This
* normally is not a problem, except when CPU hotplugging
* is involved and then there might be more than X CPUs
* in the guest - which will not work as there is no
* hypercall to expand the max number of VCPUs an already
* running guest has. So cap it up to X. */
if (subtract)
nr_cpu_ids = nr_cpu_ids - subtract;
#endif
}
static void __init xen_smp_prepare_boot_cpu(void)
{
BUG_ON(smp_processor_id() != 0);
native_smp_prepare_boot_cpu();
/* We've switched to the "real" per-cpu gdt, so make sure the
old memory can be recycled */
make_lowmem_page_readwrite(xen_initial_gdt);
xen_filter_cpu_maps();
xen_setup_vcpu_info_placement();
}
static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
{
unsigned cpu;
unsigned int i;
if (skip_ioapic_setup) {
char *m = (max_cpus == 0) ?
"The nosmp parameter is incompatible with Xen; " \
"use Xen dom0_max_vcpus=1 parameter" :
"The noapic parameter is incompatible with Xen";
xen_raw_printk(m);
panic(m);
}
xen_init_lock_cpu(0);
smp_store_cpu_info(0);
cpu_data(0).x86_max_cores = 1;
for_each_possible_cpu(i) {
zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
}
set_cpu_sibling_map(0);
if (xen_smp_intr_init(0))
BUG();
if (!alloc_cpumask_var(&xen_cpu_initialized_map, GFP_KERNEL))
panic("could not allocate xen_cpu_initialized_map\n");
cpumask_copy(xen_cpu_initialized_map, cpumask_of(0));
/* Restrict the possible_map according to max_cpus. */
while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--)
continue;
set_cpu_possible(cpu, false);
}
for_each_possible_cpu (cpu) {
struct task_struct *idle;
if (cpu == 0)
continue;
idle = fork_idle(cpu);
if (IS_ERR(idle))
panic("failed fork for CPU %d", cpu);
set_cpu_present(cpu, true);
}
}
static int __cpuinit
cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
{
struct vcpu_guest_context *ctxt;
struct desc_struct *gdt;
unsigned long gdt_mfn;
if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
return 0;
ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
if (ctxt == NULL)
return -ENOMEM;
gdt = get_cpu_gdt_table(cpu);
ctxt->flags = VGCF_IN_KERNEL;
ctxt->user_regs.ds = __USER_DS;
ctxt->user_regs.es = __USER_DS;
ctxt->user_regs.ss = __KERNEL_DS;
#ifdef CONFIG_X86_32
ctxt->user_regs.fs = __KERNEL_PERCPU;
ctxt->user_regs.gs = __KERNEL_STACK_CANARY;
#else
ctxt->gs_base_kernel = per_cpu_offset(cpu);
#endif
ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
xen_copy_trap_info(ctxt->trap_ctxt);
ctxt->ldt_ents = 0;
BUG_ON((unsigned long)gdt & ~PAGE_MASK);
gdt_mfn = arbitrary_virt_to_mfn(gdt);
make_lowmem_page_readonly(gdt);
make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));
ctxt->gdt_frames[0] = gdt_mfn;
ctxt->gdt_ents = GDT_ENTRIES;
ctxt->user_regs.cs = __KERNEL_CS;
ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
ctxt->kernel_ss = __KERNEL_DS;
ctxt->kernel_sp = idle->thread.sp0;
#ifdef CONFIG_X86_32
ctxt->event_callback_cs = __KERNEL_CS;
ctxt->failsafe_callback_cs = __KERNEL_CS;
#endif
ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
BUG();
kfree(ctxt);
return 0;
}
static int __cpuinit xen_cpu_up(unsigned int cpu)
{
struct task_struct *idle = idle_task(cpu);
int rc;
per_cpu(current_task, cpu) = idle;
#ifdef CONFIG_X86_32
irq_ctx_init(cpu);
#else
clear_tsk_thread_flag(idle, TIF_FORK);
per_cpu(kernel_stack, cpu) =
(unsigned long)task_stack_page(idle) -
KERNEL_STACK_OFFSET + THREAD_SIZE;
#endif
xen_setup_runstate_info(cpu);
xen_setup_timer(cpu);
xen_init_lock_cpu(cpu);
per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
/* make sure interrupts start blocked */
per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
rc = cpu_initialize_context(cpu, idle);
if (rc)
return rc;
if (num_online_cpus() == 1)
alternatives_smp_switch(1);
rc = xen_smp_intr_init(cpu);
if (rc)
return rc;
rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
BUG_ON(rc);
while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
barrier();
}
return 0;
}
static void xen_smp_cpus_done(unsigned int max_cpus)
{
}
#ifdef CONFIG_HOTPLUG_CPU
static int xen_cpu_disable(void)
{
unsigned int cpu = smp_processor_id();
if (cpu == 0)
return -EBUSY;
cpu_disable_common();
load_cr3(swapper_pg_dir);
return 0;
}
static void xen_cpu_die(unsigned int cpu)
{
while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
current->state = TASK_UNINTERRUPTIBLE;
schedule_timeout(HZ/10);
}
unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL);
unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL);
xen_uninit_lock_cpu(cpu);
xen_teardown_timer(cpu);
if (num_online_cpus() == 1)
alternatives_smp_switch(0);
}
static void __cpuinit xen_play_dead(void) /* used only with HOTPLUG_CPU */
{
play_dead_common();
HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
cpu_bringup();
/*
* Balance out the preempt calls - as we are running in cpu_idle
* loop which has been called at bootup from cpu_bringup_and_idle.
* The cpucpu_bringup_and_idle called cpu_bringup which made a
* preempt_disable() So this preempt_enable will balance it out.
*/
preempt_enable();
}
#else /* !CONFIG_HOTPLUG_CPU */
static int xen_cpu_disable(void)
{
return -ENOSYS;
}
static void xen_cpu_die(unsigned int cpu)
{
BUG();
}
static void xen_play_dead(void)
{
BUG();
}
#endif
static void stop_self(void *v)
{
int cpu = smp_processor_id();
/* make sure we're not pinning something down */
load_cr3(swapper_pg_dir);
/* should set up a minimal gdt */
set_cpu_online(cpu, false);
HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
BUG();
}
static void xen_stop_other_cpus(int wait)
{
smp_call_function(stop_self, NULL, wait);
}
static void xen_smp_send_reschedule(int cpu)
{
xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
}
static void xen_send_IPI_mask(const struct cpumask *mask,
enum ipi_vector vector)
{
unsigned cpu;
for_each_cpu_and(cpu, mask, cpu_online_mask)
xen_send_IPI_one(cpu, vector);
}
static void xen_smp_send_call_function_ipi(const struct cpumask *mask)
{
int cpu;
xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
/* Make sure other vcpus get a chance to run if they need to. */
for_each_cpu(cpu, mask) {
if (xen_vcpu_stolen(cpu)) {
HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
break;
}
}
}
static void xen_smp_send_call_function_single_ipi(int cpu)
{
xen_send_IPI_mask(cpumask_of(cpu),
XEN_CALL_FUNCTION_SINGLE_VECTOR);
}
static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
{
irq_enter();
generic_smp_call_function_interrupt();
inc_irq_stat(irq_call_count);
irq_exit();
return IRQ_HANDLED;
}
static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
{
irq_enter();
generic_smp_call_function_single_interrupt();
inc_irq_stat(irq_call_count);
irq_exit();
return IRQ_HANDLED;
}
static const struct smp_ops xen_smp_ops __initconst = {
.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
.smp_prepare_cpus = xen_smp_prepare_cpus,
.smp_cpus_done = xen_smp_cpus_done,
.cpu_up = xen_cpu_up,
.cpu_die = xen_cpu_die,
.cpu_disable = xen_cpu_disable,
.play_dead = xen_play_dead,
.stop_other_cpus = xen_stop_other_cpus,
.smp_send_reschedule = xen_smp_send_reschedule,
.send_call_func_ipi = xen_smp_send_call_function_ipi,
.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
};
void __init xen_smp_init(void)
{
smp_ops = xen_smp_ops;
xen_fill_possible_map();
xen_init_spinlocks();
}
static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
{
native_smp_prepare_cpus(max_cpus);
WARN_ON(xen_smp_intr_init(0));
xen_init_lock_cpu(0);
}
static int __cpuinit xen_hvm_cpu_up(unsigned int cpu)
{
int rc;
rc = native_cpu_up(cpu);
WARN_ON (xen_smp_intr_init(cpu));
return rc;
}
static void xen_hvm_cpu_die(unsigned int cpu)
{
unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL);
unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL);
native_cpu_die(cpu);
}
void __init xen_hvm_smp_init(void)
{
if (!xen_have_vector_callback)
return;
smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
smp_ops.cpu_up = xen_hvm_cpu_up;
smp_ops.cpu_die = xen_hvm_cpu_die;
smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
}
+454
View File
@@ -0,0 +1,454 @@
/*
* Split spinlock implementation out into its own file, so it can be
* compiled in a FTRACE-compatible way.
*/
#include <linux/kernel_stat.h>
#include <linux/spinlock.h>
#include <linux/debugfs.h>
#include <linux/log2.h>
#include <linux/gfp.h>
#include <asm/paravirt.h>
#include <xen/interface/xen.h>
#include <xen/events.h>
#include "xen-ops.h"
#include "debugfs.h"
#ifdef CONFIG_XEN_DEBUG_FS
static struct xen_spinlock_stats
{
u64 taken;
u32 taken_slow;
u32 taken_slow_nested;
u32 taken_slow_pickup;
u32 taken_slow_spurious;
u32 taken_slow_irqenable;
u64 released;
u32 released_slow;
u32 released_slow_kicked;
#define HISTO_BUCKETS 30
u32 histo_spin_total[HISTO_BUCKETS+1];
u32 histo_spin_spinning[HISTO_BUCKETS+1];
u32 histo_spin_blocked[HISTO_BUCKETS+1];
u64 time_total;
u64 time_spinning;
u64 time_blocked;
} spinlock_stats;
static u8 zero_stats;
static unsigned lock_timeout = 1 << 10;
#define TIMEOUT lock_timeout
static inline void check_zero(void)
{
if (unlikely(zero_stats)) {
memset(&spinlock_stats, 0, sizeof(spinlock_stats));
zero_stats = 0;
}
}
#define ADD_STATS(elem, val) \
do { check_zero(); spinlock_stats.elem += (val); } while(0)
static inline u64 spin_time_start(void)
{
return xen_clocksource_read();
}
static void __spin_time_accum(u64 delta, u32 *array)
{
unsigned index = ilog2(delta);
check_zero();
if (index < HISTO_BUCKETS)
array[index]++;
else
array[HISTO_BUCKETS]++;
}
static inline void spin_time_accum_spinning(u64 start)
{
u32 delta = xen_clocksource_read() - start;
__spin_time_accum(delta, spinlock_stats.histo_spin_spinning);
spinlock_stats.time_spinning += delta;
}
static inline void spin_time_accum_total(u64 start)
{
u32 delta = xen_clocksource_read() - start;
__spin_time_accum(delta, spinlock_stats.histo_spin_total);
spinlock_stats.time_total += delta;
}
static inline void spin_time_accum_blocked(u64 start)
{
u32 delta = xen_clocksource_read() - start;
__spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
spinlock_stats.time_blocked += delta;
}
#else /* !CONFIG_XEN_DEBUG_FS */
#define TIMEOUT (1 << 10)
#define ADD_STATS(elem, val) do { (void)(val); } while(0)
static inline u64 spin_time_start(void)
{
return 0;
}
static inline void spin_time_accum_total(u64 start)
{
}
static inline void spin_time_accum_spinning(u64 start)
{
}
static inline void spin_time_accum_blocked(u64 start)
{
}
#endif /* CONFIG_XEN_DEBUG_FS */
/*
* Size struct xen_spinlock so it's the same as arch_spinlock_t.
*/
#if NR_CPUS < 256
typedef u8 xen_spinners_t;
# define inc_spinners(xl) \
asm(LOCK_PREFIX " incb %0" : "+m" ((xl)->spinners) : : "memory");
# define dec_spinners(xl) \
asm(LOCK_PREFIX " decb %0" : "+m" ((xl)->spinners) : : "memory");
#else
typedef u16 xen_spinners_t;
# define inc_spinners(xl) \
asm(LOCK_PREFIX " incw %0" : "+m" ((xl)->spinners) : : "memory");
# define dec_spinners(xl) \
asm(LOCK_PREFIX " decw %0" : "+m" ((xl)->spinners) : : "memory");
#endif
struct xen_spinlock {
unsigned char lock; /* 0 -> free; 1 -> locked */
xen_spinners_t spinners; /* count of waiting cpus */
};
static int xen_spin_is_locked(struct arch_spinlock *lock)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
return xl->lock != 0;
}
static int xen_spin_is_contended(struct arch_spinlock *lock)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
/* Not strictly true; this is only the count of contended
lock-takers entering the slow path. */
return xl->spinners != 0;
}
static int xen_spin_trylock(struct arch_spinlock *lock)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
u8 old = 1;
asm("xchgb %b0,%1"
: "+q" (old), "+m" (xl->lock) : : "memory");
return old == 0;
}
static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
/*
* Mark a cpu as interested in a lock. Returns the CPU's previous
* lock of interest, in case we got preempted by an interrupt.
*/
static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
{
struct xen_spinlock *prev;
prev = __this_cpu_read(lock_spinners);
__this_cpu_write(lock_spinners, xl);
wmb(); /* set lock of interest before count */
inc_spinners(xl);
return prev;
}
/*
* Mark a cpu as no longer interested in a lock. Restores previous
* lock of interest (NULL for none).
*/
static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev)
{
dec_spinners(xl);
wmb(); /* decrement count before restoring lock */
__this_cpu_write(lock_spinners, prev);
}
static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
struct xen_spinlock *prev;
int irq = __this_cpu_read(lock_kicker_irq);
int ret;
u64 start;
/* If kicker interrupts not initialized yet, just spin */
if (irq == -1)
return 0;
start = spin_time_start();
/* announce we're spinning */
prev = spinning_lock(xl);
ADD_STATS(taken_slow, 1);
ADD_STATS(taken_slow_nested, prev != NULL);
do {
unsigned long flags;
/* clear pending */
xen_clear_irq_pending(irq);
/* check again make sure it didn't become free while
we weren't looking */
ret = xen_spin_trylock(lock);
if (ret) {
ADD_STATS(taken_slow_pickup, 1);
/*
* If we interrupted another spinlock while it
* was blocking, make sure it doesn't block
* without rechecking the lock.
*/
if (prev != NULL)
xen_set_irq_pending(irq);
goto out;
}
flags = arch_local_save_flags();
if (irq_enable) {
ADD_STATS(taken_slow_irqenable, 1);
raw_local_irq_enable();
}
/*
* Block until irq becomes pending. If we're
* interrupted at this point (after the trylock but
* before entering the block), then the nested lock
* handler guarantees that the irq will be left
* pending if there's any chance the lock became free;
* xen_poll_irq() returns immediately if the irq is
* pending.
*/
xen_poll_irq(irq);
raw_local_irq_restore(flags);
ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq));
} while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
out:
unspinning_lock(xl, prev);
spin_time_accum_blocked(start);
return ret;
}
static inline void __xen_spin_lock(struct arch_spinlock *lock, bool irq_enable)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
unsigned timeout;
u8 oldval;
u64 start_spin;
ADD_STATS(taken, 1);
start_spin = spin_time_start();
do {
u64 start_spin_fast = spin_time_start();
timeout = TIMEOUT;
asm("1: xchgb %1,%0\n"
" testb %1,%1\n"
" jz 3f\n"
"2: rep;nop\n"
" cmpb $0,%0\n"
" je 1b\n"
" dec %2\n"
" jnz 2b\n"
"3:\n"
: "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
: "1" (1)
: "memory");
spin_time_accum_spinning(start_spin_fast);
} while (unlikely(oldval != 0 &&
(TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable))));
spin_time_accum_total(start_spin);
}
static void xen_spin_lock(struct arch_spinlock *lock)
{
__xen_spin_lock(lock, false);
}
static void xen_spin_lock_flags(struct arch_spinlock *lock, unsigned long flags)
{
__xen_spin_lock(lock, !raw_irqs_disabled_flags(flags));
}
static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
{
int cpu;
ADD_STATS(released_slow, 1);
for_each_online_cpu(cpu) {
/* XXX should mix up next cpu selection */
if (per_cpu(lock_spinners, cpu) == xl) {
ADD_STATS(released_slow_kicked, 1);
xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
break;
}
}
}
static void xen_spin_unlock(struct arch_spinlock *lock)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
ADD_STATS(released, 1);
smp_wmb(); /* make sure no writes get moved after unlock */
xl->lock = 0; /* release lock */
/*
* Make sure unlock happens before checking for waiting
* spinners. We need a strong barrier to enforce the
* write-read ordering to different memory locations, as the
* CPU makes no implied guarantees about their ordering.
*/
mb();
if (unlikely(xl->spinners))
xen_spin_unlock_slow(xl);
}
static irqreturn_t dummy_handler(int irq, void *dev_id)
{
BUG();
return IRQ_HANDLED;
}
void __cpuinit xen_init_lock_cpu(int cpu)
{
int irq;
const char *name;
name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
cpu,
dummy_handler,
IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
name,
NULL);
if (irq >= 0) {
disable_irq(irq); /* make sure it's never delivered */
per_cpu(lock_kicker_irq, cpu) = irq;
}
printk("cpu %d spinlock event irq %d\n", cpu, irq);
}
void xen_uninit_lock_cpu(int cpu)
{
unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL);
}
void __init xen_init_spinlocks(void)
{
BUILD_BUG_ON(sizeof(struct xen_spinlock) > sizeof(arch_spinlock_t));
pv_lock_ops.spin_is_locked = xen_spin_is_locked;
pv_lock_ops.spin_is_contended = xen_spin_is_contended;
pv_lock_ops.spin_lock = xen_spin_lock;
pv_lock_ops.spin_lock_flags = xen_spin_lock_flags;
pv_lock_ops.spin_trylock = xen_spin_trylock;
pv_lock_ops.spin_unlock = xen_spin_unlock;
}
#ifdef CONFIG_XEN_DEBUG_FS
static struct dentry *d_spin_debug;
static int __init xen_spinlock_debugfs(void)
{
struct dentry *d_xen = xen_init_debugfs();
if (d_xen == NULL)
return -ENOMEM;
d_spin_debug = debugfs_create_dir("spinlocks", d_xen);
debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout);
debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken);
debugfs_create_u32("taken_slow", 0444, d_spin_debug,
&spinlock_stats.taken_slow);
debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug,
&spinlock_stats.taken_slow_nested);
debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
&spinlock_stats.taken_slow_pickup);
debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug,
&spinlock_stats.taken_slow_spurious);
debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug,
&spinlock_stats.taken_slow_irqenable);
debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released);
debugfs_create_u32("released_slow", 0444, d_spin_debug,
&spinlock_stats.released_slow);
debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
&spinlock_stats.released_slow_kicked);
debugfs_create_u64("time_spinning", 0444, d_spin_debug,
&spinlock_stats.time_spinning);
debugfs_create_u64("time_blocked", 0444, d_spin_debug,
&spinlock_stats.time_blocked);
debugfs_create_u64("time_total", 0444, d_spin_debug,
&spinlock_stats.time_total);
xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug,
spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1);
xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug,
spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1);
xen_debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
return 0;
}
fs_initcall(xen_spinlock_debugfs);
#endif /* CONFIG_XEN_DEBUG_FS */
+80
View File
@@ -0,0 +1,80 @@
#include <linux/types.h>
#include <linux/clockchips.h>
#include <xen/interface/xen.h>
#include <xen/grant_table.h>
#include <xen/events.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/page.h>
#include <asm/fixmap.h>
#include "xen-ops.h"
#include "mmu.h"
void xen_arch_pre_suspend(void)
{
xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
xen_start_info->console.domU.mfn =
mfn_to_pfn(xen_start_info->console.domU.mfn);
BUG_ON(!irqs_disabled());
HYPERVISOR_shared_info = &xen_dummy_shared_info;
if (HYPERVISOR_update_va_mapping(fix_to_virt(FIX_PARAVIRT_BOOTMAP),
__pte_ma(0), 0))
BUG();
}
void xen_arch_hvm_post_suspend(int suspend_cancelled)
{
#ifdef CONFIG_XEN_PVHVM
int cpu;
xen_hvm_init_shared_info();
xen_callback_vector();
xen_unplug_emulated_devices();
if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
for_each_online_cpu(cpu) {
xen_setup_runstate_info(cpu);
}
}
#endif
}
void xen_arch_post_suspend(int suspend_cancelled)
{
xen_build_mfn_list_list();
xen_setup_shared_info();
if (suspend_cancelled) {
xen_start_info->store_mfn =
pfn_to_mfn(xen_start_info->store_mfn);
xen_start_info->console.domU.mfn =
pfn_to_mfn(xen_start_info->console.domU.mfn);
} else {
#ifdef CONFIG_SMP
BUG_ON(xen_cpu_initialized_map == NULL);
cpumask_copy(xen_cpu_initialized_map, cpu_online_mask);
#endif
xen_vcpu_restore();
}
}
static void xen_vcpu_notify_restore(void *data)
{
unsigned long reason = (unsigned long)data;
/* Boot processor notified via generic timekeeping_resume() */
if ( smp_processor_id() == 0)
return;
clockevents_notify(reason, NULL);
}
void xen_arch_resume(void)
{
on_each_cpu(xen_vcpu_notify_restore,
(void *)CLOCK_EVT_NOTIFY_RESUME, 1);
}
+525
View File
@@ -0,0 +1,525 @@
/*
* Xen time implementation.
*
* This is implemented in terms of a clocksource driver which uses
* the hypervisor clock as a nanosecond timebase, and a clockevent
* driver which uses the hypervisor's timer mechanism.
*
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
#include <linux/kernel.h>
#include <linux/interrupt.h>
#include <linux/clocksource.h>
#include <linux/clockchips.h>
#include <linux/kernel_stat.h>
#include <linux/math64.h>
#include <linux/gfp.h>
#include <asm/pvclock.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
#include <xen/events.h>
#include <xen/features.h>
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>
#include "xen-ops.h"
/* Xen may fire a timer up to this many ns early */
#define TIMER_SLOP 100000
#define NS_PER_TICK (1000000000LL / HZ)
/* runstate info updated by Xen */
static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
/* snapshots of runstate info */
static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot);
/* unused ns of stolen and blocked time */
static DEFINE_PER_CPU(u64, xen_residual_stolen);
static DEFINE_PER_CPU(u64, xen_residual_blocked);
/* return an consistent snapshot of 64-bit time/counter value */
static u64 get64(const u64 *p)
{
u64 ret;
if (BITS_PER_LONG < 64) {
u32 *p32 = (u32 *)p;
u32 h, l;
/*
* Read high then low, and then make sure high is
* still the same; this will only loop if low wraps
* and carries into high.
* XXX some clean way to make this endian-proof?
*/
do {
h = p32[1];
barrier();
l = p32[0];
barrier();
} while (p32[1] != h);
ret = (((u64)h) << 32) | l;
} else
ret = *p;
return ret;
}
/*
* Runstate accounting
*/
static void get_runstate_snapshot(struct vcpu_runstate_info *res)
{
u64 state_time;
struct vcpu_runstate_info *state;
BUG_ON(preemptible());
state = &__get_cpu_var(xen_runstate);
/*
* The runstate info is always updated by the hypervisor on
* the current CPU, so there's no need to use anything
* stronger than a compiler barrier when fetching it.
*/
do {
state_time = get64(&state->state_entry_time);
barrier();
*res = *state;
barrier();
} while (get64(&state->state_entry_time) != state_time);
}
/* return true when a vcpu could run but has no real cpu to run on */
bool xen_vcpu_stolen(int vcpu)
{
return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable;
}
void xen_setup_runstate_info(int cpu)
{
struct vcpu_register_runstate_memory_area area;
area.addr.v = &per_cpu(xen_runstate, cpu);
if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
cpu, &area))
BUG();
}
static void do_stolen_accounting(void)
{
struct vcpu_runstate_info state;
struct vcpu_runstate_info *snap;
s64 blocked, runnable, offline, stolen;
cputime_t ticks;
get_runstate_snapshot(&state);
WARN_ON(state.state != RUNSTATE_running);
snap = &__get_cpu_var(xen_runstate_snapshot);
/* work out how much time the VCPU has not been runn*ing* */
blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
*snap = state;
/* Add the appropriate number of ticks of stolen time,
including any left-overs from last time. */
stolen = runnable + offline + __this_cpu_read(xen_residual_stolen);
if (stolen < 0)
stolen = 0;
ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
__this_cpu_write(xen_residual_stolen, stolen);
account_steal_ticks(ticks);
/* Add the appropriate number of ticks of blocked time,
including any left-overs from last time. */
blocked += __this_cpu_read(xen_residual_blocked);
if (blocked < 0)
blocked = 0;
ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
__this_cpu_write(xen_residual_blocked, blocked);
account_idle_ticks(ticks);
}
/* Get the TSC speed from Xen */
static unsigned long xen_tsc_khz(void)
{
struct pvclock_vcpu_time_info *info =
&HYPERVISOR_shared_info->vcpu_info[0].time;
return pvclock_tsc_khz(info);
}
cycle_t xen_clocksource_read(void)
{
struct pvclock_vcpu_time_info *src;
cycle_t ret;
preempt_disable_notrace();
src = &__get_cpu_var(xen_vcpu)->time;
ret = pvclock_clocksource_read(src);
preempt_enable_notrace();
return ret;
}
static cycle_t xen_clocksource_get_cycles(struct clocksource *cs)
{
return xen_clocksource_read();
}
static void xen_read_wallclock(struct timespec *ts)
{
struct shared_info *s = HYPERVISOR_shared_info;
struct pvclock_wall_clock *wall_clock = &(s->wc);
struct pvclock_vcpu_time_info *vcpu_time;
vcpu_time = &get_cpu_var(xen_vcpu)->time;
pvclock_read_wallclock(wall_clock, vcpu_time, ts);
put_cpu_var(xen_vcpu);
}
static unsigned long xen_get_wallclock(void)
{
struct timespec ts;
xen_read_wallclock(&ts);
return ts.tv_sec;
}
static int xen_set_wallclock(unsigned long now)
{
struct xen_platform_op op;
int rc;
/* do nothing for domU */
if (!xen_initial_domain())
return -1;
op.cmd = XENPF_settime;
op.u.settime.secs = now;
op.u.settime.nsecs = 0;
op.u.settime.system_time = xen_clocksource_read();
rc = HYPERVISOR_dom0_op(&op);
WARN(rc != 0, "XENPF_settime failed: now=%ld\n", now);
return rc;
}
static struct clocksource xen_clocksource __read_mostly = {
.name = "xen",
.rating = 400,
.read = xen_clocksource_get_cycles,
.mask = ~0,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
/*
Xen clockevent implementation
Xen has two clockevent implementations:
The old timer_op one works with all released versions of Xen prior
to version 3.0.4. This version of the hypervisor provides a
single-shot timer with nanosecond resolution. However, sharing the
same event channel is a 100Hz tick which is delivered while the
vcpu is running. We don't care about or use this tick, but it will
cause the core time code to think the timer fired too soon, and
will end up resetting it each time. It could be filtered, but
doing so has complications when the ktime clocksource is not yet
the xen clocksource (ie, at boot time).
The new vcpu_op-based timer interface allows the tick timer period
to be changed or turned off. The tick timer is not useful as a
periodic timer because events are only delivered to running vcpus.
The one-shot timer can report when a timeout is in the past, so
set_next_event is capable of returning -ETIME when appropriate.
This interface is used when available.
*/
/*
Get a hypervisor absolute time. In theory we could maintain an
offset between the kernel's time and the hypervisor's time, and
apply that to a kernel's absolute timeout. Unfortunately the
hypervisor and kernel times can drift even if the kernel is using
the Xen clocksource, because ntp can warp the kernel's clocksource.
*/
static s64 get_abs_timeout(unsigned long delta)
{
return xen_clocksource_read() + delta;
}
static void xen_timerop_set_mode(enum clock_event_mode mode,
struct clock_event_device *evt)
{
switch (mode) {
case CLOCK_EVT_MODE_PERIODIC:
/* unsupported */
WARN_ON(1);
break;
case CLOCK_EVT_MODE_ONESHOT:
case CLOCK_EVT_MODE_RESUME:
break;
case CLOCK_EVT_MODE_UNUSED:
case CLOCK_EVT_MODE_SHUTDOWN:
HYPERVISOR_set_timer_op(0); /* cancel timeout */
break;
}
}
static int xen_timerop_set_next_event(unsigned long delta,
struct clock_event_device *evt)
{
WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
BUG();
/* We may have missed the deadline, but there's no real way of
knowing for sure. If the event was in the past, then we'll
get an immediate interrupt. */
return 0;
}
static const struct clock_event_device xen_timerop_clockevent = {
.name = "xen",
.features = CLOCK_EVT_FEAT_ONESHOT,
.max_delta_ns = 0xffffffff,
.min_delta_ns = TIMER_SLOP,
.mult = 1,
.shift = 0,
.rating = 500,
.set_mode = xen_timerop_set_mode,
.set_next_event = xen_timerop_set_next_event,
};
static void xen_vcpuop_set_mode(enum clock_event_mode mode,
struct clock_event_device *evt)
{
int cpu = smp_processor_id();
switch (mode) {
case CLOCK_EVT_MODE_PERIODIC:
WARN_ON(1); /* unsupported */
break;
case CLOCK_EVT_MODE_ONESHOT:
if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
BUG();
break;
case CLOCK_EVT_MODE_UNUSED:
case CLOCK_EVT_MODE_SHUTDOWN:
if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
BUG();
break;
case CLOCK_EVT_MODE_RESUME:
break;
}
}
static int xen_vcpuop_set_next_event(unsigned long delta,
struct clock_event_device *evt)
{
int cpu = smp_processor_id();
struct vcpu_set_singleshot_timer single;
int ret;
WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
single.timeout_abs_ns = get_abs_timeout(delta);
single.flags = VCPU_SSHOTTMR_future;
ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
BUG_ON(ret != 0 && ret != -ETIME);
return ret;
}
static const struct clock_event_device xen_vcpuop_clockevent = {
.name = "xen",
.features = CLOCK_EVT_FEAT_ONESHOT,
.max_delta_ns = 0xffffffff,
.min_delta_ns = TIMER_SLOP,
.mult = 1,
.shift = 0,
.rating = 500,
.set_mode = xen_vcpuop_set_mode,
.set_next_event = xen_vcpuop_set_next_event,
};
static const struct clock_event_device *xen_clockevent =
&xen_timerop_clockevent;
static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
{
struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
irqreturn_t ret;
ret = IRQ_NONE;
if (evt->event_handler) {
evt->event_handler(evt);
ret = IRQ_HANDLED;
}
do_stolen_accounting();
return ret;
}
void xen_setup_timer(int cpu)
{
const char *name;
struct clock_event_device *evt;
int irq;
printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
name = kasprintf(GFP_KERNEL, "timer%d", cpu);
if (!name)
name = "<timer kasprintf failed>";
irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
IRQF_DISABLED|IRQF_PERCPU|
IRQF_NOBALANCING|IRQF_TIMER|
IRQF_FORCE_RESUME,
name, NULL);
evt = &per_cpu(xen_clock_events, cpu);
memcpy(evt, xen_clockevent, sizeof(*evt));
evt->cpumask = cpumask_of(cpu);
evt->irq = irq;
}
void xen_teardown_timer(int cpu)
{
struct clock_event_device *evt;
BUG_ON(cpu == 0);
evt = &per_cpu(xen_clock_events, cpu);
unbind_from_irqhandler(evt->irq, NULL);
}
void xen_setup_cpu_clockevents(void)
{
BUG_ON(preemptible());
clockevents_register_device(&__get_cpu_var(xen_clock_events));
}
void xen_timer_resume(void)
{
int cpu;
pvclock_resume();
if (xen_clockevent != &xen_vcpuop_clockevent)
return;
for_each_online_cpu(cpu) {
if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
BUG();
}
}
static const struct pv_time_ops xen_time_ops __initconst = {
.sched_clock = xen_clocksource_read,
};
static void __init xen_time_init(void)
{
int cpu = smp_processor_id();
struct timespec tp;
clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
/* Successfully turned off 100Hz tick, so we have the
vcpuop-based timer interface */
printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
xen_clockevent = &xen_vcpuop_clockevent;
}
/* Set initial system time with full resolution */
xen_read_wallclock(&tp);
do_settimeofday(&tp);
setup_force_cpu_cap(X86_FEATURE_TSC);
xen_setup_runstate_info(cpu);
xen_setup_timer(cpu);
xen_setup_cpu_clockevents();
}
void __init xen_init_time_ops(void)
{
pv_time_ops = xen_time_ops;
x86_init.timers.timer_init = xen_time_init;
x86_init.timers.setup_percpu_clockev = x86_init_noop;
x86_cpuinit.setup_percpu_clockev = x86_init_noop;
x86_platform.calibrate_tsc = xen_tsc_khz;
x86_platform.get_wallclock = xen_get_wallclock;
x86_platform.set_wallclock = xen_set_wallclock;
}
#ifdef CONFIG_XEN_PVHVM
static void xen_hvm_setup_cpu_clockevents(void)
{
int cpu = smp_processor_id();
xen_setup_runstate_info(cpu);
xen_setup_timer(cpu);
xen_setup_cpu_clockevents();
}
void __init xen_hvm_init_time_ops(void)
{
/* vector callback is needed otherwise we cannot receive interrupts
* on cpu > 0 and at this point we don't know how many cpus are
* available */
if (!xen_have_vector_callback)
return;
if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
printk(KERN_INFO "Xen doesn't support pvclock on HVM,"
"disable pv timer\n");
return;
}
pv_time_ops = xen_time_ops;
x86_init.timers.setup_percpu_clockev = xen_time_init;
x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
x86_platform.calibrate_tsc = xen_tsc_khz;
x86_platform.get_wallclock = xen_get_wallclock;
x86_platform.set_wallclock = xen_set_wallclock;
}
#endif
+62
View File
@@ -0,0 +1,62 @@
#include <linux/ftrace.h>
#include <xen/interface/xen.h>
#define N(x) [__HYPERVISOR_##x] = "("#x")"
static const char *xen_hypercall_names[] = {
N(set_trap_table),
N(mmu_update),
N(set_gdt),
N(stack_switch),
N(set_callbacks),
N(fpu_taskswitch),
N(sched_op_compat),
N(dom0_op),
N(set_debugreg),
N(get_debugreg),
N(update_descriptor),
N(memory_op),
N(multicall),
N(update_va_mapping),
N(set_timer_op),
N(event_channel_op_compat),
N(xen_version),
N(console_io),
N(physdev_op_compat),
N(grant_table_op),
N(vm_assist),
N(update_va_mapping_otherdomain),
N(iret),
N(vcpu_op),
N(set_segment_base),
N(mmuext_op),
N(acm_op),
N(nmi_op),
N(sched_op),
N(callback_op),
N(xenoprof_op),
N(event_channel_op),
N(physdev_op),
N(hvm_op),
/* Architecture-specific hypercall definitions. */
N(arch_0),
N(arch_1),
N(arch_2),
N(arch_3),
N(arch_4),
N(arch_5),
N(arch_6),
N(arch_7),
};
#undef N
static const char *xen_hypercall_name(unsigned op)
{
if (op < ARRAY_SIZE(xen_hypercall_names) && xen_hypercall_names[op] != NULL)
return xen_hypercall_names[op];
return "";
}
#define CREATE_TRACE_POINTS
#include <trace/events/xen.h>
+4
View File
@@ -0,0 +1,4 @@
/* Bit used for the pseudo-hwcap for non-negative segments. We use
bit 1 to avoid bugs in some versions of glibc when bit 0 is
used; the choice is otherwise arbitrary. */
#define VDSO_NOTE_NONEGSEG_BIT 1
+67
View File
@@ -0,0 +1,67 @@
#include <linux/screen_info.h>
#include <linux/init.h>
#include <asm/bootparam.h>
#include <asm/setup.h>
#include <xen/interface/xen.h>
#include "xen-ops.h"
void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size)
{
struct screen_info *screen_info = &boot_params.screen_info;
/* This is drawn from a dump from vgacon:startup in
* standard Linux. */
screen_info->orig_video_mode = 3;
screen_info->orig_video_isVGA = 1;
screen_info->orig_video_lines = 25;
screen_info->orig_video_cols = 80;
screen_info->orig_video_ega_bx = 3;
screen_info->orig_video_points = 16;
screen_info->orig_y = screen_info->orig_video_lines - 1;
switch (info->video_type) {
case XEN_VGATYPE_TEXT_MODE_3:
if (size < offsetof(struct dom0_vga_console_info, u.text_mode_3)
+ sizeof(info->u.text_mode_3))
break;
screen_info->orig_video_lines = info->u.text_mode_3.rows;
screen_info->orig_video_cols = info->u.text_mode_3.columns;
screen_info->orig_x = info->u.text_mode_3.cursor_x;
screen_info->orig_y = info->u.text_mode_3.cursor_y;
screen_info->orig_video_points =
info->u.text_mode_3.font_height;
break;
case XEN_VGATYPE_VESA_LFB:
if (size < offsetof(struct dom0_vga_console_info,
u.vesa_lfb.gbl_caps))
break;
screen_info->orig_video_isVGA = VIDEO_TYPE_VLFB;
screen_info->lfb_width = info->u.vesa_lfb.width;
screen_info->lfb_height = info->u.vesa_lfb.height;
screen_info->lfb_depth = info->u.vesa_lfb.bits_per_pixel;
screen_info->lfb_base = info->u.vesa_lfb.lfb_base;
screen_info->lfb_size = info->u.vesa_lfb.lfb_size;
screen_info->lfb_linelength = info->u.vesa_lfb.bytes_per_line;
screen_info->red_size = info->u.vesa_lfb.red_size;
screen_info->red_pos = info->u.vesa_lfb.red_pos;
screen_info->green_size = info->u.vesa_lfb.green_size;
screen_info->green_pos = info->u.vesa_lfb.green_pos;
screen_info->blue_size = info->u.vesa_lfb.blue_size;
screen_info->blue_pos = info->u.vesa_lfb.blue_pos;
screen_info->rsvd_size = info->u.vesa_lfb.rsvd_size;
screen_info->rsvd_pos = info->u.vesa_lfb.rsvd_pos;
if (size >= offsetof(struct dom0_vga_console_info,
u.vesa_lfb.gbl_caps)
+ sizeof(info->u.vesa_lfb.gbl_caps))
screen_info->capabilities = info->u.vesa_lfb.gbl_caps;
if (size >= offsetof(struct dom0_vga_console_info,
u.vesa_lfb.mode_attrs)
+ sizeof(info->u.vesa_lfb.mode_attrs))
screen_info->vesa_attributes = info->u.vesa_lfb.mode_attrs;
break;
}
}
+142
View File
@@ -0,0 +1,142 @@
/*
* Asm versions of Xen pv-ops, suitable for either direct use or
* inlining. The inline versions are the same as the direct-use
* versions, with the pre- and post-amble chopped off.
*
* This code is encoded for size rather than absolute efficiency, with
* a view to being able to inline as much as possible.
*
* We only bother with direct forms (ie, vcpu in percpu data) of the
* operations here; the indirect forms are better handled in C, since
* they're generally too large to inline anyway.
*/
#include <asm/asm-offsets.h>
#include <asm/percpu.h>
#include <asm/processor-flags.h>
#include "xen-asm.h"
/*
* Enable events. This clears the event mask and tests the pending
* event status with one and operation. If there are pending events,
* then enter the hypervisor to get them handled.
*/
ENTRY(xen_irq_enable_direct)
/* Unmask events */
movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
/*
* Preempt here doesn't matter because that will deal with any
* pending interrupts. The pending check may end up being run
* on the wrong CPU, but that doesn't hurt.
*/
/* Test for pending */
testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
jz 1f
2: call check_events
1:
ENDPATCH(xen_irq_enable_direct)
ret
ENDPROC(xen_irq_enable_direct)
RELOC(xen_irq_enable_direct, 2b+1)
/*
* Disabling events is simply a matter of making the event mask
* non-zero.
*/
ENTRY(xen_irq_disable_direct)
movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
ENDPATCH(xen_irq_disable_direct)
ret
ENDPROC(xen_irq_disable_direct)
RELOC(xen_irq_disable_direct, 0)
/*
* (xen_)save_fl is used to get the current interrupt enable status.
* Callers expect the status to be in X86_EFLAGS_IF, and other bits
* may be set in the return value. We take advantage of this by
* making sure that X86_EFLAGS_IF has the right value (and other bits
* in that byte are 0), but other bits in the return value are
* undefined. We need to toggle the state of the bit, because Xen and
* x86 use opposite senses (mask vs enable).
*/
ENTRY(xen_save_fl_direct)
testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
setz %ah
addb %ah, %ah
ENDPATCH(xen_save_fl_direct)
ret
ENDPROC(xen_save_fl_direct)
RELOC(xen_save_fl_direct, 0)
/*
* In principle the caller should be passing us a value return from
* xen_save_fl_direct, but for robustness sake we test only the
* X86_EFLAGS_IF flag rather than the whole byte. After setting the
* interrupt mask state, it checks for unmasked pending events and
* enters the hypervisor to get them delivered if so.
*/
ENTRY(xen_restore_fl_direct)
#ifdef CONFIG_X86_64
testw $X86_EFLAGS_IF, %di
#else
testb $X86_EFLAGS_IF>>8, %ah
#endif
setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
/*
* Preempt here doesn't matter because that will deal with any
* pending interrupts. The pending check may end up being run
* on the wrong CPU, but that doesn't hurt.
*/
/* check for unmasked and pending */
cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
jnz 1f
2: call check_events
1:
ENDPATCH(xen_restore_fl_direct)
ret
ENDPROC(xen_restore_fl_direct)
RELOC(xen_restore_fl_direct, 2b+1)
/*
* Force an event check by making a hypercall, but preserve regs
* before making the call.
*/
check_events:
#ifdef CONFIG_X86_32
push %eax
push %ecx
push %edx
call xen_force_evtchn_callback
pop %edx
pop %ecx
pop %eax
#else
push %rax
push %rcx
push %rdx
push %rsi
push %rdi
push %r8
push %r9
push %r10
push %r11
call xen_force_evtchn_callback
pop %r11
pop %r10
pop %r9
pop %r8
pop %rdi
pop %rsi
pop %rdx
pop %rcx
pop %rax
#endif
ret
+12
View File
@@ -0,0 +1,12 @@
#ifndef _XEN_XEN_ASM_H
#define _XEN_XEN_ASM_H
#include <linux/linkage.h>
#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
#define ENDPATCH(x) .globl x##_end; x##_end=.
/* Pseudo-flag used for virtual NMI, which we don't implement yet */
#define XEN_EFLAGS_NMI 0x80000000
#endif
+230
View File
@@ -0,0 +1,230 @@
/*
* Asm versions of Xen pv-ops, suitable for either direct use or
* inlining. The inline versions are the same as the direct-use
* versions, with the pre- and post-amble chopped off.
*
* This code is encoded for size rather than absolute efficiency, with
* a view to being able to inline as much as possible.
*
* We only bother with direct forms (ie, vcpu in pda) of the
* operations here; the indirect forms are better handled in C, since
* they're generally too large to inline anyway.
*/
#include <asm/thread_info.h>
#include <asm/processor-flags.h>
#include <asm/segment.h>
#include <xen/interface/xen.h>
#include "xen-asm.h"
/*
* Force an event check by making a hypercall, but preserve regs
* before making the call.
*/
check_events:
push %eax
push %ecx
push %edx
call xen_force_evtchn_callback
pop %edx
pop %ecx
pop %eax
ret
/*
* We can't use sysexit directly, because we're not running in ring0.
* But we can easily fake it up using iret. Assuming xen_sysexit is
* jumped to with a standard stack frame, we can just strip it back to
* a standard iret frame and use iret.
*/
ENTRY(xen_sysexit)
movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */
orl $X86_EFLAGS_IF, PT_EFLAGS(%esp)
lea PT_EIP(%esp), %esp
jmp xen_iret
ENDPROC(xen_sysexit)
/*
* This is run where a normal iret would be run, with the same stack setup:
* 8: eflags
* 4: cs
* esp-> 0: eip
*
* This attempts to make sure that any pending events are dealt with
* on return to usermode, but there is a small window in which an
* event can happen just before entering usermode. If the nested
* interrupt ends up setting one of the TIF_WORK_MASK pending work
* flags, they will not be tested again before returning to
* usermode. This means that a process can end up with pending work,
* which will be unprocessed until the process enters and leaves the
* kernel again, which could be an unbounded amount of time. This
* means that a pending signal or reschedule event could be
* indefinitely delayed.
*
* The fix is to notice a nested interrupt in the critical window, and
* if one occurs, then fold the nested interrupt into the current
* interrupt stack frame, and re-process it iteratively rather than
* recursively. This means that it will exit via the normal path, and
* all pending work will be dealt with appropriately.
*
* Because the nested interrupt handler needs to deal with the current
* stack state in whatever form its in, we keep things simple by only
* using a single register which is pushed/popped on the stack.
*/
ENTRY(xen_iret)
/* test eflags for special cases */
testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
jnz hyper_iret
push %eax
ESP_OFFSET=4 # bytes pushed onto stack
/*
* Store vcpu_info pointer for easy access. Do it this way to
* avoid having to reload %fs
*/
#ifdef CONFIG_SMP
GET_THREAD_INFO(%eax)
movl TI_cpu(%eax), %eax
movl __per_cpu_offset(,%eax,4), %eax
mov xen_vcpu(%eax), %eax
#else
movl xen_vcpu, %eax
#endif
/* check IF state we're restoring */
testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
/*
* Maybe enable events. Once this happens we could get a
* recursive event, so the critical region starts immediately
* afterwards. However, if that happens we don't end up
* resuming the code, so we don't have to be worried about
* being preempted to another CPU.
*/
setz XEN_vcpu_info_mask(%eax)
xen_iret_start_crit:
/* check for unmasked and pending */
cmpw $0x0001, XEN_vcpu_info_pending(%eax)
/*
* If there's something pending, mask events again so we can
* jump back into xen_hypervisor_callback. Otherwise do not
* touch XEN_vcpu_info_mask.
*/
jne 1f
movb $1, XEN_vcpu_info_mask(%eax)
1: popl %eax
/*
* From this point on the registers are restored and the stack
* updated, so we don't need to worry about it if we're
* preempted
*/
iret_restore_end:
/*
* Jump to hypervisor_callback after fixing up the stack.
* Events are masked, so jumping out of the critical region is
* OK.
*/
je xen_hypervisor_callback
1: iret
xen_iret_end_crit:
.section __ex_table, "a"
.align 4
.long 1b, iret_exc
.previous
hyper_iret:
/* put this out of line since its very rarely used */
jmp hypercall_page + __HYPERVISOR_iret * 32
.globl xen_iret_start_crit, xen_iret_end_crit
/*
* This is called by xen_hypervisor_callback in entry.S when it sees
* that the EIP at the time of interrupt was between
* xen_iret_start_crit and xen_iret_end_crit. We're passed the EIP in
* %eax so we can do a more refined determination of what to do.
*
* The stack format at this point is:
* ----------------
* ss : (ss/esp may be present if we came from usermode)
* esp :
* eflags } outer exception info
* cs }
* eip }
* ---------------- <- edi (copy dest)
* eax : outer eax if it hasn't been restored
* ----------------
* eflags } nested exception info
* cs } (no ss/esp because we're nested
* eip } from the same ring)
* orig_eax }<- esi (copy src)
* - - - - - - - -
* fs }
* es }
* ds } SAVE_ALL state
* eax }
* : :
* ebx }<- esp
* ----------------
*
* In order to deliver the nested exception properly, we need to shift
* everything from the return addr up to the error code so it sits
* just under the outer exception info. This means that when we
* handle the exception, we do it in the context of the outer
* exception rather than starting a new one.
*
* The only caveat is that if the outer eax hasn't been restored yet
* (ie, it's still on stack), we need to insert its value into the
* SAVE_ALL state before going on, since it's usermode state which we
* eventually need to restore.
*/
ENTRY(xen_iret_crit_fixup)
/*
* Paranoia: Make sure we're really coming from kernel space.
* One could imagine a case where userspace jumps into the
* critical range address, but just before the CPU delivers a
* GP, it decides to deliver an interrupt instead. Unlikely?
* Definitely. Easy to avoid? Yes. The Intel documents
* explicitly say that the reported EIP for a bad jump is the
* jump instruction itself, not the destination, but some
* virtual environments get this wrong.
*/
movl PT_CS(%esp), %ecx
andl $SEGMENT_RPL_MASK, %ecx
cmpl $USER_RPL, %ecx
je 2f
lea PT_ORIG_EAX(%esp), %esi
lea PT_EFLAGS(%esp), %edi
/*
* If eip is before iret_restore_end then stack
* hasn't been restored yet.
*/
cmp $iret_restore_end, %eax
jae 1f
movl 0+4(%edi), %eax /* copy EAX (just above top of frame) */
movl %eax, PT_EAX(%esp)
lea ESP_OFFSET(%edi), %edi /* move dest up over saved regs */
/* set up the copy */
1: std
mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */
rep movsl
cld
lea 4(%edi), %esp /* point esp to new frame */
2: jmp xen_do_upcall
+159
View File
@@ -0,0 +1,159 @@
/*
* Asm versions of Xen pv-ops, suitable for either direct use or
* inlining. The inline versions are the same as the direct-use
* versions, with the pre- and post-amble chopped off.
*
* This code is encoded for size rather than absolute efficiency, with
* a view to being able to inline as much as possible.
*
* We only bother with direct forms (ie, vcpu in pda) of the
* operations here; the indirect forms are better handled in C, since
* they're generally too large to inline anyway.
*/
#include <asm/errno.h>
#include <asm/percpu.h>
#include <asm/processor-flags.h>
#include <asm/segment.h>
#include <xen/interface/xen.h>
#include "xen-asm.h"
ENTRY(xen_adjust_exception_frame)
mov 8+0(%rsp), %rcx
mov 8+8(%rsp), %r11
ret $16
hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
/*
* Xen64 iret frame:
*
* ss
* rsp
* rflags
* cs
* rip <-- standard iret frame
*
* flags
*
* rcx }
* r11 }<-- pushed by hypercall page
* rsp->rax }
*/
ENTRY(xen_iret)
pushq $0
1: jmp hypercall_iret
ENDPATCH(xen_iret)
RELOC(xen_iret, 1b+1)
/*
* sysexit is not used for 64-bit processes, so it's only ever used to
* return to 32-bit compat userspace.
*/
ENTRY(xen_sysexit)
pushq $__USER32_DS
pushq %rcx
pushq $X86_EFLAGS_IF
pushq $__USER32_CS
pushq %rdx
pushq $0
1: jmp hypercall_iret
ENDPATCH(xen_sysexit)
RELOC(xen_sysexit, 1b+1)
ENTRY(xen_sysret64)
/*
* We're already on the usermode stack at this point, but
* still with the kernel gs, so we can easily switch back
*/
movq %rsp, PER_CPU_VAR(old_rsp)
movq PER_CPU_VAR(kernel_stack), %rsp
pushq $__USER_DS
pushq PER_CPU_VAR(old_rsp)
pushq %r11
pushq $__USER_CS
pushq %rcx
pushq $VGCF_in_syscall
1: jmp hypercall_iret
ENDPATCH(xen_sysret64)
RELOC(xen_sysret64, 1b+1)
ENTRY(xen_sysret32)
/*
* We're already on the usermode stack at this point, but
* still with the kernel gs, so we can easily switch back
*/
movq %rsp, PER_CPU_VAR(old_rsp)
movq PER_CPU_VAR(kernel_stack), %rsp
pushq $__USER32_DS
pushq PER_CPU_VAR(old_rsp)
pushq %r11
pushq $__USER32_CS
pushq %rcx
pushq $0
1: jmp hypercall_iret
ENDPATCH(xen_sysret32)
RELOC(xen_sysret32, 1b+1)
/*
* Xen handles syscall callbacks much like ordinary exceptions, which
* means we have:
* - kernel gs
* - kernel rsp
* - an iret-like stack frame on the stack (including rcx and r11):
* ss
* rsp
* rflags
* cs
* rip
* r11
* rsp->rcx
*
* In all the entrypoints, we undo all that to make it look like a
* CPU-generated syscall/sysenter and jump to the normal entrypoint.
*/
.macro undo_xen_syscall
mov 0*8(%rsp), %rcx
mov 1*8(%rsp), %r11
mov 5*8(%rsp), %rsp
.endm
/* Normal 64-bit system call target */
ENTRY(xen_syscall_target)
undo_xen_syscall
jmp system_call_after_swapgs
ENDPROC(xen_syscall_target)
#ifdef CONFIG_IA32_EMULATION
/* 32-bit compat syscall target */
ENTRY(xen_syscall32_target)
undo_xen_syscall
jmp ia32_cstar_target
ENDPROC(xen_syscall32_target)
/* 32-bit compat sysenter target */
ENTRY(xen_sysenter_target)
undo_xen_syscall
jmp ia32_sysenter_target
ENDPROC(xen_sysenter_target)
#else /* !CONFIG_IA32_EMULATION */
ENTRY(xen_syscall32_target)
ENTRY(xen_sysenter_target)
lea 16(%rsp), %rsp /* strip %rcx, %r11 */
mov $-ENOSYS, %rax
pushq $0
jmp hypercall_iret
ENDPROC(xen_syscall32_target)
ENDPROC(xen_sysenter_target)
#endif /* CONFIG_IA32_EMULATION */
+55
View File
@@ -0,0 +1,55 @@
/* Xen-specific pieces of head.S, intended to be included in the right
place in head.S */
#ifdef CONFIG_XEN
#include <linux/elfnote.h>
#include <linux/init.h>
#include <asm/boot.h>
#include <asm/asm.h>
#include <asm/page_types.h>
#include <xen/interface/elfnote.h>
#include <asm/xen/interface.h>
__INIT
ENTRY(startup_xen)
cld
#ifdef CONFIG_X86_32
mov %esi,xen_start_info
mov $init_thread_union+THREAD_SIZE,%esp
#else
mov %rsi,xen_start_info
mov $init_thread_union+THREAD_SIZE,%rsp
#endif
jmp xen_start_kernel
__FINIT
.pushsection .text
.align PAGE_SIZE
ENTRY(hypercall_page)
.skip PAGE_SIZE
.popsection
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
#ifdef CONFIG_X86_32
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
#else
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
#endif
ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
.quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START)
ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0)
#endif /*CONFIG_XEN */
+123
View File
@@ -0,0 +1,123 @@
#ifndef XEN_OPS_H
#define XEN_OPS_H
#include <linux/init.h>
#include <linux/clocksource.h>
#include <linux/irqreturn.h>
#include <xen/xen-ops.h>
/* These are code, but not functions. Defined in entry.S */
extern const char xen_hypervisor_callback[];
extern const char xen_failsafe_callback[];
extern void *xen_initial_gdt;
struct trap_info;
void xen_copy_trap_info(struct trap_info *traps);
DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info);
DECLARE_PER_CPU(unsigned long, xen_cr3);
DECLARE_PER_CPU(unsigned long, xen_current_cr3);
extern struct start_info *xen_start_info;
extern struct shared_info xen_dummy_shared_info;
extern struct shared_info *HYPERVISOR_shared_info;
void xen_setup_mfn_list_list(void);
void xen_setup_shared_info(void);
void xen_build_mfn_list_list(void);
void xen_setup_machphys_mapping(void);
pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
void xen_ident_map_ISA(void);
void xen_reserve_top(void);
extern unsigned long xen_max_p2m_pfn;
void xen_set_pat(u64);
char * __init xen_memory_setup(void);
void __init xen_arch_setup(void);
void __init xen_init_IRQ(void);
void xen_enable_sysenter(void);
void xen_enable_syscall(void);
void xen_vcpu_restore(void);
void xen_callback_vector(void);
void xen_hvm_init_shared_info(void);
void xen_unplug_emulated_devices(void);
void __init xen_build_dynamic_phys_to_machine(void);
void xen_init_irq_ops(void);
void xen_setup_timer(int cpu);
void xen_setup_runstate_info(int cpu);
void xen_teardown_timer(int cpu);
cycle_t xen_clocksource_read(void);
void xen_setup_cpu_clockevents(void);
void __init xen_init_time_ops(void);
void __init xen_hvm_init_time_ops(void);
irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
bool xen_vcpu_stolen(int vcpu);
void xen_setup_vcpu_info_placement(void);
#ifdef CONFIG_SMP
void xen_smp_init(void);
void __init xen_hvm_smp_init(void);
extern cpumask_var_t xen_cpu_initialized_map;
#else
static inline void xen_smp_init(void) {}
static inline void xen_hvm_smp_init(void) {}
#endif
#ifdef CONFIG_PARAVIRT_SPINLOCKS
void __init xen_init_spinlocks(void);
void __cpuinit xen_init_lock_cpu(int cpu);
void xen_uninit_lock_cpu(int cpu);
#else
static inline void xen_init_spinlocks(void)
{
}
static inline void xen_init_lock_cpu(int cpu)
{
}
static inline void xen_uninit_lock_cpu(int cpu)
{
}
#endif
struct dom0_vga_console_info;
#ifdef CONFIG_XEN_DOM0
void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size);
#else
static inline void __init xen_init_vga(const struct dom0_vga_console_info *info,
size_t size)
{
}
#endif
/* Declare an asm function, along with symbols needed to make it
inlineable */
#define DECL_ASM(ret, name, ...) \
ret name(__VA_ARGS__); \
extern char name##_end[]; \
extern char name##_reloc[] \
DECL_ASM(void, xen_irq_enable_direct, void);
DECL_ASM(void, xen_irq_disable_direct, void);
DECL_ASM(unsigned long, xen_save_fl_direct, void);
DECL_ASM(void, xen_restore_fl_direct, unsigned long);
/* These are not functions, and cannot be called normally */
void xen_iret(void);
void xen_sysexit(void);
void xen_sysret32(void);
void xen_sysret64(void);
void xen_adjust_exception_frame(void);
extern int xen_panic_handler_init(void);
#endif /* XEN_OPS_H */