Merge branch 'xen-upstream' of ssh://master.kernel.org/pub/scm/linux/kernel/git/jeremy/xen

* 'xen-upstream' of ssh://master.kernel.org/pub/scm/linux/kernel/git/jeremy/xen: (44 commits)
  xen: disable all non-virtual drivers
  xen: use iret directly when possible
  xen: suppress abs symbol warnings for unused reloc pointers
  xen: Attempt to patch inline versions of common operations
  xen: Place vcpu_info structure into per-cpu memory
  xen: handle external requests for shutdown, reboot and sysrq
  xen: machine operations
  xen: add virtual network device driver
  xen: add virtual block device driver.
  xen: add the Xenbus sysfs and virtual device hotplug driver
  xen: Add grant table support
  xen: use the hvc console infrastructure for Xen console
  xen: hack to prevent bad segment register reload
  xen: lazy-mmu operations
  xen: Add support for preemption
  xen: SMP guest support
  xen: Implement sched_clock
  xen: Account for stolen time
  xen: ignore RW mapping of RO pages in pagetable_init
  xen: Complete pagetable pinning
  ...
This commit is contained in:
Linus Torvalds 2007-07-18 10:18:39 -07:00
commit 5cc97bf2d8
116 changed files with 15035 additions and 214 deletions

View File

@ -222,6 +222,8 @@ config PARAVIRT
However, when run without a hypervisor the kernel is
theoretically slower. If in doubt, say N.
source "arch/i386/xen/Kconfig"
config VMI
bool "VMI Paravirt-ops support"
depends on PARAVIRT

View File

@ -93,6 +93,9 @@ mflags-$(CONFIG_X86_ES7000) := -Iinclude/asm-i386/mach-es7000
mcore-$(CONFIG_X86_ES7000) := mach-default
core-$(CONFIG_X86_ES7000) := arch/i386/mach-es7000/
# Xen paravirtualization support
core-$(CONFIG_XEN) += arch/i386/xen/
# default subarch .h files
mflags-y += -Iinclude/asm-i386/mach-default

View File

@ -31,6 +31,8 @@ static const char* safe_abs_relocs[] = {
"__kernel_rt_sigreturn",
"__kernel_sigreturn",
"SYSENTER_RETURN",
"xen_irq_disable_direct_reloc",
"xen_save_fl_direct_reloc",
};
static int is_safe_abs_reloc(const char* sym_name)

View File

@ -17,6 +17,8 @@
#include <asm/thread_info.h>
#include <asm/elf.h>
#include <xen/interface/xen.h>
#define DEFINE(sym, val) \
asm volatile("\n->" #sym " %0 " #val : : "i" (val))
@ -59,6 +61,7 @@ void foo(void)
OFFSET(TI_addr_limit, thread_info, addr_limit);
OFFSET(TI_restart_block, thread_info, restart_block);
OFFSET(TI_sysenter_return, thread_info, sysenter_return);
OFFSET(TI_cpu, thread_info, cpu);
BLANK();
OFFSET(GDS_size, Xgt_desc_struct, size);
@ -115,4 +118,10 @@ void foo(void)
OFFSET(PARAVIRT_iret, paravirt_ops, iret);
OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
#endif
#ifdef CONFIG_XEN
BLANK();
OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
#endif
}

View File

@ -1023,6 +1023,91 @@ ENTRY(kernel_thread_helper)
CFI_ENDPROC
ENDPROC(kernel_thread_helper)
#ifdef CONFIG_XEN
ENTRY(xen_hypervisor_callback)
CFI_STARTPROC
pushl $0
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
TRACE_IRQS_OFF
/* Check to see if we got the event in the critical
region in xen_iret_direct, after we've reenabled
events and checked for pending events. This simulates
iret instruction's behaviour where it delivers a
pending interrupt when enabling interrupts. */
movl PT_EIP(%esp),%eax
cmpl $xen_iret_start_crit,%eax
jb 1f
cmpl $xen_iret_end_crit,%eax
jae 1f
call xen_iret_crit_fixup
1: mov %esp, %eax
call xen_evtchn_do_upcall
jmp ret_from_intr
CFI_ENDPROC
ENDPROC(xen_hypervisor_callback)
# Hypervisor uses this for application faults while it executes.
# We get here for two reasons:
# 1. Fault while reloading DS, ES, FS or GS
# 2. Fault while executing IRET
# Category 1 we fix up by reattempting the load, and zeroing the segment
# register if the load fails.
# Category 2 we fix up by jumping to do_iret_error. We cannot use the
# normal Linux return path in this case because if we use the IRET hypercall
# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
# We distinguish between categories by maintaining a status value in EAX.
ENTRY(xen_failsafe_callback)
CFI_STARTPROC
pushl %eax
CFI_ADJUST_CFA_OFFSET 4
movl $1,%eax
1: mov 4(%esp),%ds
2: mov 8(%esp),%es
3: mov 12(%esp),%fs
4: mov 16(%esp),%gs
testl %eax,%eax
popl %eax
CFI_ADJUST_CFA_OFFSET -4
lea 16(%esp),%esp
CFI_ADJUST_CFA_OFFSET -16
jz 5f
addl $16,%esp
jmp iret_exc # EAX != 0 => Category 2 (Bad IRET)
5: pushl $0 # EAX == 0 => Category 1 (Bad segment)
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
jmp ret_from_exception
CFI_ENDPROC
.section .fixup,"ax"
6: xorl %eax,%eax
movl %eax,4(%esp)
jmp 1b
7: xorl %eax,%eax
movl %eax,8(%esp)
jmp 2b
8: xorl %eax,%eax
movl %eax,12(%esp)
jmp 3b
9: xorl %eax,%eax
movl %eax,16(%esp)
jmp 4b
.previous
.section __ex_table,"a"
.align 4
.long 1b,6b
.long 2b,7b
.long 3b,8b
.long 4b,9b
.previous
ENDPROC(xen_failsafe_callback)
#endif /* CONFIG_XEN */
.section .rodata,"a"
#include "syscall_table.S"

View File

@ -510,7 +510,8 @@ ENTRY(_stext)
/*
* BSS section
*/
.section ".bss.page_aligned","w"
.section ".bss.page_aligned","wa"
.align PAGE_SIZE_asm
ENTRY(swapper_pg_dir)
.fill 1024,4,0
ENTRY(swapper_pg_pmd)
@ -538,6 +539,8 @@ fault_msg:
.ascii "Int %d: CR2 %p err %p EIP %p CS %p flags %p\n"
.asciz "Stack: %p %p %p %p %p %p %p %p\n"
#include "../xen/xen-head.S"
/*
* The IDT and GDT 'descriptors' are a strange 48-bit object
* only used by the lidt and lgdt instructions. They are not

View File

@ -228,6 +228,41 @@ static int __init print_banner(void)
}
core_initcall(print_banner);
static struct resource reserve_ioports = {
.start = 0,
.end = IO_SPACE_LIMIT,
.name = "paravirt-ioport",
.flags = IORESOURCE_IO | IORESOURCE_BUSY,
};
static struct resource reserve_iomem = {
.start = 0,
.end = -1,
.name = "paravirt-iomem",
.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
};
/*
* Reserve the whole legacy IO space to prevent any legacy drivers
* from wasting time probing for their hardware. This is a fairly
* brute-force approach to disabling all non-virtual drivers.
*
* Note that this must be called very early to have any effect.
*/
int paravirt_disable_iospace(void)
{
int ret;
ret = request_resource(&ioport_resource, &reserve_ioports);
if (ret == 0) {
ret = request_resource(&iomem_resource, &reserve_iomem);
if (ret)
release_resource(&reserve_ioports);
}
return ret;
}
struct paravirt_ops paravirt_ops = {
.name = "bare hardware",
.paravirt_enabled = 0,
@ -267,7 +302,7 @@ struct paravirt_ops paravirt_ops = {
.write_msr = native_write_msr_safe,
.read_tsc = native_read_tsc,
.read_pmc = native_read_pmc,
.get_scheduled_cycles = native_read_tsc,
.sched_clock = native_sched_clock,
.get_cpu_khz = native_calculate_cpu_khz,
.load_tr_desc = native_load_tr_desc,
.set_ldt = native_set_ldt,

View File

@ -601,6 +601,8 @@ void __init setup_arch(char **cmdline_p)
* NOTE: at this point the bootmem allocator is fully available.
*/
paravirt_post_allocator_init();
dmi_scan_machine();
#ifdef CONFIG_X86_GENERICARCH

View File

@ -22,6 +22,7 @@
#include <asm/mtrr.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <mach_apic.h>
/*
@ -249,13 +250,13 @@ static unsigned long flush_va;
static DEFINE_SPINLOCK(tlbstate_lock);
/*
* We cannot call mmdrop() because we are in interrupt context,
* We cannot call mmdrop() because we are in interrupt context,
* instead update mm->cpu_vm_mask.
*
* We need to reload %cr3 since the page tables may be going
* away from under us..
*/
static inline void leave_mm (unsigned long cpu)
void leave_mm(unsigned long cpu)
{
if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
BUG();

View File

@ -148,7 +148,7 @@ void __init smp_alloc_memory(void)
* a given CPU
*/
static void __cpuinit smp_store_cpu_info(int id)
void __cpuinit smp_store_cpu_info(int id)
{
struct cpuinfo_x86 *c = cpu_data + id;
@ -308,8 +308,7 @@ cpumask_t cpu_coregroup_map(int cpu)
/* representing cpus for which sibling maps can be computed */
static cpumask_t cpu_sibling_setup_map;
static inline void
set_cpu_sibling_map(int cpu)
void set_cpu_sibling_map(int cpu)
{
int i;
struct cpuinfo_x86 *c = cpu_data;
@ -1144,8 +1143,7 @@ void __init native_smp_prepare_boot_cpu(void)
}
#ifdef CONFIG_HOTPLUG_CPU
static void
remove_siblinginfo(int cpu)
void remove_siblinginfo(int cpu)
{
int sibling;
struct cpuinfo_x86 *c = cpu_data;

View File

@ -84,7 +84,7 @@ static inline int check_tsc_unstable(void)
*
* -johnstul@us.ibm.com "math is hard, lets go shopping!"
*/
static unsigned long cyc2ns_scale __read_mostly;
unsigned long cyc2ns_scale __read_mostly;
#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
@ -93,15 +93,10 @@ static inline void set_cyc2ns_scale(unsigned long cpu_khz)
cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
}
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
}
/*
* Scheduler clock - returns current time in nanosec units.
*/
unsigned long long sched_clock(void)
unsigned long long native_sched_clock(void)
{
unsigned long long this_offset;
@ -118,12 +113,24 @@ unsigned long long sched_clock(void)
return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
/* read the Time Stamp Counter: */
get_scheduled_cycles(this_offset);
rdtscll(this_offset);
/* return the value in ns */
return cycles_2_ns(this_offset);
}
/* We need to define a real function for sched_clock, to override the
weak default version */
#ifdef CONFIG_PARAVIRT
unsigned long long sched_clock(void)
{
return paravirt_sched_clock();
}
#else
unsigned long long sched_clock(void)
__attribute__((alias("native_sched_clock")));
#endif
unsigned long native_calculate_cpu_khz(void)
{
unsigned long long start, end;

View File

@ -362,7 +362,7 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
}
#endif
static void vmi_allocate_pt(u32 pfn)
static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn)
{
vmi_set_page_type(pfn, VMI_PAGE_L1);
vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
@ -891,7 +891,7 @@ static inline int __init activate_vmi(void)
paravirt_ops.setup_boot_clock = vmi_time_bsp_init;
paravirt_ops.setup_secondary_clock = vmi_time_ap_init;
#endif
paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles;
paravirt_ops.sched_clock = vmi_sched_clock;
paravirt_ops.get_cpu_khz = vmi_cpu_khz;
/* We have true wallclock functions; disable CMOS clock sync */

View File

@ -64,10 +64,10 @@ int vmi_set_wallclock(unsigned long now)
return 0;
}
/* paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles */
unsigned long long vmi_get_sched_cycles(void)
/* paravirt_ops.sched_clock = vmi_sched_clock */
unsigned long long vmi_sched_clock(void)
{
return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE);
return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
}
/* paravirt_ops.get_cpu_khz = vmi_cpu_khz */

View File

@ -88,6 +88,7 @@ SECTIONS
. = ALIGN(4096);
.data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
*(.data.page_aligned)
*(.data.idt)
}

View File

@ -3,23 +3,40 @@
* Here we can supply some information useful to userland.
*/
#include <linux/uts.h>
#include <linux/version.h>
#include <linux/elfnote.h>
#define ASM_ELF_NOTE_BEGIN(name, flags, vendor, type) \
.section name, flags; \
.balign 4; \
.long 1f - 0f; /* name length */ \
.long 3f - 2f; /* data length */ \
.long type; /* note type */ \
0: .asciz vendor; /* vendor name */ \
1: .balign 4; \
2:
#define ASM_ELF_NOTE_END \
3: .balign 4; /* pad out section */ \
.previous
ASM_ELF_NOTE_BEGIN(".note.kernel-version", "a", UTS_SYSNAME, 0)
/* Ideally this would use UTS_NAME, but using a quoted string here
doesn't work. Remember to change this when changing the
kernel's name. */
ELFNOTE_START(Linux, 0, "a")
.long LINUX_VERSION_CODE
ASM_ELF_NOTE_END
ELFNOTE_END
#ifdef CONFIG_XEN
/*
* Add a special note telling glibc's dynamic linker a fake hardware
* flavor that it will use to choose the search path for libraries in the
* same way it uses real hardware capabilities like "mmx".
* We supply "nosegneg" as the fake capability, to indicate that we
* do not like negative offsets in instructions using segment overrides,
* since we implement those inefficiently. This makes it possible to
* install libraries optimized to avoid those access patterns in someplace
* like /lib/i686/tls/nosegneg. Note that an /etc/ld.so.conf.d/file
* corresponding to the bits here is needed to make ldconfig work right.
* It should contain:
* hwcap 1 nosegneg
* to match the mapping of bit to name that we give here.
*/
/* Bit used for the pseudo-hwcap for non-negative segments. We use
bit 1 to avoid bugs in some versions of glibc when bit 0 is
used; the choice is otherwise arbitrary. */
#define VDSO_NOTE_NONEGSEG_BIT 1
ELFNOTE_START(GNU, 2, "a")
.long 1, 1<<VDSO_NOTE_NONEGSEG_BIT /* ncaps, mask */
.byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */
ELFNOTE_END
#endif

View File

@ -52,7 +52,7 @@ execute(const char *string)
NULL,
};
if ((ret = call_usermodehelper(argv[0], argv, envp, 1)) != 0) {
if ((ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) {
printk(KERN_ERR "Voyager failed to run \"%s\": %i\n",
string, ret);
}

View File

@ -87,7 +87,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT);
paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
BUG_ON(page_table != pte_offset_kernel(pmd, 0));
}
@ -473,6 +473,7 @@ void zap_low_mappings (void)
static int disable_nx __initdata = 0;
u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
EXPORT_SYMBOL_GPL(__supported_pte_mask);
/*
* noexec = on|off

View File

@ -60,7 +60,7 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot,
address = __pa(address);
addr = address & LARGE_PAGE_MASK;
pbase = (pte_t *)page_address(base);
paravirt_alloc_pt(page_to_pfn(base));
paravirt_alloc_pt(&init_mm, page_to_pfn(base));
for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
addr == address ? prot : ref_prot));

11
arch/i386/xen/Kconfig Normal file
View File

@ -0,0 +1,11 @@
#
# This Kconfig describes xen options
#
config XEN
bool "Enable support for Xen hypervisor"
depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES
help
This is the Linux Xen port. Enabling this will allow the
kernel to boot in a paravirtualized environment under the
Xen hypervisor.

4
arch/i386/xen/Makefile Normal file
View File

@ -0,0 +1,4 @@
obj-y := enlighten.o setup.o features.o multicalls.o mmu.o \
events.o time.o manage.o xen-asm.o
obj-$(CONFIG_SMP) += smp.o

1144
arch/i386/xen/enlighten.c Normal file

File diff suppressed because it is too large Load Diff

590
arch/i386/xen/events.c Normal file
View File

@ -0,0 +1,590 @@
/*
* Xen event channels
*
* Xen models interrupts with abstract event channels. Because each
* domain gets 1024 event channels, but NR_IRQ is not that large, we
* must dynamically map irqs<->event channels. The event channels
* interface with the rest of the kernel by defining a xen interrupt
* chip. When an event is recieved, it is mapped to an irq and sent
* through the normal interrupt processing path.
*
* There are four kinds of events which can be mapped to an event
* channel:
*
* 1. Inter-domain notifications. This includes all the virtual
* device events, since they're driven by front-ends in another domain
* (typically dom0).
* 2. VIRQs, typically used for timers. These are per-cpu events.
* 3. IPIs.
* 4. Hardware interrupts. Not supported at present.
*
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
#include <linux/linkage.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/module.h>
#include <linux/string.h>
#include <asm/ptrace.h>
#include <asm/irq.h>
#include <asm/sync_bitops.h>
#include <asm/xen/hypercall.h>
#include <xen/events.h>
#include <xen/interface/xen.h>
#include <xen/interface/event_channel.h>
#include "xen-ops.h"
/*
* This lock protects updates to the following mapping and reference-count
* arrays. The lock does not need to be acquired to read the mapping tables.
*/
static DEFINE_SPINLOCK(irq_mapping_update_lock);
/* IRQ <-> VIRQ mapping. */
static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
/* IRQ <-> IPI mapping */
static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
/* Packed IRQ information: binding type, sub-type index, and event channel. */
struct packed_irq
{
unsigned short evtchn;
unsigned char index;
unsigned char type;
};
static struct packed_irq irq_info[NR_IRQS];
/* Binding types. */
enum {
IRQT_UNBOUND,
IRQT_PIRQ,
IRQT_VIRQ,
IRQT_IPI,
IRQT_EVTCHN
};
/* Convenient shorthand for packed representation of an unbound IRQ. */
#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0)
static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
[0 ... NR_EVENT_CHANNELS-1] = -1
};
static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
static u8 cpu_evtchn[NR_EVENT_CHANNELS];
/* Reference counts for bindings to IRQs. */
static int irq_bindcount[NR_IRQS];
/* Xen will never allocate port zero for any purpose. */
#define VALID_EVTCHN(chn) ((chn) != 0)
/*
* Force a proper event-channel callback from Xen after clearing the
* callback mask. We do this in a very simple manner, by making a call
* down into Xen. The pending flag will be checked by Xen on return.
*/
void force_evtchn_callback(void)
{
(void)HYPERVISOR_xen_version(0, NULL);
}
EXPORT_SYMBOL_GPL(force_evtchn_callback);
static struct irq_chip xen_dynamic_chip;
/* Constructor for packed IRQ information. */
static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn)
{
return (struct packed_irq) { evtchn, index, type };
}
/*
* Accessors for packed IRQ information.
*/
static inline unsigned int evtchn_from_irq(int irq)
{
return irq_info[irq].evtchn;
}
static inline unsigned int index_from_irq(int irq)
{
return irq_info[irq].index;
}
static inline unsigned int type_from_irq(int irq)
{
return irq_info[irq].type;
}
static inline unsigned long active_evtchns(unsigned int cpu,
struct shared_info *sh,
unsigned int idx)
{
return (sh->evtchn_pending[idx] &
cpu_evtchn_mask[cpu][idx] &
~sh->evtchn_mask[idx]);
}
static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
{
int irq = evtchn_to_irq[chn];
BUG_ON(irq == -1);
#ifdef CONFIG_SMP
irq_desc[irq].affinity = cpumask_of_cpu(cpu);
#endif
__clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]);
__set_bit(chn, cpu_evtchn_mask[cpu]);
cpu_evtchn[chn] = cpu;
}
static void init_evtchn_cpu_bindings(void)
{
#ifdef CONFIG_SMP
int i;
/* By default all event channels notify CPU#0. */
for (i = 0; i < NR_IRQS; i++)
irq_desc[i].affinity = cpumask_of_cpu(0);
#endif
memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
}
static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
{
return cpu_evtchn[evtchn];
}
static inline void clear_evtchn(int port)
{
struct shared_info *s = HYPERVISOR_shared_info;
sync_clear_bit(port, &s->evtchn_pending[0]);
}
static inline void set_evtchn(int port)
{
struct shared_info *s = HYPERVISOR_shared_info;
sync_set_bit(port, &s->evtchn_pending[0]);
}
/**
* notify_remote_via_irq - send event to remote end of event channel via irq
* @irq: irq of event channel to send event to
*
* Unlike notify_remote_via_evtchn(), this is safe to use across
* save/restore. Notifications on a broken connection are silently
* dropped.
*/
void notify_remote_via_irq(int irq)
{
int evtchn = evtchn_from_irq(irq);
if (VALID_EVTCHN(evtchn))
notify_remote_via_evtchn(evtchn);
}
EXPORT_SYMBOL_GPL(notify_remote_via_irq);
static void mask_evtchn(int port)
{
struct shared_info *s = HYPERVISOR_shared_info;
sync_set_bit(port, &s->evtchn_mask[0]);
}
static void unmask_evtchn(int port)
{
struct shared_info *s = HYPERVISOR_shared_info;
unsigned int cpu = get_cpu();
BUG_ON(!irqs_disabled());
/* Slow path (hypercall) if this is a non-local port. */
if (unlikely(cpu != cpu_from_evtchn(port))) {
struct evtchn_unmask unmask = { .port = port };
(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
} else {
struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
sync_clear_bit(port, &s->evtchn_mask[0]);
/*
* The following is basically the equivalent of
* 'hw_resend_irq'. Just like a real IO-APIC we 'lose
* the interrupt edge' if the channel is masked.
*/
if (sync_test_bit(port, &s->evtchn_pending[0]) &&
!sync_test_and_set_bit(port / BITS_PER_LONG,
&vcpu_info->evtchn_pending_sel))
vcpu_info->evtchn_upcall_pending = 1;
}
put_cpu();
}
static int find_unbound_irq(void)
{
int irq;
/* Only allocate from dynirq range */
for (irq = 0; irq < NR_IRQS; irq++)
if (irq_bindcount[irq] == 0)
break;
if (irq == NR_IRQS)
panic("No available IRQ to bind to: increase NR_IRQS!\n");
return irq;
}
int bind_evtchn_to_irq(unsigned int evtchn)
{
int irq;
spin_lock(&irq_mapping_update_lock);
irq = evtchn_to_irq[evtchn];
if (irq == -1) {
irq = find_unbound_irq();
dynamic_irq_init(irq);
set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
handle_level_irq, "event");
evtchn_to_irq[evtchn] = irq;
irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
}
irq_bindcount[irq]++;
spin_unlock(&irq_mapping_update_lock);
return irq;
}
EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
{
struct evtchn_bind_ipi bind_ipi;
int evtchn, irq;
spin_lock(&irq_mapping_update_lock);
irq = per_cpu(ipi_to_irq, cpu)[ipi];
if (irq == -1) {
irq = find_unbound_irq();
if (irq < 0)
goto out;
dynamic_irq_init(irq);
set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
handle_level_irq, "ipi");
bind_ipi.vcpu = cpu;
if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
&bind_ipi) != 0)
BUG();
evtchn = bind_ipi.port;
evtchn_to_irq[evtchn] = irq;
irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
per_cpu(ipi_to_irq, cpu)[ipi] = irq;
bind_evtchn_to_cpu(evtchn, cpu);
}
irq_bindcount[irq]++;
out:
spin_unlock(&irq_mapping_update_lock);
return irq;
}
static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
{
struct evtchn_bind_virq bind_virq;
int evtchn, irq;
spin_lock(&irq_mapping_update_lock);
irq = per_cpu(virq_to_irq, cpu)[virq];
if (irq == -1) {
bind_virq.virq = virq;
bind_virq.vcpu = cpu;
if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
&bind_virq) != 0)
BUG();
evtchn = bind_virq.port;
irq = find_unbound_irq();
dynamic_irq_init(irq);
set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
handle_level_irq, "virq");
evtchn_to_irq[evtchn] = irq;
irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
per_cpu(virq_to_irq, cpu)[virq] = irq;
bind_evtchn_to_cpu(evtchn, cpu);
}
irq_bindcount[irq]++;
spin_unlock(&irq_mapping_update_lock);
return irq;
}
static void unbind_from_irq(unsigned int irq)
{
struct evtchn_close close;
int evtchn = evtchn_from_irq(irq);
spin_lock(&irq_mapping_update_lock);
if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) {
close.port = evtchn;
if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
BUG();
switch (type_from_irq(irq)) {
case IRQT_VIRQ:
per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
[index_from_irq(irq)] = -1;
break;
default:
break;
}
/* Closed ports are implicitly re-bound to VCPU0. */
bind_evtchn_to_cpu(evtchn, 0);
evtchn_to_irq[evtchn] = -1;
irq_info[irq] = IRQ_UNBOUND;
dynamic_irq_init(irq);
}
spin_unlock(&irq_mapping_update_lock);
}
int bind_evtchn_to_irqhandler(unsigned int evtchn,
irqreturn_t (*handler)(int, void *),
unsigned long irqflags,
const char *devname, void *dev_id)
{
unsigned int irq;
int retval;
irq = bind_evtchn_to_irq(evtchn);
retval = request_irq(irq, handler, irqflags, devname, dev_id);
if (retval != 0) {
unbind_from_irq(irq);
return retval;
}
return irq;
}
EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
irqreturn_t (*handler)(int, void *),
unsigned long irqflags, const char *devname, void *dev_id)
{
unsigned int irq;
int retval;
irq = bind_virq_to_irq(virq, cpu);
retval = request_irq(irq, handler, irqflags, devname, dev_id);
if (retval != 0) {
unbind_from_irq(irq);
return retval;
}
return irq;
}
EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
int bind_ipi_to_irqhandler(enum ipi_vector ipi,
unsigned int cpu,
irq_handler_t handler,
unsigned long irqflags,
const char *devname,
void *dev_id)
{
int irq, retval;
irq = bind_ipi_to_irq(ipi, cpu);
if (irq < 0)
return irq;
retval = request_irq(irq, handler, irqflags, devname, dev_id);
if (retval != 0) {
unbind_from_irq(irq);
return retval;
}
return irq;
}
void unbind_from_irqhandler(unsigned int irq, void *dev_id)
{
free_irq(irq, dev_id);
unbind_from_irq(irq);
}
EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
{
int irq = per_cpu(ipi_to_irq, cpu)[vector];
BUG_ON(irq < 0);
notify_remote_via_irq(irq);
}
/*
* Search the CPUs pending events bitmasks. For each one found, map
* the event number to an irq, and feed it into do_IRQ() for
* handling.
*
* Xen uses a two-level bitmap to speed searching. The first level is
* a bitset of words which contain pending event bits. The second
* level is a bitset of pending events themselves.
*/
fastcall void xen_evtchn_do_upcall(struct pt_regs *regs)
{
int cpu = get_cpu();
struct shared_info *s = HYPERVISOR_shared_info;
struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
unsigned long pending_words;
vcpu_info->evtchn_upcall_pending = 0;
/* NB. No need for a barrier here -- XCHG is a barrier on x86. */
pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
while (pending_words != 0) {
unsigned long pending_bits;
int word_idx = __ffs(pending_words);
pending_words &= ~(1UL << word_idx);
while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
int bit_idx = __ffs(pending_bits);
int port = (word_idx * BITS_PER_LONG) + bit_idx;
int irq = evtchn_to_irq[port];
if (irq != -1) {
regs->orig_eax = ~irq;
do_IRQ(regs);
}
}
}
put_cpu();
}
/* Rebind an evtchn so that it gets delivered to a specific cpu */
static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
{
struct evtchn_bind_vcpu bind_vcpu;
int evtchn = evtchn_from_irq(irq);
if (!VALID_EVTCHN(evtchn))
return;
/* Send future instances of this interrupt to other vcpu. */
bind_vcpu.port = evtchn;
bind_vcpu.vcpu = tcpu;
/*
* If this fails, it usually just indicates that we're dealing with a
* virq or IPI channel, which don't actually need to be rebound. Ignore
* it, but don't do the xenlinux-level rebind in that case.
*/
if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
bind_evtchn_to_cpu(evtchn, tcpu);
}
static void set_affinity_irq(unsigned irq, cpumask_t dest)
{
unsigned tcpu = first_cpu(dest);
rebind_irq_to_cpu(irq, tcpu);
}
static void enable_dynirq(unsigned int irq)
{
int evtchn = evtchn_from_irq(irq);
if (VALID_EVTCHN(evtchn))
unmask_evtchn(evtchn);
}
static void disable_dynirq(unsigned int irq)
{
int evtchn = evtchn_from_irq(irq);
if (VALID_EVTCHN(evtchn))
mask_evtchn(evtchn);
}
static void ack_dynirq(unsigned int irq)
{
int evtchn = evtchn_from_irq(irq);
move_native_irq(irq);
if (VALID_EVTCHN(evtchn))
clear_evtchn(evtchn);
}
static int retrigger_dynirq(unsigned int irq)
{
int evtchn = evtchn_from_irq(irq);
int ret = 0;
if (VALID_EVTCHN(evtchn)) {
set_evtchn(evtchn);
ret = 1;
}
return ret;
}
static struct irq_chip xen_dynamic_chip __read_mostly = {
.name = "xen-dyn",
.mask = disable_dynirq,
.unmask = enable_dynirq,
.ack = ack_dynirq,
.set_affinity = set_affinity_irq,
.retrigger = retrigger_dynirq,
};
void __init xen_init_IRQ(void)
{
int i;
init_evtchn_cpu_bindings();
/* No event channels are 'live' right now. */
for (i = 0; i < NR_EVENT_CHANNELS; i++)
mask_evtchn(i);
/* Dynamic IRQ space is currently unbound. Zero the refcnts. */
for (i = 0; i < NR_IRQS; i++)
irq_bindcount[i] = 0;
irq_ctx_init(smp_processor_id());
}

29
arch/i386/xen/features.c Normal file
View File

@ -0,0 +1,29 @@
/******************************************************************************
* features.c
*
* Xen feature flags.
*
* Copyright (c) 2006, Ian Campbell, XenSource Inc.
*/
#include <linux/types.h>
#include <linux/cache.h>
#include <linux/module.h>
#include <asm/xen/hypervisor.h>
#include <xen/features.h>
u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
EXPORT_SYMBOL_GPL(xen_features);
void xen_setup_features(void)
{
struct xen_feature_info fi;
int i, j;
for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
fi.submap_idx = i;
if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
break;
for (j = 0; j < 32; j++)
xen_features[i * 32 + j] = !!(fi.submap & 1<<j);
}
}

143
arch/i386/xen/manage.c Normal file
View File

@ -0,0 +1,143 @@
/*
* Handle extern requests for shutdown, reboot and sysrq
*/
#include <linux/kernel.h>
#include <linux/err.h>
#include <linux/reboot.h>
#include <linux/sysrq.h>
#include <xen/xenbus.h>
#define SHUTDOWN_INVALID -1
#define SHUTDOWN_POWEROFF 0
#define SHUTDOWN_SUSPEND 2
/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
* report a crash, not be instructed to crash!
* HALT is the same as POWEROFF, as far as we're concerned. The tools use
* the distinction when we return the reason code to them.
*/
#define SHUTDOWN_HALT 4
/* Ignore multiple shutdown requests. */
static int shutting_down = SHUTDOWN_INVALID;
static void shutdown_handler(struct xenbus_watch *watch,
const char **vec, unsigned int len)
{
char *str;
struct xenbus_transaction xbt;
int err;
if (shutting_down != SHUTDOWN_INVALID)
return;
again:
err = xenbus_transaction_start(&xbt);
if (err)
return;
str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
/* Ignore read errors and empty reads. */
if (XENBUS_IS_ERR_READ(str)) {
xenbus_transaction_end(xbt, 1);
return;
}
xenbus_write(xbt, "control", "shutdown", "");
err = xenbus_transaction_end(xbt, 0);
if (err == -EAGAIN) {
kfree(str);
goto again;
}
if (strcmp(str, "poweroff") == 0 ||
strcmp(str, "halt") == 0)
orderly_poweroff(false);
else if (strcmp(str, "reboot") == 0)
ctrl_alt_del();
else {
printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
shutting_down = SHUTDOWN_INVALID;
}
kfree(str);
}
static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
unsigned int len)
{
char sysrq_key = '\0';
struct xenbus_transaction xbt;
int err;
again:
err = xenbus_transaction_start(&xbt);
if (err)
return;
if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
printk(KERN_ERR "Unable to read sysrq code in "
"control/sysrq\n");
xenbus_transaction_end(xbt, 1);
return;
}
if (sysrq_key != '\0')
xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
err = xenbus_transaction_end(xbt, 0);
if (err == -EAGAIN)
goto again;
if (sysrq_key != '\0')
handle_sysrq(sysrq_key, NULL);
}
static struct xenbus_watch shutdown_watch = {
.node = "control/shutdown",
.callback = shutdown_handler
};
static struct xenbus_watch sysrq_watch = {
.node = "control/sysrq",
.callback = sysrq_handler
};
static int setup_shutdown_watcher(void)
{
int err;
err = register_xenbus_watch(&shutdown_watch);
if (err) {
printk(KERN_ERR "Failed to set shutdown watcher\n");
return err;
}
err = register_xenbus_watch(&sysrq_watch);
if (err) {
printk(KERN_ERR "Failed to set sysrq watcher\n");
return err;
}
return 0;
}
static int shutdown_event(struct notifier_block *notifier,
unsigned long event,
void *data)
{
setup_shutdown_watcher();
return NOTIFY_DONE;
}
static int __init setup_shutdown_event(void)
{
static struct notifier_block xenstore_notifier = {
.notifier_call = shutdown_event
};
register_xenstore_notifier(&xenstore_notifier);
return 0;
}
subsys_initcall(setup_shutdown_event);

564
arch/i386/xen/mmu.c Normal file
View File

@ -0,0 +1,564 @@
/*
* Xen mmu operations
*
* This file contains the various mmu fetch and update operations.
* The most important job they must perform is the mapping between the
* domain's pfn and the overall machine mfns.
*
* Xen allows guests to directly update the pagetable, in a controlled
* fashion. In other words, the guest modifies the same pagetable
* that the CPU actually uses, which eliminates the overhead of having
* a separate shadow pagetable.
*
* In order to allow this, it falls on the guest domain to map its
* notion of a "physical" pfn - which is just a domain-local linear
* address - into a real "machine address" which the CPU's MMU can
* use.
*
* A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
* inserted directly into the pagetable. When creating a new
* pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
* when reading the content back with __(pgd|pmd|pte)_val, it converts
* the mfn back into a pfn.
*
* The other constraint is that all pages which make up a pagetable
* must be mapped read-only in the guest. This prevents uncontrolled
* guest updates to the pagetable. Xen strictly enforces this, and
* will disallow any pagetable update which will end up mapping a
* pagetable page RW, and will disallow using any writable page as a
* pagetable.
*
* Naively, when loading %cr3 with the base of a new pagetable, Xen
* would need to validate the whole pagetable before going on.
* Naturally, this is quite slow. The solution is to "pin" a
* pagetable, which enforces all the constraints on the pagetable even
* when it is not actively in use. This menas that Xen can be assured
* that it is still valid when you do load it into %cr3, and doesn't
* need to revalidate it.
*
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
#include <linux/sched.h>
#include <linux/highmem.h>
#include <linux/bug.h>
#include <linux/sched.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/paravirt.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
#include <xen/page.h>
#include <xen/interface/xen.h>
#include "multicalls.h"
#include "mmu.h"
xmaddr_t arbitrary_virt_to_machine(unsigned long address)
{
pte_t *pte = lookup_address(address);
unsigned offset = address & PAGE_MASK;
BUG_ON(pte == NULL);
return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
}
void make_lowmem_page_readonly(void *vaddr)
{
pte_t *pte, ptev;
unsigned long address = (unsigned long)vaddr;
pte = lookup_address(address);
BUG_ON(pte == NULL);
ptev = pte_wrprotect(*pte);
if (HYPERVISOR_update_va_mapping(address, ptev, 0))
BUG();
}
void make_lowmem_page_readwrite(void *vaddr)
{
pte_t *pte, ptev;
unsigned long address = (unsigned long)vaddr;
pte = lookup_address(address);
BUG_ON(pte == NULL);
ptev = pte_mkwrite(*pte);
if (HYPERVISOR_update_va_mapping(address, ptev, 0))
BUG();
}
void xen_set_pmd(pmd_t *ptr, pmd_t val)
{
struct multicall_space mcs;
struct mmu_update *u;
preempt_disable();
mcs = xen_mc_entry(sizeof(*u));
u = mcs.args;
u->ptr = virt_to_machine(ptr).maddr;
u->val = pmd_val_ma(val);
MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
xen_mc_issue(PARAVIRT_LAZY_MMU);
preempt_enable();
}
/*
* Associate a virtual page frame with a given physical page frame
* and protection flags for that frame.
*/
void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
pgd = swapper_pg_dir + pgd_index(vaddr);
if (pgd_none(*pgd)) {
BUG();
return;
}
pud = pud_offset(pgd, vaddr);
if (pud_none(*pud)) {
BUG();
return;
}
pmd = pmd_offset(pud, vaddr);
if (pmd_none(*pmd)) {
BUG();
return;
}
pte = pte_offset_kernel(pmd, vaddr);
/* <mfn,flags> stored as-is, to permit clearing entries */
xen_set_pte(pte, mfn_pte(mfn, flags));
/*
* It's enough to flush this one mapping.
* (PGE mappings get flushed as well)
*/
__flush_tlb_one(vaddr);
}
void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval)
{
if (mm == current->mm || mm == &init_mm) {
if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
struct multicall_space mcs;
mcs = xen_mc_entry(0);
MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
xen_mc_issue(PARAVIRT_LAZY_MMU);
return;
} else
if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
return;
}
xen_set_pte(ptep, pteval);
}
#ifdef CONFIG_X86_PAE
void xen_set_pud(pud_t *ptr, pud_t val)
{
struct multicall_space mcs;
struct mmu_update *u;
preempt_disable();
mcs = xen_mc_entry(sizeof(*u));
u = mcs.args;
u->ptr = virt_to_machine(ptr).maddr;
u->val = pud_val_ma(val);
MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
xen_mc_issue(PARAVIRT_LAZY_MMU);
preempt_enable();
}
void xen_set_pte(pte_t *ptep, pte_t pte)
{
ptep->pte_high = pte.pte_high;
smp_wmb();
ptep->pte_low = pte.pte_low;
}
void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
{
set_64bit((u64 *)ptep, pte_val_ma(pte));
}
void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
ptep->pte_low = 0;
smp_wmb(); /* make sure low gets written first */
ptep->pte_high = 0;
}
void xen_pmd_clear(pmd_t *pmdp)
{
xen_set_pmd(pmdp, __pmd(0));
}
unsigned long long xen_pte_val(pte_t pte)
{
unsigned long long ret = 0;
if (pte.pte_low) {
ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
ret = machine_to_phys(XMADDR(ret)).paddr | 1;
}
return ret;
}
unsigned long long xen_pmd_val(pmd_t pmd)
{
unsigned long long ret = pmd.pmd;
if (ret)
ret = machine_to_phys(XMADDR(ret)).paddr | 1;
return ret;
}
unsigned long long xen_pgd_val(pgd_t pgd)
{
unsigned long long ret = pgd.pgd;
if (ret)
ret = machine_to_phys(XMADDR(ret)).paddr | 1;
return ret;
}
pte_t xen_make_pte(unsigned long long pte)
{
if (pte & 1)
pte = phys_to_machine(XPADDR(pte)).maddr;
return (pte_t){ pte, pte >> 32 };
}
pmd_t xen_make_pmd(unsigned long long pmd)
{
if (pmd & 1)
pmd = phys_to_machine(XPADDR(pmd)).maddr;
return (pmd_t){ pmd };
}
pgd_t xen_make_pgd(unsigned long long pgd)
{
if (pgd & _PAGE_PRESENT)
pgd = phys_to_machine(XPADDR(pgd)).maddr;
return (pgd_t){ pgd };
}
#else /* !PAE */
void xen_set_pte(pte_t *ptep, pte_t pte)
{
*ptep = pte;
}
unsigned long xen_pte_val(pte_t pte)
{
unsigned long ret = pte.pte_low;
if (ret & _PAGE_PRESENT)
ret = machine_to_phys(XMADDR(ret)).paddr;
return ret;
}
unsigned long xen_pgd_val(pgd_t pgd)
{
unsigned long ret = pgd.pgd;
if (ret)
ret = machine_to_phys(XMADDR(ret)).paddr | 1;
return ret;
}
pte_t xen_make_pte(unsigned long pte)
{
if (pte & _PAGE_PRESENT)
pte = phys_to_machine(XPADDR(pte)).maddr;
return (pte_t){ pte };
}
pgd_t xen_make_pgd(unsigned long pgd)
{
if (pgd & _PAGE_PRESENT)
pgd = phys_to_machine(XPADDR(pgd)).maddr;
return (pgd_t){ pgd };
}
#endif /* CONFIG_X86_PAE */
/*
(Yet another) pagetable walker. This one is intended for pinning a
pagetable. This means that it walks a pagetable and calls the
callback function on each page it finds making up the page table,
at every level. It walks the entire pagetable, but it only bothers
pinning pte pages which are below pte_limit. In the normal case
this will be TASK_SIZE, but at boot we need to pin up to
FIXADDR_TOP. But the important bit is that we don't pin beyond
there, because then we start getting into Xen's ptes.
*/
static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
unsigned long limit)
{
pgd_t *pgd = pgd_base;
int flush = 0;
unsigned long addr = 0;
unsigned long pgd_next;
BUG_ON(limit > FIXADDR_TOP);
if (xen_feature(XENFEAT_auto_translated_physmap))
return 0;
for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
pud_t *pud;
unsigned long pud_limit, pud_next;
pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
if (!pgd_val(*pgd))
continue;
pud = pud_offset(pgd, 0);
if (PTRS_PER_PUD > 1) /* not folded */
flush |= (*func)(virt_to_page(pud), 0);
for (; addr != pud_limit; pud++, addr = pud_next) {
pmd_t *pmd;
unsigned long pmd_limit;
pud_next = pud_addr_end(addr, pud_limit);
if (pud_next < limit)
pmd_limit = pud_next;
else
pmd_limit = limit;
if (pud_none(*pud))
continue;
pmd = pmd_offset(pud, 0);
if (PTRS_PER_PMD > 1) /* not folded */
flush |= (*func)(virt_to_page(pmd), 0);
for (; addr != pmd_limit; pmd++) {
addr += (PAGE_SIZE * PTRS_PER_PTE);
if ((pmd_limit-1) < (addr-1)) {
addr = pmd_limit;
break;
}
if (pmd_none(*pmd))
continue;
flush |= (*func)(pmd_page(*pmd), 0);
}
}
}
flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH);
return flush;
}
static int pin_page(struct page *page, unsigned flags)
{
unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
int flush;
if (pgfl)
flush = 0; /* already pinned */
else if (PageHighMem(page))
/* kmaps need flushing if we found an unpinned
highpage */
flush = 1;
else {
void *pt = lowmem_page_address(page);
unsigned long pfn = page_to_pfn(page);
struct multicall_space mcs = __xen_mc_entry(0);
flush = 0;
MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
pfn_pte(pfn, PAGE_KERNEL_RO),
flags);
}
return flush;
}
/* This is called just after a mm has been created, but it has not
been used yet. We need to make sure that its pagetable is all
read-only, and can be pinned. */
void xen_pgd_pin(pgd_t *pgd)
{
struct multicall_space mcs;
struct mmuext_op *op;
xen_mc_batch();
if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
/* re-enable interrupts for kmap_flush_unused */
xen_mc_issue(0);
kmap_flush_unused();
xen_mc_batch();
}
mcs = __xen_mc_entry(sizeof(*op));
op = mcs.args;
#ifdef CONFIG_X86_PAE
op->cmd = MMUEXT_PIN_L3_TABLE;
#else
op->cmd = MMUEXT_PIN_L2_TABLE;
#endif
op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
xen_mc_issue(0);
}
/* The init_mm pagetable is really pinned as soon as its created, but
that's before we have page structures to store the bits. So do all
the book-keeping now. */
static __init int mark_pinned(struct page *page, unsigned flags)
{
SetPagePinned(page);
return 0;
}
void __init xen_mark_init_mm_pinned(void)
{
pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
}
static int unpin_page(struct page *page, unsigned flags)
{
unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
if (pgfl && !PageHighMem(page)) {
void *pt = lowmem_page_address(page);
unsigned long pfn = page_to_pfn(page);
struct multicall_space mcs = __xen_mc_entry(0);
MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
pfn_pte(pfn, PAGE_KERNEL),
flags);
}
return 0; /* never need to flush on unpin */
}
/* Release a pagetables pages back as normal RW */
static void xen_pgd_unpin(pgd_t *pgd)
{
struct mmuext_op *op;
struct multicall_space mcs;
xen_mc_batch();
mcs = __xen_mc_entry(sizeof(*op));
op = mcs.args;
op->cmd = MMUEXT_UNPIN_TABLE;
op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
pgd_walk(pgd, unpin_page, TASK_SIZE);
xen_mc_issue(0);
}
void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
{
spin_lock(&next->page_table_lock);
xen_pgd_pin(next->pgd);
spin_unlock(&next->page_table_lock);
}
void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{
spin_lock(&mm->page_table_lock);
xen_pgd_pin(mm->pgd);
spin_unlock(&mm->page_table_lock);
}
#ifdef CONFIG_SMP
/* Another cpu may still have their %cr3 pointing at the pagetable, so
we need to repoint it somewhere else before we can unpin it. */
static void drop_other_mm_ref(void *info)
{
struct mm_struct *mm = info;
if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
leave_mm(smp_processor_id());
}
static void drop_mm_ref(struct mm_struct *mm)
{
if (current->active_mm == mm) {
if (current->mm == mm)
load_cr3(swapper_pg_dir);
else
leave_mm(smp_processor_id());
}
if (!cpus_empty(mm->cpu_vm_mask))
xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref,
mm, 1);
}
#else
static void drop_mm_ref(struct mm_struct *mm)
{
if (current->active_mm == mm)
load_cr3(swapper_pg_dir);
}
#endif
/*
* While a process runs, Xen pins its pagetables, which means that the
* hypervisor forces it to be read-only, and it controls all updates
* to it. This means that all pagetable updates have to go via the
* hypervisor, which is moderately expensive.
*
* Since we're pulling the pagetable down, we switch to use init_mm,
* unpin old process pagetable and mark it all read-write, which
* allows further operations on it to be simple memory accesses.
*
* The only subtle point is that another CPU may be still using the
* pagetable because of lazy tlb flushing. This means we need need to
* switch all CPUs off this pagetable before we can unpin it.
*/
void xen_exit_mmap(struct mm_struct *mm)
{
get_cpu(); /* make sure we don't move around */
drop_mm_ref(mm);
put_cpu();
spin_lock(&mm->page_table_lock);
xen_pgd_unpin(mm->pgd);
spin_unlock(&mm->page_table_lock);
}

60
arch/i386/xen/mmu.h Normal file
View File

@ -0,0 +1,60 @@
#ifndef _XEN_MMU_H
#include <linux/linkage.h>
#include <asm/page.h>
/*
* Page-directory addresses above 4GB do not fit into architectural %cr3.
* When accessing %cr3, or equivalent field in vcpu_guest_context, guests
* must use the following accessor macros to pack/unpack valid MFNs.
*
* Note that Xen is using the fact that the pagetable base is always
* page-aligned, and putting the 12 MSB of the address into the 12 LSB
* of cr3.
*/
#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
void xen_set_pte(pte_t *ptep, pte_t pteval);
void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval);
void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
void xen_exit_mmap(struct mm_struct *mm);
void xen_pgd_pin(pgd_t *pgd);
//void xen_pgd_unpin(pgd_t *pgd);
#ifdef CONFIG_X86_PAE
unsigned long long xen_pte_val(pte_t);
unsigned long long xen_pmd_val(pmd_t);
unsigned long long xen_pgd_val(pgd_t);
pte_t xen_make_pte(unsigned long long);
pmd_t xen_make_pmd(unsigned long long);
pgd_t xen_make_pgd(unsigned long long);
void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval);
void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
void xen_set_pud(pud_t *ptr, pud_t val);
void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
void xen_pmd_clear(pmd_t *pmdp);
#else
unsigned long xen_pte_val(pte_t);
unsigned long xen_pmd_val(pmd_t);
unsigned long xen_pgd_val(pgd_t);
pte_t xen_make_pte(unsigned long);
pmd_t xen_make_pmd(unsigned long);
pgd_t xen_make_pgd(unsigned long);
#endif
#endif /* _XEN_MMU_H */

View File

@ -0,0 +1,90 @@
/*
* Xen hypercall batching.
*
* Xen allows multiple hypercalls to be issued at once, using the
* multicall interface. This allows the cost of trapping into the
* hypervisor to be amortized over several calls.
*
* This file implements a simple interface for multicalls. There's a
* per-cpu buffer of outstanding multicalls. When you want to queue a
* multicall for issuing, you can allocate a multicall slot for the
* call and its arguments, along with storage for space which is
* pointed to by the arguments (for passing pointers to structures,
* etc). When the multicall is actually issued, all the space for the
* commands and allocated memory is freed for reuse.
*
* Multicalls are flushed whenever any of the buffers get full, or
* when explicitly requested. There's no way to get per-multicall
* return results back. It will BUG if any of the multicalls fail.
*
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <asm/xen/hypercall.h>
#include "multicalls.h"
#define MC_BATCH 32
#define MC_ARGS (MC_BATCH * 16 / sizeof(u64))
struct mc_buffer {
struct multicall_entry entries[MC_BATCH];
u64 args[MC_ARGS];
unsigned mcidx, argidx;
};
static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
void xen_mc_flush(void)
{
struct mc_buffer *b = &__get_cpu_var(mc_buffer);
int ret = 0;
unsigned long flags;
BUG_ON(preemptible());
/* Disable interrupts in case someone comes in and queues
something in the middle */
local_irq_save(flags);
if (b->mcidx) {
int i;
if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
BUG();
for (i = 0; i < b->mcidx; i++)
if (b->entries[i].result < 0)
ret++;
b->mcidx = 0;
b->argidx = 0;
} else
BUG_ON(b->argidx != 0);
local_irq_restore(flags);
BUG_ON(ret);
}
struct multicall_space __xen_mc_entry(size_t args)
{
struct mc_buffer *b = &__get_cpu_var(mc_buffer);
struct multicall_space ret;
unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64);
BUG_ON(preemptible());
BUG_ON(argspace > MC_ARGS);
if (b->mcidx == MC_BATCH ||
(b->argidx + argspace) > MC_ARGS)
xen_mc_flush();
ret.mc = &b->entries[b->mcidx];
b->mcidx++;
ret.args = &b->args[b->argidx];
b->argidx += argspace;
return ret;
}

View File

@ -0,0 +1,45 @@
#ifndef _XEN_MULTICALLS_H
#define _XEN_MULTICALLS_H
#include "xen-ops.h"
/* Multicalls */
struct multicall_space
{
struct multicall_entry *mc;
void *args;
};
/* Allocate room for a multicall and its args */
struct multicall_space __xen_mc_entry(size_t args);
DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
/* Call to start a batch of multiple __xen_mc_entry()s. Must be
paired with xen_mc_issue() */
static inline void xen_mc_batch(void)
{
/* need to disable interrupts until this entry is complete */
local_irq_save(__get_cpu_var(xen_mc_irq_flags));
}
static inline struct multicall_space xen_mc_entry(size_t args)
{
xen_mc_batch();
return __xen_mc_entry(args);
}
/* Flush all pending multicalls */
void xen_mc_flush(void);
/* Issue a multicall if we're not in a lazy mode */
static inline void xen_mc_issue(unsigned mode)
{
if ((xen_get_lazy_mode() & mode) == 0)
xen_mc_flush();
/* restore flags saved in xen_mc_batch */
local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
}
#endif /* _XEN_MULTICALLS_H */

96
arch/i386/xen/setup.c Normal file
View File

@ -0,0 +1,96 @@
/*
* Machine specific setup for xen
*
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/pm.h>
#include <asm/elf.h>
#include <asm/e820.h>
#include <asm/setup.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
#include <xen/interface/physdev.h>
#include <xen/features.h>
#include "xen-ops.h"
/* These are code, but not functions. Defined in entry.S */
extern const char xen_hypervisor_callback[];
extern const char xen_failsafe_callback[];
unsigned long *phys_to_machine_mapping;
EXPORT_SYMBOL(phys_to_machine_mapping);
/**
* machine_specific_memory_setup - Hook for machine specific memory setup.
**/
char * __init xen_memory_setup(void)
{
unsigned long max_pfn = xen_start_info->nr_pages;
e820.nr_map = 0;
add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM);
return "Xen";
}
static void xen_idle(void)
{
local_irq_disable();
if (need_resched())
local_irq_enable();
else {
current_thread_info()->status &= ~TS_POLLING;
smp_mb__after_clear_bit();
safe_halt();
current_thread_info()->status |= TS_POLLING;
}
}
void __init xen_arch_setup(void)
{
struct physdev_set_iopl set_iopl;
int rc;
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
if (!xen_feature(XENFEAT_auto_translated_physmap))
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
__KERNEL_CS, (unsigned long)xen_failsafe_callback);
set_iopl.iopl = 1;
rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
if (rc != 0)
printk(KERN_INFO "physdev_op failed %d\n", rc);
#ifdef CONFIG_ACPI
if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
disable_acpi();
}
#endif
memcpy(boot_command_line, xen_start_info->cmd_line,
MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
pm_idle = xen_idle;
#ifdef CONFIG_SMP
/* fill cpus_possible with all available cpus */
xen_fill_possible_map();
#endif
paravirt_disable_iospace();
}

404
arch/i386/xen/smp.c Normal file
View File

@ -0,0 +1,404 @@
/*
* Xen SMP support
*
* This file implements the Xen versions of smp_ops. SMP under Xen is
* very straightforward. Bringing a CPU up is simply a matter of
* loading its initial context and setting it running.
*
* IPIs are handled through the Xen event mechanism.
*
* Because virtual CPUs can be scheduled onto any real CPU, there's no
* useful topology information for the kernel to make use of. As a
* result, all CPUs are treated as if they're single-core and
* single-threaded.
*
* This does not handle HOTPLUG_CPU yet.
*/
#include <linux/sched.h>
#include <linux/err.h>
#include <linux/smp.h>
#include <asm/paravirt.h>
#include <asm/desc.h>
#include <asm/pgtable.h>
#include <asm/cpu.h>
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>
#include <asm/xen/interface.h>
#include <asm/xen/hypercall.h>
#include <xen/page.h>
#include <xen/events.h>
#include "xen-ops.h"
#include "mmu.h"
static cpumask_t cpu_initialized_map;
static DEFINE_PER_CPU(int, resched_irq);
static DEFINE_PER_CPU(int, callfunc_irq);
/*
* Structure and data for smp_call_function(). This is designed to minimise
* static memory requirements. It also looks cleaner.
*/
static DEFINE_SPINLOCK(call_lock);
struct call_data_struct {
void (*func) (void *info);
void *info;
atomic_t started;
atomic_t finished;
int wait;
};
static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
static struct call_data_struct *call_data;
/*
* Reschedule call back. Nothing to do,
* all the work is done automatically when
* we return from the interrupt.
*/
static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
{
return IRQ_HANDLED;
}
static __cpuinit void cpu_bringup_and_idle(void)
{
int cpu = smp_processor_id();
cpu_init();
preempt_disable();
per_cpu(cpu_state, cpu) = CPU_ONLINE;
xen_setup_cpu_clockevents();
/* We can take interrupts now: we're officially "up". */
local_irq_enable();
wmb(); /* make sure everything is out */
cpu_idle();
}
static int xen_smp_intr_init(unsigned int cpu)
{
int rc;
const char *resched_name, *callfunc_name;
per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
cpu,
xen_reschedule_interrupt,
IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
resched_name,
NULL);
if (rc < 0)
goto fail;
per_cpu(resched_irq, cpu) = rc;
callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
cpu,
xen_call_function_interrupt,
IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
callfunc_name,
NULL);
if (rc < 0)
goto fail;
per_cpu(callfunc_irq, cpu) = rc;
return 0;
fail:
if (per_cpu(resched_irq, cpu) >= 0)
unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
if (per_cpu(callfunc_irq, cpu) >= 0)
unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
return rc;
}
void __init xen_fill_possible_map(void)
{
int i, rc;
for (i = 0; i < NR_CPUS; i++) {
rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
if (rc >= 0)
cpu_set(i, cpu_possible_map);
}
}
void __init xen_smp_prepare_boot_cpu(void)
{
int cpu;
BUG_ON(smp_processor_id() != 0);
native_smp_prepare_boot_cpu();
/* We've switched to the "real" per-cpu gdt, so make sure the
old memory can be recycled */
make_lowmem_page_readwrite(&per_cpu__gdt_page);
for (cpu = 0; cpu < NR_CPUS; cpu++) {
cpus_clear(cpu_sibling_map[cpu]);
cpus_clear(cpu_core_map[cpu]);
}
xen_setup_vcpu_info_placement();
}
void __init xen_smp_prepare_cpus(unsigned int max_cpus)
{
unsigned cpu;
for (cpu = 0; cpu < NR_CPUS; cpu++) {
cpus_clear(cpu_sibling_map[cpu]);
cpus_clear(cpu_core_map[cpu]);
}
smp_store_cpu_info(0);
set_cpu_sibling_map(0);
if (xen_smp_intr_init(0))
BUG();
cpu_initialized_map = cpumask_of_cpu(0);
/* Restrict the possible_map according to max_cpus. */
while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--)
continue;
cpu_clear(cpu, cpu_possible_map);
}
for_each_possible_cpu (cpu) {
struct task_struct *idle;
if (cpu == 0)
continue;
idle = fork_idle(cpu);
if (IS_ERR(idle))
panic("failed fork for CPU %d", cpu);
cpu_set(cpu, cpu_present_map);
}
//init_xenbus_allowed_cpumask();
}
static __cpuinit int
cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
{
struct vcpu_guest_context *ctxt;
struct gdt_page *gdt = &per_cpu(gdt_page, cpu);
if (cpu_test_and_set(cpu, cpu_initialized_map))
return 0;
ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
if (ctxt == NULL)
return -ENOMEM;
ctxt->flags = VGCF_IN_KERNEL;
ctxt->user_regs.ds = __USER_DS;
ctxt->user_regs.es = __USER_DS;
ctxt->user_regs.fs = __KERNEL_PERCPU;
ctxt->user_regs.gs = 0;
ctxt->user_regs.ss = __KERNEL_DS;
ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
xen_copy_trap_info(ctxt->trap_ctxt);
ctxt->ldt_ents = 0;
BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK);
make_lowmem_page_readonly(gdt->gdt);
ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt);
ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt);
ctxt->user_regs.cs = __KERNEL_CS;
ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
ctxt->kernel_ss = __KERNEL_DS;
ctxt->kernel_sp = idle->thread.esp0;
ctxt->event_callback_cs = __KERNEL_CS;
ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
ctxt->failsafe_callback_cs = __KERNEL_CS;
ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
BUG();
kfree(ctxt);
return 0;
}
int __cpuinit xen_cpu_up(unsigned int cpu)
{
struct task_struct *idle = idle_task(cpu);
int rc;
#if 0
rc = cpu_up_check(cpu);
if (rc)
return rc;
#endif
init_gdt(cpu);
per_cpu(current_task, cpu) = idle;
irq_ctx_init(cpu);
xen_setup_timer(cpu);
/* make sure interrupts start blocked */
per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
rc = cpu_initialize_context(cpu, idle);
if (rc)
return rc;
if (num_online_cpus() == 1)
alternatives_smp_switch(1);
rc = xen_smp_intr_init(cpu);
if (rc)
return rc;
smp_store_cpu_info(cpu);
set_cpu_sibling_map(cpu);
/* This must be done before setting cpu_online_map */
wmb();
cpu_set(cpu, cpu_online_map);
rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
BUG_ON(rc);
return 0;
}
void xen_smp_cpus_done(unsigned int max_cpus)
{
}
static void stop_self(void *v)
{
int cpu = smp_processor_id();
/* make sure we're not pinning something down */
load_cr3(swapper_pg_dir);
/* should set up a minimal gdt */
HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
BUG();
}
void xen_smp_send_stop(void)
{
smp_call_function(stop_self, NULL, 0, 0);
}
void xen_smp_send_reschedule(int cpu)
{
xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
}
static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
{
unsigned cpu;
cpus_and(mask, mask, cpu_online_map);
for_each_cpu_mask(cpu, mask)
xen_send_IPI_one(cpu, vector);
}
static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
{
void (*func) (void *info) = call_data->func;
void *info = call_data->info;
int wait = call_data->wait;
/*
* Notify initiating CPU that I've grabbed the data and am
* about to execute the function
*/
mb();
atomic_inc(&call_data->started);
/*
* At this point the info structure may be out of scope unless wait==1
*/
irq_enter();
(*func)(info);
irq_exit();
if (wait) {
mb(); /* commit everything before setting finished */
atomic_inc(&call_data->finished);
}
return IRQ_HANDLED;
}
int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
void *info, int wait)
{
struct call_data_struct data;
int cpus;
/* Holding any lock stops cpus from going down. */
spin_lock(&call_lock);
cpu_clear(smp_processor_id(), mask);
cpus = cpus_weight(mask);
if (!cpus) {
spin_unlock(&call_lock);
return 0;
}
/* Can deadlock when called with interrupts disabled */
WARN_ON(irqs_disabled());
data.func = func;
data.info = info;
atomic_set(&data.started, 0);
data.wait = wait;
if (wait)
atomic_set(&data.finished, 0);
call_data = &data;
mb(); /* write everything before IPI */
/* Send a message to other CPUs and wait for them to respond */
xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
/* Make sure other vcpus get a chance to run.
XXX too severe? Maybe we should check the other CPU's states? */
HYPERVISOR_sched_op(SCHEDOP_yield, 0);
/* Wait for response */
while (atomic_read(&data.started) != cpus ||
(wait && atomic_read(&data.finished) != cpus))
cpu_relax();
spin_unlock(&call_lock);
return 0;
}

590
arch/i386/xen/time.c Normal file
View File

@ -0,0 +1,590 @@
/*
* Xen time implementation.
*
* This is implemented in terms of a clocksource driver which uses
* the hypervisor clock as a nanosecond timebase, and a clockevent
* driver which uses the hypervisor's timer mechanism.
*
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
#include <linux/kernel.h>
#include <linux/interrupt.h>
#include <linux/clocksource.h>
#include <linux/clockchips.h>
#include <linux/kernel_stat.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
#include <xen/events.h>
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>
#include "xen-ops.h"
#define XEN_SHIFT 22
/* Xen may fire a timer up to this many ns early */
#define TIMER_SLOP 100000
#define NS_PER_TICK (1000000000LL / HZ)
static cycle_t xen_clocksource_read(void);
/* These are perodically updated in shared_info, and then copied here. */
struct shadow_time_info {
u64 tsc_timestamp; /* TSC at last update of time vals. */
u64 system_timestamp; /* Time, in nanosecs, since boot. */
u32 tsc_to_nsec_mul;
int tsc_shift;
u32 version;
};
static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
/* runstate info updated by Xen */
static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
/* snapshots of runstate info */
static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
/* unused ns of stolen and blocked time */
static DEFINE_PER_CPU(u64, residual_stolen);
static DEFINE_PER_CPU(u64, residual_blocked);
/* return an consistent snapshot of 64-bit time/counter value */
static u64 get64(const u64 *p)
{
u64 ret;
if (BITS_PER_LONG < 64) {
u32 *p32 = (u32 *)p;
u32 h, l;
/*
* Read high then low, and then make sure high is
* still the same; this will only loop if low wraps
* and carries into high.
* XXX some clean way to make this endian-proof?
*/
do {
h = p32[1];
barrier();
l = p32[0];
barrier();
} while (p32[1] != h);
ret = (((u64)h) << 32) | l;
} else
ret = *p;
return ret;
}
/*
* Runstate accounting
*/
static void get_runstate_snapshot(struct vcpu_runstate_info *res)
{
u64 state_time;
struct vcpu_runstate_info *state;
BUG_ON(preemptible());
state = &__get_cpu_var(runstate);
/*
* The runstate info is always updated by the hypervisor on
* the current CPU, so there's no need to use anything
* stronger than a compiler barrier when fetching it.
*/
do {
state_time = get64(&state->state_entry_time);
barrier();
*res = *state;
barrier();
} while (get64(&state->state_entry_time) != state_time);
}
static void setup_runstate_info(int cpu)
{
struct vcpu_register_runstate_memory_area area;
area.addr.v = &per_cpu(runstate, cpu);
if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
cpu, &area))
BUG();
}
static void do_stolen_accounting(void)
{
struct vcpu_runstate_info state;
struct vcpu_runstate_info *snap;
s64 blocked, runnable, offline, stolen;
cputime_t ticks;
get_runstate_snapshot(&state);
WARN_ON(state.state != RUNSTATE_running);
snap = &__get_cpu_var(runstate_snapshot);
/* work out how much time the VCPU has not been runn*ing* */
blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
*snap = state;
/* Add the appropriate number of ticks of stolen time,
including any left-overs from last time. Passing NULL to
account_steal_time accounts the time as stolen. */
stolen = runnable + offline + __get_cpu_var(residual_stolen);
if (stolen < 0)
stolen = 0;
ticks = 0;
while (stolen >= NS_PER_TICK) {
ticks++;
stolen -= NS_PER_TICK;
}
__get_cpu_var(residual_stolen) = stolen;
account_steal_time(NULL, ticks);
/* Add the appropriate number of ticks of blocked time,
including any left-overs from last time. Passing idle to
account_steal_time accounts the time as idle/wait. */
blocked += __get_cpu_var(residual_blocked);
if (blocked < 0)
blocked = 0;
ticks = 0;
while (blocked >= NS_PER_TICK) {
ticks++;
blocked -= NS_PER_TICK;
}
__get_cpu_var(residual_blocked) = blocked;
account_steal_time(idle_task(smp_processor_id()), ticks);
}
/*
* Xen sched_clock implementation. Returns the number of unstolen
* nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
* states.
*/
unsigned long long xen_sched_clock(void)
{
struct vcpu_runstate_info state;
cycle_t now;
u64 ret;
s64 offset;
/*
* Ideally sched_clock should be called on a per-cpu basis
* anyway, so preempt should already be disabled, but that's
* not current practice at the moment.
*/
preempt_disable();
now = xen_clocksource_read();
get_runstate_snapshot(&state);
WARN_ON(state.state != RUNSTATE_running);
offset = now - state.state_entry_time;
if (offset < 0)
offset = 0;
ret = state.time[RUNSTATE_blocked] +
state.time[RUNSTATE_running] +
offset;
preempt_enable();
return ret;
}
/* Get the CPU speed from Xen */
unsigned long xen_cpu_khz(void)
{
u64 cpu_khz = 1000000ULL << 32;
const struct vcpu_time_info *info =
&HYPERVISOR_shared_info->vcpu_info[0].time;
do_div(cpu_khz, info->tsc_to_system_mul);
if (info->tsc_shift < 0)
cpu_khz <<= -info->tsc_shift;
else
cpu_khz >>= info->tsc_shift;
return cpu_khz;
}
/*
* Reads a consistent set of time-base values from Xen, into a shadow data
* area.
*/
static unsigned get_time_values_from_xen(void)
{
struct vcpu_time_info *src;
struct shadow_time_info *dst;
/* src is shared memory with the hypervisor, so we need to
make sure we get a consistent snapshot, even in the face of
being preempted. */
src = &__get_cpu_var(xen_vcpu)->time;
dst = &__get_cpu_var(shadow_time);
do {
dst->version = src->version;
rmb(); /* fetch version before data */
dst->tsc_timestamp = src->tsc_timestamp;
dst->system_timestamp = src->system_time;
dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
dst->tsc_shift = src->tsc_shift;
rmb(); /* test version after fetching data */
} while ((src->version & 1) | (dst->version ^ src->version));
return dst->version;
}
/*
* Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
* yielding a 64-bit result.
*/
static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
{
u64 product;
#ifdef __i386__
u32 tmp1, tmp2;
#endif
if (shift < 0)
delta >>= -shift;
else
delta <<= shift;
#ifdef __i386__
__asm__ (
"mul %5 ; "
"mov %4,%%eax ; "
"mov %%edx,%4 ; "
"mul %5 ; "
"xor %5,%5 ; "
"add %4,%%eax ; "
"adc %5,%%edx ; "
: "=A" (product), "=r" (tmp1), "=r" (tmp2)
: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
#elif __x86_64__
__asm__ (
"mul %%rdx ; shrd $32,%%rdx,%%rax"
: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
#else
#error implement me!
#endif
return product;
}
static u64 get_nsec_offset(struct shadow_time_info *shadow)
{
u64 now, delta;
now = native_read_tsc();
delta = now - shadow->tsc_timestamp;
return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
}
static cycle_t xen_clocksource_read(void)
{
struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
cycle_t ret;
unsigned version;
do {
version = get_time_values_from_xen();
barrier();
ret = shadow->system_timestamp + get_nsec_offset(shadow);
barrier();
} while (version != __get_cpu_var(xen_vcpu)->time.version);
put_cpu_var(shadow_time);
return ret;
}
static void xen_read_wallclock(struct timespec *ts)
{
const struct shared_info *s = HYPERVISOR_shared_info;
u32 version;
u64 delta;
struct timespec now;
/* get wallclock at system boot */
do {
version = s->wc_version;
rmb(); /* fetch version before time */
now.tv_sec = s->wc_sec;
now.tv_nsec = s->wc_nsec;
rmb(); /* fetch time before checking version */
} while ((s->wc_version & 1) | (version ^ s->wc_version));
delta = xen_clocksource_read(); /* time since system boot */
delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
now.tv_nsec = do_div(delta, NSEC_PER_SEC);
now.tv_sec = delta;
set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
}
unsigned long xen_get_wallclock(void)
{
struct timespec ts;
xen_read_wallclock(&ts);
return ts.tv_sec;
}
int xen_set_wallclock(unsigned long now)
{
/* do nothing for domU */
return -1;
}
static struct clocksource xen_clocksource __read_mostly = {
.name = "xen",
.rating = 400,
.read = xen_clocksource_read,
.mask = ~0,
.mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */
.shift = XEN_SHIFT,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
/*
Xen clockevent implementation
Xen has two clockevent implementations:
The old timer_op one works with all released versions of Xen prior
to version 3.0.4. This version of the hypervisor provides a
single-shot timer with nanosecond resolution. However, sharing the
same event channel is a 100Hz tick which is delivered while the
vcpu is running. We don't care about or use this tick, but it will
cause the core time code to think the timer fired too soon, and
will end up resetting it each time. It could be filtered, but
doing so has complications when the ktime clocksource is not yet
the xen clocksource (ie, at boot time).
The new vcpu_op-based timer interface allows the tick timer period
to be changed or turned off. The tick timer is not useful as a
periodic timer because events are only delivered to running vcpus.
The one-shot timer can report when a timeout is in the past, so
set_next_event is capable of returning -ETIME when appropriate.
This interface is used when available.
*/
/*
Get a hypervisor absolute time. In theory we could maintain an
offset between the kernel's time and the hypervisor's time, and
apply that to a kernel's absolute timeout. Unfortunately the
hypervisor and kernel times can drift even if the kernel is using
the Xen clocksource, because ntp can warp the kernel's clocksource.
*/
static s64 get_abs_timeout(unsigned long delta)
{
return xen_clocksource_read() + delta;
}
static void xen_timerop_set_mode(enum clock_event_mode mode,
struct clock_event_device *evt)
{
switch (mode) {
case CLOCK_EVT_MODE_PERIODIC:
/* unsupported */
WARN_ON(1);
break;
case CLOCK_EVT_MODE_ONESHOT:
break;
case CLOCK_EVT_MODE_UNUSED:
case CLOCK_EVT_MODE_SHUTDOWN:
HYPERVISOR_set_timer_op(0); /* cancel timeout */
break;
}
}
static int xen_timerop_set_next_event(unsigned long delta,
struct clock_event_device *evt)
{
WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
BUG();
/* We may have missed the deadline, but there's no real way of
knowing for sure. If the event was in the past, then we'll
get an immediate interrupt. */
return 0;
}
static const struct clock_event_device xen_timerop_clockevent = {
.name = "xen",
.features = CLOCK_EVT_FEAT_ONESHOT,
.max_delta_ns = 0xffffffff,
.min_delta_ns = TIMER_SLOP,
.mult = 1,
.shift = 0,
.rating = 500,
.set_mode = xen_timerop_set_mode,
.set_next_event = xen_timerop_set_next_event,
};
static void xen_vcpuop_set_mode(enum clock_event_mode mode,
struct clock_event_device *evt)
{
int cpu = smp_processor_id();
switch (mode) {
case CLOCK_EVT_MODE_PERIODIC:
WARN_ON(1); /* unsupported */
break;
case CLOCK_EVT_MODE_ONESHOT:
if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
BUG();
break;
case CLOCK_EVT_MODE_UNUSED:
case CLOCK_EVT_MODE_SHUTDOWN:
if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
BUG();
break;
}
}
static int xen_vcpuop_set_next_event(unsigned long delta,
struct clock_event_device *evt)
{
int cpu = smp_processor_id();
struct vcpu_set_singleshot_timer single;
int ret;
WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
single.timeout_abs_ns = get_abs_timeout(delta);
single.flags = VCPU_SSHOTTMR_future;
ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
BUG_ON(ret != 0 && ret != -ETIME);
return ret;
}
static const struct clock_event_device xen_vcpuop_clockevent = {
.name = "xen",
.features = CLOCK_EVT_FEAT_ONESHOT,
.max_delta_ns = 0xffffffff,
.min_delta_ns = TIMER_SLOP,
.mult = 1,
.shift = 0,
.rating = 500,
.set_mode = xen_vcpuop_set_mode,
.set_next_event = xen_vcpuop_set_next_event,
};
static const struct clock_event_device *xen_clockevent =
&xen_timerop_clockevent;
static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
{
struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
irqreturn_t ret;
ret = IRQ_NONE;
if (evt->event_handler) {
evt->event_handler(evt);
ret = IRQ_HANDLED;
}
do_stolen_accounting();
return ret;
}
void xen_setup_timer(int cpu)
{
const char *name;
struct clock_event_device *evt;
int irq;
printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
name = kasprintf(GFP_KERNEL, "timer%d", cpu);
if (!name)
name = "<timer kasprintf failed>";
irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
name, NULL);
evt = &per_cpu(xen_clock_events, cpu);
memcpy(evt, xen_clockevent, sizeof(*evt));
evt->cpumask = cpumask_of_cpu(cpu);
evt->irq = irq;
setup_runstate_info(cpu);
}
void xen_setup_cpu_clockevents(void)
{
BUG_ON(preemptible());
clockevents_register_device(&__get_cpu_var(xen_clock_events));
}
__init void xen_time_init(void)
{
int cpu = smp_processor_id();
get_time_values_from_xen();
clocksource_register(&xen_clocksource);
if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
/* Successfully turned off 100Hz tick, so we have the
vcpuop-based timer interface */
printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
xen_clockevent = &xen_vcpuop_clockevent;
}
/* Set initial system time with full resolution */
xen_read_wallclock(&xtime);
set_normalized_timespec(&wall_to_monotonic,
-xtime.tv_sec, -xtime.tv_nsec);
tsc_disable = 0;
xen_setup_timer(cpu);
xen_setup_cpu_clockevents();
}

291
arch/i386/xen/xen-asm.S Normal file
View File

@ -0,0 +1,291 @@
/*
Asm versions of Xen pv-ops, suitable for either direct use or inlining.
The inline versions are the same as the direct-use versions, with the
pre- and post-amble chopped off.
This code is encoded for size rather than absolute efficiency,
with a view to being able to inline as much as possible.
We only bother with direct forms (ie, vcpu in pda) of the operations
here; the indirect forms are better handled in C, since they're
generally too large to inline anyway.
*/
#include <linux/linkage.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/percpu.h>
#include <asm/processor-flags.h>
#include <asm/segment.h>
#include <xen/interface/xen.h>
#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
#define ENDPATCH(x) .globl x##_end; x##_end=.
/* Pseudo-flag used for virtual NMI, which we don't implement yet */
#define XEN_EFLAGS_NMI 0x80000000
/*
Enable events. This clears the event mask and tests the pending
event status with one and operation. If there are pending
events, then enter the hypervisor to get them handled.
*/
ENTRY(xen_irq_enable_direct)
/* Clear mask and test pending */
andw $0x00ff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
/* Preempt here doesn't matter because that will deal with
any pending interrupts. The pending check may end up being
run on the wrong CPU, but that doesn't hurt. */
jz 1f
2: call check_events
1:
ENDPATCH(xen_irq_enable_direct)
ret
ENDPROC(xen_irq_enable_direct)
RELOC(xen_irq_enable_direct, 2b+1)
/*
Disabling events is simply a matter of making the event mask
non-zero.
*/
ENTRY(xen_irq_disable_direct)
movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
ENDPATCH(xen_irq_disable_direct)
ret
ENDPROC(xen_irq_disable_direct)
RELOC(xen_irq_disable_direct, 0)
/*
(xen_)save_fl is used to get the current interrupt enable status.
Callers expect the status to be in X86_EFLAGS_IF, and other bits
may be set in the return value. We take advantage of this by
making sure that X86_EFLAGS_IF has the right value (and other bits
in that byte are 0), but other bits in the return value are
undefined. We need to toggle the state of the bit, because
Xen and x86 use opposite senses (mask vs enable).
*/
ENTRY(xen_save_fl_direct)
testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
setz %ah
addb %ah,%ah
ENDPATCH(xen_save_fl_direct)
ret
ENDPROC(xen_save_fl_direct)
RELOC(xen_save_fl_direct, 0)
/*
In principle the caller should be passing us a value return
from xen_save_fl_direct, but for robustness sake we test only
the X86_EFLAGS_IF flag rather than the whole byte. After
setting the interrupt mask state, it checks for unmasked
pending events and enters the hypervisor to get them delivered
if so.
*/
ENTRY(xen_restore_fl_direct)
testb $X86_EFLAGS_IF>>8, %ah
setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
/* Preempt here doesn't matter because that will deal with
any pending interrupts. The pending check may end up being
run on the wrong CPU, but that doesn't hurt. */
/* check for unmasked and pending */
cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
jz 1f
2: call check_events
1:
ENDPATCH(xen_restore_fl_direct)
ret
ENDPROC(xen_restore_fl_direct)
RELOC(xen_restore_fl_direct, 2b+1)
/*
This is run where a normal iret would be run, with the same stack setup:
8: eflags
4: cs
esp-> 0: eip
This attempts to make sure that any pending events are dealt
with on return to usermode, but there is a small window in
which an event can happen just before entering usermode. If
the nested interrupt ends up setting one of the TIF_WORK_MASK
pending work flags, they will not be tested again before
returning to usermode. This means that a process can end up
with pending work, which will be unprocessed until the process
enters and leaves the kernel again, which could be an
unbounded amount of time. This means that a pending signal or
reschedule event could be indefinitely delayed.
The fix is to notice a nested interrupt in the critical
window, and if one occurs, then fold the nested interrupt into
the current interrupt stack frame, and re-process it
iteratively rather than recursively. This means that it will
exit via the normal path, and all pending work will be dealt
with appropriately.
Because the nested interrupt handler needs to deal with the
current stack state in whatever form its in, we keep things
simple by only using a single register which is pushed/popped
on the stack.
Non-direct iret could be done in the same way, but it would
require an annoying amount of code duplication. We'll assume
that direct mode will be the common case once the hypervisor
support becomes commonplace.
*/
ENTRY(xen_iret_direct)
/* test eflags for special cases */
testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
jnz hyper_iret
push %eax
ESP_OFFSET=4 # bytes pushed onto stack
/* Store vcpu_info pointer for easy access. Do it this
way to avoid having to reload %fs */
#ifdef CONFIG_SMP
GET_THREAD_INFO(%eax)
movl TI_cpu(%eax),%eax
movl __per_cpu_offset(,%eax,4),%eax
lea per_cpu__xen_vcpu_info(%eax),%eax
#else
movl $per_cpu__xen_vcpu_info, %eax
#endif
/* check IF state we're restoring */
testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
/* Maybe enable events. Once this happens we could get a
recursive event, so the critical region starts immediately
afterwards. However, if that happens we don't end up
resuming the code, so we don't have to be worried about
being preempted to another CPU. */
setz XEN_vcpu_info_mask(%eax)
xen_iret_start_crit:
/* check for unmasked and pending */
cmpw $0x0001, XEN_vcpu_info_pending(%eax)
/* If there's something pending, mask events again so we
can jump back into xen_hypervisor_callback */
sete XEN_vcpu_info_mask(%eax)
popl %eax
/* From this point on the registers are restored and the stack
updated, so we don't need to worry about it if we're preempted */
iret_restore_end:
/* Jump to hypervisor_callback after fixing up the stack.
Events are masked, so jumping out of the critical
region is OK. */
je xen_hypervisor_callback
iret
xen_iret_end_crit:
hyper_iret:
/* put this out of line since its very rarely used */
jmp hypercall_page + __HYPERVISOR_iret * 32
.globl xen_iret_start_crit, xen_iret_end_crit
/*
This is called by xen_hypervisor_callback in entry.S when it sees
that the EIP at the time of interrupt was between xen_iret_start_crit
and xen_iret_end_crit. We're passed the EIP in %eax so we can do
a more refined determination of what to do.
The stack format at this point is:
----------------
ss : (ss/esp may be present if we came from usermode)
esp :
eflags } outer exception info
cs }
eip }
---------------- <- edi (copy dest)
eax : outer eax if it hasn't been restored
----------------
eflags } nested exception info
cs } (no ss/esp because we're nested
eip } from the same ring)
orig_eax }<- esi (copy src)
- - - - - - - -
fs }
es }
ds } SAVE_ALL state
eax }
: :
ebx }
----------------
return addr <- esp
----------------
In order to deliver the nested exception properly, we need to shift
everything from the return addr up to the error code so it
sits just under the outer exception info. This means that when we
handle the exception, we do it in the context of the outer exception
rather than starting a new one.
The only caveat is that if the outer eax hasn't been
restored yet (ie, it's still on stack), we need to insert
its value into the SAVE_ALL state before going on, since
it's usermode state which we eventually need to restore.
*/
ENTRY(xen_iret_crit_fixup)
/* offsets +4 for return address */
/*
Paranoia: Make sure we're really coming from userspace.
One could imagine a case where userspace jumps into the
critical range address, but just before the CPU delivers a GP,
it decides to deliver an interrupt instead. Unlikely?
Definitely. Easy to avoid? Yes. The Intel documents
explicitly say that the reported EIP for a bad jump is the
jump instruction itself, not the destination, but some virtual
environments get this wrong.
*/
movl PT_CS+4(%esp), %ecx
andl $SEGMENT_RPL_MASK, %ecx
cmpl $USER_RPL, %ecx
je 2f
lea PT_ORIG_EAX+4(%esp), %esi
lea PT_EFLAGS+4(%esp), %edi
/* If eip is before iret_restore_end then stack
hasn't been restored yet. */
cmp $iret_restore_end, %eax
jae 1f
movl 0+4(%edi),%eax /* copy EAX */
movl %eax, PT_EAX+4(%esp)
lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */
/* set up the copy */
1: std
mov $(PT_EIP+4) / 4, %ecx /* copy ret+saved regs up to orig_eax */
rep movsl
cld
lea 4(%edi),%esp /* point esp to new frame */
2: ret
/*
Force an event check by making a hypercall,
but preserve regs before making the call.
*/
check_events:
push %eax
push %ecx
push %edx
call force_evtchn_callback
pop %edx
pop %ecx
pop %eax
ret

36
arch/i386/xen/xen-head.S Normal file
View File

@ -0,0 +1,36 @@
/* Xen-specific pieces of head.S, intended to be included in the right
place in head.S */
#ifdef CONFIG_XEN
#include <linux/elfnote.h>
#include <asm/boot.h>
#include <xen/interface/elfnote.h>
ENTRY(startup_xen)
movl %esi,xen_start_info
cld
movl $(init_thread_union+THREAD_SIZE),%esp
jmp xen_start_kernel
.pushsection ".bss.page_aligned"
.align PAGE_SIZE_asm
ENTRY(hypercall_page)
.skip 0x1000
.popsection
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET)
ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen)
ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page)
ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
#ifdef CONFIG_X86_PAE
ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
#else
ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no")
#endif
ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
#endif /*CONFIG_XEN */

71
arch/i386/xen/xen-ops.h Normal file
View File

@ -0,0 +1,71 @@
#ifndef XEN_OPS_H
#define XEN_OPS_H
#include <linux/init.h>
/* These are code, but not functions. Defined in entry.S */
extern const char xen_hypervisor_callback[];
extern const char xen_failsafe_callback[];
void xen_copy_trap_info(struct trap_info *traps);
DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
DECLARE_PER_CPU(unsigned long, xen_cr3);
extern struct start_info *xen_start_info;
extern struct shared_info *HYPERVISOR_shared_info;
char * __init xen_memory_setup(void);
void __init xen_arch_setup(void);
void __init xen_init_IRQ(void);
void xen_setup_timer(int cpu);
void xen_setup_cpu_clockevents(void);
unsigned long xen_cpu_khz(void);
void __init xen_time_init(void);
unsigned long xen_get_wallclock(void);
int xen_set_wallclock(unsigned long time);
unsigned long long xen_sched_clock(void);
void xen_mark_init_mm_pinned(void);
DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
static inline unsigned xen_get_lazy_mode(void)
{
return x86_read_percpu(xen_lazy_mode);
}
void __init xen_fill_possible_map(void);
void __init xen_setup_vcpu_info_placement(void);
void xen_smp_prepare_boot_cpu(void);
void xen_smp_prepare_cpus(unsigned int max_cpus);
int xen_cpu_up(unsigned int cpu);
void xen_smp_cpus_done(unsigned int max_cpus);
void xen_smp_send_stop(void);
void xen_smp_send_reschedule(int cpu);
int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic,
int wait);
int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info,
int nonatomic, int wait);
int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
void *info, int wait);
/* Declare an asm function, along with symbols needed to make it
inlineable */
#define DECL_ASM(ret, name, ...) \
ret name(__VA_ARGS__); \
extern char name##_end[]; \
extern char name##_reloc[] \
DECL_ASM(void, xen_irq_enable_direct, void);
DECL_ASM(void, xen_irq_disable_direct, void);
DECL_ASM(unsigned long, xen_save_fl_direct, void);
DECL_ASM(void, xen_restore_fl_direct, unsigned long);
void xen_iret_direct(void);
#endif /* XEN_OPS_H */

View File

@ -6,6 +6,7 @@
#include <asm/io.h>
#include <asm/processor.h>
#include <asm/fcntl.h>
#include <xen/hvc-console.h>
/* Simple VGA output */
@ -242,6 +243,10 @@ static int __init setup_early_printk(char *buf)
simnow_init(buf + 6);
early_console = &simnow_console;
keep_early = 1;
#ifdef CONFIG_HVC_XEN
} else if (!strncmp(buf, "xen", 3)) {
early_console = &xenboot_console;
#endif
}
if (keep_early)

View File

@ -174,7 +174,7 @@ static void do_mce_trigger(void)
if (events != atomic_read(&mce_logged) && trigger[0]) {
/* Small race window, but should be harmless. */
atomic_set(&mce_logged, events);
call_usermodehelper(trigger, trigger_argv, NULL, -1);
call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
}
}

View File

@ -15,6 +15,8 @@ obj-$(CONFIG_ACPI) += acpi/
obj-$(CONFIG_PNP) += pnp/
obj-$(CONFIG_ARM_AMBA) += amba/
obj-$(CONFIG_XEN) += xen/
# char/ comes before serial/ etc so that the VT console is the boot-time
# default.
obj-y += char/

View File

@ -40,6 +40,7 @@
#include <linux/jiffies.h>
#include <linux/kmod.h>
#include <linux/seq_file.h>
#include <linux/reboot.h>
#include <asm/uaccess.h>
#include <acpi/acpi_bus.h>
@ -59,7 +60,6 @@
#define ACPI_THERMAL_NOTIFY_CRITICAL 0xF0
#define ACPI_THERMAL_NOTIFY_HOT 0xF1
#define ACPI_THERMAL_MODE_ACTIVE 0x00
#define ACPI_THERMAL_PATH_POWEROFF "/sbin/poweroff"
#define ACPI_THERMAL_MAX_ACTIVE 10
#define ACPI_THERMAL_MAX_LIMIT_STR_LEN 65
@ -419,26 +419,6 @@ static int acpi_thermal_get_devices(struct acpi_thermal *tz)
return 0;
}
static int acpi_thermal_call_usermode(char *path)
{
char *argv[2] = { NULL, NULL };
char *envp[3] = { NULL, NULL, NULL };
if (!path)
return -EINVAL;
argv[0] = path;
/* minimal command environment */
envp[0] = "HOME=/";
envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
call_usermodehelper(argv[0], argv, envp, 0);
return 0;
}
static int acpi_thermal_critical(struct acpi_thermal *tz)
{
if (!tz || !tz->trips.critical.flags.valid)
@ -456,7 +436,7 @@ static int acpi_thermal_critical(struct acpi_thermal *tz)
acpi_bus_generate_event(tz->device, ACPI_THERMAL_NOTIFY_CRITICAL,
tz->trips.critical.flags.enabled);
acpi_thermal_call_usermode(ACPI_THERMAL_PATH_POWEROFF);
orderly_poweroff(true);
return 0;
}

View File

@ -427,4 +427,13 @@ config XILINX_SYSACE
help
Include support for the Xilinx SystemACE CompactFlash interface
config XEN_BLKDEV_FRONTEND
tristate "Xen virtual block device support"
depends on XEN
default y
help
This driver implements the front-end of the Xen virtual
block device driver. It communicates with a back-end driver
in another domain which drives the actual block device.
endif # BLK_DEV

View File

@ -29,3 +29,4 @@ obj-$(CONFIG_VIODASD) += viodasd.o
obj-$(CONFIG_BLK_DEV_SX8) += sx8.o
obj-$(CONFIG_BLK_DEV_UB) += ub.o
obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o

View File

@ -0,0 +1,988 @@
/*
* blkfront.c
*
* XenLinux virtual block device driver.
*
* Copyright (c) 2003-2004, Keir Fraser & Steve Hand
* Modifications by Mark A. Williamson are (c) Intel Research Cambridge
* Copyright (c) 2004, Christian Limpach
* Copyright (c) 2004, Andrew Warfield
* Copyright (c) 2005, Christopher Clark
* Copyright (c) 2005, XenSource Ltd
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version 2
* as published by the Free Software Foundation; or, when distributed
* separately from the Linux kernel or incorporated into other
* software packages, subject to the following license:
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this source file (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy, modify,
* merge, publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include <linux/interrupt.h>
#include <linux/blkdev.h>
#include <linux/module.h>
#include <xen/xenbus.h>
#include <xen/grant_table.h>
#include <xen/events.h>
#include <xen/page.h>
#include <xen/interface/grant_table.h>
#include <xen/interface/io/blkif.h>
#include <asm/xen/hypervisor.h>
enum blkif_state {
BLKIF_STATE_DISCONNECTED,
BLKIF_STATE_CONNECTED,
BLKIF_STATE_SUSPENDED,
};
struct blk_shadow {
struct blkif_request req;
unsigned long request;
unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
};
static struct block_device_operations xlvbd_block_fops;
#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
/*
* We have one of these per vbd, whether ide, scsi or 'other'. They
* hang in private_data off the gendisk structure. We may end up
* putting all kinds of interesting stuff here :-)
*/
struct blkfront_info
{
struct xenbus_device *xbdev;
dev_t dev;
struct gendisk *gd;
int vdevice;
blkif_vdev_t handle;
enum blkif_state connected;
int ring_ref;
struct blkif_front_ring ring;
unsigned int evtchn, irq;
struct request_queue *rq;
struct work_struct work;
struct gnttab_free_callback callback;
struct blk_shadow shadow[BLK_RING_SIZE];
unsigned long shadow_free;
int feature_barrier;
/**
* The number of people holding this device open. We won't allow a
* hot-unplug unless this is 0.
*/
int users;
};
static DEFINE_SPINLOCK(blkif_io_lock);
#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
(BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
#define GRANT_INVALID_REF 0
#define PARTS_PER_DISK 16
#define BLKIF_MAJOR(dev) ((dev)>>8)
#define BLKIF_MINOR(dev) ((dev) & 0xff)
#define DEV_NAME "xvd" /* name in /dev */
/* Information about our VBDs. */
#define MAX_VBDS 64
static LIST_HEAD(vbds_list);
static int get_id_from_freelist(struct blkfront_info *info)
{
unsigned long free = info->shadow_free;
BUG_ON(free > BLK_RING_SIZE);
info->shadow_free = info->shadow[free].req.id;
info->shadow[free].req.id = 0x0fffffee; /* debug */
return free;
}
static void add_id_to_freelist(struct blkfront_info *info,
unsigned long id)
{
info->shadow[id].req.id = info->shadow_free;
info->shadow[id].request = 0;
info->shadow_free = id;
}
static void blkif_restart_queue_callback(void *arg)
{
struct blkfront_info *info = (struct blkfront_info *)arg;
schedule_work(&info->work);
}
/*
* blkif_queue_request
*
* request block io
*
* id: for guest use only.
* operation: BLKIF_OP_{READ,WRITE,PROBE}
* buffer: buffer to read/write into. this should be a
* virtual address in the guest os.
*/
static int blkif_queue_request(struct request *req)
{
struct blkfront_info *info = req->rq_disk->private_data;
unsigned long buffer_mfn;
struct blkif_request *ring_req;
struct bio *bio;
struct bio_vec *bvec;
int idx;
unsigned long id;
unsigned int fsect, lsect;
int ref;
grant_ref_t gref_head;
if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
return 1;
if (gnttab_alloc_grant_references(
BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
gnttab_request_free_callback(
&info->callback,
blkif_restart_queue_callback,
info,
BLKIF_MAX_SEGMENTS_PER_REQUEST);
return 1;
}
/* Fill out a communications ring structure. */
ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
id = get_id_from_freelist(info);
info->shadow[id].request = (unsigned long)req;
ring_req->id = id;
ring_req->sector_number = (blkif_sector_t)req->sector;
ring_req->handle = info->handle;
ring_req->operation = rq_data_dir(req) ?
BLKIF_OP_WRITE : BLKIF_OP_READ;
if (blk_barrier_rq(req))
ring_req->operation = BLKIF_OP_WRITE_BARRIER;
ring_req->nr_segments = 0;
rq_for_each_bio (bio, req) {
bio_for_each_segment (bvec, bio, idx) {
BUG_ON(ring_req->nr_segments
== BLKIF_MAX_SEGMENTS_PER_REQUEST);
buffer_mfn = pfn_to_mfn(page_to_pfn(bvec->bv_page));
fsect = bvec->bv_offset >> 9;
lsect = fsect + (bvec->bv_len >> 9) - 1;
/* install a grant reference. */
ref = gnttab_claim_grant_reference(&gref_head);
BUG_ON(ref == -ENOSPC);
gnttab_grant_foreign_access_ref(
ref,
info->xbdev->otherend_id,
buffer_mfn,
rq_data_dir(req) );
info->shadow[id].frame[ring_req->nr_segments] =
mfn_to_pfn(buffer_mfn);
ring_req->seg[ring_req->nr_segments] =
(struct blkif_request_segment) {
.gref = ref,
.first_sect = fsect,
.last_sect = lsect };
ring_req->nr_segments++;
}
}
info->ring.req_prod_pvt++;
/* Keep a private copy so we can reissue requests when recovering. */
info->shadow[id].req = *ring_req;
gnttab_free_grant_references(gref_head);
return 0;
}
static inline void flush_requests(struct blkfront_info *info)
{
int notify;
RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
if (notify)
notify_remote_via_irq(info->irq);
}
/*
* do_blkif_request
* read a block; request is in a request queue
*/
static void do_blkif_request(request_queue_t *rq)
{
struct blkfront_info *info = NULL;
struct request *req;
int queued;
pr_debug("Entered do_blkif_request\n");
queued = 0;
while ((req = elv_next_request(rq)) != NULL) {
info = req->rq_disk->private_data;
if (!blk_fs_request(req)) {
end_request(req, 0);
continue;
}
if (RING_FULL(&info->ring))
goto wait;
pr_debug("do_blk_req %p: cmd %p, sec %lx, "
"(%u/%li) buffer:%p [%s]\n",
req, req->cmd, (unsigned long)req->sector,
req->current_nr_sectors,
req->nr_sectors, req->buffer,
rq_data_dir(req) ? "write" : "read");
blkdev_dequeue_request(req);
if (blkif_queue_request(req)) {
blk_requeue_request(rq, req);
wait:
/* Avoid pointless unplugs. */
blk_stop_queue(rq);
break;
}
queued++;
}
if (queued != 0)
flush_requests(info);
}
static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
{
request_queue_t *rq;
rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
if (rq == NULL)
return -1;
elevator_init(rq, "noop");
/* Hard sector size and max sectors impersonate the equiv. hardware. */
blk_queue_hardsect_size(rq, sector_size);
blk_queue_max_sectors(rq, 512);
/* Each segment in a request is up to an aligned page in size. */
blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
blk_queue_max_segment_size(rq, PAGE_SIZE);
/* Ensure a merged request will fit in a single I/O ring slot. */
blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
/* Make sure buffer addresses are sector-aligned. */
blk_queue_dma_alignment(rq, 511);
gd->queue = rq;
return 0;
}
static int xlvbd_barrier(struct blkfront_info *info)
{
int err;
err = blk_queue_ordered(info->rq,
info->feature_barrier ? QUEUE_ORDERED_DRAIN : QUEUE_ORDERED_NONE,
NULL);
if (err)
return err;
printk(KERN_INFO "blkfront: %s: barriers %s\n",
info->gd->disk_name,
info->feature_barrier ? "enabled" : "disabled");
return 0;
}
static int xlvbd_alloc_gendisk(int minor, blkif_sector_t capacity,
int vdevice, u16 vdisk_info, u16 sector_size,
struct blkfront_info *info)
{
struct gendisk *gd;
int nr_minors = 1;
int err = -ENODEV;
BUG_ON(info->gd != NULL);
BUG_ON(info->rq != NULL);
if ((minor % PARTS_PER_DISK) == 0)
nr_minors = PARTS_PER_DISK;
gd = alloc_disk(nr_minors);
if (gd == NULL)
goto out;
if (nr_minors > 1)
sprintf(gd->disk_name, "%s%c", DEV_NAME,
'a' + minor / PARTS_PER_DISK);
else
sprintf(gd->disk_name, "%s%c%d", DEV_NAME,
'a' + minor / PARTS_PER_DISK,
minor % PARTS_PER_DISK);
gd->major = XENVBD_MAJOR;
gd->first_minor = minor;
gd->fops = &xlvbd_block_fops;
gd->private_data = info;
gd->driverfs_dev = &(info->xbdev->dev);
set_capacity(gd, capacity);
if (xlvbd_init_blk_queue(gd, sector_size)) {
del_gendisk(gd);
goto out;
}
info->rq = gd->queue;
info->gd = gd;
if (info->feature_barrier)
xlvbd_barrier(info);
if (vdisk_info & VDISK_READONLY)
set_disk_ro(gd, 1);
if (vdisk_info & VDISK_REMOVABLE)
gd->flags |= GENHD_FL_REMOVABLE;
if (vdisk_info & VDISK_CDROM)
gd->flags |= GENHD_FL_CD;
return 0;
out:
return err;
}
static void kick_pending_request_queues(struct blkfront_info *info)
{
if (!RING_FULL(&info->ring)) {
/* Re-enable calldowns. */
blk_start_queue(info->rq);
/* Kick things off immediately. */
do_blkif_request(info->rq);
}
}
static void blkif_restart_queue(struct work_struct *work)
{
struct blkfront_info *info = container_of(work, struct blkfront_info, work);
spin_lock_irq(&blkif_io_lock);
if (info->connected == BLKIF_STATE_CONNECTED)
kick_pending_request_queues(info);
spin_unlock_irq(&blkif_io_lock);
}
static void blkif_free(struct blkfront_info *info, int suspend)
{
/* Prevent new requests being issued until we fix things up. */
spin_lock_irq(&blkif_io_lock);
info->connected = suspend ?
BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
/* No more blkif_request(). */
if (info->rq)
blk_stop_queue(info->rq);
/* No more gnttab callback work. */
gnttab_cancel_free_callback(&info->callback);
spin_unlock_irq(&blkif_io_lock);
/* Flush gnttab callback work. Must be done with no locks held. */
flush_scheduled_work();
/* Free resources associated with old device channel. */
if (info->ring_ref != GRANT_INVALID_REF) {
gnttab_end_foreign_access(info->ring_ref, 0,
(unsigned long)info->ring.sring);
info->ring_ref = GRANT_INVALID_REF;
info->ring.sring = NULL;
}
if (info->irq)
unbind_from_irqhandler(info->irq, info);
info->evtchn = info->irq = 0;
}
static void blkif_completion(struct blk_shadow *s)
{
int i;
for (i = 0; i < s->req.nr_segments; i++)
gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL);
}
static irqreturn_t blkif_interrupt(int irq, void *dev_id)
{
struct request *req;
struct blkif_response *bret;
RING_IDX i, rp;
unsigned long flags;
struct blkfront_info *info = (struct blkfront_info *)dev_id;
int uptodate;
spin_lock_irqsave(&blkif_io_lock, flags);
if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
spin_unlock_irqrestore(&blkif_io_lock, flags);
return IRQ_HANDLED;
}
again:
rp = info->ring.sring->rsp_prod;
rmb(); /* Ensure we see queued responses up to 'rp'. */
for (i = info->ring.rsp_cons; i != rp; i++) {
unsigned long id;
int ret;
bret = RING_GET_RESPONSE(&info->ring, i);
id = bret->id;
req = (struct request *)info->shadow[id].request;
blkif_completion(&info->shadow[id]);
add_id_to_freelist(info, id);
uptodate = (bret->status == BLKIF_RSP_OKAY);
switch (bret->operation) {
case BLKIF_OP_WRITE_BARRIER:
if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
printk(KERN_WARNING "blkfront: %s: write barrier op failed\n",
info->gd->disk_name);
uptodate = -EOPNOTSUPP;
info->feature_barrier = 0;
xlvbd_barrier(info);
}
/* fall through */
case BLKIF_OP_READ:
case BLKIF_OP_WRITE:
if (unlikely(bret->status != BLKIF_RSP_OKAY))
dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
"request: %x\n", bret->status);
ret = end_that_request_first(req, uptodate,
req->hard_nr_sectors);
BUG_ON(ret);
end_that_request_last(req, uptodate);
break;
default:
BUG();
}
}
info->ring.rsp_cons = i;
if (i != info->ring.req_prod_pvt) {
int more_to_do;
RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
if (more_to_do)
goto again;
} else
info->ring.sring->rsp_event = i + 1;
kick_pending_request_queues(info);
spin_unlock_irqrestore(&blkif_io_lock, flags);
return IRQ_HANDLED;
}
static int setup_blkring(struct xenbus_device *dev,
struct blkfront_info *info)
{
struct blkif_sring *sring;
int err;
info->ring_ref = GRANT_INVALID_REF;
sring = (struct blkif_sring *)__get_free_page(GFP_KERNEL);
if (!sring) {
xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
return -ENOMEM;
}
SHARED_RING_INIT(sring);
FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
if (err < 0) {
free_page((unsigned long)sring);
info->ring.sring = NULL;
goto fail;
}
info->ring_ref = err;
err = xenbus_alloc_evtchn(dev, &info->evtchn);
if (err)
goto fail;
err = bind_evtchn_to_irqhandler(info->evtchn,
blkif_interrupt,
IRQF_SAMPLE_RANDOM, "blkif", info);
if (err <= 0) {
xenbus_dev_fatal(dev, err,
"bind_evtchn_to_irqhandler failed");
goto fail;
}
info->irq = err;
return 0;
fail:
blkif_free(info, 0);
return err;
}
/* Common code used when first setting up, and when resuming. */
static int talk_to_backend(struct xenbus_device *dev,
struct blkfront_info *info)
{
const char *message = NULL;
struct xenbus_transaction xbt;
int err;
/* Create shared ring, alloc event channel. */
err = setup_blkring(dev, info);
if (err)
goto out;
again:
err = xenbus_transaction_start(&xbt);
if (err) {
xenbus_dev_fatal(dev, err, "starting transaction");
goto destroy_blkring;
}
err = xenbus_printf(xbt, dev->nodename,
"ring-ref", "%u", info->ring_ref);
if (err) {
message = "writing ring-ref";
goto abort_transaction;
}
err = xenbus_printf(xbt, dev->nodename,
"event-channel", "%u", info->evtchn);
if (err) {
message = "writing event-channel";
goto abort_transaction;
}
err = xenbus_transaction_end(xbt, 0);
if (err) {
if (err == -EAGAIN)
goto again;
xenbus_dev_fatal(dev, err, "completing transaction");
goto destroy_blkring;
}
xenbus_switch_state(dev, XenbusStateInitialised);
return 0;
abort_transaction:
xenbus_transaction_end(xbt, 1);
if (message)
xenbus_dev_fatal(dev, err, "%s", message);
destroy_blkring:
blkif_free(info, 0);
out:
return err;
}
/**
* Entry point to this code when a new device is created. Allocate the basic
* structures and the ring buffer for communication with the backend, and
* inform the backend of the appropriate details for those. Switch to
* Initialised state.
*/
static int blkfront_probe(struct xenbus_device *dev,
const struct xenbus_device_id *id)
{
int err, vdevice, i;
struct blkfront_info *info;
/* FIXME: Use dynamic device id if this is not set. */
err = xenbus_scanf(XBT_NIL, dev->nodename,
"virtual-device", "%i", &vdevice);
if (err != 1) {
xenbus_dev_fatal(dev, err, "reading virtual-device");
return err;
}
info = kzalloc(sizeof(*info), GFP_KERNEL);
if (!info) {
xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
return -ENOMEM;
}
info->xbdev = dev;
info->vdevice = vdevice;
info->connected = BLKIF_STATE_DISCONNECTED;
INIT_WORK(&info->work, blkif_restart_queue);
for (i = 0; i < BLK_RING_SIZE; i++)
info->shadow[i].req.id = i+1;
info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
/* Front end dir is a number, which is used as the id. */
info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
dev->dev.driver_data = info;
err = talk_to_backend(dev, info);
if (err) {
kfree(info);
dev->dev.driver_data = NULL;
return err;
}
return 0;
}
static int blkif_recover(struct blkfront_info *info)
{
int i;
struct blkif_request *req;
struct blk_shadow *copy;
int j;
/* Stage 1: Make a safe copy of the shadow state. */
copy = kmalloc(sizeof(info->shadow), GFP_KERNEL);
if (!copy)
return -ENOMEM;
memcpy(copy, info->shadow, sizeof(info->shadow));
/* Stage 2: Set up free list. */
memset(&info->shadow, 0, sizeof(info->shadow));
for (i = 0; i < BLK_RING_SIZE; i++)
info->shadow[i].req.id = i+1;
info->shadow_free = info->ring.req_prod_pvt;
info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
/* Stage 3: Find pending requests and requeue them. */
for (i = 0; i < BLK_RING_SIZE; i++) {
/* Not in use? */
if (copy[i].request == 0)
continue;
/* Grab a request slot and copy shadow state into it. */
req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
*req = copy[i].req;
/* We get a new request id, and must reset the shadow state. */
req->id = get_id_from_freelist(info);
memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
/* Rewrite any grant references invalidated by susp/resume. */
for (j = 0; j < req->nr_segments; j++)
gnttab_grant_foreign_access_ref(
req->seg[j].gref,
info->xbdev->otherend_id,
pfn_to_mfn(info->shadow[req->id].frame[j]),
rq_data_dir(
(struct request *)
info->shadow[req->id].request));
info->shadow[req->id].req = *req;
info->ring.req_prod_pvt++;
}
kfree(copy);
xenbus_switch_state(info->xbdev, XenbusStateConnected);
spin_lock_irq(&blkif_io_lock);
/* Now safe for us to use the shared ring */
info->connected = BLKIF_STATE_CONNECTED;
/* Send off requeued requests */
flush_requests(info);
/* Kick any other new requests queued since we resumed */
kick_pending_request_queues(info);
spin_unlock_irq(&blkif_io_lock);
return 0;
}
/**
* We are reconnecting to the backend, due to a suspend/resume, or a backend
* driver restart. We tear down our blkif structure and recreate it, but
* leave the device-layer structures intact so that this is transparent to the
* rest of the kernel.
*/
static int blkfront_resume(struct xenbus_device *dev)
{
struct blkfront_info *info = dev->dev.driver_data;
int err;
dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename);
blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
err = talk_to_backend(dev, info);
if (info->connected == BLKIF_STATE_SUSPENDED && !err)
err = blkif_recover(info);
return err;
}
/*
* Invoked when the backend is finally 'ready' (and has told produced
* the details about the physical device - #sectors, size, etc).
*/
static void blkfront_connect(struct blkfront_info *info)
{
unsigned long long sectors;
unsigned long sector_size;
unsigned int binfo;
int err;
if ((info->connected == BLKIF_STATE_CONNECTED) ||
(info->connected == BLKIF_STATE_SUSPENDED) )
return;
dev_dbg(&info->xbdev->dev, "%s:%s.\n",
__func__, info->xbdev->otherend);
err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
"sectors", "%llu", &sectors,
"info", "%u", &binfo,
"sector-size", "%lu", &sector_size,
NULL);
if (err) {
xenbus_dev_fatal(info->xbdev, err,
"reading backend fields at %s",
info->xbdev->otherend);
return;
}
err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
"feature-barrier", "%lu", &info->feature_barrier,
NULL);
if (err)
info->feature_barrier = 0;
err = xlvbd_alloc_gendisk(BLKIF_MINOR(info->vdevice),
sectors, info->vdevice,
binfo, sector_size, info);
if (err) {
xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
info->xbdev->otherend);
return;
}
xenbus_switch_state(info->xbdev, XenbusStateConnected);
/* Kick pending requests. */
spin_lock_irq(&blkif_io_lock);
info->connected = BLKIF_STATE_CONNECTED;
kick_pending_request_queues(info);
spin_unlock_irq(&blkif_io_lock);
add_disk(info->gd);
}
/**
* Handle the change of state of the backend to Closing. We must delete our
* device-layer structures now, to ensure that writes are flushed through to
* the backend. Once is this done, we can switch to Closed in
* acknowledgement.
*/
static void blkfront_closing(struct xenbus_device *dev)
{
struct blkfront_info *info = dev->dev.driver_data;
unsigned long flags;
dev_dbg(&dev->dev, "blkfront_closing: %s removed\n", dev->nodename);
if (info->rq == NULL)
goto out;
spin_lock_irqsave(&blkif_io_lock, flags);
del_gendisk(info->gd);
/* No more blkif_request(). */
blk_stop_queue(info->rq);
/* No more gnttab callback work. */
gnttab_cancel_free_callback(&info->callback);
spin_unlock_irqrestore(&blkif_io_lock, flags);
/* Flush gnttab callback work. Must be done with no locks held. */
flush_scheduled_work();
blk_cleanup_queue(info->rq);
info->rq = NULL;
out:
xenbus_frontend_closed(dev);
}
/**
* Callback received when the backend's state changes.
*/
static void backend_changed(struct xenbus_device *dev,
enum xenbus_state backend_state)
{
struct blkfront_info *info = dev->dev.driver_data;
struct block_device *bd;
dev_dbg(&dev->dev, "blkfront:backend_changed.\n");
switch (backend_state) {
case XenbusStateInitialising:
case XenbusStateInitWait:
case XenbusStateInitialised:
case XenbusStateUnknown:
case XenbusStateClosed:
break;
case XenbusStateConnected:
blkfront_connect(info);
break;
case XenbusStateClosing:
bd = bdget(info->dev);
if (bd == NULL)
xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
mutex_lock(&bd->bd_mutex);
if (info->users > 0)
xenbus_dev_error(dev, -EBUSY,
"Device in use; refusing to close");
else
blkfront_closing(dev);
mutex_unlock(&bd->bd_mutex);
bdput(bd);
break;
}
}
static int blkfront_remove(struct xenbus_device *dev)
{
struct blkfront_info *info = dev->dev.driver_data;
dev_dbg(&dev->dev, "blkfront_remove: %s removed\n", dev->nodename);
blkif_free(info, 0);
kfree(info);
return 0;
}
static int blkif_open(struct inode *inode, struct file *filep)
{
struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
info->users++;
return 0;
}
static int blkif_release(struct inode *inode, struct file *filep)
{
struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
info->users--;
if (info->users == 0) {
/* Check whether we have been instructed to close. We will
have ignored this request initially, as the device was
still mounted. */
struct xenbus_device *dev = info->xbdev;
enum xenbus_state state = xenbus_read_driver_state(dev->otherend);
if (state == XenbusStateClosing)
blkfront_closing(dev);
}
return 0;
}
static struct block_device_operations xlvbd_block_fops =
{
.owner = THIS_MODULE,
.open = blkif_open,
.release = blkif_release,
};
static struct xenbus_device_id blkfront_ids[] = {
{ "vbd" },
{ "" }
};
static struct xenbus_driver blkfront = {
.name = "vbd",
.owner = THIS_MODULE,
.ids = blkfront_ids,
.probe = blkfront_probe,
.remove = blkfront_remove,
.resume = blkfront_resume,
.otherend_changed = backend_changed,
};
static int __init xlblk_init(void)
{
if (!is_running_on_xen())
return -ENODEV;
if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
printk(KERN_WARNING "xen_blk: can't get major %d with name %s\n",
XENVBD_MAJOR, DEV_NAME);
return -ENODEV;
}
return xenbus_register_frontend(&blkfront);
}
module_init(xlblk_init);
static void xlblk_exit(void)
{
return xenbus_unregister_driver(&blkfront);
}
module_exit(xlblk_exit);
MODULE_DESCRIPTION("Xen virtual block device frontend");
MODULE_LICENSE("GPL");
MODULE_ALIAS_BLOCKDEV_MAJOR(XENVBD_MAJOR);

View File

@ -604,6 +604,14 @@ config HVC_BEAT
help
Toshiba's Cell Reference Set Beat Console device driver
config HVC_XEN
bool "Xen Hypervisor Console support"
depends on XEN
select HVC_DRIVER
default y
help
Xen virtual console device driver
config HVCS
tristate "IBM Hypervisor Virtual Console Server support"
depends on PPC_PSERIES

View File

@ -48,6 +48,7 @@ obj-$(CONFIG_HVC_ISERIES) += hvc_iseries.o
obj-$(CONFIG_HVC_RTAS) += hvc_rtas.o
obj-$(CONFIG_HVC_BEAT) += hvc_beat.o
obj-$(CONFIG_HVC_DRIVER) += hvc_console.o
obj-$(CONFIG_HVC_XEN) += hvc_xen.o
obj-$(CONFIG_RAW_DRIVER) += raw.o
obj-$(CONFIG_SGI_SNSC) += snsc.o snsc_event.o
obj-$(CONFIG_MSPEC) += mspec.o

159
drivers/char/hvc_xen.c Normal file
View File

@ -0,0 +1,159 @@
/*
* xen console driver interface to hvc_console.c
*
* (c) 2007 Gerd Hoffmann <kraxel@suse.de>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/console.h>
#include <linux/delay.h>
#include <linux/err.h>
#include <linux/init.h>
#include <linux/types.h>
#include <asm/xen/hypervisor.h>
#include <xen/page.h>
#include <xen/events.h>
#include <xen/interface/io/console.h>
#include <xen/hvc-console.h>
#include "hvc_console.h"
#define HVC_COOKIE 0x58656e /* "Xen" in hex */
static struct hvc_struct *hvc;
static int xencons_irq;
/* ------------------------------------------------------------------ */
static inline struct xencons_interface *xencons_interface(void)
{
return mfn_to_virt(xen_start_info->console.domU.mfn);
}
static inline void notify_daemon(void)
{
/* Use evtchn: this is called early, before irq is set up. */
notify_remote_via_evtchn(xen_start_info->console.domU.evtchn);
}
static int write_console(uint32_t vtermno, const char *data, int len)
{
struct xencons_interface *intf = xencons_interface();
XENCONS_RING_IDX cons, prod;
int sent = 0;
cons = intf->out_cons;
prod = intf->out_prod;
mb(); /* update queue values before going on */
BUG_ON((prod - cons) > sizeof(intf->out));
while ((sent < len) && ((prod - cons) < sizeof(intf->out)))
intf->out[MASK_XENCONS_IDX(prod++, intf->out)] = data[sent++];
wmb(); /* write ring before updating pointer */
intf->out_prod = prod;
notify_daemon();
return sent;
}
static int read_console(uint32_t vtermno, char *buf, int len)
{
struct xencons_interface *intf = xencons_interface();
XENCONS_RING_IDX cons, prod;
int recv = 0;
cons = intf->in_cons;
prod = intf->in_prod;
mb(); /* get pointers before reading ring */
BUG_ON((prod - cons) > sizeof(intf->in));
while (cons != prod && recv < len)
buf[recv++] = intf->in[MASK_XENCONS_IDX(cons++, intf->in)];
mb(); /* read ring before consuming */
intf->in_cons = cons;
notify_daemon();
return recv;
}
static struct hv_ops hvc_ops = {
.get_chars = read_console,
.put_chars = write_console,
};
static int __init xen_init(void)
{
struct hvc_struct *hp;
if (!is_running_on_xen())
return 0;
xencons_irq = bind_evtchn_to_irq(xen_start_info->console.domU.evtchn);
if (xencons_irq < 0)
xencons_irq = 0 /* NO_IRQ */;
hp = hvc_alloc(HVC_COOKIE, xencons_irq, &hvc_ops, 256);
if (IS_ERR(hp))
return PTR_ERR(hp);
hvc = hp;
return 0;
}
static void __exit xen_fini(void)
{
if (hvc)
hvc_remove(hvc);
}
static int xen_cons_init(void)
{
if (!is_running_on_xen())
return 0;
hvc_instantiate(HVC_COOKIE, 0, &hvc_ops);
return 0;
}
module_init(xen_init);
module_exit(xen_fini);
console_initcall(xen_cons_init);
static void xenboot_write_console(struct console *console, const char *string,
unsigned len)
{
unsigned int linelen, off = 0;
const char *pos;
while (off < len && NULL != (pos = strchr(string+off, '\n'))) {
linelen = pos-string+off;
if (off + linelen > len)
break;
write_console(0, string+off, linelen);
write_console(0, "\r\n", 2);
off += linelen + 1;
}
if (off < len)
write_console(0, string+off, len-off);
}
struct console xenboot_console = {
.name = "xenboot",
.write = xenboot_write_console,
.flags = CON_PRINTBUFFER | CON_BOOT,
};

View File

@ -1770,7 +1770,8 @@ static int call_critical_overtemp(void)
"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
NULL };
return call_usermodehelper(critical_overtemp_path, argv, envp, 0);
return call_usermodehelper(critical_overtemp_path,
argv, envp, UMH_WAIT_EXEC);
}

View File

@ -80,7 +80,8 @@ int wf_critical_overtemp(void)
"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
NULL };
return call_usermodehelper(critical_overtemp_path, argv, envp, 0);
return call_usermodehelper(critical_overtemp_path,
argv, envp, UMH_WAIT_EXEC);
}
EXPORT_SYMBOL_GPL(wf_critical_overtemp);

View File

@ -2486,6 +2486,18 @@ source "drivers/atm/Kconfig"
source "drivers/s390/net/Kconfig"
config XEN_NETDEV_FRONTEND
tristate "Xen network device frontend driver"
depends on XEN
default y
help
The network device frontend driver allows the kernel to
access network devices exported exported by a virtual
machine containing a physical network device driver. The
frontend driver is intended for unprivileged guest domains;
if you are compiling a kernel for a Xen guest, you almost
certainly want to enable this.
config ISERIES_VETH
tristate "iSeries Virtual Ethernet driver support"
depends on PPC_ISERIES

View File

@ -127,6 +127,8 @@ obj-$(CONFIG_PPPOL2TP) += pppox.o pppol2tp.o
obj-$(CONFIG_SLIP) += slip.o
obj-$(CONFIG_SLHC) += slhc.o
obj-$(CONFIG_XEN_NETDEV_FRONTEND) += xen-netfront.o
obj-$(CONFIG_DUMMY) += dummy.o
obj-$(CONFIG_IFB) += ifb.o
obj-$(CONFIG_MACVLAN) += macvlan.o

View File

@ -320,7 +320,7 @@ static int eppconfig(struct baycom_state *bc)
sprintf(portarg, "%ld", bc->pdev->port->base);
printk(KERN_DEBUG "%s: %s -s -p %s -m %s\n", bc_drvname, eppconfig_path, portarg, modearg);
return call_usermodehelper(eppconfig_path, argv, envp, 1);
return call_usermodehelper(eppconfig_path, argv, envp, UMH_WAIT_PROC);
}
/* ---------------------------------------------------------------------- */

1863
drivers/net/xen-netfront.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -147,7 +147,7 @@ static int pnp_dock_event(int dock, struct pnp_docking_station_info *info)
info->location_id, info->serial, info->capabilities);
envp[i] = NULL;
value = call_usermodehelper (argv [0], argv, envp, 0);
value = call_usermodehelper (argv [0], argv, envp, UMH_WAIT_EXEC);
kfree (buf);
kfree (envp);
return 0;

View File

@ -7,6 +7,7 @@
#include <linux/kthread.h>
#include <linux/delay.h>
#include <linux/kmod.h>
#include <linux/reboot.h>
#include <asm/oplib.h>
#include <asm/ebus.h>
@ -170,8 +171,6 @@ static void get_current_temps(struct bbc_cpu_temperature *tp)
static void do_envctrl_shutdown(struct bbc_cpu_temperature *tp)
{
static int shutting_down = 0;
static char *envp[] = { "HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
char *argv[] = { "/sbin/shutdown", "-h", "now", NULL };
char *type = "???";
s8 val = -1;
@ -195,7 +194,7 @@ static void do_envctrl_shutdown(struct bbc_cpu_temperature *tp)
printk(KERN_CRIT "kenvctrld: Shutting down the system now.\n");
shutting_down = 1;
if (call_usermodehelper("/sbin/shutdown", argv, envp, 0) < 0)
if (orderly_poweroff(true) < 0)
printk(KERN_CRIT "envctrl: shutdown execution failed\n");
}

View File

@ -26,6 +26,7 @@
#include <linux/ioport.h>
#include <linux/miscdevice.h>
#include <linux/kmod.h>
#include <linux/reboot.h>
#include <asm/ebus.h>
#include <asm/uaccess.h>
@ -966,10 +967,6 @@ static struct i2c_child_t *envctrl_get_i2c_child(unsigned char mon_type)
static void envctrl_do_shutdown(void)
{
static int inprog = 0;
static char *envp[] = {
"HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
char *argv[] = {
"/sbin/shutdown", "-h", "now", NULL };
int ret;
if (inprog != 0)
@ -977,7 +974,7 @@ static void envctrl_do_shutdown(void)
inprog = 1;
printk(KERN_CRIT "kenvctrld: WARNING: Shutting down the system now.\n");
ret = call_usermodehelper("/sbin/shutdown", argv, envp, 0);
ret = orderly_poweroff(true);
if (ret < 0) {
printk(KERN_CRIT "kenvctrld: WARNING: system shutdown failed!\n");
inprog = 0; /* unlikely to succeed, but we could try again */

2
drivers/xen/Makefile Normal file
View File

@ -0,0 +1,2 @@
obj-y += grant-table.o
obj-y += xenbus/

582
drivers/xen/grant-table.c Normal file
View File

@ -0,0 +1,582 @@
/******************************************************************************
* grant_table.c
*
* Granting foreign access to our memory reservation.
*
* Copyright (c) 2005-2006, Christopher Clark
* Copyright (c) 2004-2005, K A Fraser
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version 2
* as published by the Free Software Foundation; or, when distributed
* separately from the Linux kernel or incorporated into other
* software packages, subject to the following license:
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this source file (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy, modify,
* merge, publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
#include <xen/interface/xen.h>
#include <xen/page.h>
#include <xen/grant_table.h>
#include <asm/pgtable.h>
#include <asm/sync_bitops.h>
/* External tools reserve first few grant table entries. */
#define NR_RESERVED_ENTRIES 8
#define GNTTAB_LIST_END 0xffffffff
#define GREFS_PER_GRANT_FRAME (PAGE_SIZE / sizeof(struct grant_entry))
static grant_ref_t **gnttab_list;
static unsigned int nr_grant_frames;
static unsigned int boot_max_nr_grant_frames;
static int gnttab_free_count;
static grant_ref_t gnttab_free_head;
static DEFINE_SPINLOCK(gnttab_list_lock);
static struct grant_entry *shared;
static struct gnttab_free_callback *gnttab_free_callback_list;
static int gnttab_expand(unsigned int req_entries);
#define RPP (PAGE_SIZE / sizeof(grant_ref_t))
static inline grant_ref_t *__gnttab_entry(grant_ref_t entry)
{
return &gnttab_list[(entry) / RPP][(entry) % RPP];
}
/* This can be used as an l-value */
#define gnttab_entry(entry) (*__gnttab_entry(entry))
static int get_free_entries(unsigned count)
{
unsigned long flags;
int ref, rc;
grant_ref_t head;
spin_lock_irqsave(&gnttab_list_lock, flags);
if ((gnttab_free_count < count) &&
((rc = gnttab_expand(count - gnttab_free_count)) < 0)) {
spin_unlock_irqrestore(&gnttab_list_lock, flags);
return rc;
}
ref = head = gnttab_free_head;
gnttab_free_count -= count;
while (count-- > 1)
head = gnttab_entry(head);
gnttab_free_head = gnttab_entry(head);
gnttab_entry(head) = GNTTAB_LIST_END;
spin_unlock_irqrestore(&gnttab_list_lock, flags);
return ref;
}
static void do_free_callbacks(void)
{
struct gnttab_free_callback *callback, *next;
callback = gnttab_free_callback_list;
gnttab_free_callback_list = NULL;
while (callback != NULL) {
next = callback->next;
if (gnttab_free_count >= callback->count) {
callback->next = NULL;
callback->fn(callback->arg);
} else {
callback->next = gnttab_free_callback_list;
gnttab_free_callback_list = callback;
}
callback = next;
}
}
static inline void check_free_callbacks(void)
{
if (unlikely(gnttab_free_callback_list))
do_free_callbacks();
}
static void put_free_entry(grant_ref_t ref)
{
unsigned long flags;
spin_lock_irqsave(&gnttab_list_lock, flags);
gnttab_entry(ref) = gnttab_free_head;
gnttab_free_head = ref;
gnttab_free_count++;
check_free_callbacks();
spin_unlock_irqrestore(&gnttab_list_lock, flags);
}
static void update_grant_entry(grant_ref_t ref, domid_t domid,
unsigned long frame, unsigned flags)
{
/*
* Introducing a valid entry into the grant table:
* 1. Write ent->domid.
* 2. Write ent->frame:
* GTF_permit_access: Frame to which access is permitted.
* GTF_accept_transfer: Pseudo-phys frame slot being filled by new
* frame, or zero if none.
* 3. Write memory barrier (WMB).
* 4. Write ent->flags, inc. valid type.
*/
shared[ref].frame = frame;
shared[ref].domid = domid;
wmb();
shared[ref].flags = flags;
}
/*
* Public grant-issuing interface functions
*/
void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
unsigned long frame, int readonly)
{
update_grant_entry(ref, domid, frame,
GTF_permit_access | (readonly ? GTF_readonly : 0));
}
EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_ref);
int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
int readonly)
{
int ref;
ref = get_free_entries(1);
if (unlikely(ref < 0))
return -ENOSPC;
gnttab_grant_foreign_access_ref(ref, domid, frame, readonly);
return ref;
}
EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access);
int gnttab_query_foreign_access(grant_ref_t ref)
{
u16 nflags;
nflags = shared[ref].flags;
return (nflags & (GTF_reading|GTF_writing));
}
EXPORT_SYMBOL_GPL(gnttab_query_foreign_access);
int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly)
{
u16 flags, nflags;
nflags = shared[ref].flags;
do {
flags = nflags;
if (flags & (GTF_reading|GTF_writing)) {
printk(KERN_ALERT "WARNING: g.e. still in use!\n");
return 0;
}
} while ((nflags = sync_cmpxchg(&shared[ref].flags, flags, 0)) != flags);
return 1;
}
EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref);
void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
unsigned long page)
{
if (gnttab_end_foreign_access_ref(ref, readonly)) {
put_free_entry(ref);
if (page != 0)
free_page(page);
} else {
/* XXX This needs to be fixed so that the ref and page are
placed on a list to be freed up later. */
printk(KERN_WARNING
"WARNING: leaking g.e. and page still in use!\n");
}
}
EXPORT_SYMBOL_GPL(gnttab_end_foreign_access);
int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn)
{
int ref;
ref = get_free_entries(1);
if (unlikely(ref < 0))
return -ENOSPC;
gnttab_grant_foreign_transfer_ref(ref, domid, pfn);
return ref;
}
EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer);
void gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid,
unsigned long pfn)
{
update_grant_entry(ref, domid, pfn, GTF_accept_transfer);
}
EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref);
unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref)
{
unsigned long frame;
u16 flags;
/*
* If a transfer is not even yet started, try to reclaim the grant
* reference and return failure (== 0).
*/
while (!((flags = shared[ref].flags) & GTF_transfer_committed)) {
if (sync_cmpxchg(&shared[ref].flags, flags, 0) == flags)
return 0;
cpu_relax();
}
/* If a transfer is in progress then wait until it is completed. */
while (!(flags & GTF_transfer_completed)) {
flags = shared[ref].flags;
cpu_relax();
}
rmb(); /* Read the frame number /after/ reading completion status. */
frame = shared[ref].frame;
BUG_ON(frame == 0);
return frame;
}
EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref);
unsigned long gnttab_end_foreign_transfer(grant_ref_t ref)
{
unsigned long frame = gnttab_end_foreign_transfer_ref(ref);
put_free_entry(ref);
return frame;
}
EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer);
void gnttab_free_grant_reference(grant_ref_t ref)
{
put_free_entry(ref);
}
EXPORT_SYMBOL_GPL(gnttab_free_grant_reference);
void gnttab_free_grant_references(grant_ref_t head)
{
grant_ref_t ref;
unsigned long flags;
int count = 1;
if (head == GNTTAB_LIST_END)
return;
spin_lock_irqsave(&gnttab_list_lock, flags);
ref = head;
while (gnttab_entry(ref) != GNTTAB_LIST_END) {
ref = gnttab_entry(ref);
count++;
}
gnttab_entry(ref) = gnttab_free_head;
gnttab_free_head = head;
gnttab_free_count += count;
check_free_callbacks();
spin_unlock_irqrestore(&gnttab_list_lock, flags);
}
EXPORT_SYMBOL_GPL(gnttab_free_grant_references);
int gnttab_alloc_grant_references(u16 count, grant_ref_t *head)
{
int h = get_free_entries(count);
if (h < 0)
return -ENOSPC;
*head = h;
return 0;
}
EXPORT_SYMBOL_GPL(gnttab_alloc_grant_references);
int gnttab_empty_grant_references(const grant_ref_t *private_head)
{
return (*private_head == GNTTAB_LIST_END);
}
EXPORT_SYMBOL_GPL(gnttab_empty_grant_references);
int gnttab_claim_grant_reference(grant_ref_t *private_head)
{
grant_ref_t g = *private_head;
if (unlikely(g == GNTTAB_LIST_END))
return -ENOSPC;
*private_head = gnttab_entry(g);
return g;
}
EXPORT_SYMBOL_GPL(gnttab_claim_grant_reference);
void gnttab_release_grant_reference(grant_ref_t *private_head,
grant_ref_t release)
{
gnttab_entry(release) = *private_head;
*private_head = release;
}
EXPORT_SYMBOL_GPL(gnttab_release_grant_reference);
void gnttab_request_free_callback(struct gnttab_free_callback *callback,
void (*fn)(void *), void *arg, u16 count)
{
unsigned long flags;
spin_lock_irqsave(&gnttab_list_lock, flags);
if (callback->next)
goto out;
callback->fn = fn;
callback->arg = arg;
callback->count = count;
callback->next = gnttab_free_callback_list;
gnttab_free_callback_list = callback;
check_free_callbacks();
out:
spin_unlock_irqrestore(&gnttab_list_lock, flags);
}
EXPORT_SYMBOL_GPL(gnttab_request_free_callback);
void gnttab_cancel_free_callback(struct gnttab_free_callback *callback)
{
struct gnttab_free_callback **pcb;
unsigned long flags;
spin_lock_irqsave(&gnttab_list_lock, flags);
for (pcb = &gnttab_free_callback_list; *pcb; pcb = &(*pcb)->next) {
if (*pcb == callback) {
*pcb = callback->next;
break;
}
}
spin_unlock_irqrestore(&gnttab_list_lock, flags);
}
EXPORT_SYMBOL_GPL(gnttab_cancel_free_callback);
static int grow_gnttab_list(unsigned int more_frames)
{
unsigned int new_nr_grant_frames, extra_entries, i;
new_nr_grant_frames = nr_grant_frames + more_frames;
extra_entries = more_frames * GREFS_PER_GRANT_FRAME;
for (i = nr_grant_frames; i < new_nr_grant_frames; i++) {
gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_ATOMIC);
if (!gnttab_list[i])
goto grow_nomem;
}
for (i = GREFS_PER_GRANT_FRAME * nr_grant_frames;
i < GREFS_PER_GRANT_FRAME * new_nr_grant_frames - 1; i++)
gnttab_entry(i) = i + 1;
gnttab_entry(i) = gnttab_free_head;
gnttab_free_head = GREFS_PER_GRANT_FRAME * nr_grant_frames;
gnttab_free_count += extra_entries;
nr_grant_frames = new_nr_grant_frames;
check_free_callbacks();
return 0;
grow_nomem:
for ( ; i >= nr_grant_frames; i--)
free_page((unsigned long) gnttab_list[i]);
return -ENOMEM;
}
static unsigned int __max_nr_grant_frames(void)
{
struct gnttab_query_size query;
int rc;
query.dom = DOMID_SELF;
rc = HYPERVISOR_grant_table_op(GNTTABOP_query_size, &query, 1);
if ((rc < 0) || (query.status != GNTST_okay))
return 4; /* Legacy max supported number of frames */
return query.max_nr_frames;
}
static inline unsigned int max_nr_grant_frames(void)
{
unsigned int xen_max = __max_nr_grant_frames();
if (xen_max > boot_max_nr_grant_frames)
return boot_max_nr_grant_frames;
return xen_max;
}
static int map_pte_fn(pte_t *pte, struct page *pmd_page,
unsigned long addr, void *data)
{
unsigned long **frames = (unsigned long **)data;
set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
(*frames)++;
return 0;
}
static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
unsigned long addr, void *data)
{
set_pte_at(&init_mm, addr, pte, __pte(0));
return 0;
}
static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
{
struct gnttab_setup_table setup;
unsigned long *frames;
unsigned int nr_gframes = end_idx + 1;
int rc;
frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC);
if (!frames)
return -ENOMEM;
setup.dom = DOMID_SELF;
setup.nr_frames = nr_gframes;
setup.frame_list = frames;
rc = HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1);
if (rc == -ENOSYS) {
kfree(frames);
return -ENOSYS;
}
BUG_ON(rc || setup.status);
if (shared == NULL) {
struct vm_struct *area;
area = alloc_vm_area(PAGE_SIZE * max_nr_grant_frames());
BUG_ON(area == NULL);
shared = area->addr;
}
rc = apply_to_page_range(&init_mm, (unsigned long)shared,
PAGE_SIZE * nr_gframes,
map_pte_fn, &frames);
BUG_ON(rc);
frames -= nr_gframes; /* adjust after map_pte_fn() */
kfree(frames);
return 0;
}
static int gnttab_resume(void)
{
if (max_nr_grant_frames() < nr_grant_frames)
return -ENOSYS;
return gnttab_map(0, nr_grant_frames - 1);
}
static int gnttab_suspend(void)
{
apply_to_page_range(&init_mm, (unsigned long)shared,
PAGE_SIZE * nr_grant_frames,
unmap_pte_fn, NULL);
return 0;
}
static int gnttab_expand(unsigned int req_entries)
{
int rc;
unsigned int cur, extra;
cur = nr_grant_frames;
extra = ((req_entries + (GREFS_PER_GRANT_FRAME-1)) /
GREFS_PER_GRANT_FRAME);
if (cur + extra > max_nr_grant_frames())
return -ENOSPC;
rc = gnttab_map(cur, cur + extra - 1);
if (rc == 0)
rc = grow_gnttab_list(extra);
return rc;
}
static int __devinit gnttab_init(void)
{
int i;
unsigned int max_nr_glist_frames;
unsigned int nr_init_grefs;
if (!is_running_on_xen())
return -ENODEV;
nr_grant_frames = 1;
boot_max_nr_grant_frames = __max_nr_grant_frames();
/* Determine the maximum number of frames required for the
* grant reference free list on the current hypervisor.
*/
max_nr_glist_frames = (boot_max_nr_grant_frames *
GREFS_PER_GRANT_FRAME /
(PAGE_SIZE / sizeof(grant_ref_t)));
gnttab_list = kmalloc(max_nr_glist_frames * sizeof(grant_ref_t *),
GFP_KERNEL);
if (gnttab_list == NULL)
return -ENOMEM;
for (i = 0; i < nr_grant_frames; i++) {
gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_KERNEL);
if (gnttab_list[i] == NULL)
goto ini_nomem;
}
if (gnttab_resume() < 0)
return -ENODEV;
nr_init_grefs = nr_grant_frames * GREFS_PER_GRANT_FRAME;
for (i = NR_RESERVED_ENTRIES; i < nr_init_grefs - 1; i++)
gnttab_entry(i) = i + 1;
gnttab_entry(nr_init_grefs - 1) = GNTTAB_LIST_END;
gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES;
gnttab_free_head = NR_RESERVED_ENTRIES;
printk("Grant table initialized\n");
return 0;
ini_nomem:
for (i--; i >= 0; i--)
free_page((unsigned long)gnttab_list[i]);
kfree(gnttab_list);
return -ENOMEM;
}
core_initcall(gnttab_init);

View File

@ -0,0 +1,7 @@
obj-y += xenbus.o
xenbus-objs =
xenbus-objs += xenbus_client.o
xenbus-objs += xenbus_comms.o
xenbus-objs += xenbus_xs.o
xenbus-objs += xenbus_probe.o

View File

@ -0,0 +1,569 @@
/******************************************************************************
* Client-facing interface for the Xenbus driver. In other words, the
* interface between the Xenbus and the device-specific code, be it the
* frontend or the backend of that driver.
*
* Copyright (C) 2005 XenSource Ltd
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version 2
* as published by the Free Software Foundation; or, when distributed
* separately from the Linux kernel or incorporated into other
* software packages, subject to the following license:
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this source file (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy, modify,
* merge, publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include <linux/types.h>
#include <linux/vmalloc.h>
#include <asm/xen/hypervisor.h>
#include <xen/interface/xen.h>
#include <xen/interface/event_channel.h>
#include <xen/events.h>
#include <xen/grant_table.h>
#include <xen/xenbus.h>
const char *xenbus_strstate(enum xenbus_state state)
{
static const char *const name[] = {
[ XenbusStateUnknown ] = "Unknown",
[ XenbusStateInitialising ] = "Initialising",
[ XenbusStateInitWait ] = "InitWait",
[ XenbusStateInitialised ] = "Initialised",
[ XenbusStateConnected ] = "Connected",
[ XenbusStateClosing ] = "Closing",
[ XenbusStateClosed ] = "Closed",
};
return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID";
}
EXPORT_SYMBOL_GPL(xenbus_strstate);
/**
* xenbus_watch_path - register a watch
* @dev: xenbus device
* @path: path to watch
* @watch: watch to register
* @callback: callback to register
*
* Register a @watch on the given path, using the given xenbus_watch structure
* for storage, and the given @callback function as the callback. Return 0 on
* success, or -errno on error. On success, the given @path will be saved as
* @watch->node, and remains the caller's to free. On error, @watch->node will
* be NULL, the device will switch to %XenbusStateClosing, and the error will
* be saved in the store.
*/
int xenbus_watch_path(struct xenbus_device *dev, const char *path,
struct xenbus_watch *watch,
void (*callback)(struct xenbus_watch *,
const char **, unsigned int))
{
int err;
watch->node = path;
watch->callback = callback;
err = register_xenbus_watch(watch);
if (err) {
watch->node = NULL;
watch->callback = NULL;
xenbus_dev_fatal(dev, err, "adding watch on %s", path);
}
return err;
}
EXPORT_SYMBOL_GPL(xenbus_watch_path);
/**
* xenbus_watch_pathfmt - register a watch on a sprintf-formatted path
* @dev: xenbus device
* @watch: watch to register
* @callback: callback to register
* @pathfmt: format of path to watch
*
* Register a watch on the given @path, using the given xenbus_watch
* structure for storage, and the given @callback function as the callback.
* Return 0 on success, or -errno on error. On success, the watched path
* (@path/@path2) will be saved as @watch->node, and becomes the caller's to
* kfree(). On error, watch->node will be NULL, so the caller has nothing to
* free, the device will switch to %XenbusStateClosing, and the error will be
* saved in the store.
*/
int xenbus_watch_pathfmt(struct xenbus_device *dev,
struct xenbus_watch *watch,
void (*callback)(struct xenbus_watch *,
const char **, unsigned int),
const char *pathfmt, ...)
{
int err;
va_list ap;
char *path;
va_start(ap, pathfmt);
path = kvasprintf(GFP_KERNEL, pathfmt, ap);
va_end(ap);
if (!path) {
xenbus_dev_fatal(dev, -ENOMEM, "allocating path for watch");
return -ENOMEM;
}
err = xenbus_watch_path(dev, path, watch, callback);
if (err)
kfree(path);
return err;
}
EXPORT_SYMBOL_GPL(xenbus_watch_pathfmt);
/**
* xenbus_switch_state
* @dev: xenbus device
* @xbt: transaction handle
* @state: new state
*
* Advertise in the store a change of the given driver to the given new_state.
* Return 0 on success, or -errno on error. On error, the device will switch
* to XenbusStateClosing, and the error will be saved in the store.
*/
int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state)
{
/* We check whether the state is currently set to the given value, and
if not, then the state is set. We don't want to unconditionally
write the given state, because we don't want to fire watches
unnecessarily. Furthermore, if the node has gone, we don't write
to it, as the device will be tearing down, and we don't want to
resurrect that directory.
Note that, because of this cached value of our state, this function
will not work inside a Xenstore transaction (something it was
trying to in the past) because dev->state would not get reset if
the transaction was aborted.
*/
int current_state;
int err;
if (state == dev->state)
return 0;
err = xenbus_scanf(XBT_NIL, dev->nodename, "state", "%d",
&current_state);
if (err != 1)
return 0;
err = xenbus_printf(XBT_NIL, dev->nodename, "state", "%d", state);
if (err) {
if (state != XenbusStateClosing) /* Avoid looping */
xenbus_dev_fatal(dev, err, "writing new state");
return err;
}
dev->state = state;
return 0;
}
EXPORT_SYMBOL_GPL(xenbus_switch_state);
int xenbus_frontend_closed(struct xenbus_device *dev)
{
xenbus_switch_state(dev, XenbusStateClosed);
complete(&dev->down);
return 0;
}
EXPORT_SYMBOL_GPL(xenbus_frontend_closed);
/**
* Return the path to the error node for the given device, or NULL on failure.
* If the value returned is non-NULL, then it is the caller's to kfree.
*/
static char *error_path(struct xenbus_device *dev)
{
return kasprintf(GFP_KERNEL, "error/%s", dev->nodename);
}
static void xenbus_va_dev_error(struct xenbus_device *dev, int err,
const char *fmt, va_list ap)
{
int ret;
unsigned int len;
char *printf_buffer = NULL;
char *path_buffer = NULL;
#define PRINTF_BUFFER_SIZE 4096
printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
if (printf_buffer == NULL)
goto fail;
len = sprintf(printf_buffer, "%i ", -err);
ret = vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap);
BUG_ON(len + ret > PRINTF_BUFFER_SIZE-1);
dev_err(&dev->dev, "%s\n", printf_buffer);
path_buffer = error_path(dev);
if (path_buffer == NULL) {
dev_err(&dev->dev, "failed to write error node for %s (%s)\n",
dev->nodename, printf_buffer);
goto fail;
}
if (xenbus_write(XBT_NIL, path_buffer, "error", printf_buffer) != 0) {
dev_err(&dev->dev, "failed to write error node for %s (%s)\n",
dev->nodename, printf_buffer);
goto fail;
}
fail:
kfree(printf_buffer);
kfree(path_buffer);
}
/**
* xenbus_dev_error
* @dev: xenbus device
* @err: error to report
* @fmt: error message format
*
* Report the given negative errno into the store, along with the given
* formatted message.
*/
void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt, ...)
{
va_list ap;
va_start(ap, fmt);
xenbus_va_dev_error(dev, err, fmt, ap);
va_end(ap);
}
EXPORT_SYMBOL_GPL(xenbus_dev_error);
/**
* xenbus_dev_fatal
* @dev: xenbus device
* @err: error to report
* @fmt: error message format
*
* Equivalent to xenbus_dev_error(dev, err, fmt, args), followed by
* xenbus_switch_state(dev, NULL, XenbusStateClosing) to schedule an orderly
* closedown of this driver and its peer.
*/
void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...)
{
va_list ap;
va_start(ap, fmt);
xenbus_va_dev_error(dev, err, fmt, ap);
va_end(ap);
xenbus_switch_state(dev, XenbusStateClosing);
}
EXPORT_SYMBOL_GPL(xenbus_dev_fatal);
/**
* xenbus_grant_ring
* @dev: xenbus device
* @ring_mfn: mfn of ring to grant
* Grant access to the given @ring_mfn to the peer of the given device. Return
* 0 on success, or -errno on error. On error, the device will switch to
* XenbusStateClosing, and the error will be saved in the store.
*/
int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn)
{
int err = gnttab_grant_foreign_access(dev->otherend_id, ring_mfn, 0);
if (err < 0)
xenbus_dev_fatal(dev, err, "granting access to ring page");
return err;
}
EXPORT_SYMBOL_GPL(xenbus_grant_ring);
/**
* Allocate an event channel for the given xenbus_device, assigning the newly
* created local port to *port. Return 0 on success, or -errno on error. On
* error, the device will switch to XenbusStateClosing, and the error will be
* saved in the store.
*/
int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port)
{
struct evtchn_alloc_unbound alloc_unbound;
int err;
alloc_unbound.dom = DOMID_SELF;
alloc_unbound.remote_dom = dev->otherend_id;
err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
&alloc_unbound);
if (err)
xenbus_dev_fatal(dev, err, "allocating event channel");
else
*port = alloc_unbound.port;
return err;
}
EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn);
/**
* Bind to an existing interdomain event channel in another domain. Returns 0
* on success and stores the local port in *port. On error, returns -errno,
* switches the device to XenbusStateClosing, and saves the error in XenStore.
*/
int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port)
{
struct evtchn_bind_interdomain bind_interdomain;
int err;
bind_interdomain.remote_dom = dev->otherend_id;
bind_interdomain.remote_port = remote_port;
err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
&bind_interdomain);
if (err)
xenbus_dev_fatal(dev, err,
"binding to event channel %d from domain %d",
remote_port, dev->otherend_id);
else
*port = bind_interdomain.local_port;
return err;
}
EXPORT_SYMBOL_GPL(xenbus_bind_evtchn);
/**
* Free an existing event channel. Returns 0 on success or -errno on error.
*/
int xenbus_free_evtchn(struct xenbus_device *dev, int port)
{
struct evtchn_close close;
int err;
close.port = port;
err = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
if (err)
xenbus_dev_error(dev, err, "freeing event channel %d", port);
return err;
}
EXPORT_SYMBOL_GPL(xenbus_free_evtchn);
/**
* xenbus_map_ring_valloc
* @dev: xenbus device
* @gnt_ref: grant reference
* @vaddr: pointer to address to be filled out by mapping
*
* Based on Rusty Russell's skeleton driver's map_page.
* Map a page of memory into this domain from another domain's grant table.
* xenbus_map_ring_valloc allocates a page of virtual address space, maps the
* page to that address, and sets *vaddr to that address.
* Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h)
* or -ENOMEM on error. If an error is returned, device will switch to
* XenbusStateClosing and the error message will be saved in XenStore.
*/
int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref, void **vaddr)
{
struct gnttab_map_grant_ref op = {
.flags = GNTMAP_host_map,
.ref = gnt_ref,
.dom = dev->otherend_id,
};
struct vm_struct *area;
*vaddr = NULL;
area = alloc_vm_area(PAGE_SIZE);
if (!area)
return -ENOMEM;
op.host_addr = (unsigned long)area->addr;
if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
BUG();
if (op.status != GNTST_okay) {
free_vm_area(area);
xenbus_dev_fatal(dev, op.status,
"mapping in shared page %d from domain %d",
gnt_ref, dev->otherend_id);
return op.status;
}
/* Stuff the handle in an unused field */
area->phys_addr = (unsigned long)op.handle;
*vaddr = area->addr;
return 0;
}
EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc);
/**
* xenbus_map_ring
* @dev: xenbus device
* @gnt_ref: grant reference
* @handle: pointer to grant handle to be filled
* @vaddr: address to be mapped to
*
* Map a page of memory into this domain from another domain's grant table.
* xenbus_map_ring does not allocate the virtual address space (you must do
* this yourself!). It only maps in the page to the specified address.
* Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h)
* or -ENOMEM on error. If an error is returned, device will switch to
* XenbusStateClosing and the error message will be saved in XenStore.
*/
int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
grant_handle_t *handle, void *vaddr)
{
struct gnttab_map_grant_ref op = {
.host_addr = (unsigned long)vaddr,
.flags = GNTMAP_host_map,
.ref = gnt_ref,
.dom = dev->otherend_id,
};
if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
BUG();
if (op.status != GNTST_okay) {
xenbus_dev_fatal(dev, op.status,
"mapping in shared page %d from domain %d",
gnt_ref, dev->otherend_id);
} else
*handle = op.handle;
return op.status;
}
EXPORT_SYMBOL_GPL(xenbus_map_ring);
/**
* xenbus_unmap_ring_vfree
* @dev: xenbus device
* @vaddr: addr to unmap
*
* Based on Rusty Russell's skeleton driver's unmap_page.
* Unmap a page of memory in this domain that was imported from another domain.
* Use xenbus_unmap_ring_vfree if you mapped in your memory with
* xenbus_map_ring_valloc (it will free the virtual address space).
* Returns 0 on success and returns GNTST_* on error
* (see xen/include/interface/grant_table.h).
*/
int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr)
{
struct vm_struct *area;
struct gnttab_unmap_grant_ref op = {
.host_addr = (unsigned long)vaddr,
};
/* It'd be nice if linux/vmalloc.h provided a find_vm_area(void *addr)
* method so that we don't have to muck with vmalloc internals here.
* We could force the user to hang on to their struct vm_struct from
* xenbus_map_ring_valloc, but these 6 lines considerably simplify
* this API.
*/
read_lock(&vmlist_lock);
for (area = vmlist; area != NULL; area = area->next) {
if (area->addr == vaddr)
break;
}
read_unlock(&vmlist_lock);
if (!area) {
xenbus_dev_error(dev, -ENOENT,
"can't find mapped virtual address %p", vaddr);
return GNTST_bad_virt_addr;
}
op.handle = (grant_handle_t)area->phys_addr;
if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
BUG();
if (op.status == GNTST_okay)
free_vm_area(area);
else
xenbus_dev_error(dev, op.status,
"unmapping page at handle %d error %d",
(int16_t)area->phys_addr, op.status);
return op.status;
}
EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree);
/**
* xenbus_unmap_ring
* @dev: xenbus device
* @handle: grant handle
* @vaddr: addr to unmap
*
* Unmap a page of memory in this domain that was imported from another domain.
* Returns 0 on success and returns GNTST_* on error
* (see xen/include/interface/grant_table.h).
*/
int xenbus_unmap_ring(struct xenbus_device *dev,
grant_handle_t handle, void *vaddr)
{
struct gnttab_unmap_grant_ref op = {
.host_addr = (unsigned long)vaddr,
.handle = handle,
};
if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
BUG();
if (op.status != GNTST_okay)
xenbus_dev_error(dev, op.status,
"unmapping page at handle %d error %d",
handle, op.status);
return op.status;
}
EXPORT_SYMBOL_GPL(xenbus_unmap_ring);
/**
* xenbus_read_driver_state
* @path: path for driver
*
* Return the state of the driver rooted at the given store path, or
* XenbusStateUnknown if no state can be read.
*/
enum xenbus_state xenbus_read_driver_state(const char *path)
{
enum xenbus_state result;
int err = xenbus_gather(XBT_NIL, path, "state", "%d", &result, NULL);
if (err)
result = XenbusStateUnknown;
return result;
}
EXPORT_SYMBOL_GPL(xenbus_read_driver_state);

View File

@ -0,0 +1,233 @@
/******************************************************************************
* xenbus_comms.c
*
* Low level code to talks to Xen Store: ringbuffer and event channel.
*
* Copyright (C) 2005 Rusty Russell, IBM Corporation
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version 2
* as published by the Free Software Foundation; or, when distributed
* separately from the Linux kernel or incorporated into other
* software packages, subject to the following license:
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this source file (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy, modify,
* merge, publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include <linux/wait.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
#include <linux/err.h>
#include <xen/xenbus.h>
#include <asm/xen/hypervisor.h>
#include <xen/events.h>
#include <xen/page.h>
#include "xenbus_comms.h"
static int xenbus_irq;
static DECLARE_WORK(probe_work, xenbus_probe);
static DECLARE_WAIT_QUEUE_HEAD(xb_waitq);
static irqreturn_t wake_waiting(int irq, void *unused)
{
if (unlikely(xenstored_ready == 0)) {
xenstored_ready = 1;
schedule_work(&probe_work);
}
wake_up(&xb_waitq);
return IRQ_HANDLED;
}
static int check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod)
{
return ((prod - cons) <= XENSTORE_RING_SIZE);
}
static void *get_output_chunk(XENSTORE_RING_IDX cons,
XENSTORE_RING_IDX prod,
char *buf, uint32_t *len)
{
*len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod);
if ((XENSTORE_RING_SIZE - (prod - cons)) < *len)
*len = XENSTORE_RING_SIZE - (prod - cons);
return buf + MASK_XENSTORE_IDX(prod);
}
static const void *get_input_chunk(XENSTORE_RING_IDX cons,
XENSTORE_RING_IDX prod,
const char *buf, uint32_t *len)
{
*len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons);
if ((prod - cons) < *len)
*len = prod - cons;
return buf + MASK_XENSTORE_IDX(cons);
}
/**
* xb_write - low level write
* @data: buffer to send
* @len: length of buffer
*
* Returns 0 on success, error otherwise.
*/
int xb_write(const void *data, unsigned len)
{
struct xenstore_domain_interface *intf = xen_store_interface;
XENSTORE_RING_IDX cons, prod;
int rc;
while (len != 0) {
void *dst;
unsigned int avail;
rc = wait_event_interruptible(
xb_waitq,
(intf->req_prod - intf->req_cons) !=
XENSTORE_RING_SIZE);
if (rc < 0)
return rc;
/* Read indexes, then verify. */
cons = intf->req_cons;
prod = intf->req_prod;
if (!check_indexes(cons, prod)) {
intf->req_cons = intf->req_prod = 0;
return -EIO;
}
dst = get_output_chunk(cons, prod, intf->req, &avail);
if (avail == 0)
continue;
if (avail > len)
avail = len;
/* Must write data /after/ reading the consumer index. */
mb();
memcpy(dst, data, avail);
data += avail;
len -= avail;
/* Other side must not see new producer until data is there. */
wmb();
intf->req_prod += avail;
/* Implies mb(): other side will see the updated producer. */
notify_remote_via_evtchn(xen_store_evtchn);
}
return 0;
}
int xb_data_to_read(void)
{
struct xenstore_domain_interface *intf = xen_store_interface;
return (intf->rsp_cons != intf->rsp_prod);
}
int xb_wait_for_data_to_read(void)
{
return wait_event_interruptible(xb_waitq, xb_data_to_read());
}
int xb_read(void *data, unsigned len)
{
struct xenstore_domain_interface *intf = xen_store_interface;
XENSTORE_RING_IDX cons, prod;
int rc;
while (len != 0) {
unsigned int avail;
const char *src;
rc = xb_wait_for_data_to_read();
if (rc < 0)
return rc;
/* Read indexes, then verify. */
cons = intf->rsp_cons;
prod = intf->rsp_prod;
if (!check_indexes(cons, prod)) {
intf->rsp_cons = intf->rsp_prod = 0;
return -EIO;
}
src = get_input_chunk(cons, prod, intf->rsp, &avail);
if (avail == 0)
continue;
if (avail > len)
avail = len;
/* Must read data /after/ reading the producer index. */
rmb();
memcpy(data, src, avail);
data += avail;
len -= avail;
/* Other side must not see free space until we've copied out */
mb();
intf->rsp_cons += avail;
pr_debug("Finished read of %i bytes (%i to go)\n", avail, len);
/* Implies mb(): other side will see the updated consumer. */
notify_remote_via_evtchn(xen_store_evtchn);
}
return 0;
}
/**
* xb_init_comms - Set up interrupt handler off store event channel.
*/
int xb_init_comms(void)
{
struct xenstore_domain_interface *intf = xen_store_interface;
int err;
if (intf->req_prod != intf->req_cons)
printk(KERN_ERR "XENBUS request ring is not quiescent "
"(%08x:%08x)!\n", intf->req_cons, intf->req_prod);
if (intf->rsp_prod != intf->rsp_cons) {
printk(KERN_WARNING "XENBUS response ring is not quiescent "
"(%08x:%08x): fixing up\n",
intf->rsp_cons, intf->rsp_prod);
intf->rsp_cons = intf->rsp_prod;
}
if (xenbus_irq)
unbind_from_irqhandler(xenbus_irq, &xb_waitq);
err = bind_evtchn_to_irqhandler(
xen_store_evtchn, wake_waiting,
0, "xenbus", &xb_waitq);
if (err <= 0) {
printk(KERN_ERR "XENBUS request irq failed %i\n", err);
return err;
}
xenbus_irq = err;
return 0;
}

View File

@ -0,0 +1,46 @@
/*
* Private include for xenbus communications.
*
* Copyright (C) 2005 Rusty Russell, IBM Corporation
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version 2
* as published by the Free Software Foundation; or, when distributed
* separately from the Linux kernel or incorporated into other
* software packages, subject to the following license:
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this source file (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy, modify,
* merge, publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef _XENBUS_COMMS_H
#define _XENBUS_COMMS_H
int xs_init(void);
int xb_init_comms(void);
/* Low level routines. */
int xb_write(const void *data, unsigned len);
int xb_read(void *data, unsigned len);
int xb_data_to_read(void);
int xb_wait_for_data_to_read(void);
int xs_input_avail(void);
extern struct xenstore_domain_interface *xen_store_interface;
extern int xen_store_evtchn;
#endif /* _XENBUS_COMMS_H */

View File

@ -0,0 +1,935 @@
/******************************************************************************
* Talks to Xen Store to figure out what devices we have.
*
* Copyright (C) 2005 Rusty Russell, IBM Corporation
* Copyright (C) 2005 Mike Wray, Hewlett-Packard
* Copyright (C) 2005, 2006 XenSource Ltd
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version 2
* as published by the Free Software Foundation; or, when distributed
* separately from the Linux kernel or incorporated into other
* software packages, subject to the following license:
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this source file (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy, modify,
* merge, publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#define DPRINTK(fmt, args...) \
pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \
__func__, __LINE__, ##args)
#include <linux/kernel.h>
#include <linux/err.h>
#include <linux/string.h>
#include <linux/ctype.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/notifier.h>
#include <linux/kthread.h>
#include <linux/mutex.h>
#include <linux/io.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/xen/hypervisor.h>
#include <xen/xenbus.h>
#include <xen/events.h>
#include <xen/page.h>
#include "xenbus_comms.h"
#include "xenbus_probe.h"
int xen_store_evtchn;
struct xenstore_domain_interface *xen_store_interface;
static unsigned long xen_store_mfn;
static BLOCKING_NOTIFIER_HEAD(xenstore_chain);
static void wait_for_devices(struct xenbus_driver *xendrv);
static int xenbus_probe_frontend(const char *type, const char *name);
static void xenbus_dev_shutdown(struct device *_dev);
/* If something in array of ids matches this device, return it. */
static const struct xenbus_device_id *
match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev)
{
for (; *arr->devicetype != '\0'; arr++) {
if (!strcmp(arr->devicetype, dev->devicetype))
return arr;
}
return NULL;
}
int xenbus_match(struct device *_dev, struct device_driver *_drv)
{
struct xenbus_driver *drv = to_xenbus_driver(_drv);
if (!drv->ids)
return 0;
return match_device(drv->ids, to_xenbus_device(_dev)) != NULL;
}
/* device/<type>/<id> => <type>-<id> */
static int frontend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename)
{
nodename = strchr(nodename, '/');
if (!nodename || strlen(nodename + 1) >= BUS_ID_SIZE) {
printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename);
return -EINVAL;
}
strlcpy(bus_id, nodename + 1, BUS_ID_SIZE);
if (!strchr(bus_id, '/')) {
printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id);
return -EINVAL;
}
*strchr(bus_id, '/') = '-';
return 0;
}
static void free_otherend_details(struct xenbus_device *dev)
{
kfree(dev->otherend);
dev->otherend = NULL;
}
static void free_otherend_watch(struct xenbus_device *dev)
{
if (dev->otherend_watch.node) {
unregister_xenbus_watch(&dev->otherend_watch);
kfree(dev->otherend_watch.node);
dev->otherend_watch.node = NULL;
}
}
int read_otherend_details(struct xenbus_device *xendev,
char *id_node, char *path_node)
{
int err = xenbus_gather(XBT_NIL, xendev->nodename,
id_node, "%i", &xendev->otherend_id,
path_node, NULL, &xendev->otherend,
NULL);
if (err) {
xenbus_dev_fatal(xendev, err,
"reading other end details from %s",
xendev->nodename);
return err;
}
if (strlen(xendev->otherend) == 0 ||
!xenbus_exists(XBT_NIL, xendev->otherend, "")) {
xenbus_dev_fatal(xendev, -ENOENT,
"unable to read other end from %s. "
"missing or inaccessible.",
xendev->nodename);
free_otherend_details(xendev);
return -ENOENT;
}
return 0;
}
static int read_backend_details(struct xenbus_device *xendev)
{
return read_otherend_details(xendev, "backend-id", "backend");
}
/* Bus type for frontend drivers. */
static struct xen_bus_type xenbus_frontend = {
.root = "device",
.levels = 2, /* device/type/<id> */
.get_bus_id = frontend_bus_id,
.probe = xenbus_probe_frontend,
.bus = {
.name = "xen",
.match = xenbus_match,
.probe = xenbus_dev_probe,
.remove = xenbus_dev_remove,
.shutdown = xenbus_dev_shutdown,
},
};
static void otherend_changed(struct xenbus_watch *watch,
const char **vec, unsigned int len)
{
struct xenbus_device *dev =
container_of(watch, struct xenbus_device, otherend_watch);
struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
enum xenbus_state state;
/* Protect us against watches firing on old details when the otherend
details change, say immediately after a resume. */
if (!dev->otherend ||
strncmp(dev->otherend, vec[XS_WATCH_PATH],
strlen(dev->otherend))) {
dev_dbg(&dev->dev, "Ignoring watch at %s", vec[XS_WATCH_PATH]);
return;
}
state = xenbus_read_driver_state(dev->otherend);
dev_dbg(&dev->dev, "state is %d, (%s), %s, %s",
state, xenbus_strstate(state), dev->otherend_watch.node,
vec[XS_WATCH_PATH]);
/*
* Ignore xenbus transitions during shutdown. This prevents us doing
* work that can fail e.g., when the rootfs is gone.
*/
if (system_state > SYSTEM_RUNNING) {
struct xen_bus_type *bus = bus;
bus = container_of(dev->dev.bus, struct xen_bus_type, bus);
/* If we're frontend, drive the state machine to Closed. */
/* This should cause the backend to release our resources. */
if ((bus == &xenbus_frontend) && (state == XenbusStateClosing))
xenbus_frontend_closed(dev);
return;
}
if (drv->otherend_changed)
drv->otherend_changed(dev, state);
}
static int talk_to_otherend(struct xenbus_device *dev)
{
struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
free_otherend_watch(dev);
free_otherend_details(dev);
return drv->read_otherend_details(dev);
}
static int watch_otherend(struct xenbus_device *dev)
{
return xenbus_watch_pathfmt(dev, &dev->otherend_watch, otherend_changed,
"%s/%s", dev->otherend, "state");
}
int xenbus_dev_probe(struct device *_dev)
{
struct xenbus_device *dev = to_xenbus_device(_dev);
struct xenbus_driver *drv = to_xenbus_driver(_dev->driver);
const struct xenbus_device_id *id;
int err;
DPRINTK("%s", dev->nodename);
if (!drv->probe) {
err = -ENODEV;
goto fail;
}
id = match_device(drv->ids, dev);
if (!id) {
err = -ENODEV;
goto fail;
}
err = talk_to_otherend(dev);
if (err) {
dev_warn(&dev->dev, "talk_to_otherend on %s failed.\n",
dev->nodename);
return err;
}
err = drv->probe(dev, id);
if (err)
goto fail;
err = watch_otherend(dev);
if (err) {
dev_warn(&dev->dev, "watch_otherend on %s failed.\n",
dev->nodename);
return err;
}
return 0;
fail:
xenbus_dev_error(dev, err, "xenbus_dev_probe on %s", dev->nodename);
xenbus_switch_state(dev, XenbusStateClosed);
return -ENODEV;
}
int xenbus_dev_remove(struct device *_dev)
{
struct xenbus_device *dev = to_xenbus_device(_dev);
struct xenbus_driver *drv = to_xenbus_driver(_dev->driver);
DPRINTK("%s", dev->nodename);
free_otherend_watch(dev);
free_otherend_details(dev);
if (drv->remove)
drv->remove(dev);
xenbus_switch_state(dev, XenbusStateClosed);
return 0;
}
static void xenbus_dev_shutdown(struct device *_dev)
{
struct xenbus_device *dev = to_xenbus_device(_dev);
unsigned long timeout = 5*HZ;
DPRINTK("%s", dev->nodename);
get_device(&dev->dev);
if (dev->state != XenbusStateConnected) {
printk(KERN_INFO "%s: %s: %s != Connected, skipping\n", __func__,
dev->nodename, xenbus_strstate(dev->state));
goto out;
}
xenbus_switch_state(dev, XenbusStateClosing);
timeout = wait_for_completion_timeout(&dev->down, timeout);
if (!timeout)
printk(KERN_INFO "%s: %s timeout closing device\n",
__func__, dev->nodename);
out:
put_device(&dev->dev);
}
int xenbus_register_driver_common(struct xenbus_driver *drv,
struct xen_bus_type *bus,
struct module *owner,
const char *mod_name)
{
drv->driver.name = drv->name;
drv->driver.bus = &bus->bus;
drv->driver.owner = owner;
drv->driver.mod_name = mod_name;
return driver_register(&drv->driver);
}
int __xenbus_register_frontend(struct xenbus_driver *drv,
struct module *owner, const char *mod_name)
{
int ret;
drv->read_otherend_details = read_backend_details;
ret = xenbus_register_driver_common(drv, &xenbus_frontend,
owner, mod_name);
if (ret)
return ret;
/* If this driver is loaded as a module wait for devices to attach. */
wait_for_devices(drv);
return 0;
}
EXPORT_SYMBOL_GPL(__xenbus_register_frontend);
void xenbus_unregister_driver(struct xenbus_driver *drv)
{
driver_unregister(&drv->driver);
}
EXPORT_SYMBOL_GPL(xenbus_unregister_driver);
struct xb_find_info
{
struct xenbus_device *dev;
const char *nodename;
};
static int cmp_dev(struct device *dev, void *data)
{
struct xenbus_device *xendev = to_xenbus_device(dev);
struct xb_find_info *info = data;
if (!strcmp(xendev->nodename, info->nodename)) {
info->dev = xendev;
get_device(dev);
return 1;
}
return 0;
}
struct xenbus_device *xenbus_device_find(const char *nodename,
struct bus_type *bus)
{
struct xb_find_info info = { .dev = NULL, .nodename = nodename };
bus_for_each_dev(bus, NULL, &info, cmp_dev);
return info.dev;
}
static int cleanup_dev(struct device *dev, void *data)
{
struct xenbus_device *xendev = to_xenbus_device(dev);
struct xb_find_info *info = data;
int len = strlen(info->nodename);
DPRINTK("%s", info->nodename);
/* Match the info->nodename path, or any subdirectory of that path. */
if (strncmp(xendev->nodename, info->nodename, len))
return 0;
/* If the node name is longer, ensure it really is a subdirectory. */
if ((strlen(xendev->nodename) > len) && (xendev->nodename[len] != '/'))
return 0;
info->dev = xendev;
get_device(dev);
return 1;
}
static void xenbus_cleanup_devices(const char *path, struct bus_type *bus)
{
struct xb_find_info info = { .nodename = path };
do {
info.dev = NULL;
bus_for_each_dev(bus, NULL, &info, cleanup_dev);
if (info.dev) {
device_unregister(&info.dev->dev);
put_device(&info.dev->dev);
}
} while (info.dev);
}
static void xenbus_dev_release(struct device *dev)
{
if (dev)
kfree(to_xenbus_device(dev));
}
static ssize_t xendev_show_nodename(struct device *dev,
struct device_attribute *attr, char *buf)
{
return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename);
}
DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL);
static ssize_t xendev_show_devtype(struct device *dev,
struct device_attribute *attr, char *buf)
{
return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype);
}
DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL);
int xenbus_probe_node(struct xen_bus_type *bus,
const char *type,
const char *nodename)
{
int err;
struct xenbus_device *xendev;
size_t stringlen;
char *tmpstring;
enum xenbus_state state = xenbus_read_driver_state(nodename);
if (state != XenbusStateInitialising) {
/* Device is not new, so ignore it. This can happen if a
device is going away after switching to Closed. */
return 0;
}
stringlen = strlen(nodename) + 1 + strlen(type) + 1;
xendev = kzalloc(sizeof(*xendev) + stringlen, GFP_KERNEL);
if (!xendev)
return -ENOMEM;
xendev->state = XenbusStateInitialising;
/* Copy the strings into the extra space. */
tmpstring = (char *)(xendev + 1);
strcpy(tmpstring, nodename);
xendev->nodename = tmpstring;
tmpstring += strlen(tmpstring) + 1;
strcpy(tmpstring, type);
xendev->devicetype = tmpstring;
init_completion(&xendev->down);
xendev->dev.bus = &bus->bus;
xendev->dev.release = xenbus_dev_release;
err = bus->get_bus_id(xendev->dev.bus_id, xendev->nodename);
if (err)
goto fail;
/* Register with generic device framework. */
err = device_register(&xendev->dev);
if (err)
goto fail;
err = device_create_file(&xendev->dev, &dev_attr_nodename);
if (err)
goto fail_unregister;
err = device_create_file(&xendev->dev, &dev_attr_devtype);
if (err)
goto fail_remove_file;
return 0;
fail_remove_file:
device_remove_file(&xendev->dev, &dev_attr_nodename);
fail_unregister:
device_unregister(&xendev->dev);
fail:
kfree(xendev);
return err;
}
/* device/<typename>/<name> */
static int xenbus_probe_frontend(const char *type, const char *name)
{
char *nodename;
int err;
nodename = kasprintf(GFP_KERNEL, "%s/%s/%s",
xenbus_frontend.root, type, name);
if (!nodename)
return -ENOMEM;
DPRINTK("%s", nodename);
err = xenbus_probe_node(&xenbus_frontend, type, nodename);
kfree(nodename);
return err;
}
static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)
{
int err = 0;
char **dir;
unsigned int dir_n = 0;
int i;
dir = xenbus_directory(XBT_NIL, bus->root, type, &dir_n);
if (IS_ERR(dir))
return PTR_ERR(dir);
for (i = 0; i < dir_n; i++) {
err = bus->probe(type, dir[i]);
if (err)
break;
}
kfree(dir);
return err;
}
int xenbus_probe_devices(struct xen_bus_type *bus)
{
int err = 0;
char **dir;
unsigned int i, dir_n;
dir = xenbus_directory(XBT_NIL, bus->root, "", &dir_n);
if (IS_ERR(dir))
return PTR_ERR(dir);
for (i = 0; i < dir_n; i++) {
err = xenbus_probe_device_type(bus, dir[i]);
if (err)
break;
}
kfree(dir);
return err;
}
static unsigned int char_count(const char *str, char c)
{
unsigned int i, ret = 0;
for (i = 0; str[i]; i++)
if (str[i] == c)
ret++;
return ret;
}
static int strsep_len(const char *str, char c, unsigned int len)
{
unsigned int i;
for (i = 0; str[i]; i++)
if (str[i] == c) {
if (len == 0)
return i;
len--;
}
return (len == 0) ? i : -ERANGE;
}
void xenbus_dev_changed(const char *node, struct xen_bus_type *bus)
{
int exists, rootlen;
struct xenbus_device *dev;
char type[BUS_ID_SIZE];
const char *p, *root;
if (char_count(node, '/') < 2)
return;
exists = xenbus_exists(XBT_NIL, node, "");
if (!exists) {
xenbus_cleanup_devices(node, &bus->bus);
return;
}
/* backend/<type>/... or device/<type>/... */
p = strchr(node, '/') + 1;
snprintf(type, BUS_ID_SIZE, "%.*s", (int)strcspn(p, "/"), p);
type[BUS_ID_SIZE-1] = '\0';
rootlen = strsep_len(node, '/', bus->levels);
if (rootlen < 0)
return;
root = kasprintf(GFP_KERNEL, "%.*s", rootlen, node);
if (!root)
return;
dev = xenbus_device_find(root, &bus->bus);
if (!dev)
xenbus_probe_node(bus, type, root);
else
put_device(&dev->dev);
kfree(root);
}
static void frontend_changed(struct xenbus_watch *watch,
const char **vec, unsigned int len)
{
DPRINTK("");
xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend);
}
/* We watch for devices appearing and vanishing. */
static struct xenbus_watch fe_watch = {
.node = "device",
.callback = frontend_changed,
};
static int suspend_dev(struct device *dev, void *data)
{
int err = 0;
struct xenbus_driver *drv;
struct xenbus_device *xdev;
DPRINTK("");
if (dev->driver == NULL)
return 0;
drv = to_xenbus_driver(dev->driver);
xdev = container_of(dev, struct xenbus_device, dev);
if (drv->suspend)
err = drv->suspend(xdev);
if (err)
printk(KERN_WARNING
"xenbus: suspend %s failed: %i\n", dev->bus_id, err);
return 0;
}
static int suspend_cancel_dev(struct device *dev, void *data)
{
int err = 0;
struct xenbus_driver *drv;
struct xenbus_device *xdev;
DPRINTK("");
if (dev->driver == NULL)
return 0;
drv = to_xenbus_driver(dev->driver);
xdev = container_of(dev, struct xenbus_device, dev);
if (drv->suspend_cancel)
err = drv->suspend_cancel(xdev);
if (err)
printk(KERN_WARNING
"xenbus: suspend_cancel %s failed: %i\n",
dev->bus_id, err);
return 0;
}
static int resume_dev(struct device *dev, void *data)
{
int err;
struct xenbus_driver *drv;
struct xenbus_device *xdev;
DPRINTK("");
if (dev->driver == NULL)
return 0;
drv = to_xenbus_driver(dev->driver);
xdev = container_of(dev, struct xenbus_device, dev);
err = talk_to_otherend(xdev);
if (err) {
printk(KERN_WARNING
"xenbus: resume (talk_to_otherend) %s failed: %i\n",
dev->bus_id, err);
return err;
}
xdev->state = XenbusStateInitialising;
if (drv->resume) {
err = drv->resume(xdev);
if (err) {
printk(KERN_WARNING
"xenbus: resume %s failed: %i\n",
dev->bus_id, err);
return err;
}
}
err = watch_otherend(xdev);
if (err) {
printk(KERN_WARNING
"xenbus_probe: resume (watch_otherend) %s failed: "
"%d.\n", dev->bus_id, err);
return err;
}
return 0;
}
void xenbus_suspend(void)
{
DPRINTK("");
bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev);
xenbus_backend_suspend(suspend_dev);
xs_suspend();
}
EXPORT_SYMBOL_GPL(xenbus_suspend);
void xenbus_resume(void)
{
xb_init_comms();
xs_resume();
bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev);
xenbus_backend_resume(resume_dev);
}
EXPORT_SYMBOL_GPL(xenbus_resume);
void xenbus_suspend_cancel(void)
{
xs_suspend_cancel();
bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_cancel_dev);
xenbus_backend_resume(suspend_cancel_dev);
}
EXPORT_SYMBOL_GPL(xenbus_suspend_cancel);
/* A flag to determine if xenstored is 'ready' (i.e. has started) */
int xenstored_ready = 0;
int register_xenstore_notifier(struct notifier_block *nb)
{
int ret = 0;
if (xenstored_ready > 0)
ret = nb->notifier_call(nb, 0, NULL);
else
blocking_notifier_chain_register(&xenstore_chain, nb);
return ret;
}
EXPORT_SYMBOL_GPL(register_xenstore_notifier);
void unregister_xenstore_notifier(struct notifier_block *nb)
{
blocking_notifier_chain_unregister(&xenstore_chain, nb);
}
EXPORT_SYMBOL_GPL(unregister_xenstore_notifier);
void xenbus_probe(struct work_struct *unused)
{
BUG_ON((xenstored_ready <= 0));
/* Enumerate devices in xenstore and watch for changes. */
xenbus_probe_devices(&xenbus_frontend);
register_xenbus_watch(&fe_watch);
xenbus_backend_probe_and_watch();
/* Notify others that xenstore is up */
blocking_notifier_call_chain(&xenstore_chain, 0, NULL);
}
static int __init xenbus_probe_init(void)
{
int err = 0;
DPRINTK("");
err = -ENODEV;
if (!is_running_on_xen())
goto out_error;
/* Register ourselves with the kernel bus subsystem */
err = bus_register(&xenbus_frontend.bus);
if (err)
goto out_error;
err = xenbus_backend_bus_register();
if (err)
goto out_unreg_front;
/*
* Domain0 doesn't have a store_evtchn or store_mfn yet.
*/
if (is_initial_xendomain()) {
/* dom0 not yet supported */
} else {
xenstored_ready = 1;
xen_store_evtchn = xen_start_info->store_evtchn;
xen_store_mfn = xen_start_info->store_mfn;
}
xen_store_interface = mfn_to_virt(xen_store_mfn);
/* Initialize the interface to xenstore. */
err = xs_init();
if (err) {
printk(KERN_WARNING
"XENBUS: Error initializing xenstore comms: %i\n", err);
goto out_unreg_back;
}
if (!is_initial_xendomain())
xenbus_probe(NULL);
return 0;
out_unreg_back:
xenbus_backend_bus_unregister();
out_unreg_front:
bus_unregister(&xenbus_frontend.bus);
out_error:
return err;
}
postcore_initcall(xenbus_probe_init);
MODULE_LICENSE("GPL");
static int is_disconnected_device(struct device *dev, void *data)
{
struct xenbus_device *xendev = to_xenbus_device(dev);
struct device_driver *drv = data;
/*
* A device with no driver will never connect. We care only about
* devices which should currently be in the process of connecting.
*/
if (!dev->driver)
return 0;
/* Is this search limited to a particular driver? */
if (drv && (dev->driver != drv))
return 0;
return (xendev->state != XenbusStateConnected);
}
static int exists_disconnected_device(struct device_driver *drv)
{
return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
is_disconnected_device);
}
static int print_device_status(struct device *dev, void *data)
{
struct xenbus_device *xendev = to_xenbus_device(dev);
struct device_driver *drv = data;
/* Is this operation limited to a particular driver? */
if (drv && (dev->driver != drv))
return 0;
if (!dev->driver) {
/* Information only: is this too noisy? */
printk(KERN_INFO "XENBUS: Device with no driver: %s\n",
xendev->nodename);
} else if (xendev->state != XenbusStateConnected) {
printk(KERN_WARNING "XENBUS: Timeout connecting "
"to device: %s (state %d)\n",
xendev->nodename, xendev->state);
}
return 0;
}
/* We only wait for device setup after most initcalls have run. */
static int ready_to_wait_for_devices;
/*
* On a 10 second timeout, wait for all devices currently configured. We need
* to do this to guarantee that the filesystems and / or network devices
* needed for boot are available, before we can allow the boot to proceed.
*
* This needs to be on a late_initcall, to happen after the frontend device
* drivers have been initialised, but before the root fs is mounted.
*
* A possible improvement here would be to have the tools add a per-device
* flag to the store entry, indicating whether it is needed at boot time.
* This would allow people who knew what they were doing to accelerate their
* boot slightly, but of course needs tools or manual intervention to set up
* those flags correctly.
*/
static void wait_for_devices(struct xenbus_driver *xendrv)
{
unsigned long timeout = jiffies + 10*HZ;
struct device_driver *drv = xendrv ? &xendrv->driver : NULL;
if (!ready_to_wait_for_devices || !is_running_on_xen())
return;
while (exists_disconnected_device(drv)) {
if (time_after(jiffies, timeout))
break;
schedule_timeout_interruptible(HZ/10);
}
bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
print_device_status);
}
#ifndef MODULE
static int __init boot_wait_for_devices(void)
{
ready_to_wait_for_devices = 1;
wait_for_devices(NULL);
return 0;
}
late_initcall(boot_wait_for_devices);
#endif

View File

@ -0,0 +1,74 @@
/******************************************************************************
* xenbus_probe.h
*
* Talks to Xen Store to figure out what devices we have.
*
* Copyright (C) 2005 Rusty Russell, IBM Corporation
* Copyright (C) 2005 XenSource Ltd.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version 2
* as published by the Free Software Foundation; or, when distributed
* separately from the Linux kernel or incorporated into other
* software packages, subject to the following license:
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this source file (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy, modify,
* merge, publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef _XENBUS_PROBE_H
#define _XENBUS_PROBE_H
#ifdef CONFIG_XEN_BACKEND
extern void xenbus_backend_suspend(int (*fn)(struct device *, void *));
extern void xenbus_backend_resume(int (*fn)(struct device *, void *));
extern void xenbus_backend_probe_and_watch(void);
extern int xenbus_backend_bus_register(void);
extern void xenbus_backend_bus_unregister(void);
#else
static inline void xenbus_backend_suspend(int (*fn)(struct device *, void *)) {}
static inline void xenbus_backend_resume(int (*fn)(struct device *, void *)) {}
static inline void xenbus_backend_probe_and_watch(void) {}
static inline int xenbus_backend_bus_register(void) { return 0; }
static inline void xenbus_backend_bus_unregister(void) {}
#endif
struct xen_bus_type
{
char *root;
unsigned int levels;
int (*get_bus_id)(char bus_id[BUS_ID_SIZE], const char *nodename);
int (*probe)(const char *type, const char *dir);
struct bus_type bus;
};
extern int xenbus_match(struct device *_dev, struct device_driver *_drv);
extern int xenbus_dev_probe(struct device *_dev);
extern int xenbus_dev_remove(struct device *_dev);
extern int xenbus_register_driver_common(struct xenbus_driver *drv,
struct xen_bus_type *bus,
struct module *owner,
const char *mod_name);
extern int xenbus_probe_node(struct xen_bus_type *bus,
const char *type,
const char *nodename);
extern int xenbus_probe_devices(struct xen_bus_type *bus);
extern void xenbus_dev_changed(const char *node, struct xen_bus_type *bus);
#endif

View File

@ -0,0 +1,861 @@
/******************************************************************************
* xenbus_xs.c
*
* This is the kernel equivalent of the "xs" library. We don't need everything
* and we use xenbus_comms for communication.
*
* Copyright (C) 2005 Rusty Russell, IBM Corporation
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version 2
* as published by the Free Software Foundation; or, when distributed
* separately from the Linux kernel or incorporated into other
* software packages, subject to the following license:
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this source file (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy, modify,
* merge, publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include <linux/unistd.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/uio.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/fcntl.h>
#include <linux/kthread.h>
#include <linux/rwsem.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <xen/xenbus.h>
#include "xenbus_comms.h"
struct xs_stored_msg {
struct list_head list;
struct xsd_sockmsg hdr;
union {
/* Queued replies. */
struct {
char *body;
} reply;
/* Queued watch events. */
struct {
struct xenbus_watch *handle;
char **vec;
unsigned int vec_size;
} watch;
} u;
};
struct xs_handle {
/* A list of replies. Currently only one will ever be outstanding. */
struct list_head reply_list;
spinlock_t reply_lock;
wait_queue_head_t reply_waitq;
/*
* Mutex ordering: transaction_mutex -> watch_mutex -> request_mutex.
* response_mutex is never taken simultaneously with the other three.
*/
/* One request at a time. */
struct mutex request_mutex;
/* Protect xenbus reader thread against save/restore. */
struct mutex response_mutex;
/* Protect transactions against save/restore. */
struct rw_semaphore transaction_mutex;
/* Protect watch (de)register against save/restore. */
struct rw_semaphore watch_mutex;
};
static struct xs_handle xs_state;
/* List of registered watches, and a lock to protect it. */
static LIST_HEAD(watches);
static DEFINE_SPINLOCK(watches_lock);
/* List of pending watch callback events, and a lock to protect it. */
static LIST_HEAD(watch_events);
static DEFINE_SPINLOCK(watch_events_lock);
/*
* Details of the xenwatch callback kernel thread. The thread waits on the
* watch_events_waitq for work to do (queued on watch_events list). When it
* wakes up it acquires the xenwatch_mutex before reading the list and
* carrying out work.
*/
static pid_t xenwatch_pid;
static DEFINE_MUTEX(xenwatch_mutex);
static DECLARE_WAIT_QUEUE_HEAD(watch_events_waitq);
static int get_error(const char *errorstring)
{
unsigned int i;
for (i = 0; strcmp(errorstring, xsd_errors[i].errstring) != 0; i++) {
if (i == ARRAY_SIZE(xsd_errors) - 1) {
printk(KERN_WARNING
"XENBUS xen store gave: unknown error %s",
errorstring);
return EINVAL;
}
}
return xsd_errors[i].errnum;
}
static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len)
{
struct xs_stored_msg *msg;
char *body;
spin_lock(&xs_state.reply_lock);
while (list_empty(&xs_state.reply_list)) {
spin_unlock(&xs_state.reply_lock);
/* XXX FIXME: Avoid synchronous wait for response here. */
wait_event(xs_state.reply_waitq,
!list_empty(&xs_state.reply_list));
spin_lock(&xs_state.reply_lock);
}
msg = list_entry(xs_state.reply_list.next,
struct xs_stored_msg, list);
list_del(&msg->list);
spin_unlock(&xs_state.reply_lock);
*type = msg->hdr.type;
if (len)
*len = msg->hdr.len;
body = msg->u.reply.body;
kfree(msg);
return body;
}
void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
{
void *ret;
struct xsd_sockmsg req_msg = *msg;
int err;
if (req_msg.type == XS_TRANSACTION_START)
down_read(&xs_state.transaction_mutex);
mutex_lock(&xs_state.request_mutex);
err = xb_write(msg, sizeof(*msg) + msg->len);
if (err) {
msg->type = XS_ERROR;
ret = ERR_PTR(err);
} else
ret = read_reply(&msg->type, &msg->len);
mutex_unlock(&xs_state.request_mutex);
if ((msg->type == XS_TRANSACTION_END) ||
((req_msg.type == XS_TRANSACTION_START) &&
(msg->type == XS_ERROR)))
up_read(&xs_state.transaction_mutex);
return ret;
}
/* Send message to xs, get kmalloc'ed reply. ERR_PTR() on error. */
static void *xs_talkv(struct xenbus_transaction t,
enum xsd_sockmsg_type type,
const struct kvec *iovec,
unsigned int num_vecs,
unsigned int *len)
{
struct xsd_sockmsg msg;
void *ret = NULL;
unsigned int i;
int err;
msg.tx_id = t.id;
msg.req_id = 0;
msg.type = type;
msg.len = 0;
for (i = 0; i < num_vecs; i++)
msg.len += iovec[i].iov_len;
mutex_lock(&xs_state.request_mutex);
err = xb_write(&msg, sizeof(msg));
if (err) {
mutex_unlock(&xs_state.request_mutex);
return ERR_PTR(err);
}
for (i = 0; i < num_vecs; i++) {
err = xb_write(iovec[i].iov_base, iovec[i].iov_len);
if (err) {
mutex_unlock(&xs_state.request_mutex);
return ERR_PTR(err);
}
}
ret = read_reply(&msg.type, len);
mutex_unlock(&xs_state.request_mutex);
if (IS_ERR(ret))
return ret;
if (msg.type == XS_ERROR) {
err = get_error(ret);
kfree(ret);
return ERR_PTR(-err);
}
if (msg.type != type) {
if (printk_ratelimit())
printk(KERN_WARNING
"XENBUS unexpected type [%d], expected [%d]\n",
msg.type, type);
kfree(ret);
return ERR_PTR(-EINVAL);
}
return ret;
}
/* Simplified version of xs_talkv: single message. */
static void *xs_single(struct xenbus_transaction t,
enum xsd_sockmsg_type type,
const char *string,
unsigned int *len)
{
struct kvec iovec;
iovec.iov_base = (void *)string;
iovec.iov_len = strlen(string) + 1;
return xs_talkv(t, type, &iovec, 1, len);
}
/* Many commands only need an ack, don't care what it says. */
static int xs_error(char *reply)
{
if (IS_ERR(reply))
return PTR_ERR(reply);
kfree(reply);
return 0;
}
static unsigned int count_strings(const char *strings, unsigned int len)
{
unsigned int num;
const char *p;
for (p = strings, num = 0; p < strings + len; p += strlen(p) + 1)
num++;
return num;
}
/* Return the path to dir with /name appended. Buffer must be kfree()'ed. */
static char *join(const char *dir, const char *name)
{
char *buffer;
if (strlen(name) == 0)
buffer = kasprintf(GFP_KERNEL, "%s", dir);
else
buffer = kasprintf(GFP_KERNEL, "%s/%s", dir, name);
return (!buffer) ? ERR_PTR(-ENOMEM) : buffer;
}
static char **split(char *strings, unsigned int len, unsigned int *num)
{
char *p, **ret;
/* Count the strings. */
*num = count_strings(strings, len);
/* Transfer to one big alloc for easy freeing. */
ret = kmalloc(*num * sizeof(char *) + len, GFP_KERNEL);
if (!ret) {
kfree(strings);
return ERR_PTR(-ENOMEM);
}
memcpy(&ret[*num], strings, len);
kfree(strings);
strings = (char *)&ret[*num];
for (p = strings, *num = 0; p < strings + len; p += strlen(p) + 1)
ret[(*num)++] = p;
return ret;
}
char **xenbus_directory(struct xenbus_transaction t,
const char *dir, const char *node, unsigned int *num)
{
char *strings, *path;
unsigned int len;
path = join(dir, node);
if (IS_ERR(path))
return (char **)path;
strings = xs_single(t, XS_DIRECTORY, path, &len);
kfree(path);
if (IS_ERR(strings))
return (char **)strings;
return split(strings, len, num);
}
EXPORT_SYMBOL_GPL(xenbus_directory);
/* Check if a path exists. Return 1 if it does. */
int xenbus_exists(struct xenbus_transaction t,
const char *dir, const char *node)
{
char **d;
int dir_n;
d = xenbus_directory(t, dir, node, &dir_n);
if (IS_ERR(d))
return 0;
kfree(d);
return 1;
}
EXPORT_SYMBOL_GPL(xenbus_exists);
/* Get the value of a single file.
* Returns a kmalloced value: call free() on it after use.
* len indicates length in bytes.
*/
void *xenbus_read(struct xenbus_transaction t,
const char *dir, const char *node, unsigned int *len)
{
char *path;
void *ret;
path = join(dir, node);
if (IS_ERR(path))
return (void *)path;
ret = xs_single(t, XS_READ, path, len);
kfree(path);
return ret;
}
EXPORT_SYMBOL_GPL(xenbus_read);
/* Write the value of a single file.
* Returns -err on failure.
*/
int xenbus_write(struct xenbus_transaction t,
const char *dir, const char *node, const char *string)
{
const char *path;
struct kvec iovec[2];
int ret;
path = join(dir, node);
if (IS_ERR(path))
return PTR_ERR(path);
iovec[0].iov_base = (void *)path;
iovec[0].iov_len = strlen(path) + 1;
iovec[1].iov_base = (void *)string;
iovec[1].iov_len = strlen(string);
ret = xs_error(xs_talkv(t, XS_WRITE, iovec, ARRAY_SIZE(iovec), NULL));
kfree(path);
return ret;
}
EXPORT_SYMBOL_GPL(xenbus_write);
/* Create a new directory. */
int xenbus_mkdir(struct xenbus_transaction t,
const char *dir, const char *node)
{
char *path;
int ret;
path = join(dir, node);
if (IS_ERR(path))
return PTR_ERR(path);
ret = xs_error(xs_single(t, XS_MKDIR, path, NULL));
kfree(path);
return ret;
}
EXPORT_SYMBOL_GPL(xenbus_mkdir);
/* Destroy a file or directory (directories must be empty). */
int xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node)
{
char *path;
int ret;
path = join(dir, node);
if (IS_ERR(path))
return PTR_ERR(path);
ret = xs_error(xs_single(t, XS_RM, path, NULL));
kfree(path);
return ret;
}
EXPORT_SYMBOL_GPL(xenbus_rm);
/* Start a transaction: changes by others will not be seen during this
* transaction, and changes will not be visible to others until end.
*/
int xenbus_transaction_start(struct xenbus_transaction *t)
{
char *id_str;
down_read(&xs_state.transaction_mutex);
id_str = xs_single(XBT_NIL, XS_TRANSACTION_START, "", NULL);
if (IS_ERR(id_str)) {
up_read(&xs_state.transaction_mutex);
return PTR_ERR(id_str);
}
t->id = simple_strtoul(id_str, NULL, 0);
kfree(id_str);
return 0;
}
EXPORT_SYMBOL_GPL(xenbus_transaction_start);
/* End a transaction.
* If abandon is true, transaction is discarded instead of committed.
*/
int xenbus_transaction_end(struct xenbus_transaction t, int abort)
{
char abortstr[2];
int err;
if (abort)
strcpy(abortstr, "F");
else
strcpy(abortstr, "T");
err = xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL));
up_read(&xs_state.transaction_mutex);
return err;
}
EXPORT_SYMBOL_GPL(xenbus_transaction_end);
/* Single read and scanf: returns -errno or num scanned. */
int xenbus_scanf(struct xenbus_transaction t,
const char *dir, const char *node, const char *fmt, ...)
{
va_list ap;
int ret;
char *val;
val = xenbus_read(t, dir, node, NULL);
if (IS_ERR(val))
return PTR_ERR(val);
va_start(ap, fmt);
ret = vsscanf(val, fmt, ap);
va_end(ap);
kfree(val);
/* Distinctive errno. */
if (ret == 0)
return -ERANGE;
return ret;
}
EXPORT_SYMBOL_GPL(xenbus_scanf);
/* Single printf and write: returns -errno or 0. */
int xenbus_printf(struct xenbus_transaction t,
const char *dir, const char *node, const char *fmt, ...)
{
va_list ap;
int ret;
#define PRINTF_BUFFER_SIZE 4096
char *printf_buffer;
printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
if (printf_buffer == NULL)
return -ENOMEM;
va_start(ap, fmt);
ret = vsnprintf(printf_buffer, PRINTF_BUFFER_SIZE, fmt, ap);
va_end(ap);
BUG_ON(ret > PRINTF_BUFFER_SIZE-1);
ret = xenbus_write(t, dir, node, printf_buffer);
kfree(printf_buffer);
return ret;
}
EXPORT_SYMBOL_GPL(xenbus_printf);
/* Takes tuples of names, scanf-style args, and void **, NULL terminated. */
int xenbus_gather(struct xenbus_transaction t, const char *dir, ...)
{
va_list ap;
const char *name;
int ret = 0;
va_start(ap, dir);
while (ret == 0 && (name = va_arg(ap, char *)) != NULL) {
const char *fmt = va_arg(ap, char *);
void *result = va_arg(ap, void *);
char *p;
p = xenbus_read(t, dir, name, NULL);
if (IS_ERR(p)) {
ret = PTR_ERR(p);
break;
}
if (fmt) {
if (sscanf(p, fmt, result) == 0)
ret = -EINVAL;
kfree(p);
} else
*(char **)result = p;
}
va_end(ap);
return ret;
}
EXPORT_SYMBOL_GPL(xenbus_gather);
static int xs_watch(const char *path, const char *token)
{
struct kvec iov[2];
iov[0].iov_base = (void *)path;
iov[0].iov_len = strlen(path) + 1;
iov[1].iov_base = (void *)token;
iov[1].iov_len = strlen(token) + 1;
return xs_error(xs_talkv(XBT_NIL, XS_WATCH, iov,
ARRAY_SIZE(iov), NULL));
}
static int xs_unwatch(const char *path, const char *token)
{
struct kvec iov[2];
iov[0].iov_base = (char *)path;
iov[0].iov_len = strlen(path) + 1;
iov[1].iov_base = (char *)token;
iov[1].iov_len = strlen(token) + 1;
return xs_error(xs_talkv(XBT_NIL, XS_UNWATCH, iov,
ARRAY_SIZE(iov), NULL));
}
static struct xenbus_watch *find_watch(const char *token)
{
struct xenbus_watch *i, *cmp;
cmp = (void *)simple_strtoul(token, NULL, 16);
list_for_each_entry(i, &watches, list)
if (i == cmp)
return i;
return NULL;
}
/* Register callback to watch this node. */
int register_xenbus_watch(struct xenbus_watch *watch)
{
/* Pointer in ascii is the token. */
char token[sizeof(watch) * 2 + 1];
int err;
sprintf(token, "%lX", (long)watch);
down_read(&xs_state.watch_mutex);
spin_lock(&watches_lock);
BUG_ON(find_watch(token));
list_add(&watch->list, &watches);
spin_unlock(&watches_lock);
err = xs_watch(watch->node, token);
/* Ignore errors due to multiple registration. */
if ((err != 0) && (err != -EEXIST)) {
spin_lock(&watches_lock);
list_del(&watch->list);
spin_unlock(&watches_lock);
}
up_read(&xs_state.watch_mutex);
return err;
}
EXPORT_SYMBOL_GPL(register_xenbus_watch);
void unregister_xenbus_watch(struct xenbus_watch *watch)
{
struct xs_stored_msg *msg, *tmp;
char token[sizeof(watch) * 2 + 1];
int err;
sprintf(token, "%lX", (long)watch);
down_read(&xs_state.watch_mutex);
spin_lock(&watches_lock);
BUG_ON(!find_watch(token));
list_del(&watch->list);
spin_unlock(&watches_lock);
err = xs_unwatch(watch->node, token);
if (err)
printk(KERN_WARNING
"XENBUS Failed to release watch %s: %i\n",
watch->node, err);
up_read(&xs_state.watch_mutex);
/* Make sure there are no callbacks running currently (unless
its us) */
if (current->pid != xenwatch_pid)
mutex_lock(&xenwatch_mutex);
/* Cancel pending watch events. */
spin_lock(&watch_events_lock);
list_for_each_entry_safe(msg, tmp, &watch_events, list) {
if (msg->u.watch.handle != watch)
continue;
list_del(&msg->list);
kfree(msg->u.watch.vec);
kfree(msg);
}
spin_unlock(&watch_events_lock);
if (current->pid != xenwatch_pid)
mutex_unlock(&xenwatch_mutex);
}
EXPORT_SYMBOL_GPL(unregister_xenbus_watch);
void xs_suspend(void)
{
down_write(&xs_state.transaction_mutex);
down_write(&xs_state.watch_mutex);
mutex_lock(&xs_state.request_mutex);
mutex_lock(&xs_state.response_mutex);
}
void xs_resume(void)
{
struct xenbus_watch *watch;
char token[sizeof(watch) * 2 + 1];
mutex_unlock(&xs_state.response_mutex);
mutex_unlock(&xs_state.request_mutex);
up_write(&xs_state.transaction_mutex);
/* No need for watches_lock: the watch_mutex is sufficient. */
list_for_each_entry(watch, &watches, list) {
sprintf(token, "%lX", (long)watch);
xs_watch(watch->node, token);
}
up_write(&xs_state.watch_mutex);
}
void xs_suspend_cancel(void)
{
mutex_unlock(&xs_state.response_mutex);
mutex_unlock(&xs_state.request_mutex);
up_write(&xs_state.watch_mutex);
up_write(&xs_state.transaction_mutex);
}
static int xenwatch_thread(void *unused)
{
struct list_head *ent;
struct xs_stored_msg *msg;
for (;;) {
wait_event_interruptible(watch_events_waitq,
!list_empty(&watch_events));
if (kthread_should_stop())
break;
mutex_lock(&xenwatch_mutex);
spin_lock(&watch_events_lock);
ent = watch_events.next;
if (ent != &watch_events)
list_del(ent);
spin_unlock(&watch_events_lock);
if (ent != &watch_events) {
msg = list_entry(ent, struct xs_stored_msg, list);
msg->u.watch.handle->callback(
msg->u.watch.handle,
(const char **)msg->u.watch.vec,
msg->u.watch.vec_size);
kfree(msg->u.watch.vec);
kfree(msg);
}
mutex_unlock(&xenwatch_mutex);
}
return 0;
}
static int process_msg(void)
{
struct xs_stored_msg *msg;
char *body;
int err;
/*
* We must disallow save/restore while reading a xenstore message.
* A partial read across s/r leaves us out of sync with xenstored.
*/
for (;;) {
err = xb_wait_for_data_to_read();
if (err)
return err;
mutex_lock(&xs_state.response_mutex);
if (xb_data_to_read())
break;
/* We raced with save/restore: pending data 'disappeared'. */
mutex_unlock(&xs_state.response_mutex);
}
msg = kmalloc(sizeof(*msg), GFP_KERNEL);
if (msg == NULL) {
err = -ENOMEM;
goto out;
}
err = xb_read(&msg->hdr, sizeof(msg->hdr));
if (err) {
kfree(msg);
goto out;
}
body = kmalloc(msg->hdr.len + 1, GFP_KERNEL);
if (body == NULL) {
kfree(msg);
err = -ENOMEM;
goto out;
}
err = xb_read(body, msg->hdr.len);
if (err) {
kfree(body);
kfree(msg);
goto out;
}
body[msg->hdr.len] = '\0';
if (msg->hdr.type == XS_WATCH_EVENT) {
msg->u.watch.vec = split(body, msg->hdr.len,
&msg->u.watch.vec_size);
if (IS_ERR(msg->u.watch.vec)) {
kfree(msg);
err = PTR_ERR(msg->u.watch.vec);
goto out;
}
spin_lock(&watches_lock);
msg->u.watch.handle = find_watch(
msg->u.watch.vec[XS_WATCH_TOKEN]);
if (msg->u.watch.handle != NULL) {
spin_lock(&watch_events_lock);
list_add_tail(&msg->list, &watch_events);
wake_up(&watch_events_waitq);
spin_unlock(&watch_events_lock);
} else {
kfree(msg->u.watch.vec);
kfree(msg);
}
spin_unlock(&watches_lock);
} else {
msg->u.reply.body = body;
spin_lock(&xs_state.reply_lock);
list_add_tail(&msg->list, &xs_state.reply_list);
spin_unlock(&xs_state.reply_lock);
wake_up(&xs_state.reply_waitq);
}
out:
mutex_unlock(&xs_state.response_mutex);
return err;
}
static int xenbus_thread(void *unused)
{
int err;
for (;;) {
err = process_msg();
if (err)
printk(KERN_WARNING "XENBUS error %d while reading "
"message\n", err);
if (kthread_should_stop())
break;
}
return 0;
}
int xs_init(void)
{
int err;
struct task_struct *task;
INIT_LIST_HEAD(&xs_state.reply_list);
spin_lock_init(&xs_state.reply_lock);
init_waitqueue_head(&xs_state.reply_waitq);
mutex_init(&xs_state.request_mutex);
mutex_init(&xs_state.response_mutex);
init_rwsem(&xs_state.transaction_mutex);
init_rwsem(&xs_state.watch_mutex);
/* Initialize the shared memory rings to talk to xenstored */
err = xb_init_comms();
if (err)
return err;
task = kthread_run(xenwatch_thread, NULL, "xenwatch");
if (IS_ERR(task))
return PTR_ERR(task);
xenwatch_pid = task->pid;
task = kthread_run(xenbus_thread, NULL, "xenbus");
if (IS_ERR(task))
return PTR_ERR(task);
return 0;
}

View File

@ -209,7 +209,7 @@ void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
envp[2] = NULL;
ret = call_usermodehelper(argv[0], argv, envp, 1);
ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
if (ret < 0)
mlog_errno(ret);
}

View File

@ -41,6 +41,7 @@ extern int irqbalance_disable(char *str);
extern void fixup_irqs(cpumask_t map);
#endif
unsigned int do_IRQ(struct pt_regs *regs);
void init_IRQ(void);
void __init native_init_IRQ(void);

View File

@ -1,7 +1,7 @@
#ifndef _ASM_IRQ_VECTORS_LIMITS_H
#define _ASM_IRQ_VECTORS_LIMITS_H
#ifdef CONFIG_X86_IO_APIC
#if defined(CONFIG_X86_IO_APIC) || defined(CONFIG_PARAVIRT)
#define NR_IRQS 224
# if (224 >= 32 * NR_CPUS)
# define NR_IRQ_VECTORS NR_IRQS

View File

@ -32,6 +32,8 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
#endif
}
void leave_mm(unsigned long cpu);
static inline void switch_mm(struct mm_struct *prev,
struct mm_struct *next,
struct task_struct *tsk)

View File

@ -52,6 +52,8 @@ struct paravirt_ops
/* Basic arch-specific setup */
void (*arch_setup)(void);
char *(*memory_setup)(void);
void (*post_allocator_init)(void);
void (*init_IRQ)(void);
void (*time_init)(void);
@ -116,7 +118,7 @@ struct paravirt_ops
u64 (*read_tsc)(void);
u64 (*read_pmc)(void);
u64 (*get_scheduled_cycles)(void);
unsigned long long (*sched_clock)(void);
unsigned long (*get_cpu_khz)(void);
/* Segment descriptor handling */
@ -173,7 +175,7 @@ struct paravirt_ops
unsigned long va);
/* Hooks for allocating/releasing pagetable pages */
void (*alloc_pt)(u32 pfn);
void (*alloc_pt)(struct mm_struct *mm, u32 pfn);
void (*alloc_pd)(u32 pfn);
void (*alloc_pd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count);
void (*release_pt)(u32 pfn);
@ -260,6 +262,7 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *site, unsigned len)
unsigned paravirt_patch_insns(void *site, unsigned len,
const char *start, const char *end);
int paravirt_disable_iospace(void);
/*
* This generates an indirect call based on the operation type number.
@ -563,7 +566,10 @@ static inline u64 paravirt_read_tsc(void)
#define rdtscll(val) (val = paravirt_read_tsc())
#define get_scheduled_cycles(val) (val = paravirt_ops.get_scheduled_cycles())
static inline unsigned long long paravirt_sched_clock(void)
{
return PVOP_CALL0(unsigned long long, sched_clock);
}
#define calculate_cpu_khz() (paravirt_ops.get_cpu_khz())
#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
@ -669,6 +675,12 @@ static inline void setup_secondary_clock(void)
}
#endif
static inline void paravirt_post_allocator_init(void)
{
if (paravirt_ops.post_allocator_init)
(*paravirt_ops.post_allocator_init)();
}
static inline void paravirt_pagetable_setup_start(pgd_t *base)
{
if (paravirt_ops.pagetable_setup_start)
@ -725,9 +737,9 @@ static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
PVOP_VCALL3(flush_tlb_others, &cpumask, mm, va);
}
static inline void paravirt_alloc_pt(unsigned pfn)
static inline void paravirt_alloc_pt(struct mm_struct *mm, unsigned pfn)
{
PVOP_VCALL1(alloc_pt, pfn);
PVOP_VCALL2(alloc_pt, mm, pfn);
}
static inline void paravirt_release_pt(unsigned pfn)
{

View File

@ -7,7 +7,7 @@
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
#define paravirt_alloc_pt(pfn) do { } while (0)
#define paravirt_alloc_pt(mm, pfn) do { } while (0)
#define paravirt_alloc_pd(pfn) do { } while (0)
#define paravirt_alloc_pd(pfn) do { } while (0)
#define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
@ -17,13 +17,13 @@
#define pmd_populate_kernel(mm, pmd, pte) \
do { \
paravirt_alloc_pt(__pa(pte) >> PAGE_SHIFT); \
paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT); \
set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))); \
} while (0)
#define pmd_populate(mm, pmd, pte) \
do { \
paravirt_alloc_pt(page_to_pfn(pte)); \
paravirt_alloc_pt(mm, page_to_pfn(pte)); \
set_pmd(pmd, __pmd(_PAGE_TABLE + \
((unsigned long long)page_to_pfn(pte) << \
(unsigned long long) PAGE_SHIFT))); \

View File

@ -81,6 +81,10 @@ void __init add_memory_region(unsigned long long start,
extern unsigned long init_pg_tables_end;
#ifndef CONFIG_PARAVIRT
#define paravirt_post_allocator_init() do {} while (0)
#endif
#endif /* __ASSEMBLY__ */
#endif /* __KERNEL__ */

View File

@ -43,9 +43,12 @@ extern u8 x86_cpu_to_apicid[];
#define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu]
extern void set_cpu_sibling_map(int cpu);
#ifdef CONFIG_HOTPLUG_CPU
extern void cpu_exit_clear(void);
extern void cpu_uninit(void);
extern void remove_siblinginfo(int cpu);
#endif
struct smp_ops
@ -129,6 +132,8 @@ extern int __cpu_disable(void);
extern void __cpu_die(unsigned int cpu);
extern unsigned int num_processors;
void __cpuinit smp_store_cpu_info(int id);
#endif /* !__ASSEMBLY__ */
#else /* CONFIG_SMP */

View File

@ -15,8 +15,38 @@ extern int no_sync_cmos_clock;
extern int recalibrate_cpu_khz(void);
#ifndef CONFIG_PARAVIRT
#define get_scheduled_cycles(val) rdtscll(val)
#define calculate_cpu_khz() native_calculate_cpu_khz()
#endif
/* Accellerators for sched_clock()
* convert from cycles(64bits) => nanoseconds (64bits)
* basic equation:
* ns = cycles / (freq / ns_per_sec)
* ns = cycles * (ns_per_sec / freq)
* ns = cycles * (10^9 / (cpu_khz * 10^3))
* ns = cycles * (10^6 / cpu_khz)
*
* Then we use scaling math (suggested by george@mvista.com) to get:
* ns = cycles * (10^6 * SC / cpu_khz) / SC
* ns = cycles * cyc2ns_scale / SC
*
* And since SC is a constant power of two, we can convert the div
* into a shift.
*
* We can use khz divisor instead of mhz to keep a better percision, since
* cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
* (mathieu.desnoyers@polymtl.ca)
*
* -johnstul@us.ibm.com "math is hard, lets go shopping!"
*/
extern unsigned long cyc2ns_scale __read_mostly;
#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
}
#endif

View File

@ -49,7 +49,7 @@ extern struct vmi_timer_ops {
extern void __init vmi_time_init(void);
extern unsigned long vmi_get_wallclock(void);
extern int vmi_set_wallclock(unsigned long now);
extern unsigned long long vmi_get_sched_cycles(void);
extern unsigned long long vmi_sched_clock(void);
extern unsigned long vmi_cpu_khz(void);
#ifdef CONFIG_X86_LOCAL_APIC

View File

@ -0,0 +1,413 @@
/******************************************************************************
* hypercall.h
*
* Linux-specific hypervisor handling.
*
* Copyright (c) 2002-2004, K A Fraser
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version 2
* as published by the Free Software Foundation; or, when distributed
* separately from the Linux kernel or incorporated into other
* software packages, subject to the following license:
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this source file (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy, modify,
* merge, publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef __HYPERCALL_H__
#define __HYPERCALL_H__
#include <linux/errno.h>
#include <linux/string.h>
#include <xen/interface/xen.h>
#include <xen/interface/sched.h>
#include <xen/interface/physdev.h>
extern struct { char _entry[32]; } hypercall_page[];
#define _hypercall0(type, name) \
({ \
long __res; \
asm volatile ( \
"call %[call]" \
: "=a" (__res) \
: [call] "m" (hypercall_page[__HYPERVISOR_##name]) \
: "memory" ); \
(type)__res; \
})
#define _hypercall1(type, name, a1) \
({ \
long __res, __ign1; \
asm volatile ( \
"call %[call]" \
: "=a" (__res), "=b" (__ign1) \
: "1" ((long)(a1)), \
[call] "m" (hypercall_page[__HYPERVISOR_##name]) \
: "memory" ); \
(type)__res; \
})
#define _hypercall2(type, name, a1, a2) \
({ \
long __res, __ign1, __ign2; \
asm volatile ( \
"call %[call]" \
: "=a" (__res), "=b" (__ign1), "=c" (__ign2) \
: "1" ((long)(a1)), "2" ((long)(a2)), \
[call] "m" (hypercall_page[__HYPERVISOR_##name]) \
: "memory" ); \
(type)__res; \
})
#define _hypercall3(type, name, a1, a2, a3) \
({ \
long __res, __ign1, __ign2, __ign3; \
asm volatile ( \
"call %[call]" \
: "=a" (__res), "=b" (__ign1), "=c" (__ign2), \
"=d" (__ign3) \
: "1" ((long)(a1)), "2" ((long)(a2)), \
"3" ((long)(a3)), \
[call] "m" (hypercall_page[__HYPERVISOR_##name]) \
: "memory" ); \
(type)__res; \
})
#define _hypercall4(type, name, a1, a2, a3, a4) \
({ \
long __res, __ign1, __ign2, __ign3, __ign4; \
asm volatile ( \
"call %[call]" \
: "=a" (__res), "=b" (__ign1), "=c" (__ign2), \
"=d" (__ign3), "=S" (__ign4) \
: "1" ((long)(a1)), "2" ((long)(a2)), \
"3" ((long)(a3)), "4" ((long)(a4)), \
[call] "m" (hypercall_page[__HYPERVISOR_##name]) \
: "memory" ); \
(type)__res; \
})
#define _hypercall5(type, name, a1, a2, a3, a4, a5) \
({ \
long __res, __ign1, __ign2, __ign3, __ign4, __ign5; \
asm volatile ( \
"call %[call]" \
: "=a" (__res), "=b" (__ign1), "=c" (__ign2), \
"=d" (__ign3), "=S" (__ign4), "=D" (__ign5) \
: "1" ((long)(a1)), "2" ((long)(a2)), \
"3" ((long)(a3)), "4" ((long)(a4)), \
"5" ((long)(a5)), \
[call] "m" (hypercall_page[__HYPERVISOR_##name]) \
: "memory" ); \
(type)__res; \
})
static inline int
HYPERVISOR_set_trap_table(struct trap_info *table)
{
return _hypercall1(int, set_trap_table, table);
}
static inline int
HYPERVISOR_mmu_update(struct mmu_update *req, int count,
int *success_count, domid_t domid)
{
return _hypercall4(int, mmu_update, req, count, success_count, domid);
}
static inline int
HYPERVISOR_mmuext_op(struct mmuext_op *op, int count,
int *success_count, domid_t domid)
{
return _hypercall4(int, mmuext_op, op, count, success_count, domid);
}
static inline int
HYPERVISOR_set_gdt(unsigned long *frame_list, int entries)
{
return _hypercall2(int, set_gdt, frame_list, entries);
}
static inline int
HYPERVISOR_stack_switch(unsigned long ss, unsigned long esp)
{
return _hypercall2(int, stack_switch, ss, esp);
}
static inline int
HYPERVISOR_set_callbacks(unsigned long event_selector,
unsigned long event_address,
unsigned long failsafe_selector,
unsigned long failsafe_address)
{
return _hypercall4(int, set_callbacks,
event_selector, event_address,
failsafe_selector, failsafe_address);
}
static inline int
HYPERVISOR_fpu_taskswitch(int set)
{
return _hypercall1(int, fpu_taskswitch, set);
}
static inline int
HYPERVISOR_sched_op(int cmd, unsigned long arg)
{
return _hypercall2(int, sched_op, cmd, arg);
}
static inline long
HYPERVISOR_set_timer_op(u64 timeout)
{
unsigned long timeout_hi = (unsigned long)(timeout>>32);
unsigned long timeout_lo = (unsigned long)timeout;
return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
}
static inline int
HYPERVISOR_set_debugreg(int reg, unsigned long value)
{
return _hypercall2(int, set_debugreg, reg, value);
}
static inline unsigned long
HYPERVISOR_get_debugreg(int reg)
{
return _hypercall1(unsigned long, get_debugreg, reg);
}
static inline int
HYPERVISOR_update_descriptor(u64 ma, u64 desc)
{
return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
}
static inline int
HYPERVISOR_memory_op(unsigned int cmd, void *arg)
{
return _hypercall2(int, memory_op, cmd, arg);
}
static inline int
HYPERVISOR_multicall(void *call_list, int nr_calls)
{
return _hypercall2(int, multicall, call_list, nr_calls);
}
static inline int
HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val,
unsigned long flags)
{
unsigned long pte_hi = 0;
#ifdef CONFIG_X86_PAE
pte_hi = new_val.pte_high;
#endif
return _hypercall4(int, update_va_mapping, va,
new_val.pte_low, pte_hi, flags);
}
static inline int
HYPERVISOR_event_channel_op(int cmd, void *arg)
{
int rc = _hypercall2(int, event_channel_op, cmd, arg);
if (unlikely(rc == -ENOSYS)) {
struct evtchn_op op;
op.cmd = cmd;
memcpy(&op.u, arg, sizeof(op.u));
rc = _hypercall1(int, event_channel_op_compat, &op);
memcpy(arg, &op.u, sizeof(op.u));
}
return rc;
}
static inline int
HYPERVISOR_xen_version(int cmd, void *arg)
{
return _hypercall2(int, xen_version, cmd, arg);
}
static inline int
HYPERVISOR_console_io(int cmd, int count, char *str)
{
return _hypercall3(int, console_io, cmd, count, str);
}
static inline int
HYPERVISOR_physdev_op(int cmd, void *arg)
{
int rc = _hypercall2(int, physdev_op, cmd, arg);
if (unlikely(rc == -ENOSYS)) {
struct physdev_op op;
op.cmd = cmd;
memcpy(&op.u, arg, sizeof(op.u));
rc = _hypercall1(int, physdev_op_compat, &op);
memcpy(arg, &op.u, sizeof(op.u));
}
return rc;
}
static inline int
HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count)
{
return _hypercall3(int, grant_table_op, cmd, uop, count);
}
static inline int
HYPERVISOR_update_va_mapping_otherdomain(unsigned long va, pte_t new_val,
unsigned long flags, domid_t domid)
{
unsigned long pte_hi = 0;
#ifdef CONFIG_X86_PAE
pte_hi = new_val.pte_high;
#endif
return _hypercall5(int, update_va_mapping_otherdomain, va,
new_val.pte_low, pte_hi, flags, domid);
}
static inline int
HYPERVISOR_vm_assist(unsigned int cmd, unsigned int type)
{
return _hypercall2(int, vm_assist, cmd, type);
}
static inline int
HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args)
{
return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
}
static inline int
HYPERVISOR_suspend(unsigned long srec)
{
return _hypercall3(int, sched_op, SCHEDOP_shutdown,
SHUTDOWN_suspend, srec);
}
static inline int
HYPERVISOR_nmi_op(unsigned long op, unsigned long arg)
{
return _hypercall2(int, nmi_op, op, arg);
}
static inline void
MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
pte_t new_val, unsigned long flags)
{
mcl->op = __HYPERVISOR_update_va_mapping;
mcl->args[0] = va;
#ifdef CONFIG_X86_PAE
mcl->args[1] = new_val.pte_low;
mcl->args[2] = new_val.pte_high;
#else
mcl->args[1] = new_val.pte_low;
mcl->args[2] = 0;
#endif
mcl->args[3] = flags;
}
static inline void
MULTI_grant_table_op(struct multicall_entry *mcl, unsigned int cmd,
void *uop, unsigned int count)
{
mcl->op = __HYPERVISOR_grant_table_op;
mcl->args[0] = cmd;
mcl->args[1] = (unsigned long)uop;
mcl->args[2] = count;
}
static inline void
MULTI_update_va_mapping_otherdomain(struct multicall_entry *mcl, unsigned long va,
pte_t new_val, unsigned long flags,
domid_t domid)
{
mcl->op = __HYPERVISOR_update_va_mapping_otherdomain;
mcl->args[0] = va;
#ifdef CONFIG_X86_PAE
mcl->args[1] = new_val.pte_low;
mcl->args[2] = new_val.pte_high;
#else
mcl->args[1] = new_val.pte_low;
mcl->args[2] = 0;
#endif
mcl->args[3] = flags;
mcl->args[4] = domid;
}
static inline void
MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
struct desc_struct desc)
{
mcl->op = __HYPERVISOR_update_descriptor;
mcl->args[0] = maddr;
mcl->args[1] = maddr >> 32;
mcl->args[2] = desc.a;
mcl->args[3] = desc.b;
}
static inline void
MULTI_memory_op(struct multicall_entry *mcl, unsigned int cmd, void *arg)
{
mcl->op = __HYPERVISOR_memory_op;
mcl->args[0] = cmd;
mcl->args[1] = (unsigned long)arg;
}
static inline void
MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req,
int count, int *success_count, domid_t domid)
{
mcl->op = __HYPERVISOR_mmu_update;
mcl->args[0] = (unsigned long)req;
mcl->args[1] = count;
mcl->args[2] = (unsigned long)success_count;
mcl->args[3] = domid;
}
static inline void
MULTI_mmuext_op(struct multicall_entry *mcl, struct mmuext_op *op, int count,
int *success_count, domid_t domid)
{
mcl->op = __HYPERVISOR_mmuext_op;
mcl->args[0] = (unsigned long)op;
mcl->args[1] = count;
mcl->args[2] = (unsigned long)success_count;
mcl->args[3] = domid;
}
static inline void
MULTI_set_gdt(struct multicall_entry *mcl, unsigned long *frames, int entries)
{
mcl->op = __HYPERVISOR_set_gdt;
mcl->args[0] = (unsigned long)frames;
mcl->args[1] = entries;
}
static inline void
MULTI_stack_switch(struct multicall_entry *mcl,
unsigned long ss, unsigned long esp)
{
mcl->op = __HYPERVISOR_stack_switch;
mcl->args[0] = ss;
mcl->args[1] = esp;
}
#endif /* __HYPERCALL_H__ */

View File

@ -0,0 +1,73 @@
/******************************************************************************
* hypervisor.h
*
* Linux-specific hypervisor handling.
*
* Copyright (c) 2002-2004, K A Fraser
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version 2
* as published by the Free Software Foundation; or, when distributed
* separately from the Linux kernel or incorporated into other
* software packages, subject to the following license:
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this source file (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy, modify,
* merge, publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef __HYPERVISOR_H__
#define __HYPERVISOR_H__
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/version.h>
#include <xen/interface/xen.h>
#include <xen/interface/version.h>
#include <asm/ptrace.h>
#include <asm/page.h>
#include <asm/desc.h>
#if defined(__i386__)
# ifdef CONFIG_X86_PAE
# include <asm-generic/pgtable-nopud.h>
# else
# include <asm-generic/pgtable-nopmd.h>
# endif
#endif
#include <asm/xen/hypercall.h>
/* arch/i386/kernel/setup.c */
extern struct shared_info *HYPERVISOR_shared_info;
extern struct start_info *xen_start_info;
#define is_initial_xendomain() (xen_start_info->flags & SIF_INITDOMAIN)
/* arch/i386/mach-xen/evtchn.c */
/* Force a proper event-channel callback from Xen. */
extern void force_evtchn_callback(void);
/* Turn jiffies into Xen system time. */
u64 jiffies_to_st(unsigned long jiffies);
#define MULTI_UVMFLAGS_INDEX 3
#define MULTI_UVMDOMID_INDEX 4
#define is_running_on_xen() (xen_start_info ? 1 : 0)
#endif /* __HYPERVISOR_H__ */

View File

@ -0,0 +1,188 @@
/******************************************************************************
* arch-x86_32.h
*
* Guest OS interface to x86 32-bit Xen.
*
* Copyright (c) 2004, K A Fraser
*/
#ifndef __XEN_PUBLIC_ARCH_X86_32_H__
#define __XEN_PUBLIC_ARCH_X86_32_H__
#ifdef __XEN__
#define __DEFINE_GUEST_HANDLE(name, type) \
typedef struct { type *p; } __guest_handle_ ## name
#else
#define __DEFINE_GUEST_HANDLE(name, type) \
typedef type * __guest_handle_ ## name
#endif
#define DEFINE_GUEST_HANDLE_STRUCT(name) \
__DEFINE_GUEST_HANDLE(name, struct name)
#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name)
#define GUEST_HANDLE(name) __guest_handle_ ## name
#ifndef __ASSEMBLY__
/* Guest handles for primitive C types. */
__DEFINE_GUEST_HANDLE(uchar, unsigned char);
__DEFINE_GUEST_HANDLE(uint, unsigned int);
__DEFINE_GUEST_HANDLE(ulong, unsigned long);
DEFINE_GUEST_HANDLE(char);
DEFINE_GUEST_HANDLE(int);
DEFINE_GUEST_HANDLE(long);
DEFINE_GUEST_HANDLE(void);
#endif
/*
* SEGMENT DESCRIPTOR TABLES
*/
/*
* A number of GDT entries are reserved by Xen. These are not situated at the
* start of the GDT because some stupid OSes export hard-coded selector values
* in their ABI. These hard-coded values are always near the start of the GDT,
* so Xen places itself out of the way, at the far end of the GDT.
*/
#define FIRST_RESERVED_GDT_PAGE 14
#define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096)
#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
/*
* These flat segments are in the Xen-private section of every GDT. Since these
* are also present in the initial GDT, many OSes will be able to avoid
* installing their own GDT.
*/
#define FLAT_RING1_CS 0xe019 /* GDT index 259 */
#define FLAT_RING1_DS 0xe021 /* GDT index 260 */
#define FLAT_RING1_SS 0xe021 /* GDT index 260 */
#define FLAT_RING3_CS 0xe02b /* GDT index 261 */
#define FLAT_RING3_DS 0xe033 /* GDT index 262 */
#define FLAT_RING3_SS 0xe033 /* GDT index 262 */
#define FLAT_KERNEL_CS FLAT_RING1_CS
#define FLAT_KERNEL_DS FLAT_RING1_DS
#define FLAT_KERNEL_SS FLAT_RING1_SS
#define FLAT_USER_CS FLAT_RING3_CS
#define FLAT_USER_DS FLAT_RING3_DS
#define FLAT_USER_SS FLAT_RING3_SS
/* And the trap vector is... */
#define TRAP_INSTR "int $0x82"
/*
* Virtual addresses beyond this are not modifiable by guest OSes. The
* machine->physical mapping table starts at this address, read-only.
*/
#ifdef CONFIG_X86_PAE
#define __HYPERVISOR_VIRT_START 0xF5800000
#else
#define __HYPERVISOR_VIRT_START 0xFC000000
#endif
#ifndef HYPERVISOR_VIRT_START
#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
#endif
#ifndef machine_to_phys_mapping
#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
#endif
/* Maximum number of virtual CPUs in multi-processor guests. */
#define MAX_VIRT_CPUS 32
#ifndef __ASSEMBLY__
/*
* Send an array of these to HYPERVISOR_set_trap_table()
*/
#define TI_GET_DPL(_ti) ((_ti)->flags & 3)
#define TI_GET_IF(_ti) ((_ti)->flags & 4)
#define TI_SET_DPL(_ti, _dpl) ((_ti)->flags |= (_dpl))
#define TI_SET_IF(_ti, _if) ((_ti)->flags |= ((!!(_if))<<2))
struct trap_info {
uint8_t vector; /* exception vector */
uint8_t flags; /* 0-3: privilege level; 4: clear event enable? */
uint16_t cs; /* code selector */
unsigned long address; /* code offset */
};
DEFINE_GUEST_HANDLE_STRUCT(trap_info);
struct cpu_user_regs {
uint32_t ebx;
uint32_t ecx;
uint32_t edx;
uint32_t esi;
uint32_t edi;
uint32_t ebp;
uint32_t eax;
uint16_t error_code; /* private */
uint16_t entry_vector; /* private */
uint32_t eip;
uint16_t cs;
uint8_t saved_upcall_mask;
uint8_t _pad0;
uint32_t eflags; /* eflags.IF == !saved_upcall_mask */
uint32_t esp;
uint16_t ss, _pad1;
uint16_t es, _pad2;
uint16_t ds, _pad3;
uint16_t fs, _pad4;
uint16_t gs, _pad5;
};
DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
/*
* The following is all CPU context. Note that the fpu_ctxt block is filled
* in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
*/
struct vcpu_guest_context {
/* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */
#define VGCF_I387_VALID (1<<0)
#define VGCF_HVM_GUEST (1<<1)
#define VGCF_IN_KERNEL (1<<2)
unsigned long flags; /* VGCF_* flags */
struct cpu_user_regs user_regs; /* User-level CPU registers */
struct trap_info trap_ctxt[256]; /* Virtual IDT */
unsigned long ldt_base, ldt_ents; /* LDT (linear address, # ents) */
unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
unsigned long kernel_ss, kernel_sp; /* Virtual TSS (only SS1/SP1) */
unsigned long ctrlreg[8]; /* CR0-CR7 (control registers) */
unsigned long debugreg[8]; /* DB0-DB7 (debug registers) */
unsigned long event_callback_cs; /* CS:EIP of event callback */
unsigned long event_callback_eip;
unsigned long failsafe_callback_cs; /* CS:EIP of failsafe callback */
unsigned long failsafe_callback_eip;
unsigned long vm_assist; /* VMASST_TYPE_* bitmap */
};
DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context);
struct arch_shared_info {
unsigned long max_pfn; /* max pfn that appears in table */
/* Frame containing list of mfns containing list of mfns containing p2m. */
unsigned long pfn_to_mfn_frame_list_list;
unsigned long nmi_reason;
};
struct arch_vcpu_info {
unsigned long cr2;
unsigned long pad[5]; /* sizeof(struct vcpu_info) == 64 */
};
#endif /* !__ASSEMBLY__ */
/*
* Prefix forces emulation of some non-trapping instructions.
* Currently only CPUID.
*/
#ifdef __ASSEMBLY__
#define XEN_EMULATE_PREFIX .byte 0x0f,0x0b,0x78,0x65,0x6e ;
#define XEN_CPUID XEN_EMULATE_PREFIX cpuid
#else
#define XEN_EMULATE_PREFIX ".byte 0x0f,0x0b,0x78,0x65,0x6e ; "
#define XEN_CPUID XEN_EMULATE_PREFIX "cpuid"
#endif
#endif

View File

@ -38,17 +38,25 @@
* e.g. ELFNOTE(XYZCo, 42, .asciz, "forty-two")
* ELFNOTE(XYZCo, 12, .long, 0xdeadbeef)
*/
#define ELFNOTE(name, type, desctype, descdata) \
.pushsection .note.name, "",@note ; \
.align 4 ; \
#define ELFNOTE_START(name, type, flags) \
.pushsection .note.name, flags,@note ; \
.balign 4 ; \
.long 2f - 1f /* namesz */ ; \
.long 4f - 3f /* descsz */ ; \
.long 4484f - 3f /* descsz */ ; \
.long type ; \
1:.asciz #name ; \
2:.align 4 ; \
3:desctype descdata ; \
4:.align 4 ; \
2:.balign 4 ; \
3:
#define ELFNOTE_END \
4484:.balign 4 ; \
.popsection ;
#define ELFNOTE(name, type, desc) \
ELFNOTE_START(name, type, "") \
desc ; \
ELFNOTE_END
#else /* !__ASSEMBLER__ */
#include <linux/elf.h>
/*

View File

@ -36,13 +36,57 @@ static inline int request_module(const char * name, ...) { return -ENOSYS; }
#define try_then_request_module(x, mod...) ((x) ?: (request_module(mod), (x)))
struct key;
extern int call_usermodehelper_keys(char *path, char *argv[], char *envp[],
struct key *session_keyring, int wait);
struct file;
struct subprocess_info;
/* Allocate a subprocess_info structure */
struct subprocess_info *call_usermodehelper_setup(char *path,
char **argv, char **envp);
/* Set various pieces of state into the subprocess_info structure */
void call_usermodehelper_setkeys(struct subprocess_info *info,
struct key *session_keyring);
int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
struct file **filp);
void call_usermodehelper_setcleanup(struct subprocess_info *info,
void (*cleanup)(char **argv, char **envp));
enum umh_wait {
UMH_NO_WAIT = -1, /* don't wait at all */
UMH_WAIT_EXEC = 0, /* wait for the exec, but not the process */
UMH_WAIT_PROC = 1, /* wait for the process to complete */
};
/* Actually execute the sub-process */
int call_usermodehelper_exec(struct subprocess_info *info, enum umh_wait wait);
/* Free the subprocess_info. This is only needed if you're not going
to call call_usermodehelper_exec */
void call_usermodehelper_freeinfo(struct subprocess_info *info);
static inline int
call_usermodehelper(char *path, char **argv, char **envp, int wait)
call_usermodehelper(char *path, char **argv, char **envp, enum umh_wait wait)
{
return call_usermodehelper_keys(path, argv, envp, NULL, wait);
struct subprocess_info *info;
info = call_usermodehelper_setup(path, argv, envp);
if (info == NULL)
return -ENOMEM;
return call_usermodehelper_exec(info, wait);
}
static inline int
call_usermodehelper_keys(char *path, char **argv, char **envp,
struct key *session_keyring, enum umh_wait wait)
{
struct subprocess_info *info;
info = call_usermodehelper_setup(path, argv, envp);
if (info == NULL)
return -ENOMEM;
call_usermodehelper_setkeys(info, session_keyring);
return call_usermodehelper_exec(info, wait);
}
extern void usermodehelper_init(void);

View File

@ -158,6 +158,8 @@
#define VXSPEC_MAJOR 200 /* VERITAS volume config driver */
#define VXDMP_MAJOR 201 /* VERITAS volume multipath driver */
#define XENVBD_MAJOR 202 /* Xen virtual block device */
#define MSR_MAJOR 202
#define CPUID_MAJOR 203

View File

@ -92,6 +92,7 @@
/* PG_owner_priv_1 users should have descriptive aliases */
#define PG_checked PG_owner_priv_1 /* Used by some filesystems */
#define PG_pinned PG_owner_priv_1 /* Xen pinned pagetable */
#if (BITS_PER_LONG > 32)
/*
@ -170,6 +171,10 @@ static inline void SetPageUptodate(struct page *page)
#define SetPageChecked(page) set_bit(PG_checked, &(page)->flags)
#define ClearPageChecked(page) clear_bit(PG_checked, &(page)->flags)
#define PagePinned(page) test_bit(PG_pinned, &(page)->flags)
#define SetPagePinned(page) set_bit(PG_pinned, &(page)->flags)
#define ClearPagePinned(page) clear_bit(PG_pinned, &(page)->flags)
#define PageReserved(page) test_bit(PG_reserved, &(page)->flags)
#define SetPageReserved(page) set_bit(PG_reserved, &(page)->flags)
#define ClearPageReserved(page) clear_bit(PG_reserved, &(page)->flags)

View File

@ -67,6 +67,11 @@ extern void kernel_power_off(void);
void ctrl_alt_del(void);
#define POWEROFF_CMD_PATH_LEN 256
extern char poweroff_cmd[POWEROFF_CMD_PATH_LEN];
extern int orderly_poweroff(bool force);
/*
* Emergency restart, callable from an interrupt handler.
*/

View File

@ -105,8 +105,12 @@ extern void * memchr(const void *,int,__kernel_size_t);
#endif
extern char *kstrdup(const char *s, gfp_t gfp);
extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
extern void *kmemdup(const void *src, size_t len, gfp_t gfp);
extern char **argv_split(gfp_t gfp, const char *str, int *argcp);
extern void argv_free(char **argv);
#ifdef __cplusplus
}
#endif

View File

@ -70,6 +70,10 @@ extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
struct page ***pages);
extern void unmap_kernel_range(unsigned long addr, unsigned long size);
/* Allocate/destroy a 'vmalloc' VM area. */
extern struct vm_struct *alloc_vm_area(size_t size);
extern void free_vm_area(struct vm_struct *area);
/*
* Internals. Dont't use..
*/

48
include/xen/events.h Normal file
View File

@ -0,0 +1,48 @@
#ifndef _XEN_EVENTS_H
#define _XEN_EVENTS_H
#include <linux/interrupt.h>
#include <xen/interface/event_channel.h>
#include <asm/xen/hypercall.h>
enum ipi_vector {
XEN_RESCHEDULE_VECTOR,
XEN_CALL_FUNCTION_VECTOR,
XEN_NR_IPIS,
};
int bind_evtchn_to_irq(unsigned int evtchn);
int bind_evtchn_to_irqhandler(unsigned int evtchn,
irq_handler_t handler,
unsigned long irqflags, const char *devname,
void *dev_id);
int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
irq_handler_t handler,
unsigned long irqflags, const char *devname,
void *dev_id);
int bind_ipi_to_irqhandler(enum ipi_vector ipi,
unsigned int cpu,
irq_handler_t handler,
unsigned long irqflags,
const char *devname,
void *dev_id);
/*
* Common unbind function for all event sources. Takes IRQ to unbind from.
* Automatically closes the underlying event channel (even for bindings
* made with bind_evtchn_to_irqhandler()).
*/
void unbind_from_irqhandler(unsigned int irq, void *dev_id);
void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector);
static inline void notify_remote_via_evtchn(int port)
{
struct evtchn_send send = { .port = port };
(void)HYPERVISOR_event_channel_op(EVTCHNOP_send, &send);
}
extern void notify_remote_via_irq(int irq);
#endif /* _XEN_EVENTS_H */

23
include/xen/features.h Normal file
View File

@ -0,0 +1,23 @@
/******************************************************************************
* features.h
*
* Query the features reported by Xen.
*
* Copyright (c) 2006, Ian Campbell
*/
#ifndef __XEN_FEATURES_H__
#define __XEN_FEATURES_H__
#include <xen/interface/features.h>
void xen_setup_features(void);
extern u8 xen_features[XENFEAT_NR_SUBMAPS * 32];
static inline int xen_feature(int flag)
{
return xen_features[flag];
}
#endif /* __ASM_XEN_FEATURES_H__ */

107
include/xen/grant_table.h Normal file
View File

@ -0,0 +1,107 @@
/******************************************************************************
* grant_table.h
*
* Two sets of functionality:
* 1. Granting foreign access to our memory reservation.
* 2. Accessing others' memory reservations via grant references.
* (i.e., mechanisms for both sender and recipient of grant references)
*
* Copyright (c) 2004-2005, K A Fraser
* Copyright (c) 2005, Christopher Clark
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version 2
* as published by the Free Software Foundation; or, when distributed
* separately from the Linux kernel or incorporated into other
* software packages, subject to the following license:
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this source file (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy, modify,
* merge, publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef __ASM_GNTTAB_H__
#define __ASM_GNTTAB_H__
#include <asm/xen/hypervisor.h>
#include <xen/interface/grant_table.h>
/* NR_GRANT_FRAMES must be less than or equal to that configured in Xen */
#define NR_GRANT_FRAMES 4
struct gnttab_free_callback {
struct gnttab_free_callback *next;
void (*fn)(void *);
void *arg;
u16 count;
};
int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
int readonly);
/*
* End access through the given grant reference, iff the grant entry is no
* longer in use. Return 1 if the grant entry was freed, 0 if it is still in
* use.
*/
int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly);
/*
* Eventually end access through the given grant reference, and once that
* access has been ended, free the given page too. Access will be ended
* immediately iff the grant entry is not in use, otherwise it will happen
* some time later. page may be 0, in which case no freeing will occur.
*/
void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
unsigned long page);
int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn);
unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref);
unsigned long gnttab_end_foreign_transfer(grant_ref_t ref);
int gnttab_query_foreign_access(grant_ref_t ref);
/*
* operations on reserved batches of grant references
*/
int gnttab_alloc_grant_references(u16 count, grant_ref_t *pprivate_head);
void gnttab_free_grant_reference(grant_ref_t ref);
void gnttab_free_grant_references(grant_ref_t head);
int gnttab_empty_grant_references(const grant_ref_t *pprivate_head);
int gnttab_claim_grant_reference(grant_ref_t *pprivate_head);
void gnttab_release_grant_reference(grant_ref_t *private_head,
grant_ref_t release);
void gnttab_request_free_callback(struct gnttab_free_callback *callback,
void (*fn)(void *), void *arg, u16 count);
void gnttab_cancel_free_callback(struct gnttab_free_callback *callback);
void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
unsigned long frame, int readonly);
void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
unsigned long pfn);
#define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr))
#endif /* __ASM_GNTTAB_H__ */

View File

@ -0,0 +1,6 @@
#ifndef XEN_HVC_CONSOLE_H
#define XEN_HVC_CONSOLE_H
extern struct console xenboot_console;
#endif /* XEN_HVC_CONSOLE_H */

View File

@ -0,0 +1,133 @@
/******************************************************************************
* elfnote.h
*
* Definitions used for the Xen ELF notes.
*
* Copyright (c) 2006, Ian Campbell, XenSource Ltd.
*/
#ifndef __XEN_PUBLIC_ELFNOTE_H__
#define __XEN_PUBLIC_ELFNOTE_H__
/*
* The notes should live in a SHT_NOTE segment and have "Xen" in the
* name field.
*
* Numeric types are either 4 or 8 bytes depending on the content of
* the desc field.
*
* LEGACY indicated the fields in the legacy __xen_guest string which
* this a note type replaces.
*/
/*
* NAME=VALUE pair (string).
*
* LEGACY: FEATURES and PAE
*/
#define XEN_ELFNOTE_INFO 0
/*
* The virtual address of the entry point (numeric).
*
* LEGACY: VIRT_ENTRY
*/
#define XEN_ELFNOTE_ENTRY 1
/* The virtual address of the hypercall transfer page (numeric).
*
* LEGACY: HYPERCALL_PAGE. (n.b. legacy value is a physical page
* number not a virtual address)
*/
#define XEN_ELFNOTE_HYPERCALL_PAGE 2
/* The virtual address where the kernel image should be mapped (numeric).
*
* Defaults to 0.
*
* LEGACY: VIRT_BASE
*/
#define XEN_ELFNOTE_VIRT_BASE 3
/*
* The offset of the ELF paddr field from the acutal required
* psuedo-physical address (numeric).
*
* This is used to maintain backwards compatibility with older kernels
* which wrote __PAGE_OFFSET into that field. This field defaults to 0
* if not present.
*
* LEGACY: ELF_PADDR_OFFSET. (n.b. legacy default is VIRT_BASE)
*/
#define XEN_ELFNOTE_PADDR_OFFSET 4
/*
* The version of Xen that we work with (string).
*
* LEGACY: XEN_VER
*/
#define XEN_ELFNOTE_XEN_VERSION 5
/*
* The name of the guest operating system (string).
*
* LEGACY: GUEST_OS
*/
#define XEN_ELFNOTE_GUEST_OS 6
/*
* The version of the guest operating system (string).
*
* LEGACY: GUEST_VER
*/
#define XEN_ELFNOTE_GUEST_VERSION 7
/*
* The loader type (string).
*
* LEGACY: LOADER
*/
#define XEN_ELFNOTE_LOADER 8
/*
* The kernel supports PAE (x86/32 only, string = "yes" or "no").
*
* LEGACY: PAE (n.b. The legacy interface included a provision to
* indicate 'extended-cr3' support allowing L3 page tables to be
* placed above 4G. It is assumed that any kernel new enough to use
* these ELF notes will include this and therefore "yes" here is
* equivalent to "yes[entended-cr3]" in the __xen_guest interface.
*/
#define XEN_ELFNOTE_PAE_MODE 9
/*
* The features supported/required by this kernel (string).
*
* The string must consist of a list of feature names (as given in
* features.h, without the "XENFEAT_" prefix) separated by '|'
* characters. If a feature is required for the kernel to function
* then the feature name must be preceded by a '!' character.
*
* LEGACY: FEATURES
*/
#define XEN_ELFNOTE_FEATURES 10
/*
* The kernel requires the symbol table to be loaded (string = "yes" or "no")
* LEGACY: BSD_SYMTAB (n.b. The legacy treated the presence or absence
* of this string as a boolean flag rather than requiring "yes" or
* "no".
*/
#define XEN_ELFNOTE_BSD_SYMTAB 11
#endif /* __XEN_PUBLIC_ELFNOTE_H__ */
/*
* Local variables:
* mode: C
* c-set-style: "BSD"
* c-basic-offset: 4
* tab-width: 4
* indent-tabs-mode: nil
* End:
*/

View File

@ -0,0 +1,195 @@
/******************************************************************************
* event_channel.h
*
* Event channels between domains.
*
* Copyright (c) 2003-2004, K A Fraser.
*/
#ifndef __XEN_PUBLIC_EVENT_CHANNEL_H__
#define __XEN_PUBLIC_EVENT_CHANNEL_H__
typedef uint32_t evtchn_port_t;
DEFINE_GUEST_HANDLE(evtchn_port_t);
/*
* EVTCHNOP_alloc_unbound: Allocate a port in domain <dom> and mark as
* accepting interdomain bindings from domain <remote_dom>. A fresh port
* is allocated in <dom> and returned as <port>.
* NOTES:
* 1. If the caller is unprivileged then <dom> must be DOMID_SELF.
* 2. <rdom> may be DOMID_SELF, allowing loopback connections.
*/
#define EVTCHNOP_alloc_unbound 6
struct evtchn_alloc_unbound {
/* IN parameters */
domid_t dom, remote_dom;
/* OUT parameters */
evtchn_port_t port;
};
/*
* EVTCHNOP_bind_interdomain: Construct an interdomain event channel between
* the calling domain and <remote_dom>. <remote_dom,remote_port> must identify
* a port that is unbound and marked as accepting bindings from the calling
* domain. A fresh port is allocated in the calling domain and returned as
* <local_port>.
* NOTES:
* 2. <remote_dom> may be DOMID_SELF, allowing loopback connections.
*/
#define EVTCHNOP_bind_interdomain 0
struct evtchn_bind_interdomain {
/* IN parameters. */
domid_t remote_dom;
evtchn_port_t remote_port;
/* OUT parameters. */
evtchn_port_t local_port;
};
/*
* EVTCHNOP_bind_virq: Bind a local event channel to VIRQ <irq> on specified
* vcpu.
* NOTES:
* 1. A virtual IRQ may be bound to at most one event channel per vcpu.
* 2. The allocated event channel is bound to the specified vcpu. The binding
* may not be changed.
*/
#define EVTCHNOP_bind_virq 1
struct evtchn_bind_virq {
/* IN parameters. */
uint32_t virq;
uint32_t vcpu;
/* OUT parameters. */
evtchn_port_t port;
};
/*
* EVTCHNOP_bind_pirq: Bind a local event channel to PIRQ <irq>.
* NOTES:
* 1. A physical IRQ may be bound to at most one event channel per domain.
* 2. Only a sufficiently-privileged domain may bind to a physical IRQ.
*/
#define EVTCHNOP_bind_pirq 2
struct evtchn_bind_pirq {
/* IN parameters. */
uint32_t pirq;
#define BIND_PIRQ__WILL_SHARE 1
uint32_t flags; /* BIND_PIRQ__* */
/* OUT parameters. */
evtchn_port_t port;
};
/*
* EVTCHNOP_bind_ipi: Bind a local event channel to receive events.
* NOTES:
* 1. The allocated event channel is bound to the specified vcpu. The binding
* may not be changed.
*/
#define EVTCHNOP_bind_ipi 7
struct evtchn_bind_ipi {
uint32_t vcpu;
/* OUT parameters. */
evtchn_port_t port;
};
/*
* EVTCHNOP_close: Close a local event channel <port>. If the channel is
* interdomain then the remote end is placed in the unbound state
* (EVTCHNSTAT_unbound), awaiting a new connection.
*/
#define EVTCHNOP_close 3
struct evtchn_close {
/* IN parameters. */
evtchn_port_t port;
};
/*
* EVTCHNOP_send: Send an event to the remote end of the channel whose local
* endpoint is <port>.
*/
#define EVTCHNOP_send 4
struct evtchn_send {
/* IN parameters. */
evtchn_port_t port;
};
/*
* EVTCHNOP_status: Get the current status of the communication channel which
* has an endpoint at <dom, port>.
* NOTES:
* 1. <dom> may be specified as DOMID_SELF.
* 2. Only a sufficiently-privileged domain may obtain the status of an event
* channel for which <dom> is not DOMID_SELF.
*/
#define EVTCHNOP_status 5
struct evtchn_status {
/* IN parameters */
domid_t dom;
evtchn_port_t port;
/* OUT parameters */
#define EVTCHNSTAT_closed 0 /* Channel is not in use. */
#define EVTCHNSTAT_unbound 1 /* Channel is waiting interdom connection.*/
#define EVTCHNSTAT_interdomain 2 /* Channel is connected to remote domain. */
#define EVTCHNSTAT_pirq 3 /* Channel is bound to a phys IRQ line. */
#define EVTCHNSTAT_virq 4 /* Channel is bound to a virtual IRQ line */
#define EVTCHNSTAT_ipi 5 /* Channel is bound to a virtual IPI line */
uint32_t status;
uint32_t vcpu; /* VCPU to which this channel is bound. */
union {
struct {
domid_t dom;
} unbound; /* EVTCHNSTAT_unbound */
struct {
domid_t dom;
evtchn_port_t port;
} interdomain; /* EVTCHNSTAT_interdomain */
uint32_t pirq; /* EVTCHNSTAT_pirq */
uint32_t virq; /* EVTCHNSTAT_virq */
} u;
};
/*
* EVTCHNOP_bind_vcpu: Specify which vcpu a channel should notify when an
* event is pending.
* NOTES:
* 1. IPI- and VIRQ-bound channels always notify the vcpu that initialised
* the binding. This binding cannot be changed.
* 2. All other channels notify vcpu0 by default. This default is set when
* the channel is allocated (a port that is freed and subsequently reused
* has its binding reset to vcpu0).
*/
#define EVTCHNOP_bind_vcpu 8
struct evtchn_bind_vcpu {
/* IN parameters. */
evtchn_port_t port;
uint32_t vcpu;
};
/*
* EVTCHNOP_unmask: Unmask the specified local event-channel port and deliver
* a notification to the appropriate VCPU if an event is pending.
*/
#define EVTCHNOP_unmask 9
struct evtchn_unmask {
/* IN parameters. */
evtchn_port_t port;
};
struct evtchn_op {
uint32_t cmd; /* EVTCHNOP_* */
union {
struct evtchn_alloc_unbound alloc_unbound;
struct evtchn_bind_interdomain bind_interdomain;
struct evtchn_bind_virq bind_virq;
struct evtchn_bind_pirq bind_pirq;
struct evtchn_bind_ipi bind_ipi;
struct evtchn_close close;
struct evtchn_send send;
struct evtchn_status status;
struct evtchn_bind_vcpu bind_vcpu;
struct evtchn_unmask unmask;
} u;
};
DEFINE_GUEST_HANDLE_STRUCT(evtchn_op);
#endif /* __XEN_PUBLIC_EVENT_CHANNEL_H__ */

View File

@ -0,0 +1,43 @@
/******************************************************************************
* features.h
*
* Feature flags, reported by XENVER_get_features.
*
* Copyright (c) 2006, Keir Fraser <keir@xensource.com>
*/
#ifndef __XEN_PUBLIC_FEATURES_H__
#define __XEN_PUBLIC_FEATURES_H__
/*
* If set, the guest does not need to write-protect its pagetables, and can
* update them via direct writes.
*/
#define XENFEAT_writable_page_tables 0
/*
* If set, the guest does not need to write-protect its segment descriptor
* tables, and can update them via direct writes.
*/
#define XENFEAT_writable_descriptor_tables 1
/*
* If set, translation between the guest's 'pseudo-physical' address space
* and the host's machine address space are handled by the hypervisor. In this
* mode the guest does not need to perform phys-to/from-machine translations
* when performing page table operations.
*/
#define XENFEAT_auto_translated_physmap 2
/* If set, the guest is running in supervisor mode (e.g., x86 ring 0). */
#define XENFEAT_supervisor_mode_kernel 3
/*
* If set, the guest does not need to allocate x86 PAE page directories
* below 4GB. This flag is usually implied by auto_translated_physmap.
*/
#define XENFEAT_pae_pgdir_above_4gb 4
#define XENFEAT_NR_SUBMAPS 1
#endif /* __XEN_PUBLIC_FEATURES_H__ */

View File

@ -0,0 +1,375 @@
/******************************************************************************
* grant_table.h
*
* Interface for granting foreign access to page frames, and receiving
* page-ownership transfers.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
* Copyright (c) 2004, K A Fraser
*/
#ifndef __XEN_PUBLIC_GRANT_TABLE_H__
#define __XEN_PUBLIC_GRANT_TABLE_H__
/***********************************
* GRANT TABLE REPRESENTATION
*/
/* Some rough guidelines on accessing and updating grant-table entries
* in a concurrency-safe manner. For more information, Linux contains a
* reference implementation for guest OSes (arch/xen/kernel/grant_table.c).
*
* NB. WMB is a no-op on current-generation x86 processors. However, a
* compiler barrier will still be required.
*
* Introducing a valid entry into the grant table:
* 1. Write ent->domid.
* 2. Write ent->frame:
* GTF_permit_access: Frame to which access is permitted.
* GTF_accept_transfer: Pseudo-phys frame slot being filled by new
* frame, or zero if none.
* 3. Write memory barrier (WMB).
* 4. Write ent->flags, inc. valid type.
*
* Invalidating an unused GTF_permit_access entry:
* 1. flags = ent->flags.
* 2. Observe that !(flags & (GTF_reading|GTF_writing)).
* 3. Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0).
* NB. No need for WMB as reuse of entry is control-dependent on success of
* step 3, and all architectures guarantee ordering of ctrl-dep writes.
*
* Invalidating an in-use GTF_permit_access entry:
* This cannot be done directly. Request assistance from the domain controller
* which can set a timeout on the use of a grant entry and take necessary
* action. (NB. This is not yet implemented!).
*
* Invalidating an unused GTF_accept_transfer entry:
* 1. flags = ent->flags.
* 2. Observe that !(flags & GTF_transfer_committed). [*]
* 3. Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0).
* NB. No need for WMB as reuse of entry is control-dependent on success of
* step 3, and all architectures guarantee ordering of ctrl-dep writes.
* [*] If GTF_transfer_committed is set then the grant entry is 'committed'.
* The guest must /not/ modify the grant entry until the address of the
* transferred frame is written. It is safe for the guest to spin waiting
* for this to occur (detect by observing GTF_transfer_completed in
* ent->flags).
*
* Invalidating a committed GTF_accept_transfer entry:
* 1. Wait for (ent->flags & GTF_transfer_completed).
*
* Changing a GTF_permit_access from writable to read-only:
* Use SMP-safe CMPXCHG to set GTF_readonly, while checking !GTF_writing.
*
* Changing a GTF_permit_access from read-only to writable:
* Use SMP-safe bit-setting instruction.
*/
/*
* A grant table comprises a packed array of grant entries in one or more
* page frames shared between Xen and a guest.
* [XEN]: This field is written by Xen and read by the sharing guest.
* [GST]: This field is written by the guest and read by Xen.
*/
struct grant_entry {
/* GTF_xxx: various type and flag information. [XEN,GST] */
uint16_t flags;
/* The domain being granted foreign privileges. [GST] */
domid_t domid;
/*
* GTF_permit_access: Frame that @domid is allowed to map and access. [GST]
* GTF_accept_transfer: Frame whose ownership transferred by @domid. [XEN]
*/
uint32_t frame;
};
/*
* Type of grant entry.
* GTF_invalid: This grant entry grants no privileges.
* GTF_permit_access: Allow @domid to map/access @frame.
* GTF_accept_transfer: Allow @domid to transfer ownership of one page frame
* to this guest. Xen writes the page number to @frame.
*/
#define GTF_invalid (0U<<0)
#define GTF_permit_access (1U<<0)
#define GTF_accept_transfer (2U<<0)
#define GTF_type_mask (3U<<0)
/*
* Subflags for GTF_permit_access.
* GTF_readonly: Restrict @domid to read-only mappings and accesses. [GST]
* GTF_reading: Grant entry is currently mapped for reading by @domid. [XEN]
* GTF_writing: Grant entry is currently mapped for writing by @domid. [XEN]
*/
#define _GTF_readonly (2)
#define GTF_readonly (1U<<_GTF_readonly)
#define _GTF_reading (3)
#define GTF_reading (1U<<_GTF_reading)
#define _GTF_writing (4)
#define GTF_writing (1U<<_GTF_writing)
/*
* Subflags for GTF_accept_transfer:
* GTF_transfer_committed: Xen sets this flag to indicate that it is committed
* to transferring ownership of a page frame. When a guest sees this flag
* it must /not/ modify the grant entry until GTF_transfer_completed is
* set by Xen.
* GTF_transfer_completed: It is safe for the guest to spin-wait on this flag
* after reading GTF_transfer_committed. Xen will always write the frame
* address, followed by ORing this flag, in a timely manner.
*/
#define _GTF_transfer_committed (2)
#define GTF_transfer_committed (1U<<_GTF_transfer_committed)
#define _GTF_transfer_completed (3)
#define GTF_transfer_completed (1U<<_GTF_transfer_completed)
/***********************************
* GRANT TABLE QUERIES AND USES
*/
/*
* Reference to a grant entry in a specified domain's grant table.
*/
typedef uint32_t grant_ref_t;
/*
* Handle to track a mapping created via a grant reference.
*/
typedef uint32_t grant_handle_t;
/*
* GNTTABOP_map_grant_ref: Map the grant entry (<dom>,<ref>) for access
* by devices and/or host CPUs. If successful, <handle> is a tracking number
* that must be presented later to destroy the mapping(s). On error, <handle>
* is a negative status code.
* NOTES:
* 1. If GNTMAP_device_map is specified then <dev_bus_addr> is the address
* via which I/O devices may access the granted frame.
* 2. If GNTMAP_host_map is specified then a mapping will be added at
* either a host virtual address in the current address space, or at
* a PTE at the specified machine address. The type of mapping to
* perform is selected through the GNTMAP_contains_pte flag, and the
* address is specified in <host_addr>.
* 3. Mappings should only be destroyed via GNTTABOP_unmap_grant_ref. If a
* host mapping is destroyed by other means then it is *NOT* guaranteed
* to be accounted to the correct grant reference!
*/
#define GNTTABOP_map_grant_ref 0
struct gnttab_map_grant_ref {
/* IN parameters. */
uint64_t host_addr;
uint32_t flags; /* GNTMAP_* */
grant_ref_t ref;
domid_t dom;
/* OUT parameters. */
int16_t status; /* GNTST_* */
grant_handle_t handle;
uint64_t dev_bus_addr;
};
/*
* GNTTABOP_unmap_grant_ref: Destroy one or more grant-reference mappings
* tracked by <handle>. If <host_addr> or <dev_bus_addr> is zero, that
* field is ignored. If non-zero, they must refer to a device/host mapping
* that is tracked by <handle>
* NOTES:
* 1. The call may fail in an undefined manner if either mapping is not
* tracked by <handle>.
* 3. After executing a batch of unmaps, it is guaranteed that no stale
* mappings will remain in the device or host TLBs.
*/
#define GNTTABOP_unmap_grant_ref 1
struct gnttab_unmap_grant_ref {
/* IN parameters. */
uint64_t host_addr;
uint64_t dev_bus_addr;
grant_handle_t handle;
/* OUT parameters. */
int16_t status; /* GNTST_* */
};
/*
* GNTTABOP_setup_table: Set up a grant table for <dom> comprising at least
* <nr_frames> pages. The frame addresses are written to the <frame_list>.
* Only <nr_frames> addresses are written, even if the table is larger.
* NOTES:
* 1. <dom> may be specified as DOMID_SELF.
* 2. Only a sufficiently-privileged domain may specify <dom> != DOMID_SELF.
* 3. Xen may not support more than a single grant-table page per domain.
*/
#define GNTTABOP_setup_table 2
struct gnttab_setup_table {
/* IN parameters. */
domid_t dom;
uint32_t nr_frames;
/* OUT parameters. */
int16_t status; /* GNTST_* */
ulong *frame_list;
};
/*
* GNTTABOP_dump_table: Dump the contents of the grant table to the
* xen console. Debugging use only.
*/
#define GNTTABOP_dump_table 3
struct gnttab_dump_table {
/* IN parameters. */
domid_t dom;
/* OUT parameters. */
int16_t status; /* GNTST_* */
};
/*
* GNTTABOP_transfer_grant_ref: Transfer <frame> to a foreign domain. The
* foreign domain has previously registered its interest in the transfer via
* <domid, ref>.
*
* Note that, even if the transfer fails, the specified page no longer belongs
* to the calling domain *unless* the error is GNTST_bad_page.
*/
#define GNTTABOP_transfer 4
struct gnttab_transfer {
/* IN parameters. */
unsigned long mfn;
domid_t domid;
grant_ref_t ref;
/* OUT parameters. */
int16_t status;
};
/*
* GNTTABOP_copy: Hypervisor based copy
* source and destinations can be eithers MFNs or, for foreign domains,
* grant references. the foreign domain has to grant read/write access
* in its grant table.
*
* The flags specify what type source and destinations are (either MFN
* or grant reference).
*
* Note that this can also be used to copy data between two domains
* via a third party if the source and destination domains had previously
* grant appropriate access to their pages to the third party.
*
* source_offset specifies an offset in the source frame, dest_offset
* the offset in the target frame and len specifies the number of
* bytes to be copied.
*/
#define _GNTCOPY_source_gref (0)
#define GNTCOPY_source_gref (1<<_GNTCOPY_source_gref)
#define _GNTCOPY_dest_gref (1)
#define GNTCOPY_dest_gref (1<<_GNTCOPY_dest_gref)
#define GNTTABOP_copy 5
struct gnttab_copy {
/* IN parameters. */
struct {
union {
grant_ref_t ref;
unsigned long gmfn;
} u;
domid_t domid;
uint16_t offset;
} source, dest;
uint16_t len;
uint16_t flags; /* GNTCOPY_* */
/* OUT parameters. */
int16_t status;
};
/*
* GNTTABOP_query_size: Query the current and maximum sizes of the shared
* grant table.
* NOTES:
* 1. <dom> may be specified as DOMID_SELF.
* 2. Only a sufficiently-privileged domain may specify <dom> != DOMID_SELF.
*/
#define GNTTABOP_query_size 6
struct gnttab_query_size {
/* IN parameters. */
domid_t dom;
/* OUT parameters. */
uint32_t nr_frames;
uint32_t max_nr_frames;
int16_t status; /* GNTST_* */
};
/*
* Bitfield values for update_pin_status.flags.
*/
/* Map the grant entry for access by I/O devices. */
#define _GNTMAP_device_map (0)
#define GNTMAP_device_map (1<<_GNTMAP_device_map)
/* Map the grant entry for access by host CPUs. */
#define _GNTMAP_host_map (1)
#define GNTMAP_host_map (1<<_GNTMAP_host_map)
/* Accesses to the granted frame will be restricted to read-only access. */
#define _GNTMAP_readonly (2)
#define GNTMAP_readonly (1<<_GNTMAP_readonly)
/*
* GNTMAP_host_map subflag:
* 0 => The host mapping is usable only by the guest OS.
* 1 => The host mapping is usable by guest OS + current application.
*/
#define _GNTMAP_application_map (3)
#define GNTMAP_application_map (1<<_GNTMAP_application_map)
/*
* GNTMAP_contains_pte subflag:
* 0 => This map request contains a host virtual address.
* 1 => This map request contains the machine addess of the PTE to update.
*/
#define _GNTMAP_contains_pte (4)
#define GNTMAP_contains_pte (1<<_GNTMAP_contains_pte)
/*
* Values for error status returns. All errors are -ve.
*/
#define GNTST_okay (0) /* Normal return. */
#define GNTST_general_error (-1) /* General undefined error. */
#define GNTST_bad_domain (-2) /* Unrecognsed domain id. */
#define GNTST_bad_gntref (-3) /* Unrecognised or inappropriate gntref. */
#define GNTST_bad_handle (-4) /* Unrecognised or inappropriate handle. */
#define GNTST_bad_virt_addr (-5) /* Inappropriate virtual address to map. */
#define GNTST_bad_dev_addr (-6) /* Inappropriate device address to unmap.*/
#define GNTST_no_device_space (-7) /* Out of space in I/O MMU. */
#define GNTST_permission_denied (-8) /* Not enough privilege for operation. */
#define GNTST_bad_page (-9) /* Specified page was invalid for op. */
#define GNTST_bad_copy_arg (-10) /* copy arguments cross page boundary */
#define GNTTABOP_error_msgs { \
"okay", \
"undefined error", \
"unrecognised domain id", \
"invalid grant reference", \
"invalid mapping handle", \
"invalid virtual address", \
"invalid device address", \
"no spare translation slot in the I/O MMU", \
"permission denied", \
"bad page", \
"copy arguments cross page boundary" \
}
#endif /* __XEN_PUBLIC_GRANT_TABLE_H__ */

View File

@ -0,0 +1,94 @@
/******************************************************************************
* blkif.h
*
* Unified block-device I/O interface for Xen guest OSes.
*
* Copyright (c) 2003-2004, Keir Fraser
*/
#ifndef __XEN_PUBLIC_IO_BLKIF_H__
#define __XEN_PUBLIC_IO_BLKIF_H__
#include "ring.h"
#include "../grant_table.h"
/*
* Front->back notifications: When enqueuing a new request, sending a
* notification can be made conditional on req_event (i.e., the generic
* hold-off mechanism provided by the ring macros). Backends must set
* req_event appropriately (e.g., using RING_FINAL_CHECK_FOR_REQUESTS()).
*
* Back->front notifications: When enqueuing a new response, sending a
* notification can be made conditional on rsp_event (i.e., the generic
* hold-off mechanism provided by the ring macros). Frontends must set
* rsp_event appropriately (e.g., using RING_FINAL_CHECK_FOR_RESPONSES()).
*/
typedef uint16_t blkif_vdev_t;
typedef uint64_t blkif_sector_t;
/*
* REQUEST CODES.
*/
#define BLKIF_OP_READ 0
#define BLKIF_OP_WRITE 1
/*
* Recognised only if "feature-barrier" is present in backend xenbus info.
* The "feature_barrier" node contains a boolean indicating whether barrier
* requests are likely to succeed or fail. Either way, a barrier request
* may fail at any time with BLKIF_RSP_EOPNOTSUPP if it is unsupported by
* the underlying block-device hardware. The boolean simply indicates whether
* or not it is worthwhile for the frontend to attempt barrier requests.
* If a backend does not recognise BLKIF_OP_WRITE_BARRIER, it should *not*
* create the "feature-barrier" node!
*/
#define BLKIF_OP_WRITE_BARRIER 2
/*
* Maximum scatter/gather segments per request.
* This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE.
* NB. This could be 12 if the ring indexes weren't stored in the same page.
*/
#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
struct blkif_request {
uint8_t operation; /* BLKIF_OP_??? */
uint8_t nr_segments; /* number of segments */
blkif_vdev_t handle; /* only for read/write requests */
uint64_t id; /* private guest value, echoed in resp */
blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
struct blkif_request_segment {
grant_ref_t gref; /* reference to I/O buffer frame */
/* @first_sect: first sector in frame to transfer (inclusive). */
/* @last_sect: last sector in frame to transfer (inclusive). */
uint8_t first_sect, last_sect;
} seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
};
struct blkif_response {
uint64_t id; /* copied from request */
uint8_t operation; /* copied from request */
int16_t status; /* BLKIF_RSP_??? */
};
/*
* STATUS RETURN CODES.
*/
/* Operation not supported (only happens on barrier writes). */
#define BLKIF_RSP_EOPNOTSUPP -2
/* Operation failed for some unspecified reason (-EIO). */
#define BLKIF_RSP_ERROR -1
/* Operation completed successfully. */
#define BLKIF_RSP_OKAY 0
/*
* Generate blkif ring structures and types.
*/
DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response);
#define VDISK_CDROM 0x1
#define VDISK_REMOVABLE 0x2
#define VDISK_READONLY 0x4
#endif /* __XEN_PUBLIC_IO_BLKIF_H__ */

View File

@ -0,0 +1,23 @@
/******************************************************************************
* console.h
*
* Console I/O interface for Xen guest OSes.
*
* Copyright (c) 2005, Keir Fraser
*/
#ifndef __XEN_PUBLIC_IO_CONSOLE_H__
#define __XEN_PUBLIC_IO_CONSOLE_H__
typedef uint32_t XENCONS_RING_IDX;
#define MASK_XENCONS_IDX(idx, ring) ((idx) & (sizeof(ring)-1))
struct xencons_interface {
char in[1024];
char out[2048];
XENCONS_RING_IDX in_cons, in_prod;
XENCONS_RING_IDX out_cons, out_prod;
};
#endif /* __XEN_PUBLIC_IO_CONSOLE_H__ */

View File

@ -0,0 +1,158 @@
/******************************************************************************
* netif.h
*
* Unified network-device I/O interface for Xen guest OSes.
*
* Copyright (c) 2003-2004, Keir Fraser
*/
#ifndef __XEN_PUBLIC_IO_NETIF_H__
#define __XEN_PUBLIC_IO_NETIF_H__
#include "ring.h"
#include "../grant_table.h"
/*
* Notifications after enqueuing any type of message should be conditional on
* the appropriate req_event or rsp_event field in the shared ring.
* If the client sends notification for rx requests then it should specify
* feature 'feature-rx-notify' via xenbus. Otherwise the backend will assume
* that it cannot safely queue packets (as it may not be kicked to send them).
*/
/*
* This is the 'wire' format for packets:
* Request 1: netif_tx_request -- NETTXF_* (any flags)
* [Request 2: netif_tx_extra] (only if request 1 has NETTXF_extra_info)
* [Request 3: netif_tx_extra] (only if request 2 has XEN_NETIF_EXTRA_MORE)
* Request 4: netif_tx_request -- NETTXF_more_data
* Request 5: netif_tx_request -- NETTXF_more_data
* ...
* Request N: netif_tx_request -- 0
*/
/* Protocol checksum field is blank in the packet (hardware offload)? */
#define _NETTXF_csum_blank (0)
#define NETTXF_csum_blank (1U<<_NETTXF_csum_blank)
/* Packet data has been validated against protocol checksum. */
#define _NETTXF_data_validated (1)
#define NETTXF_data_validated (1U<<_NETTXF_data_validated)
/* Packet continues in the next request descriptor. */
#define _NETTXF_more_data (2)
#define NETTXF_more_data (1U<<_NETTXF_more_data)
/* Packet to be followed by extra descriptor(s). */
#define _NETTXF_extra_info (3)
#define NETTXF_extra_info (1U<<_NETTXF_extra_info)
struct xen_netif_tx_request {
grant_ref_t gref; /* Reference to buffer page */
uint16_t offset; /* Offset within buffer page */
uint16_t flags; /* NETTXF_* */
uint16_t id; /* Echoed in response message. */
uint16_t size; /* Packet size in bytes. */
};
/* Types of netif_extra_info descriptors. */
#define XEN_NETIF_EXTRA_TYPE_NONE (0) /* Never used - invalid */
#define XEN_NETIF_EXTRA_TYPE_GSO (1) /* u.gso */
#define XEN_NETIF_EXTRA_TYPE_MAX (2)
/* netif_extra_info flags. */
#define _XEN_NETIF_EXTRA_FLAG_MORE (0)
#define XEN_NETIF_EXTRA_FLAG_MORE (1U<<_XEN_NETIF_EXTRA_FLAG_MORE)
/* GSO types - only TCPv4 currently supported. */
#define XEN_NETIF_GSO_TYPE_TCPV4 (1)
/*
* This structure needs to fit within both netif_tx_request and
* netif_rx_response for compatibility.
*/
struct xen_netif_extra_info {
uint8_t type; /* XEN_NETIF_EXTRA_TYPE_* */
uint8_t flags; /* XEN_NETIF_EXTRA_FLAG_* */
union {
struct {
/*
* Maximum payload size of each segment. For
* example, for TCP this is just the path MSS.
*/
uint16_t size;
/*
* GSO type. This determines the protocol of
* the packet and any extra features required
* to segment the packet properly.
*/
uint8_t type; /* XEN_NETIF_GSO_TYPE_* */
/* Future expansion. */
uint8_t pad;
/*
* GSO features. This specifies any extra GSO
* features required to process this packet,
* such as ECN support for TCPv4.
*/
uint16_t features; /* XEN_NETIF_GSO_FEAT_* */
} gso;
uint16_t pad[3];
} u;
};
struct xen_netif_tx_response {
uint16_t id;
int16_t status; /* NETIF_RSP_* */
};
struct xen_netif_rx_request {
uint16_t id; /* Echoed in response message. */
grant_ref_t gref; /* Reference to incoming granted frame */
};
/* Packet data has been validated against protocol checksum. */
#define _NETRXF_data_validated (0)
#define NETRXF_data_validated (1U<<_NETRXF_data_validated)
/* Protocol checksum field is blank in the packet (hardware offload)? */
#define _NETRXF_csum_blank (1)
#define NETRXF_csum_blank (1U<<_NETRXF_csum_blank)
/* Packet continues in the next request descriptor. */
#define _NETRXF_more_data (2)
#define NETRXF_more_data (1U<<_NETRXF_more_data)
/* Packet to be followed by extra descriptor(s). */
#define _NETRXF_extra_info (3)
#define NETRXF_extra_info (1U<<_NETRXF_extra_info)
struct xen_netif_rx_response {
uint16_t id;
uint16_t offset; /* Offset in page of start of received packet */
uint16_t flags; /* NETRXF_* */
int16_t status; /* -ve: BLKIF_RSP_* ; +ve: Rx'ed pkt size. */
};
/*
* Generate netif ring structures and types.
*/
DEFINE_RING_TYPES(xen_netif_tx,
struct xen_netif_tx_request,
struct xen_netif_tx_response);
DEFINE_RING_TYPES(xen_netif_rx,
struct xen_netif_rx_request,
struct xen_netif_rx_response);
#define NETIF_RSP_DROPPED -2
#define NETIF_RSP_ERROR -1
#define NETIF_RSP_OKAY 0
/* No response: used for auxiliary requests (e.g., netif_tx_extra). */
#define NETIF_RSP_NULL 1
#endif

View File

@ -0,0 +1,260 @@
/******************************************************************************
* ring.h
*
* Shared producer-consumer ring macros.
*
* Tim Deegan and Andrew Warfield November 2004.
*/
#ifndef __XEN_PUBLIC_IO_RING_H__
#define __XEN_PUBLIC_IO_RING_H__
typedef unsigned int RING_IDX;
/* Round a 32-bit unsigned constant down to the nearest power of two. */
#define __RD2(_x) (((_x) & 0x00000002) ? 0x2 : ((_x) & 0x1))
#define __RD4(_x) (((_x) & 0x0000000c) ? __RD2((_x)>>2)<<2 : __RD2(_x))
#define __RD8(_x) (((_x) & 0x000000f0) ? __RD4((_x)>>4)<<4 : __RD4(_x))
#define __RD16(_x) (((_x) & 0x0000ff00) ? __RD8((_x)>>8)<<8 : __RD8(_x))
#define __RD32(_x) (((_x) & 0xffff0000) ? __RD16((_x)>>16)<<16 : __RD16(_x))
/*
* Calculate size of a shared ring, given the total available space for the
* ring and indexes (_sz), and the name tag of the request/response structure.
* A ring contains as many entries as will fit, rounded down to the nearest
* power of two (so we can mask with (size-1) to loop around).
*/
#define __RING_SIZE(_s, _sz) \
(__RD32(((_sz) - (long)&(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0])))
/*
* Macros to make the correct C datatypes for a new kind of ring.
*
* To make a new ring datatype, you need to have two message structures,
* let's say struct request, and struct response already defined.
*
* In a header where you want the ring datatype declared, you then do:
*
* DEFINE_RING_TYPES(mytag, struct request, struct response);
*
* These expand out to give you a set of types, as you can see below.
* The most important of these are:
*
* struct mytag_sring - The shared ring.
* struct mytag_front_ring - The 'front' half of the ring.
* struct mytag_back_ring - The 'back' half of the ring.
*
* To initialize a ring in your code you need to know the location and size
* of the shared memory area (PAGE_SIZE, for instance). To initialise
* the front half:
*
* struct mytag_front_ring front_ring;
* SHARED_RING_INIT((struct mytag_sring *)shared_page);
* FRONT_RING_INIT(&front_ring, (struct mytag_sring *)shared_page,
* PAGE_SIZE);
*
* Initializing the back follows similarly (note that only the front
* initializes the shared ring):
*
* struct mytag_back_ring back_ring;
* BACK_RING_INIT(&back_ring, (struct mytag_sring *)shared_page,
* PAGE_SIZE);
*/
#define DEFINE_RING_TYPES(__name, __req_t, __rsp_t) \
\
/* Shared ring entry */ \
union __name##_sring_entry { \
__req_t req; \
__rsp_t rsp; \
}; \
\
/* Shared ring page */ \
struct __name##_sring { \
RING_IDX req_prod, req_event; \
RING_IDX rsp_prod, rsp_event; \
uint8_t pad[48]; \
union __name##_sring_entry ring[1]; /* variable-length */ \
}; \
\
/* "Front" end's private variables */ \
struct __name##_front_ring { \
RING_IDX req_prod_pvt; \
RING_IDX rsp_cons; \
unsigned int nr_ents; \
struct __name##_sring *sring; \
}; \
\
/* "Back" end's private variables */ \
struct __name##_back_ring { \
RING_IDX rsp_prod_pvt; \
RING_IDX req_cons; \
unsigned int nr_ents; \
struct __name##_sring *sring; \
};
/*
* Macros for manipulating rings.
*
* FRONT_RING_whatever works on the "front end" of a ring: here
* requests are pushed on to the ring and responses taken off it.
*
* BACK_RING_whatever works on the "back end" of a ring: here
* requests are taken off the ring and responses put on.
*
* N.B. these macros do NO INTERLOCKS OR FLOW CONTROL.
* This is OK in 1-for-1 request-response situations where the
* requestor (front end) never has more than RING_SIZE()-1
* outstanding requests.
*/
/* Initialising empty rings */
#define SHARED_RING_INIT(_s) do { \
(_s)->req_prod = (_s)->rsp_prod = 0; \
(_s)->req_event = (_s)->rsp_event = 1; \
memset((_s)->pad, 0, sizeof((_s)->pad)); \
} while(0)
#define FRONT_RING_INIT(_r, _s, __size) do { \
(_r)->req_prod_pvt = 0; \
(_r)->rsp_cons = 0; \
(_r)->nr_ents = __RING_SIZE(_s, __size); \
(_r)->sring = (_s); \
} while (0)
#define BACK_RING_INIT(_r, _s, __size) do { \
(_r)->rsp_prod_pvt = 0; \
(_r)->req_cons = 0; \
(_r)->nr_ents = __RING_SIZE(_s, __size); \
(_r)->sring = (_s); \
} while (0)
/* Initialize to existing shared indexes -- for recovery */
#define FRONT_RING_ATTACH(_r, _s, __size) do { \
(_r)->sring = (_s); \
(_r)->req_prod_pvt = (_s)->req_prod; \
(_r)->rsp_cons = (_s)->rsp_prod; \
(_r)->nr_ents = __RING_SIZE(_s, __size); \
} while (0)
#define BACK_RING_ATTACH(_r, _s, __size) do { \
(_r)->sring = (_s); \
(_r)->rsp_prod_pvt = (_s)->rsp_prod; \
(_r)->req_cons = (_s)->req_prod; \
(_r)->nr_ents = __RING_SIZE(_s, __size); \
} while (0)
/* How big is this ring? */
#define RING_SIZE(_r) \
((_r)->nr_ents)
/* Number of free requests (for use on front side only). */
#define RING_FREE_REQUESTS(_r) \
(RING_SIZE(_r) - ((_r)->req_prod_pvt - (_r)->rsp_cons))
/* Test if there is an empty slot available on the front ring.
* (This is only meaningful from the front. )
*/
#define RING_FULL(_r) \
(RING_FREE_REQUESTS(_r) == 0)
/* Test if there are outstanding messages to be processed on a ring. */
#define RING_HAS_UNCONSUMED_RESPONSES(_r) \
((_r)->sring->rsp_prod - (_r)->rsp_cons)
#define RING_HAS_UNCONSUMED_REQUESTS(_r) \
({ \
unsigned int req = (_r)->sring->req_prod - (_r)->req_cons; \
unsigned int rsp = RING_SIZE(_r) - \
((_r)->req_cons - (_r)->rsp_prod_pvt); \
req < rsp ? req : rsp; \
})
/* Direct access to individual ring elements, by index. */
#define RING_GET_REQUEST(_r, _idx) \
(&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].req))
#define RING_GET_RESPONSE(_r, _idx) \
(&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].rsp))
/* Loop termination condition: Would the specified index overflow the ring? */
#define RING_REQUEST_CONS_OVERFLOW(_r, _cons) \
(((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r))
#define RING_PUSH_REQUESTS(_r) do { \
wmb(); /* back sees requests /before/ updated producer index */ \
(_r)->sring->req_prod = (_r)->req_prod_pvt; \
} while (0)
#define RING_PUSH_RESPONSES(_r) do { \
wmb(); /* front sees responses /before/ updated producer index */ \
(_r)->sring->rsp_prod = (_r)->rsp_prod_pvt; \
} while (0)
/*
* Notification hold-off (req_event and rsp_event):
*
* When queueing requests or responses on a shared ring, it may not always be
* necessary to notify the remote end. For example, if requests are in flight
* in a backend, the front may be able to queue further requests without
* notifying the back (if the back checks for new requests when it queues
* responses).
*
* When enqueuing requests or responses:
*
* Use RING_PUSH_{REQUESTS,RESPONSES}_AND_CHECK_NOTIFY(). The second argument
* is a boolean return value. True indicates that the receiver requires an
* asynchronous notification.
*
* After dequeuing requests or responses (before sleeping the connection):
*
* Use RING_FINAL_CHECK_FOR_REQUESTS() or RING_FINAL_CHECK_FOR_RESPONSES().
* The second argument is a boolean return value. True indicates that there
* are pending messages on the ring (i.e., the connection should not be put
* to sleep).
*
* These macros will set the req_event/rsp_event field to trigger a
* notification on the very next message that is enqueued. If you want to
* create batches of work (i.e., only receive a notification after several
* messages have been enqueued) then you will need to create a customised
* version of the FINAL_CHECK macro in your own code, which sets the event
* field appropriately.
*/
#define RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(_r, _notify) do { \
RING_IDX __old = (_r)->sring->req_prod; \
RING_IDX __new = (_r)->req_prod_pvt; \
wmb(); /* back sees requests /before/ updated producer index */ \
(_r)->sring->req_prod = __new; \
mb(); /* back sees new requests /before/ we check req_event */ \
(_notify) = ((RING_IDX)(__new - (_r)->sring->req_event) < \
(RING_IDX)(__new - __old)); \
} while (0)
#define RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(_r, _notify) do { \
RING_IDX __old = (_r)->sring->rsp_prod; \
RING_IDX __new = (_r)->rsp_prod_pvt; \
wmb(); /* front sees responses /before/ updated producer index */ \
(_r)->sring->rsp_prod = __new; \
mb(); /* front sees new responses /before/ we check rsp_event */ \
(_notify) = ((RING_IDX)(__new - (_r)->sring->rsp_event) < \
(RING_IDX)(__new - __old)); \
} while (0)
#define RING_FINAL_CHECK_FOR_REQUESTS(_r, _work_to_do) do { \
(_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r); \
if (_work_to_do) break; \
(_r)->sring->req_event = (_r)->req_cons + 1; \
mb(); \
(_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r); \
} while (0)
#define RING_FINAL_CHECK_FOR_RESPONSES(_r, _work_to_do) do { \
(_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r); \
if (_work_to_do) break; \
(_r)->sring->rsp_event = (_r)->rsp_cons + 1; \
mb(); \
(_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r); \
} while (0)
#endif /* __XEN_PUBLIC_IO_RING_H__ */

View File

@ -0,0 +1,44 @@
/*****************************************************************************
* xenbus.h
*
* Xenbus protocol details.
*
* Copyright (C) 2005 XenSource Ltd.
*/
#ifndef _XEN_PUBLIC_IO_XENBUS_H
#define _XEN_PUBLIC_IO_XENBUS_H
/* The state of either end of the Xenbus, i.e. the current communication
status of initialisation across the bus. States here imply nothing about
the state of the connection between the driver and the kernel's device
layers. */
enum xenbus_state
{
XenbusStateUnknown = 0,
XenbusStateInitialising = 1,
XenbusStateInitWait = 2, /* Finished early
initialisation, but waiting
for information from the peer
or hotplug scripts. */
XenbusStateInitialised = 3, /* Initialised and waiting for a
connection from the peer. */
XenbusStateConnected = 4,
XenbusStateClosing = 5, /* The device is being closed
due to an error or an unplug
event. */
XenbusStateClosed = 6
};
#endif /* _XEN_PUBLIC_IO_XENBUS_H */
/*
* Local variables:
* c-file-style: "linux"
* indent-tabs-mode: t
* c-indent-level: 8
* c-basic-offset: 8
* tab-width: 8
* End:
*/

View File

@ -0,0 +1,87 @@
/*
* Details of the "wire" protocol between Xen Store Daemon and client
* library or guest kernel.
* Copyright (C) 2005 Rusty Russell IBM Corporation
*/
#ifndef _XS_WIRE_H
#define _XS_WIRE_H
enum xsd_sockmsg_type
{
XS_DEBUG,
XS_DIRECTORY,
XS_READ,
XS_GET_PERMS,
XS_WATCH,
XS_UNWATCH,
XS_TRANSACTION_START,
XS_TRANSACTION_END,
XS_INTRODUCE,
XS_RELEASE,
XS_GET_DOMAIN_PATH,
XS_WRITE,
XS_MKDIR,
XS_RM,
XS_SET_PERMS,
XS_WATCH_EVENT,
XS_ERROR,
XS_IS_DOMAIN_INTRODUCED
};
#define XS_WRITE_NONE "NONE"
#define XS_WRITE_CREATE "CREATE"
#define XS_WRITE_CREATE_EXCL "CREATE|EXCL"
/* We hand errors as strings, for portability. */
struct xsd_errors
{
int errnum;
const char *errstring;
};
#define XSD_ERROR(x) { x, #x }
static struct xsd_errors xsd_errors[] __attribute__((unused)) = {
XSD_ERROR(EINVAL),
XSD_ERROR(EACCES),
XSD_ERROR(EEXIST),
XSD_ERROR(EISDIR),
XSD_ERROR(ENOENT),
XSD_ERROR(ENOMEM),
XSD_ERROR(ENOSPC),
XSD_ERROR(EIO),
XSD_ERROR(ENOTEMPTY),
XSD_ERROR(ENOSYS),
XSD_ERROR(EROFS),
XSD_ERROR(EBUSY),
XSD_ERROR(EAGAIN),
XSD_ERROR(EISCONN)
};
struct xsd_sockmsg
{
uint32_t type; /* XS_??? */
uint32_t req_id;/* Request identifier, echoed in daemon's response. */
uint32_t tx_id; /* Transaction id (0 if not related to a transaction). */
uint32_t len; /* Length of data following this. */
/* Generally followed by nul-terminated string(s). */
};
enum xs_watch_type
{
XS_WATCH_PATH = 0,
XS_WATCH_TOKEN
};
/* Inter-domain shared memory communications. */
#define XENSTORE_RING_SIZE 1024
typedef uint32_t XENSTORE_RING_IDX;
#define MASK_XENSTORE_IDX(idx) ((idx) & (XENSTORE_RING_SIZE-1))
struct xenstore_domain_interface {
char req[XENSTORE_RING_SIZE]; /* Requests to xenstore daemon. */
char rsp[XENSTORE_RING_SIZE]; /* Replies and async watch events. */
XENSTORE_RING_IDX req_cons, req_prod;
XENSTORE_RING_IDX rsp_cons, rsp_prod;
};
#endif /* _XS_WIRE_H */

View File

@ -0,0 +1,145 @@
/******************************************************************************
* memory.h
*
* Memory reservation and information.
*
* Copyright (c) 2005, Keir Fraser <keir@xensource.com>
*/
#ifndef __XEN_PUBLIC_MEMORY_H__
#define __XEN_PUBLIC_MEMORY_H__
/*
* Increase or decrease the specified domain's memory reservation. Returns a
* -ve errcode on failure, or the # extents successfully allocated or freed.
* arg == addr of struct xen_memory_reservation.
*/
#define XENMEM_increase_reservation 0
#define XENMEM_decrease_reservation 1
#define XENMEM_populate_physmap 6
struct xen_memory_reservation {
/*
* XENMEM_increase_reservation:
* OUT: MFN (*not* GMFN) bases of extents that were allocated
* XENMEM_decrease_reservation:
* IN: GMFN bases of extents to free
* XENMEM_populate_physmap:
* IN: GPFN bases of extents to populate with memory
* OUT: GMFN bases of extents that were allocated
* (NB. This command also updates the mach_to_phys translation table)
*/
GUEST_HANDLE(ulong) extent_start;
/* Number of extents, and size/alignment of each (2^extent_order pages). */
unsigned long nr_extents;
unsigned int extent_order;
/*
* Maximum # bits addressable by the user of the allocated region (e.g.,
* I/O devices often have a 32-bit limitation even in 64-bit systems). If
* zero then the user has no addressing restriction.
* This field is not used by XENMEM_decrease_reservation.
*/
unsigned int address_bits;
/*
* Domain whose reservation is being changed.
* Unprivileged domains can specify only DOMID_SELF.
*/
domid_t domid;
};
DEFINE_GUEST_HANDLE_STRUCT(xen_memory_reservation);
/*
* Returns the maximum machine frame number of mapped RAM in this system.
* This command always succeeds (it never returns an error code).
* arg == NULL.
*/
#define XENMEM_maximum_ram_page 2
/*
* Returns the current or maximum memory reservation, in pages, of the
* specified domain (may be DOMID_SELF). Returns -ve errcode on failure.
* arg == addr of domid_t.
*/
#define XENMEM_current_reservation 3
#define XENMEM_maximum_reservation 4
/*
* Returns a list of MFN bases of 2MB extents comprising the machine_to_phys
* mapping table. Architectures which do not have a m2p table do not implement
* this command.
* arg == addr of xen_machphys_mfn_list_t.
*/
#define XENMEM_machphys_mfn_list 5
struct xen_machphys_mfn_list {
/*
* Size of the 'extent_start' array. Fewer entries will be filled if the
* machphys table is smaller than max_extents * 2MB.
*/
unsigned int max_extents;
/*
* Pointer to buffer to fill with list of extent starts. If there are
* any large discontiguities in the machine address space, 2MB gaps in
* the machphys table will be represented by an MFN base of zero.
*/
GUEST_HANDLE(ulong) extent_start;
/*
* Number of extents written to the above array. This will be smaller
* than 'max_extents' if the machphys table is smaller than max_e * 2MB.
*/
unsigned int nr_extents;
};
DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list);
/*
* Sets the GPFN at which a particular page appears in the specified guest's
* pseudophysical address space.
* arg == addr of xen_add_to_physmap_t.
*/
#define XENMEM_add_to_physmap 7
struct xen_add_to_physmap {
/* Which domain to change the mapping for. */
domid_t domid;
/* Source mapping space. */
#define XENMAPSPACE_shared_info 0 /* shared info page */
#define XENMAPSPACE_grant_table 1 /* grant table page */
unsigned int space;
/* Index into source mapping space. */
unsigned long idx;
/* GPFN where the source mapping page should appear. */
unsigned long gpfn;
};
DEFINE_GUEST_HANDLE_STRUCT(xen_add_to_physmap);
/*
* Translates a list of domain-specific GPFNs into MFNs. Returns a -ve error
* code on failure. This call only works for auto-translated guests.
*/
#define XENMEM_translate_gpfn_list 8
struct xen_translate_gpfn_list {
/* Which domain to translate for? */
domid_t domid;
/* Length of list. */
unsigned long nr_gpfns;
/* List of GPFNs to translate. */
GUEST_HANDLE(ulong) gpfn_list;
/*
* Output list to contain MFN translations. May be the same as the input
* list (in which case each input GPFN is overwritten with the output MFN).
*/
GUEST_HANDLE(ulong) mfn_list;
};
DEFINE_GUEST_HANDLE_STRUCT(xen_translate_gpfn_list);
#endif /* __XEN_PUBLIC_MEMORY_H__ */

View File

@ -0,0 +1,145 @@
/*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef __XEN_PUBLIC_PHYSDEV_H__
#define __XEN_PUBLIC_PHYSDEV_H__
/*
* Prototype for this hypercall is:
* int physdev_op(int cmd, void *args)
* @cmd == PHYSDEVOP_??? (physdev operation).
* @args == Operation-specific extra arguments (NULL if none).
*/
/*
* Notify end-of-interrupt (EOI) for the specified IRQ.
* @arg == pointer to physdev_eoi structure.
*/
#define PHYSDEVOP_eoi 12
struct physdev_eoi {
/* IN */
uint32_t irq;
};
/*
* Query the status of an IRQ line.
* @arg == pointer to physdev_irq_status_query structure.
*/
#define PHYSDEVOP_irq_status_query 5
struct physdev_irq_status_query {
/* IN */
uint32_t irq;
/* OUT */
uint32_t flags; /* XENIRQSTAT_* */
};
/* Need to call PHYSDEVOP_eoi when the IRQ has been serviced? */
#define _XENIRQSTAT_needs_eoi (0)
#define XENIRQSTAT_needs_eoi (1U<<_XENIRQSTAT_needs_eoi)
/* IRQ shared by multiple guests? */
#define _XENIRQSTAT_shared (1)
#define XENIRQSTAT_shared (1U<<_XENIRQSTAT_shared)
/*
* Set the current VCPU's I/O privilege level.
* @arg == pointer to physdev_set_iopl structure.
*/
#define PHYSDEVOP_set_iopl 6
struct physdev_set_iopl {
/* IN */
uint32_t iopl;
};
/*
* Set the current VCPU's I/O-port permissions bitmap.
* @arg == pointer to physdev_set_iobitmap structure.
*/
#define PHYSDEVOP_set_iobitmap 7
struct physdev_set_iobitmap {
/* IN */
uint8_t * bitmap;
uint32_t nr_ports;
};
/*
* Read or write an IO-APIC register.
* @arg == pointer to physdev_apic structure.
*/
#define PHYSDEVOP_apic_read 8
#define PHYSDEVOP_apic_write 9
struct physdev_apic {
/* IN */
unsigned long apic_physbase;
uint32_t reg;
/* IN or OUT */
uint32_t value;
};
/*
* Allocate or free a physical upcall vector for the specified IRQ line.
* @arg == pointer to physdev_irq structure.
*/
#define PHYSDEVOP_alloc_irq_vector 10
#define PHYSDEVOP_free_irq_vector 11
struct physdev_irq {
/* IN */
uint32_t irq;
/* IN or OUT */
uint32_t vector;
};
/*
* Argument to physdev_op_compat() hypercall. Superceded by new physdev_op()
* hypercall since 0x00030202.
*/
struct physdev_op {
uint32_t cmd;
union {
struct physdev_irq_status_query irq_status_query;
struct physdev_set_iopl set_iopl;
struct physdev_set_iobitmap set_iobitmap;
struct physdev_apic apic_op;
struct physdev_irq irq_op;
} u;
};
/*
* Notify that some PIRQ-bound event channels have been unmasked.
* ** This command is obsolete since interface version 0x00030202 and is **
* ** unsupported by newer versions of Xen. **
*/
#define PHYSDEVOP_IRQ_UNMASK_NOTIFY 4
/*
* These all-capitals physdev operation names are superceded by the new names
* (defined above) since interface version 0x00030202.
*/
#define PHYSDEVOP_IRQ_STATUS_QUERY PHYSDEVOP_irq_status_query
#define PHYSDEVOP_SET_IOPL PHYSDEVOP_set_iopl
#define PHYSDEVOP_SET_IOBITMAP PHYSDEVOP_set_iobitmap
#define PHYSDEVOP_APIC_READ PHYSDEVOP_apic_read
#define PHYSDEVOP_APIC_WRITE PHYSDEVOP_apic_write
#define PHYSDEVOP_ASSIGN_VECTOR PHYSDEVOP_alloc_irq_vector
#define PHYSDEVOP_FREE_VECTOR PHYSDEVOP_free_irq_vector
#define PHYSDEVOP_IRQ_NEEDS_UNMASK_NOTIFY XENIRQSTAT_needs_eoi
#define PHYSDEVOP_IRQ_SHARED XENIRQSTAT_shared
#endif /* __XEN_PUBLIC_PHYSDEV_H__ */

View File

@ -0,0 +1,77 @@
/******************************************************************************
* sched.h
*
* Scheduler state interactions
*
* Copyright (c) 2005, Keir Fraser <keir@xensource.com>
*/
#ifndef __XEN_PUBLIC_SCHED_H__
#define __XEN_PUBLIC_SCHED_H__
#include "event_channel.h"
/*
* The prototype for this hypercall is:
* long sched_op_new(int cmd, void *arg)
* @cmd == SCHEDOP_??? (scheduler operation).
* @arg == Operation-specific extra argument(s), as described below.
*
* **NOTE**:
* Versions of Xen prior to 3.0.2 provide only the following legacy version
* of this hypercall, supporting only the commands yield, block and shutdown:
* long sched_op(int cmd, unsigned long arg)
* @cmd == SCHEDOP_??? (scheduler operation).
* @arg == 0 (SCHEDOP_yield and SCHEDOP_block)
* == SHUTDOWN_* code (SCHEDOP_shutdown)
*/
/*
* Voluntarily yield the CPU.
* @arg == NULL.
*/
#define SCHEDOP_yield 0
/*
* Block execution of this VCPU until an event is received for processing.
* If called with event upcalls masked, this operation will atomically
* reenable event delivery and check for pending events before blocking the
* VCPU. This avoids a "wakeup waiting" race.
* @arg == NULL.
*/
#define SCHEDOP_block 1
/*
* Halt execution of this domain (all VCPUs) and notify the system controller.
* @arg == pointer to sched_shutdown structure.
*/
#define SCHEDOP_shutdown 2
struct sched_shutdown {
unsigned int reason; /* SHUTDOWN_* */
};
DEFINE_GUEST_HANDLE_STRUCT(sched_shutdown);
/*
* Poll a set of event-channel ports. Return when one or more are pending. An
* optional timeout may be specified.
* @arg == pointer to sched_poll structure.
*/
#define SCHEDOP_poll 3
struct sched_poll {
GUEST_HANDLE(evtchn_port_t) ports;
unsigned int nr_ports;
uint64_t timeout;
};
DEFINE_GUEST_HANDLE_STRUCT(sched_poll);
/*
* Reason codes for SCHEDOP_shutdown. These may be interpreted by control
* software to determine the appropriate action. For the most part, Xen does
* not care about the shutdown code.
*/
#define SHUTDOWN_poweroff 0 /* Domain exited normally. Clean up and kill. */
#define SHUTDOWN_reboot 1 /* Clean up, kill, and then restart. */
#define SHUTDOWN_suspend 2 /* Clean up, save suspend info, kill. */
#define SHUTDOWN_crash 3 /* Tell controller we've crashed. */
#endif /* __XEN_PUBLIC_SCHED_H__ */

View File

@ -0,0 +1,167 @@
/******************************************************************************
* vcpu.h
*
* VCPU initialisation, query, and hotplug.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
* Copyright (c) 2005, Keir Fraser <keir@xensource.com>
*/
#ifndef __XEN_PUBLIC_VCPU_H__
#define __XEN_PUBLIC_VCPU_H__
/*
* Prototype for this hypercall is:
* int vcpu_op(int cmd, int vcpuid, void *extra_args)
* @cmd == VCPUOP_??? (VCPU operation).
* @vcpuid == VCPU to operate on.
* @extra_args == Operation-specific extra arguments (NULL if none).
*/
/*
* Initialise a VCPU. Each VCPU can be initialised only once. A
* newly-initialised VCPU will not run until it is brought up by VCPUOP_up.
*
* @extra_arg == pointer to vcpu_guest_context structure containing initial
* state for the VCPU.
*/
#define VCPUOP_initialise 0
/*
* Bring up a VCPU. This makes the VCPU runnable. This operation will fail
* if the VCPU has not been initialised (VCPUOP_initialise).
*/
#define VCPUOP_up 1
/*
* Bring down a VCPU (i.e., make it non-runnable).
* There are a few caveats that callers should observe:
* 1. This operation may return, and VCPU_is_up may return false, before the
* VCPU stops running (i.e., the command is asynchronous). It is a good
* idea to ensure that the VCPU has entered a non-critical loop before
* bringing it down. Alternatively, this operation is guaranteed
* synchronous if invoked by the VCPU itself.
* 2. After a VCPU is initialised, there is currently no way to drop all its
* references to domain memory. Even a VCPU that is down still holds
* memory references via its pagetable base pointer and GDT. It is good
* practise to move a VCPU onto an 'idle' or default page table, LDT and
* GDT before bringing it down.
*/
#define VCPUOP_down 2
/* Returns 1 if the given VCPU is up. */
#define VCPUOP_is_up 3
/*
* Return information about the state and running time of a VCPU.
* @extra_arg == pointer to vcpu_runstate_info structure.
*/
#define VCPUOP_get_runstate_info 4
struct vcpu_runstate_info {
/* VCPU's current state (RUNSTATE_*). */
int state;
/* When was current state entered (system time, ns)? */
uint64_t state_entry_time;
/*
* Time spent in each RUNSTATE_* (ns). The sum of these times is
* guaranteed not to drift from system time.
*/
uint64_t time[4];
};
/* VCPU is currently running on a physical CPU. */
#define RUNSTATE_running 0
/* VCPU is runnable, but not currently scheduled on any physical CPU. */
#define RUNSTATE_runnable 1
/* VCPU is blocked (a.k.a. idle). It is therefore not runnable. */
#define RUNSTATE_blocked 2
/*
* VCPU is not runnable, but it is not blocked.
* This is a 'catch all' state for things like hotplug and pauses by the
* system administrator (or for critical sections in the hypervisor).
* RUNSTATE_blocked dominates this state (it is the preferred state).
*/
#define RUNSTATE_offline 3
/*
* Register a shared memory area from which the guest may obtain its own
* runstate information without needing to execute a hypercall.
* Notes:
* 1. The registered address may be virtual or physical, depending on the
* platform. The virtual address should be registered on x86 systems.
* 2. Only one shared area may be registered per VCPU. The shared area is
* updated by the hypervisor each time the VCPU is scheduled. Thus
* runstate.state will always be RUNSTATE_running and
* runstate.state_entry_time will indicate the system time at which the
* VCPU was last scheduled to run.
* @extra_arg == pointer to vcpu_register_runstate_memory_area structure.
*/
#define VCPUOP_register_runstate_memory_area 5
struct vcpu_register_runstate_memory_area {
union {
struct vcpu_runstate_info *v;
uint64_t p;
} addr;
};
/*
* Set or stop a VCPU's periodic timer. Every VCPU has one periodic timer
* which can be set via these commands. Periods smaller than one millisecond
* may not be supported.
*/
#define VCPUOP_set_periodic_timer 6 /* arg == vcpu_set_periodic_timer_t */
#define VCPUOP_stop_periodic_timer 7 /* arg == NULL */
struct vcpu_set_periodic_timer {
uint64_t period_ns;
};
/*
* Set or stop a VCPU's single-shot timer. Every VCPU has one single-shot
* timer which can be set via these commands.
*/
#define VCPUOP_set_singleshot_timer 8 /* arg == vcpu_set_singleshot_timer_t */
#define VCPUOP_stop_singleshot_timer 9 /* arg == NULL */
struct vcpu_set_singleshot_timer {
uint64_t timeout_abs_ns;
uint32_t flags; /* VCPU_SSHOTTMR_??? */
};
/* Flags to VCPUOP_set_singleshot_timer. */
/* Require the timeout to be in the future (return -ETIME if it's passed). */
#define _VCPU_SSHOTTMR_future (0)
#define VCPU_SSHOTTMR_future (1U << _VCPU_SSHOTTMR_future)
/*
* Register a memory location in the guest address space for the
* vcpu_info structure. This allows the guest to place the vcpu_info
* structure in a convenient place, such as in a per-cpu data area.
* The pointer need not be page aligned, but the structure must not
* cross a page boundary.
*/
#define VCPUOP_register_vcpu_info 10 /* arg == struct vcpu_info */
struct vcpu_register_vcpu_info {
uint32_t mfn; /* mfn of page to place vcpu_info */
uint32_t offset; /* offset within page */
};
#endif /* __XEN_PUBLIC_VCPU_H__ */

Some files were not shown because too many files have changed in this diff Show More