diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 6671fdc0afb1..a5607c571cb3 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2071,6 +2071,60 @@ KVM_S390_INT_EXTERNAL_CALL (vcpu) - sigp external call; source cpu in parm Note that the vcpu ioctl is asynchronous to vcpu execution. +4.78 KVM_PPC_GET_HTAB_FD + +Capability: KVM_CAP_PPC_HTAB_FD +Architectures: powerpc +Type: vm ioctl +Parameters: Pointer to struct kvm_get_htab_fd (in) +Returns: file descriptor number (>= 0) on success, -1 on error + +This returns a file descriptor that can be used either to read out the +entries in the guest's hashed page table (HPT), or to write entries to +initialize the HPT. The returned fd can only be written to if the +KVM_GET_HTAB_WRITE bit is set in the flags field of the argument, and +can only be read if that bit is clear. The argument struct looks like +this: + +/* For KVM_PPC_GET_HTAB_FD */ +struct kvm_get_htab_fd { + __u64 flags; + __u64 start_index; + __u64 reserved[2]; +}; + +/* Values for kvm_get_htab_fd.flags */ +#define KVM_GET_HTAB_BOLTED_ONLY ((__u64)0x1) +#define KVM_GET_HTAB_WRITE ((__u64)0x2) + +The `start_index' field gives the index in the HPT of the entry at +which to start reading. It is ignored when writing. + +Reads on the fd will initially supply information about all +"interesting" HPT entries. Interesting entries are those with the +bolted bit set, if the KVM_GET_HTAB_BOLTED_ONLY bit is set, otherwise +all entries. When the end of the HPT is reached, the read() will +return. If read() is called again on the fd, it will start again from +the beginning of the HPT, but will only return HPT entries that have +changed since they were last read. + +Data read or written is structured as a header (8 bytes) followed by a +series of valid HPT entries (16 bytes) each. The header indicates how +many valid HPT entries there are and how many invalid entries follow +the valid entries. The invalid entries are not represented explicitly +in the stream. The header format is: + +struct kvm_get_htab_header { + __u32 index; + __u16 n_valid; + __u16 n_invalid; +}; + +Writes to the fd create HPT entries starting at the index given in the +header; first `n_valid' valid entries with contents from the data +written, then `n_invalid' invalid entries, invalidating any previously +valid entries found. + 5. The kvm_run structure ------------------------ diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index b322e5bd6964..38bec1dc9928 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -246,4 +246,26 @@ static inline bool slot_is_aligned(struct kvm_memory_slot *memslot, return !(memslot->base_gfn & mask) && !(memslot->npages & mask); } +/* + * This works for 4k, 64k and 16M pages on POWER7, + * and 4k and 16M pages on PPC970. + */ +static inline unsigned long slb_pgsize_encoding(unsigned long psize) +{ + unsigned long senc = 0; + + if (psize > 0x1000) { + senc = SLB_VSID_L; + if (psize == 0x10000) + senc |= SLB_VSID_LP_01; + } + return senc; +} + +static inline int is_vrma_hpte(unsigned long hpte_v) +{ + return (hpte_v & ~0xffffffUL) == + (HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16))); +} + #endif /* __ASM_KVM_BOOK3S_64_H__ */ diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 609cca3e9426..1ca31e92ee75 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -164,6 +164,8 @@ extern void kvmppc_bookehv_exit(void); extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu); +extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *); + /* * Cuts out inst bits with ordering according to spec. * That means the leftmost bit is zero. All given bits are included. diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index b89ae4db45ce..514883dd311e 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -331,6 +331,31 @@ struct kvm_book3e_206_tlb_params { __u32 reserved[8]; }; +/* For KVM_PPC_GET_HTAB_FD */ +struct kvm_get_htab_fd { + __u64 flags; + __u64 start_index; + __u64 reserved[2]; +}; + +/* Values for kvm_get_htab_fd.flags */ +#define KVM_GET_HTAB_BOLTED_ONLY ((__u64)0x1) +#define KVM_GET_HTAB_WRITE ((__u64)0x2) + +/* + * Data read on the file descriptor is formatted as a series of + * records, each consisting of a header followed by a series of + * `n_valid' HPTEs (16 bytes each), which are all valid. Following + * those valid HPTEs there are `n_invalid' invalid HPTEs, which + * are not represented explicitly in the stream. The same format + * is used for writing. + */ +struct kvm_get_htab_header { + __u32 index; + __u16 n_valid; + __u16 n_invalid; +}; + #define KVM_REG_PPC_HIOR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x1) #define KVM_REG_PPC_IAC1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x2) #define KVM_REG_PPC_IAC2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x3) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 6ee6516a0bee..0aa40734c8f6 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include #include @@ -1145,6 +1147,348 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va) put_page(page); } +/* + * Functions for reading and writing the hash table via reads and + * writes on a file descriptor. + * + * Reads return the guest view of the hash table, which has to be + * pieced together from the real hash table and the guest_rpte + * values in the revmap array. + * + * On writes, each HPTE written is considered in turn, and if it + * is valid, it is written to the HPT as if an H_ENTER with the + * exact flag set was done. When the invalid count is non-zero + * in the header written to the stream, the kernel will make + * sure that that many HPTEs are invalid, and invalidate them + * if not. + */ + +struct kvm_htab_ctx { + unsigned long index; + unsigned long flags; + struct kvm *kvm; + int first_pass; +}; + +#define HPTE_SIZE (2 * sizeof(unsigned long)) + +static long record_hpte(unsigned long flags, unsigned long *hptp, + unsigned long *hpte, struct revmap_entry *revp, + int want_valid, int first_pass) +{ + unsigned long v, r; + int ok = 1; + int valid, dirty; + + /* Unmodified entries are uninteresting except on the first pass */ + dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED); + if (!first_pass && !dirty) + return 0; + + valid = 0; + if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) { + valid = 1; + if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && + !(hptp[0] & HPTE_V_BOLTED)) + valid = 0; + } + if (valid != want_valid) + return 0; + + v = r = 0; + if (valid || dirty) { + /* lock the HPTE so it's stable and read it */ + preempt_disable(); + while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) + cpu_relax(); + v = hptp[0]; + if (v & HPTE_V_ABSENT) { + v &= ~HPTE_V_ABSENT; + v |= HPTE_V_VALID; + } + /* re-evaluate valid and dirty from synchronized HPTE value */ + valid = !!(v & HPTE_V_VALID); + if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED)) + valid = 0; + r = revp->guest_rpte | (hptp[1] & (HPTE_R_R | HPTE_R_C)); + dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED); + /* only clear modified if this is the right sort of entry */ + if (valid == want_valid && dirty) { + r &= ~HPTE_GR_MODIFIED; + revp->guest_rpte = r; + } + asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); + hptp[0] &= ~HPTE_V_HVLOCK; + preempt_enable(); + if (!(valid == want_valid && (first_pass || dirty))) + ok = 0; + } + hpte[0] = v; + hpte[1] = r; + return ok; +} + +static ssize_t kvm_htab_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct kvm_htab_ctx *ctx = file->private_data; + struct kvm *kvm = ctx->kvm; + struct kvm_get_htab_header hdr; + unsigned long *hptp; + struct revmap_entry *revp; + unsigned long i, nb, nw; + unsigned long __user *lbuf; + struct kvm_get_htab_header __user *hptr; + unsigned long flags; + int first_pass; + unsigned long hpte[2]; + + if (!access_ok(VERIFY_WRITE, buf, count)) + return -EFAULT; + + first_pass = ctx->first_pass; + flags = ctx->flags; + + i = ctx->index; + hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); + revp = kvm->arch.revmap + i; + lbuf = (unsigned long __user *)buf; + + nb = 0; + while (nb + sizeof(hdr) + HPTE_SIZE < count) { + /* Initialize header */ + hptr = (struct kvm_get_htab_header __user *)buf; + hdr.index = i; + hdr.n_valid = 0; + hdr.n_invalid = 0; + nw = nb; + nb += sizeof(hdr); + lbuf = (unsigned long __user *)(buf + sizeof(hdr)); + + /* Skip uninteresting entries, i.e. clean on not-first pass */ + if (!first_pass) { + while (i < kvm->arch.hpt_npte && + !(revp->guest_rpte & HPTE_GR_MODIFIED)) { + ++i; + hptp += 2; + ++revp; + } + } + + /* Grab a series of valid entries */ + while (i < kvm->arch.hpt_npte && + hdr.n_valid < 0xffff && + nb + HPTE_SIZE < count && + record_hpte(flags, hptp, hpte, revp, 1, first_pass)) { + /* valid entry, write it out */ + ++hdr.n_valid; + if (__put_user(hpte[0], lbuf) || + __put_user(hpte[1], lbuf + 1)) + return -EFAULT; + nb += HPTE_SIZE; + lbuf += 2; + ++i; + hptp += 2; + ++revp; + } + /* Now skip invalid entries while we can */ + while (i < kvm->arch.hpt_npte && + hdr.n_invalid < 0xffff && + record_hpte(flags, hptp, hpte, revp, 0, first_pass)) { + /* found an invalid entry */ + ++hdr.n_invalid; + ++i; + hptp += 2; + ++revp; + } + + if (hdr.n_valid || hdr.n_invalid) { + /* write back the header */ + if (__copy_to_user(hptr, &hdr, sizeof(hdr))) + return -EFAULT; + nw = nb; + buf = (char __user *)lbuf; + } else { + nb = nw; + } + + /* Check if we've wrapped around the hash table */ + if (i >= kvm->arch.hpt_npte) { + i = 0; + ctx->first_pass = 0; + break; + } + } + + ctx->index = i; + + return nb; +} + +static ssize_t kvm_htab_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct kvm_htab_ctx *ctx = file->private_data; + struct kvm *kvm = ctx->kvm; + struct kvm_get_htab_header hdr; + unsigned long i, j; + unsigned long v, r; + unsigned long __user *lbuf; + unsigned long *hptp; + unsigned long tmp[2]; + ssize_t nb; + long int err, ret; + int rma_setup; + + if (!access_ok(VERIFY_READ, buf, count)) + return -EFAULT; + + /* lock out vcpus from running while we're doing this */ + mutex_lock(&kvm->lock); + rma_setup = kvm->arch.rma_setup_done; + if (rma_setup) { + kvm->arch.rma_setup_done = 0; /* temporarily */ + /* order rma_setup_done vs. vcpus_running */ + smp_mb(); + if (atomic_read(&kvm->arch.vcpus_running)) { + kvm->arch.rma_setup_done = 1; + mutex_unlock(&kvm->lock); + return -EBUSY; + } + } + + err = 0; + for (nb = 0; nb + sizeof(hdr) <= count; ) { + err = -EFAULT; + if (__copy_from_user(&hdr, buf, sizeof(hdr))) + break; + + err = 0; + if (nb + hdr.n_valid * HPTE_SIZE > count) + break; + + nb += sizeof(hdr); + buf += sizeof(hdr); + + err = -EINVAL; + i = hdr.index; + if (i >= kvm->arch.hpt_npte || + i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte) + break; + + hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); + lbuf = (unsigned long __user *)buf; + for (j = 0; j < hdr.n_valid; ++j) { + err = -EFAULT; + if (__get_user(v, lbuf) || __get_user(r, lbuf + 1)) + goto out; + err = -EINVAL; + if (!(v & HPTE_V_VALID)) + goto out; + lbuf += 2; + nb += HPTE_SIZE; + + if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) + kvmppc_do_h_remove(kvm, 0, i, 0, tmp); + err = -EIO; + ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r, + tmp); + if (ret != H_SUCCESS) { + pr_err("kvm_htab_write ret %ld i=%ld v=%lx " + "r=%lx\n", ret, i, v, r); + goto out; + } + if (!rma_setup && is_vrma_hpte(v)) { + unsigned long psize = hpte_page_size(v, r); + unsigned long senc = slb_pgsize_encoding(psize); + unsigned long lpcr; + + kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | + (VRMA_VSID << SLB_VSID_SHIFT_1T); + lpcr = kvm->arch.lpcr & ~LPCR_VRMASD; + lpcr |= senc << (LPCR_VRMASD_SH - 4); + kvm->arch.lpcr = lpcr; + rma_setup = 1; + } + ++i; + hptp += 2; + } + + for (j = 0; j < hdr.n_invalid; ++j) { + if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) + kvmppc_do_h_remove(kvm, 0, i, 0, tmp); + ++i; + hptp += 2; + } + err = 0; + } + + out: + /* Order HPTE updates vs. rma_setup_done */ + smp_wmb(); + kvm->arch.rma_setup_done = rma_setup; + mutex_unlock(&kvm->lock); + + if (err) + return err; + return nb; +} + +static int kvm_htab_release(struct inode *inode, struct file *filp) +{ + struct kvm_htab_ctx *ctx = filp->private_data; + + filp->private_data = NULL; + if (!(ctx->flags & KVM_GET_HTAB_WRITE)) + atomic_dec(&ctx->kvm->arch.hpte_mod_interest); + kvm_put_kvm(ctx->kvm); + kfree(ctx); + return 0; +} + +static struct file_operations kvm_htab_fops = { + .read = kvm_htab_read, + .write = kvm_htab_write, + .llseek = default_llseek, + .release = kvm_htab_release, +}; + +int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf) +{ + int ret; + struct kvm_htab_ctx *ctx; + int rwflag; + + /* reject flags we don't recognize */ + if (ghf->flags & ~(KVM_GET_HTAB_BOLTED_ONLY | KVM_GET_HTAB_WRITE)) + return -EINVAL; + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + kvm_get_kvm(kvm); + ctx->kvm = kvm; + ctx->index = ghf->start_index; + ctx->flags = ghf->flags; + ctx->first_pass = 1; + + rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY; + ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag); + if (ret < 0) { + kvm_put_kvm(kvm); + return ret; + } + + if (rwflag == O_RDONLY) { + mutex_lock(&kvm->slots_lock); + atomic_inc(&kvm->arch.hpte_mod_interest); + /* make sure kvmppc_do_h_enter etc. see the increment */ + synchronize_srcu_expedited(&kvm->srcu); + mutex_unlock(&kvm->slots_lock); + } + + return ret; +} + void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) { struct kvmppc_mmu *mmu = &vcpu->arch.mmu; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 843eb754a1d5..a4f59dbcd800 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1563,18 +1563,6 @@ out: return r; } -static unsigned long slb_pgsize_encoding(unsigned long psize) -{ - unsigned long senc = 0; - - if (psize > 0x1000) { - senc = SLB_VSID_L; - if (psize == 0x10000) - senc |= SLB_VSID_LP_01; - } - return senc; -} - static void unpin_slot(struct kvm_memory_slot *memslot) { unsigned long *physp; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index d583ea15e151..70739a089560 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -354,6 +354,12 @@ int kvm_dev_ioctl_check_extension(long ext) r = 1; #else r = 0; + break; +#endif +#ifdef CONFIG_KVM_BOOK3S_64_HV + case KVM_CAP_PPC_HTAB_FD: + r = 1; + break; #endif break; case KVM_CAP_NR_VCPUS: @@ -954,6 +960,17 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; break; } + + case KVM_PPC_GET_HTAB_FD: { + struct kvm *kvm = filp->private_data; + struct kvm_get_htab_fd ghf; + + r = -EFAULT; + if (copy_from_user(&ghf, argp, sizeof(ghf))) + break; + r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf); + break; + } #endif /* CONFIG_KVM_BOOK3S_64_HV */ #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 494a84c37c3e..e6e5d4b13708 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -634,6 +634,7 @@ struct kvm_ppc_smmu_info { #endif #define KVM_CAP_IRQFD_RESAMPLE 82 #define KVM_CAP_PPC_BOOKE_WATCHDOG 83 +#define KVM_CAP_PPC_HTAB_FD 84 #ifdef KVM_CAP_IRQ_ROUTING @@ -859,6 +860,8 @@ struct kvm_s390_ucas_mapping { #define KVM_CREATE_SPAPR_TCE _IOW(KVMIO, 0xa8, struct kvm_create_spapr_tce) /* Available with KVM_CAP_RMA */ #define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma) +/* Available with KVM_CAP_PPC_HTAB_FD */ +#define KVM_PPC_GET_HTAB_FD _IOW(KVMIO, 0xaa, struct kvm_get_htab_fd) /* * ioctls for vcpu fds