From fdb554136a2e16968ab3c902c38c27a84bcaf6df Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Tue, 21 Mar 2023 22:59:41 +0000 Subject: [PATCH] Allow writes via newly-readonly PTE for buggy Ubuntu guests When adding the vsyscall page at 0xffffffffff600000 to the userspace page tables, some early PV Linux guest kernels would make the "page table" read only before writing to it. When Xen and the guest agree that it's a page table, that's fine. Xen will trap the write and handle it appropriately as a PTE update. However, in the case of userspace page tables that have never been loaded into a %cr3 yet, Xen does not agree. The only reason this ever worked was because there was no TLB flush between the guest marking it read only, and the immediately subsequent write. However, in Xen-on-Nitro (or perhaps more relevantly on Skylake CPUs), on guests with lots of memory like m2.4xlarge with 68GiB, it became fairly deterministic and the affected guests would consistently fail to boot. Since this worked for years and only started to hurt when we changed to XoN, we need to work around this for them. Do so by remembering the last PTE to be flipped to read-only, then in the write handler, *if* the write that faulted is via that PTE, *and* no explicit TLB flush has happened in the interim, permitting the write anyway. Signed-off-by: David Woodhouse --- xen/arch/x86/flushtlb.c | 4 +++ xen/arch/x86/mm.c | 68 ++++++++++++++++++++++++++++++++++-- xen/include/asm-x86/domain.h | 3 ++ 3 files changed, 72 insertions(+), 3 deletions(-) diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c index f6d7ad1650..2fe9d68533 100644 --- a/xen/arch/x86/flushtlb.c +++ b/xen/arch/x86/flushtlb.c @@ -103,6 +103,10 @@ unsigned int flush_area_local(const void *va, unsigned int flags) /* This non-reentrant function is sometimes called in interrupt context. */ local_irq_save(irqfl); + if (system_state >= SYS_STATE_active) { + current->arch.pv_vcpu.last_ptep_readonly = NULL; + } + if ( flags & (FLUSH_TLB|FLUSH_TLB_GLOBAL) ) { if ( order == 0 ) diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index 6a12358006..f92c1a3395 100644 --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -2156,6 +2156,22 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e, return -EBUSY; } + /* + * If this is merely flipping the _PAGE_RW bit off, remember it. Some + * buggy guests will immediately try to write to the page, hoping that + * we know it's a PTE page. But when it's the vsyscall PTE page for + * userspace, being written before any userspace %cr3 has been loaded, + * we don't. And userspace is just one TLB flush (or natural expiry) + * away from getting the fault that it so truly deserves. + * Since said buggy guests always seemed to get away with it on true + * Xen and it only came to light when running on XoN/Skylake, we get + * to work around it. + */ + if ( !(l1e_get_intpte(nl1e) & _PAGE_RW) && + (l1e_get_intpte(nl1e) | _PAGE_RW) == l1e_get_intpte(ol1e)) { + pt_vcpu->arch.pv_vcpu.last_ptep_readonly = pl1e; + pt_vcpu->arch.pv_vcpu.last_pteval_readonly = l1e_get_intpte(nl1e); + } put_page_from_l1e(ol1e, pt_dom); return rc; } @@ -5566,12 +5582,23 @@ static int ptwr_emulated_update( /* We are looking only for read-only mappings of p.t. pages. */ ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT); ASSERT(mfn_valid(_mfn(mfn))); - ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table); + ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table || + (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page); ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0); ASSERT(page_get_owner(page) == d); /* Check the new PTE. */ nl1e = l1e_from_intpte(val); + + /* + * If permitting a buggy guest to write to a PTE page that Xen doesn't + * yet think of as a PTE page, don't allow writable PTEs. That would + * mess up the refcounting. Read-only is fine. + */ + if ((page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page && + l1e_get_flags(nl1e) & _PAGE_RW) { + return X86EMUL_UNHANDLEABLE; + } switch ( ret = get_page_from_l1e(nl1e, d, d) ) { default: @@ -5735,6 +5762,34 @@ int ptwr_do_page_fault(struct vcpu *v, unsigned long addr, page = l1e_get_page(pte); if ( !page_lock(page) ) { + /* + * If this was the last PTE flipped to read-only, and (obviously) + * no TLB flush has happened since then, it is reasonable to + * argue that the CPU *could* have continued to permit the write + * access instead of giving the guest the write fault that it so + * richly deserves. We do just that, because guests didn't start + * suffering from this until we hosted them on Skylake, thus + * making it "our fault" that something which had worked for + * years had suddenly broken. + */ + if (current->arch.pv_vcpu.last_ptep_readonly) { + l1_pgentry_t *guest_l1e; + unsigned long gl1mfn; + guest_l1e = guest_map_l1e(addr, &gl1mfn); + if (guest_l1e) { + guest_unmap_l1e(guest_l1e); + } + if (guest_l1e == current->arch.pv_vcpu.last_ptep_readonly && + l1e_get_intpte(pte) == current->arch.pv_vcpu.last_pteval_readonly && + get_page_type(page, PGT_writable_page)) { + static bool_t warned_once; + if (!warned_once) { + warned_once = true; + printk("Allowing write to %lx via newly read-only PTE %lx\n", addr, l1e_get_intpte(pte)); + } + goto allow; + } + } put_page(page); goto bail; } @@ -5746,13 +5803,18 @@ int ptwr_do_page_fault(struct vcpu *v, unsigned long addr, goto bail; } +allow: ptwr_ctxt.cr2 = addr; ptwr_ctxt.pte = pte; rc = x86_emulate(&ptwr_ctxt.ctxt, &ptwr_emulate_ops); - page_unlock(page); - put_page(page); + if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page ) + put_page_and_type(page); + else { + page_unlock(page); + put_page(page); + } switch ( rc ) { diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h index cc1f7916c3..dc205f54e5 100644 --- a/xen/include/asm-x86/domain.h +++ b/xen/include/asm-x86/domain.h @@ -446,6 +446,9 @@ struct pv_vcpu struct trap_info *trap_ctxt; + unsigned long last_pteval_readonly; + void *last_ptep_readonly; + unsigned long gdt_frames[FIRST_RESERVED_GDT_PAGE]; unsigned long ldt_base; unsigned int gdt_ents, ldt_ents; -- 2.39.2