summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFlorian Schmaus <flow@gentoo.org>2022-07-01 20:20:49 +0200
committerFlorian Schmaus <flow@gentoo.org>2022-07-01 20:26:39 +0200
commita16128cfac11b2f6462bbbc993cced2636abb312 (patch)
tree19b04c0f36afd341ba8a0ffbedf91ada49a332f1
downloadxen-upstream-patches-a16128cfac11b2f6462bbbc993cced2636abb312.tar.gz
xen-upstream-patches-a16128cfac11b2f6462bbbc993cced2636abb312.tar.bz2
xen-upstream-patches-a16128cfac11b2f6462bbbc993cced2636abb312.zip
Xen 4.16.2-pre-patchset-04.16.2-pre-patchset-0
Signed-off-by: Florian Schmaus <flow@gentoo.org>
-rw-r--r--0001-update-Xen-version-to-4.16.2-pre.patch25
-rw-r--r--0002-x86-irq-skip-unmap_domain_pirq-XSM-during-destructio.patch50
-rw-r--r--0003-xen-fix-XEN_DOMCTL_gdbsx_guestmemio-crash.patch63
-rw-r--r--0004-VT-d-refuse-to-use-IOMMU-with-reserved-CAP.ND-value.patch49
-rw-r--r--0005-x86-mm-avoid-inadvertently-degrading-a-TLB-flush-to-.patch116
-rw-r--r--0006-xen-build-Fix-dependency-for-the-MAP-rule.patch29
-rw-r--r--0007-tools-libs-evtchn-don-t-set-errno-to-negative-values.patch74
-rw-r--r--0008-tools-libs-ctrl-don-t-set-errno-to-a-negative-value.patch36
-rw-r--r--0009-tools-libs-guest-don-t-set-errno-to-a-negative-value.patch32
-rw-r--r--0010-tools-libs-light-don-t-set-errno-to-a-negative-value.patch32
-rw-r--r--0011-xen-iommu-cleanup-iommu-related-domctl-handling.patch112
-rw-r--r--0012-IOMMU-make-domctl-handler-tolerate-NULL-domain.patch36
-rw-r--r--0013-IOMMU-x86-disallow-device-assignment-to-PoD-guests.patch229
-rw-r--r--0014-x86-msr-handle-reads-to-MSR_P5_MC_-ADDR-TYPE.patch121
-rw-r--r--0015-kconfig-detect-LD-implementation.patch46
-rw-r--r--0016-linker-lld-do-not-generate-quoted-section-names.patch54
-rw-r--r--0017-xen-io-Fix-race-between-sending-an-I-O-and-domain-sh.patch142
-rw-r--r--0018-build-suppress-GNU-ld-warning-about-RWX-load-segment.patch35
-rw-r--r--0019-build-silence-GNU-ld-warning-about-executable-stacks.patch35
-rw-r--r--0020-ns16550-use-poll-mode-if-INTERRUPT_LINE-is-0xff.patch50
-rw-r--r--0021-PCI-don-t-allow-pci-phantom-to-mark-real-devices-as-.patch56
-rw-r--r--0022-x86-pv-Clean-up-_get_page_type.patch180
-rw-r--r--0023-x86-pv-Fix-ABAC-cmpxchg-race-in-_get_page_type.patch201
-rw-r--r--0024-x86-page-Introduce-_PAGE_-constants-for-memory-types.patch53
-rw-r--r--0025-x86-Don-t-change-the-cacheability-of-the-directmap.patch223
-rw-r--r--0026-x86-Split-cache_flush-out-of-cache_writeback.patch294
-rw-r--r--0027-x86-amd-Work-around-CLFLUSH-ordering-on-older-parts.patch95
-rw-r--r--0028-x86-pv-Track-and-flush-non-coherent-mappings-of-RAM.patch160
-rw-r--r--0029-x86-mm-account-for-PGT_pae_xen_l2-in-recently-added-.patch37
-rw-r--r--0030-x86-spec-ctrl-Make-VERW-flushing-runtime-conditional.patch258
-rw-r--r--0031-x86-spec-ctrl-Enumeration-for-MMIO-Stale-Data-contro.patch98
-rw-r--r--0032-x86-spec-ctrl-Add-spec-ctrl-unpriv-mmio.patch187
-rw-r--r--LICENSE339
-rwxr-xr-xcreate-patches60
-rw-r--r--info.txt6
35 files changed, 3613 insertions, 0 deletions
diff --git a/0001-update-Xen-version-to-4.16.2-pre.patch b/0001-update-Xen-version-to-4.16.2-pre.patch
new file mode 100644
index 0000000..30411de
--- /dev/null
+++ b/0001-update-Xen-version-to-4.16.2-pre.patch
@@ -0,0 +1,25 @@
+From 5be9edb482ab20cf3e7acb05b511465294d1e19b Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 7 Jun 2022 13:55:17 +0200
+Subject: [PATCH 01/32] update Xen version to 4.16.2-pre
+
+---
+ xen/Makefile | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/xen/Makefile b/xen/Makefile
+index 8abc71cf73aa..90a29782dbf4 100644
+--- a/xen/Makefile
++++ b/xen/Makefile
+@@ -2,7 +2,7 @@
+ # All other places this is stored (eg. compile.h) should be autogenerated.
+ export XEN_VERSION = 4
+ export XEN_SUBVERSION = 16
+-export XEN_EXTRAVERSION ?= .1$(XEN_VENDORVERSION)
++export XEN_EXTRAVERSION ?= .2-pre$(XEN_VENDORVERSION)
+ export XEN_FULLVERSION = $(XEN_VERSION).$(XEN_SUBVERSION)$(XEN_EXTRAVERSION)
+ -include xen-version
+
+--
+2.35.1
+
diff --git a/0002-x86-irq-skip-unmap_domain_pirq-XSM-during-destructio.patch b/0002-x86-irq-skip-unmap_domain_pirq-XSM-during-destructio.patch
new file mode 100644
index 0000000..fc6c2e1
--- /dev/null
+++ b/0002-x86-irq-skip-unmap_domain_pirq-XSM-during-destructio.patch
@@ -0,0 +1,50 @@
+From b58fb6e81bd55b6bd946abc3070770f7994c9ef9 Mon Sep 17 00:00:00 2001
+From: Jason Andryuk <jandryuk@gmail.com>
+Date: Tue, 7 Jun 2022 13:55:39 +0200
+Subject: [PATCH 02/32] x86/irq: skip unmap_domain_pirq XSM during destruction
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+xsm_unmap_domain_irq was seen denying unmap_domain_pirq when called from
+complete_domain_destroy as an RCU callback. The source context was an
+unexpected, random domain. Since this is a xen-internal operation,
+going through the XSM hook is inapproriate.
+
+Check d->is_dying and skip the XSM hook when set since this is a cleanup
+operation for a domain being destroyed.
+
+Suggested-by: Roger Pau Monné <roger.pau@citrix.com>
+Signed-off-by: Jason Andryuk <jandryuk@gmail.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+master commit: 2e6f95a942d1927a53f077c301db0b799c54c05a
+master date: 2022-04-08 14:51:52 +0200
+---
+ xen/arch/x86/irq.c | 10 ++++++++--
+ 1 file changed, 8 insertions(+), 2 deletions(-)
+
+diff --git a/xen/arch/x86/irq.c b/xen/arch/x86/irq.c
+index 67cbf6b979dc..47b86af5dce9 100644
+--- a/xen/arch/x86/irq.c
++++ b/xen/arch/x86/irq.c
+@@ -2342,8 +2342,14 @@ int unmap_domain_pirq(struct domain *d, int pirq)
+ nr = msi_desc->msi.nvec;
+ }
+
+- ret = xsm_unmap_domain_irq(XSM_HOOK, d, irq,
+- msi_desc ? msi_desc->dev : NULL);
++ /*
++ * When called by complete_domain_destroy via RCU, current is a random
++ * domain. Skip the XSM check since this is a Xen-initiated action.
++ */
++ if ( !d->is_dying )
++ ret = xsm_unmap_domain_irq(XSM_HOOK, d, irq,
++ msi_desc ? msi_desc->dev : NULL);
++
+ if ( ret )
+ goto done;
+
+--
+2.35.1
+
diff --git a/0003-xen-fix-XEN_DOMCTL_gdbsx_guestmemio-crash.patch b/0003-xen-fix-XEN_DOMCTL_gdbsx_guestmemio-crash.patch
new file mode 100644
index 0000000..905993b
--- /dev/null
+++ b/0003-xen-fix-XEN_DOMCTL_gdbsx_guestmemio-crash.patch
@@ -0,0 +1,63 @@
+From 6c6bbfdff9374ef41f84c4ebed7b8a7a40767ef6 Mon Sep 17 00:00:00 2001
+From: Juergen Gross <jgross@suse.com>
+Date: Tue, 7 Jun 2022 13:56:54 +0200
+Subject: [PATCH 03/32] xen: fix XEN_DOMCTL_gdbsx_guestmemio crash
+
+A hypervisor built without CONFIG_GDBSX will crash in case the
+XEN_DOMCTL_gdbsx_guestmemio domctl is being called, as the call will
+end up in iommu_do_domctl() with d == NULL:
+
+ (XEN) CPU: 6
+ (XEN) RIP: e008:[<ffff82d040269984>] iommu_do_domctl+0x4/0x30
+ (XEN) RFLAGS: 0000000000010202 CONTEXT: hypervisor (d0v0)
+ (XEN) rax: 00000000000003e8 rbx: ffff830856277ef8 rcx: ffff830856277fff
+ ...
+ (XEN) Xen call trace:
+ (XEN) [<ffff82d040269984>] R iommu_do_domctl+0x4/0x30
+ (XEN) [<ffff82d04035cd5f>] S arch_do_domctl+0x7f/0x2330
+ (XEN) [<ffff82d040239e46>] S do_domctl+0xe56/0x1930
+ (XEN) [<ffff82d040238ff0>] S do_domctl+0/0x1930
+ (XEN) [<ffff82d0402f8c59>] S pv_hypercall+0x99/0x110
+ (XEN) [<ffff82d0402f5161>] S arch/x86/pv/domain.c#_toggle_guest_pt+0x11/0x90
+ (XEN) [<ffff82d040366288>] S lstar_enter+0x128/0x130
+ (XEN)
+ (XEN) Pagetable walk from 0000000000000144:
+ (XEN) L4[0x000] = 0000000000000000 ffffffffffffffff
+ (XEN)
+ (XEN) ****************************************
+ (XEN) Panic on CPU 6:
+ (XEN) FATAL PAGE FAULT
+ (XEN) [error_code=0000]
+ (XEN) Faulting linear address: 0000000000000144
+ (XEN) ****************************************
+
+It used to be permitted to pass DOMID_IDLE to dbg_rw_mem(), which is why the
+special case skipping the domid checks exists. Now that it is only permitted
+to pass proper domids, remove the special case, making 'd' always valid.
+
+Reported-by: Cheyenne Wills <cheyenne.wills@gmail.com>
+Fixes: e726a82ca0dc ("xen: make gdbsx support configurable")
+Signed-off-by: Juergen Gross <jgross@suse.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+master commit: f00daf1fb3213a9b0335d9dcd90fe9cb5c02b7a9
+master date: 2022-04-19 17:07:08 +0100
+---
+ xen/common/domctl.c | 1 -
+ 1 file changed, 1 deletion(-)
+
+diff --git a/xen/common/domctl.c b/xen/common/domctl.c
+index 271862ae587f..419e4070f59d 100644
+--- a/xen/common/domctl.c
++++ b/xen/common/domctl.c
+@@ -304,7 +304,6 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
+ if ( op->domain == DOMID_INVALID )
+ {
+ case XEN_DOMCTL_createdomain:
+- case XEN_DOMCTL_gdbsx_guestmemio:
+ d = NULL;
+ break;
+ }
+--
+2.35.1
+
diff --git a/0004-VT-d-refuse-to-use-IOMMU-with-reserved-CAP.ND-value.patch b/0004-VT-d-refuse-to-use-IOMMU-with-reserved-CAP.ND-value.patch
new file mode 100644
index 0000000..c566888
--- /dev/null
+++ b/0004-VT-d-refuse-to-use-IOMMU-with-reserved-CAP.ND-value.patch
@@ -0,0 +1,49 @@
+From b378ee56c7e0bb5eeb35dcc55b3d29e5f50eb566 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 7 Jun 2022 13:58:16 +0200
+Subject: [PATCH 04/32] VT-d: refuse to use IOMMU with reserved CAP.ND value
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+The field taking the value 7 (resulting in 18-bit DIDs when using the
+calculation in cap_ndoms(), when the DID fields are only 16 bits wide)
+is reserved. Instead of misbehaving in case we would encounter such an
+IOMMU, refuse to use it.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Kevin Tian <kevin.tian@intel.com>
+master commit: a1545fbf45c689aff39ce76a6eaa609d32ef72a7
+master date: 2022-04-20 10:54:26 +0200
+---
+ xen/drivers/passthrough/vtd/iommu.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
+index 93dd8aa643aa..8975c1de61bc 100644
+--- a/xen/drivers/passthrough/vtd/iommu.c
++++ b/xen/drivers/passthrough/vtd/iommu.c
+@@ -1279,8 +1279,11 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd)
+
+ quirk_iommu_caps(iommu);
+
++ nr_dom = cap_ndoms(iommu->cap);
++
+ if ( cap_fault_reg_offset(iommu->cap) +
+ cap_num_fault_regs(iommu->cap) * PRIMARY_FAULT_REG_LEN >= PAGE_SIZE ||
++ ((nr_dom - 1) >> 16) /* I.e. cap.nd > 6 */ ||
+ ecap_iotlb_offset(iommu->ecap) >= PAGE_SIZE )
+ {
+ printk(XENLOG_ERR VTDPREFIX "IOMMU: unsupported\n");
+@@ -1305,7 +1308,6 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd)
+ vtd_ops.sync_cache = sync_cache;
+
+ /* allocate domain id bitmap */
+- nr_dom = cap_ndoms(iommu->cap);
+ iommu->domid_bitmap = xzalloc_array(unsigned long, BITS_TO_LONGS(nr_dom));
+ if ( !iommu->domid_bitmap )
+ return -ENOMEM;
+--
+2.35.1
+
diff --git a/0005-x86-mm-avoid-inadvertently-degrading-a-TLB-flush-to-.patch b/0005-x86-mm-avoid-inadvertently-degrading-a-TLB-flush-to-.patch
new file mode 100644
index 0000000..6410aaa
--- /dev/null
+++ b/0005-x86-mm-avoid-inadvertently-degrading-a-TLB-flush-to-.patch
@@ -0,0 +1,116 @@
+From 7c003ab4a398ff4ddd54d15d4158cffb463134cc Mon Sep 17 00:00:00 2001
+From: David Vrabel <dvrabel@amazon.co.uk>
+Date: Tue, 7 Jun 2022 13:59:31 +0200
+Subject: [PATCH 05/32] x86/mm: avoid inadvertently degrading a TLB flush to
+ local only
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+If the direct map is incorrectly modified with interrupts disabled,
+the required TLB flushes are degraded to flushing the local CPU only.
+
+This could lead to very hard to diagnose problems as different CPUs will
+end up with different views of memory. Although, no such issues have yet
+been identified.
+
+Change the check in the flush_area() macro to look at system_state
+instead. This defers the switch from local to all later in the boot
+(see xen/arch/x86/setup.c:__start_xen()). This is fine because
+additional PCPUs are not brought up until after the system state is
+SYS_STATE_smp_boot.
+
+Signed-off-by: David Vrabel <dvrabel@amazon.co.uk>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+
+x86/flushtlb: remove flush_area check on system state
+
+Booting with Shadow Stacks leads to the following assert on a debug
+hypervisor:
+
+Assertion 'local_irq_is_enabled()' failed at arch/x86/smp.c:265
+----[ Xen-4.17.0-10.24-d x86_64 debug=y Not tainted ]----
+CPU: 0
+RIP: e008:[<ffff82d040345300>] flush_area_mask+0x40/0x13e
+[...]
+Xen call trace:
+ [<ffff82d040345300>] R flush_area_mask+0x40/0x13e
+ [<ffff82d040338a40>] F modify_xen_mappings+0xc5/0x958
+ [<ffff82d0404474f9>] F arch/x86/alternative.c#_alternative_instructions+0xb7/0xb9
+ [<ffff82d0404476cc>] F alternative_branches+0xf/0x12
+ [<ffff82d04044e37d>] F __start_xen+0x1ef4/0x2776
+ [<ffff82d040203344>] F __high_start+0x94/0xa0
+
+This is due to SYS_STATE_smp_boot being set before calling
+alternative_branches(), and the flush in modify_xen_mappings() then
+using flush_area_all() with interrupts disabled. Note that
+alternative_branches() is called before APs are started, so the flush
+must be a local one (and indeed the cpumask passed to
+flush_area_mask() just contains one CPU).
+
+Take the opportunity to simplify a bit the logic and make flush_area()
+an alias of flush_area_all() in mm.c, taking into account that
+cpu_online_map just contains the BSP before APs are started. This
+requires widening the assert in flush_area_mask() to allow being
+called with interrupts disabled as long as it's strictly a local only
+flush.
+
+The overall result is that a conditional can be removed from
+flush_area().
+
+While there also introduce an ASSERT to check that a vCPU state flush
+is not issued for the local CPU only.
+
+Fixes: 78e072bc37 ('x86/mm: avoid inadvertently degrading a TLB flush to local only')
+Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+master commit: 78e072bc375043e81691a59454e09f0b38241ddd
+master date: 2022-04-20 10:55:01 +0200
+master commit: 9f735ee4903f1b9f1966bb4ba5b5616b03ae08b5
+master date: 2022-05-25 11:09:46 +0200
+---
+ xen/arch/x86/mm.c | 10 ++--------
+ xen/arch/x86/smp.c | 5 ++++-
+ 2 files changed, 6 insertions(+), 9 deletions(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 4d799032dc82..e222d9aa98ee 100644
+--- a/xen/arch/x86/mm.c
++++ b/xen/arch/x86/mm.c
+@@ -5051,14 +5051,8 @@ l1_pgentry_t *virt_to_xen_l1e(unsigned long v)
+ #define l1f_to_lNf(f) (((f) & _PAGE_PRESENT) ? ((f) | _PAGE_PSE) : (f))
+ #define lNf_to_l1f(f) (((f) & _PAGE_PRESENT) ? ((f) & ~_PAGE_PSE) : (f))
+
+-/*
+- * map_pages_to_xen() can be called with interrupts disabled during
+- * early bootstrap. In this case it is safe to use flush_area_local()
+- * and avoid locking because only the local CPU is online.
+- */
+-#define flush_area(v,f) (!local_irq_is_enabled() ? \
+- flush_area_local((const void *)v, f) : \
+- flush_area_all((const void *)v, f))
++/* flush_area_all() can be used prior to any other CPU being online. */
++#define flush_area(v, f) flush_area_all((const void *)(v), f)
+
+ #define L3T_INIT(page) (page) = ZERO_BLOCK_PTR
+
+diff --git a/xen/arch/x86/smp.c b/xen/arch/x86/smp.c
+index eef0f9c6cbf4..3556ec116608 100644
+--- a/xen/arch/x86/smp.c
++++ b/xen/arch/x86/smp.c
+@@ -262,7 +262,10 @@ void flush_area_mask(const cpumask_t *mask, const void *va, unsigned int flags)
+ {
+ unsigned int cpu = smp_processor_id();
+
+- ASSERT(local_irq_is_enabled());
++ /* Local flushes can be performed with interrupts disabled. */
++ ASSERT(local_irq_is_enabled() || cpumask_subset(mask, cpumask_of(cpu)));
++ /* Exclude use of FLUSH_VCPU_STATE for the local CPU. */
++ ASSERT(!cpumask_test_cpu(cpu, mask) || !(flags & FLUSH_VCPU_STATE));
+
+ if ( (flags & ~(FLUSH_VCPU_STATE | FLUSH_ORDER_MASK)) &&
+ cpumask_test_cpu(cpu, mask) )
+--
+2.35.1
+
diff --git a/0006-xen-build-Fix-dependency-for-the-MAP-rule.patch b/0006-xen-build-Fix-dependency-for-the-MAP-rule.patch
new file mode 100644
index 0000000..6489cba
--- /dev/null
+++ b/0006-xen-build-Fix-dependency-for-the-MAP-rule.patch
@@ -0,0 +1,29 @@
+From 4bb8c34ba4241c2bf7845cd8b80c17530dbfb085 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Tue, 7 Jun 2022 14:00:09 +0200
+Subject: [PATCH 06/32] xen/build: Fix dependency for the MAP rule
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Acked-by: Jan Beulich <jbeulich@suse.com>
+master commit: e1e72198213b80b7a82bdc90f96ed05ae4f53e20
+master date: 2022-04-20 19:10:59 +0100
+---
+ xen/Makefile | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/xen/Makefile b/xen/Makefile
+index 90a29782dbf4..ce4eca3ee4d7 100644
+--- a/xen/Makefile
++++ b/xen/Makefile
+@@ -507,7 +507,7 @@ cscope:
+ cscope -k -b -q
+
+ .PHONY: _MAP
+-_MAP:
++_MAP: $(TARGET)
+ $(NM) -n $(TARGET)-syms | grep -v '\(compiled\)\|\(\.o$$\)\|\( [aUw] \)\|\(\.\.ng$$\)\|\(LASH[RL]DI\)' > System.map
+
+ %.o %.i %.s: %.c FORCE
+--
+2.35.1
+
diff --git a/0007-tools-libs-evtchn-don-t-set-errno-to-negative-values.patch b/0007-tools-libs-evtchn-don-t-set-errno-to-negative-values.patch
new file mode 100644
index 0000000..2f02fcc
--- /dev/null
+++ b/0007-tools-libs-evtchn-don-t-set-errno-to-negative-values.patch
@@ -0,0 +1,74 @@
+From 13a29f3756bc4cab96c59f46c3875b483553fb8f Mon Sep 17 00:00:00 2001
+From: Juergen Gross <jgross@suse.com>
+Date: Tue, 7 Jun 2022 14:00:31 +0200
+Subject: [PATCH 07/32] tools/libs/evtchn: don't set errno to negative values
+
+Setting errno to a negative value makes no sense.
+
+Fixes: 6b6500b3cbaa ("tools/libs/evtchn: Add support for restricting a handle")
+Signed-off-by: Juergen Gross <jgross@suse.com>
+Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
+master commit: 60245b71c1cd001686fa7b7a26869cbcb80d074c
+master date: 2022-04-22 20:39:34 +0100
+---
+ tools/libs/evtchn/freebsd.c | 2 +-
+ tools/libs/evtchn/minios.c | 2 +-
+ tools/libs/evtchn/netbsd.c | 2 +-
+ tools/libs/evtchn/solaris.c | 2 +-
+ 4 files changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/tools/libs/evtchn/freebsd.c b/tools/libs/evtchn/freebsd.c
+index 7427ab240860..fa17a0f8dbb5 100644
+--- a/tools/libs/evtchn/freebsd.c
++++ b/tools/libs/evtchn/freebsd.c
+@@ -58,7 +58,7 @@ int osdep_evtchn_close(xenevtchn_handle *xce)
+
+ int osdep_evtchn_restrict(xenevtchn_handle *xce, domid_t domid)
+ {
+- errno = -EOPNOTSUPP;
++ errno = EOPNOTSUPP;
+
+ return -1;
+ }
+diff --git a/tools/libs/evtchn/minios.c b/tools/libs/evtchn/minios.c
+index e5dfdc5ef52e..c0bd5429eea2 100644
+--- a/tools/libs/evtchn/minios.c
++++ b/tools/libs/evtchn/minios.c
+@@ -97,7 +97,7 @@ int osdep_evtchn_close(xenevtchn_handle *xce)
+
+ int osdep_evtchn_restrict(xenevtchn_handle *xce, domid_t domid)
+ {
+- errno = -EOPNOTSUPP;
++ errno = EOPNOTSUPP;
+
+ return -1;
+ }
+diff --git a/tools/libs/evtchn/netbsd.c b/tools/libs/evtchn/netbsd.c
+index 1cebc21ffce0..56409513bc23 100644
+--- a/tools/libs/evtchn/netbsd.c
++++ b/tools/libs/evtchn/netbsd.c
+@@ -53,7 +53,7 @@ int osdep_evtchn_close(xenevtchn_handle *xce)
+
+ int osdep_evtchn_restrict(xenevtchn_handle *xce, domid_t domid)
+ {
+- errno = -EOPNOTSUPP;
++ errno = EOPNOTSUPP;
+
+ return -1;
+ }
+diff --git a/tools/libs/evtchn/solaris.c b/tools/libs/evtchn/solaris.c
+index df9579df1778..beaa7721425f 100644
+--- a/tools/libs/evtchn/solaris.c
++++ b/tools/libs/evtchn/solaris.c
+@@ -53,7 +53,7 @@ int osdep_evtchn_close(xenevtchn_handle *xce)
+
+ int osdep_evtchn_restrict(xenevtchn_handle *xce, domid_t domid)
+ {
+- errno = -EOPNOTSUPP;
++ errno = EOPNOTSUPP;
+ return -1;
+ }
+
+--
+2.35.1
+
diff --git a/0008-tools-libs-ctrl-don-t-set-errno-to-a-negative-value.patch b/0008-tools-libs-ctrl-don-t-set-errno-to-a-negative-value.patch
new file mode 100644
index 0000000..acd7955
--- /dev/null
+++ b/0008-tools-libs-ctrl-don-t-set-errno-to-a-negative-value.patch
@@ -0,0 +1,36 @@
+From ba62afdbc31a8cfe897191efd25ed4449d9acd94 Mon Sep 17 00:00:00 2001
+From: Juergen Gross <jgross@suse.com>
+Date: Tue, 7 Jun 2022 14:01:03 +0200
+Subject: [PATCH 08/32] tools/libs/ctrl: don't set errno to a negative value
+
+The claimed reason for setting errno to -1 is wrong. On x86
+xc_domain_pod_target() will set errno to a sane value in the error
+case.
+
+Fixes: ff1745d5882b ("tools: libxl: do not set the PoD target on ARM")
+Signed-off-by: Juergen Gross <jgross@suse.com>
+Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
+master commit: a0fb7e0e73483ed042d5ca34861a891a51ad337b
+master date: 2022-04-22 20:39:34 +0100
+---
+ tools/libs/ctrl/xc_domain.c | 4 +---
+ 1 file changed, 1 insertion(+), 3 deletions(-)
+
+diff --git a/tools/libs/ctrl/xc_domain.c b/tools/libs/ctrl/xc_domain.c
+index b155d6afd2ef..9d675c8f21e1 100644
+--- a/tools/libs/ctrl/xc_domain.c
++++ b/tools/libs/ctrl/xc_domain.c
+@@ -1297,9 +1297,7 @@ int xc_domain_get_pod_target(xc_interface *xch,
+ uint64_t *pod_cache_pages,
+ uint64_t *pod_entries)
+ {
+- /* On x86 (above) xc_domain_pod_target will incorrectly return -1
+- * with errno==-1 on error. Do the same for least surprise. */
+- errno = -1;
++ errno = EOPNOTSUPP;
+ return -1;
+ }
+ #endif
+--
+2.35.1
+
diff --git a/0009-tools-libs-guest-don-t-set-errno-to-a-negative-value.patch b/0009-tools-libs-guest-don-t-set-errno-to-a-negative-value.patch
new file mode 100644
index 0000000..41eb1f1
--- /dev/null
+++ b/0009-tools-libs-guest-don-t-set-errno-to-a-negative-value.patch
@@ -0,0 +1,32 @@
+From a2cf30eec08db5df974a9e8bb7366fee8fc7fcd9 Mon Sep 17 00:00:00 2001
+From: Juergen Gross <jgross@suse.com>
+Date: Tue, 7 Jun 2022 14:01:27 +0200
+Subject: [PATCH 09/32] tools/libs/guest: don't set errno to a negative value
+
+Setting errno to a negative error value makes no sense.
+
+Fixes: cb99a64029c9 ("libxc: arm: allow passing a device tree blob to the guest")
+Signed-off-by: Juergen Gross <jgross@suse.com>
+Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
+master commit: 438e96ab479495a932391a22e219ee62fa8c4f47
+master date: 2022-04-22 20:39:34 +0100
+---
+ tools/libs/guest/xg_dom_core.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/tools/libs/guest/xg_dom_core.c b/tools/libs/guest/xg_dom_core.c
+index 2e4c1330ea6b..65975a75da37 100644
+--- a/tools/libs/guest/xg_dom_core.c
++++ b/tools/libs/guest/xg_dom_core.c
+@@ -856,7 +856,7 @@ int xc_dom_devicetree_file(struct xc_dom_image *dom, const char *filename)
+ return -1;
+ return 0;
+ #else
+- errno = -EINVAL;
++ errno = EINVAL;
+ return -1;
+ #endif
+ }
+--
+2.35.1
+
diff --git a/0010-tools-libs-light-don-t-set-errno-to-a-negative-value.patch b/0010-tools-libs-light-don-t-set-errno-to-a-negative-value.patch
new file mode 100644
index 0000000..a83e1cc
--- /dev/null
+++ b/0010-tools-libs-light-don-t-set-errno-to-a-negative-value.patch
@@ -0,0 +1,32 @@
+From 15391de8e2bb6153eadd483154c53044ab53d98d Mon Sep 17 00:00:00 2001
+From: Juergen Gross <jgross@suse.com>
+Date: Tue, 7 Jun 2022 14:01:44 +0200
+Subject: [PATCH 10/32] tools/libs/light: don't set errno to a negative value
+
+Setting errno to a negative value makes no sense.
+
+Fixes: e78e8b9bb649 ("libxl: Add interface for querying hypervisor about PCI topology")
+Signed-off-by: Juergen Gross <jgross@suse.com>
+Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
+master commit: 2419a159fb943c24a6f2439604b9fdb1478fcd08
+master date: 2022-04-22 20:39:34 +0100
+---
+ tools/libs/light/libxl_linux.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/tools/libs/light/libxl_linux.c b/tools/libs/light/libxl_linux.c
+index 8d62dfd255cb..27f2bce71837 100644
+--- a/tools/libs/light/libxl_linux.c
++++ b/tools/libs/light/libxl_linux.c
+@@ -288,7 +288,7 @@ int libxl__pci_topology_init(libxl__gc *gc,
+ if (i == num_devs) {
+ LOG(ERROR, "Too many devices");
+ err = ERROR_FAIL;
+- errno = -ENOSPC;
++ errno = ENOSPC;
+ goto out;
+ }
+
+--
+2.35.1
+
diff --git a/0011-xen-iommu-cleanup-iommu-related-domctl-handling.patch b/0011-xen-iommu-cleanup-iommu-related-domctl-handling.patch
new file mode 100644
index 0000000..b62ae9b
--- /dev/null
+++ b/0011-xen-iommu-cleanup-iommu-related-domctl-handling.patch
@@ -0,0 +1,112 @@
+From a6c32abd144ec6443c6a433b5a2ac00e2615aa86 Mon Sep 17 00:00:00 2001
+From: Juergen Gross <jgross@suse.com>
+Date: Tue, 7 Jun 2022 14:02:08 +0200
+Subject: [PATCH 11/32] xen/iommu: cleanup iommu related domctl handling
+
+Today iommu_do_domctl() is being called from arch_do_domctl() in the
+"default:" case of a switch statement. This has led already to crashes
+due to unvalidated parameters.
+
+Fix that by moving the call of iommu_do_domctl() to the main switch
+statement of do_domctl().
+
+Signed-off-by: Juergen Gross <jgross@suse.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> # Arm
+master commit: 9cd7e31b3f584e97a138a770cfb031a91a867936
+master date: 2022-04-26 10:23:58 +0200
+---
+ xen/arch/arm/domctl.c | 11 +----------
+ xen/arch/x86/domctl.c | 2 +-
+ xen/common/domctl.c | 7 +++++++
+ xen/include/xen/iommu.h | 12 +++++++++---
+ 4 files changed, 18 insertions(+), 14 deletions(-)
+
+diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c
+index 6245af6d0bab..1baf25c3d98b 100644
+--- a/xen/arch/arm/domctl.c
++++ b/xen/arch/arm/domctl.c
+@@ -176,16 +176,7 @@ long arch_do_domctl(struct xen_domctl *domctl, struct domain *d,
+ return rc;
+ }
+ default:
+- {
+- int rc;
+-
+- rc = subarch_do_domctl(domctl, d, u_domctl);
+-
+- if ( rc == -ENOSYS )
+- rc = iommu_do_domctl(domctl, d, u_domctl);
+-
+- return rc;
+- }
++ return subarch_do_domctl(domctl, d, u_domctl);
+ }
+ }
+
+diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c
+index 7d102e0647ec..0fa51f2ebd10 100644
+--- a/xen/arch/x86/domctl.c
++++ b/xen/arch/x86/domctl.c
+@@ -1380,7 +1380,7 @@ long arch_do_domctl(
+ break;
+
+ default:
+- ret = iommu_do_domctl(domctl, d, u_domctl);
++ ret = -ENOSYS;
+ break;
+ }
+
+diff --git a/xen/common/domctl.c b/xen/common/domctl.c
+index 419e4070f59d..65d2a4588b71 100644
+--- a/xen/common/domctl.c
++++ b/xen/common/domctl.c
+@@ -870,6 +870,13 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
+ copyback = 1;
+ break;
+
++ case XEN_DOMCTL_assign_device:
++ case XEN_DOMCTL_test_assign_device:
++ case XEN_DOMCTL_deassign_device:
++ case XEN_DOMCTL_get_device_group:
++ ret = iommu_do_domctl(op, d, u_domctl);
++ break;
++
+ default:
+ ret = arch_do_domctl(op, d, u_domctl);
+ break;
+diff --git a/xen/include/xen/iommu.h b/xen/include/xen/iommu.h
+index 92b2d23f0ba2..861579562e8a 100644
+--- a/xen/include/xen/iommu.h
++++ b/xen/include/xen/iommu.h
+@@ -342,8 +342,17 @@ struct domain_iommu {
+ /* Does the IOMMU pagetable need to be kept synchronized with the P2M */
+ #ifdef CONFIG_HAS_PASSTHROUGH
+ #define need_iommu_pt_sync(d) (dom_iommu(d)->need_sync)
++
++int iommu_do_domctl(struct xen_domctl *domctl, struct domain *d,
++ XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl);
+ #else
+ #define need_iommu_pt_sync(d) ({ (void)(d); false; })
++
++static inline int iommu_do_domctl(struct xen_domctl *domctl, struct domain *d,
++ XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
++{
++ return -ENOSYS;
++}
+ #endif
+
+ int __must_check iommu_suspend(void);
+@@ -357,9 +366,6 @@ int iommu_do_pci_domctl(struct xen_domctl *, struct domain *d,
+ XEN_GUEST_HANDLE_PARAM(xen_domctl_t));
+ #endif
+
+-int iommu_do_domctl(struct xen_domctl *, struct domain *d,
+- XEN_GUEST_HANDLE_PARAM(xen_domctl_t));
+-
+ void iommu_dev_iotlb_flush_timeout(struct domain *d, struct pci_dev *pdev);
+
+ /*
+--
+2.35.1
+
diff --git a/0012-IOMMU-make-domctl-handler-tolerate-NULL-domain.patch b/0012-IOMMU-make-domctl-handler-tolerate-NULL-domain.patch
new file mode 100644
index 0000000..ff26651
--- /dev/null
+++ b/0012-IOMMU-make-domctl-handler-tolerate-NULL-domain.patch
@@ -0,0 +1,36 @@
+From 4cf9a7c7bdb9d544fbac81105bbc1059ba3dd932 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 7 Jun 2022 14:02:30 +0200
+Subject: [PATCH 12/32] IOMMU: make domctl handler tolerate NULL domain
+
+Besides the reporter's issue of hitting a NULL deref when !CONFIG_GDBSX,
+XEN_DOMCTL_test_assign_device can legitimately end up having NULL passed
+here, when the domctl was passed DOMID_INVALID.
+
+Fixes: 71e617a6b8f6 ("use is_iommu_enabled() where appropriate...")
+Reported-by: Cheyenne Wills <cheyenne.wills@gmail.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Paul Durrant <paul@xen.org>
+Reviewed-by: Juergen Gross <jgross@suse.com>
+master commit: fa4d84e6dd3c3bfd23a525b75a5483d4ce15adbb
+master date: 2022-04-26 10:25:54 +0200
+---
+ xen/drivers/passthrough/iommu.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/xen/drivers/passthrough/iommu.c b/xen/drivers/passthrough/iommu.c
+index caaba62c8865..287f63fc736f 100644
+--- a/xen/drivers/passthrough/iommu.c
++++ b/xen/drivers/passthrough/iommu.c
+@@ -535,7 +535,7 @@ int iommu_do_domctl(
+ {
+ int ret = -ENODEV;
+
+- if ( !is_iommu_enabled(d) )
++ if ( !(d ? is_iommu_enabled(d) : iommu_enabled) )
+ return -EOPNOTSUPP;
+
+ #ifdef CONFIG_HAS_PCI
+--
+2.35.1
+
diff --git a/0013-IOMMU-x86-disallow-device-assignment-to-PoD-guests.patch b/0013-IOMMU-x86-disallow-device-assignment-to-PoD-guests.patch
new file mode 100644
index 0000000..efadef6
--- /dev/null
+++ b/0013-IOMMU-x86-disallow-device-assignment-to-PoD-guests.patch
@@ -0,0 +1,229 @@
+From 838f6c211f7f05f107e1acdfb0977ab61ec0bf2e Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 7 Jun 2022 14:03:20 +0200
+Subject: [PATCH 13/32] IOMMU/x86: disallow device assignment to PoD guests
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+While it is okay for IOMMU page tables to be set up for guests starting
+in PoD mode, actual device assignment may only occur once all PoD
+entries have been removed from the P2M. So far this was enforced only
+for boot-time assignment, and only in the tool stack.
+
+Also use the new function to replace p2m_pod_entry_count(): Its unlocked
+access to p2m->pod.entry_count wasn't really okay (irrespective of the
+result being stale by the time the caller gets to see it). Nor was the
+use of that function in line with the immediately preceding comment: A
+PoD guest isn't just one with a non-zero entry count, but also one with
+a non-empty cache (e.g. prior to actually launching the guest).
+
+To allow the tool stack to see a consistent snapshot of PoD state, move
+the tail of XENMEM_{get,set}_pod_target handling into a function, adding
+proper locking there.
+
+In libxl take the liberty to use the new local variable r also for a
+pre-existing call into libxc.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+master commit: ad4312d764e8b40a1e45b64aac6d840a60c59f13
+master date: 2022-05-02 08:48:02 +0200
+---
+ xen/arch/x86/mm.c | 6 +---
+ xen/arch/x86/mm/p2m-pod.c | 43 ++++++++++++++++++++++++++++-
+ xen/common/vm_event.c | 2 +-
+ xen/drivers/passthrough/x86/iommu.c | 3 +-
+ xen/include/asm-x86/p2m.h | 21 +++++++-------
+ 5 files changed, 57 insertions(+), 18 deletions(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index e222d9aa98ee..4ee2de11051d 100644
+--- a/xen/arch/x86/mm.c
++++ b/xen/arch/x86/mm.c
+@@ -4777,7 +4777,6 @@ long arch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
+ {
+ xen_pod_target_t target;
+ struct domain *d;
+- struct p2m_domain *p2m;
+
+ if ( copy_from_guest(&target, arg, 1) )
+ return -EFAULT;
+@@ -4812,10 +4811,7 @@ long arch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
+ }
+ else if ( rc >= 0 )
+ {
+- p2m = p2m_get_hostp2m(d);
+- target.tot_pages = domain_tot_pages(d);
+- target.pod_cache_pages = p2m->pod.count;
+- target.pod_entries = p2m->pod.entry_count;
++ p2m_pod_get_mem_target(d, &target);
+
+ if ( __copy_to_guest(arg, &target, 1) )
+ {
+diff --git a/xen/arch/x86/mm/p2m-pod.c b/xen/arch/x86/mm/p2m-pod.c
+index d8d1a0ce7ed7..a3c9d8a97423 100644
+--- a/xen/arch/x86/mm/p2m-pod.c
++++ b/xen/arch/x86/mm/p2m-pod.c
+@@ -20,6 +20,7 @@
+ */
+
+ #include <xen/event.h>
++#include <xen/iocap.h>
+ #include <xen/ioreq.h>
+ #include <xen/mm.h>
+ #include <xen/sched.h>
+@@ -362,7 +363,10 @@ p2m_pod_set_mem_target(struct domain *d, unsigned long target)
+
+ ASSERT( pod_target >= p2m->pod.count );
+
+- ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
++ if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
++ ret = -ENOTEMPTY;
++ else
++ ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
+
+ out:
+ pod_unlock(p2m);
+@@ -370,6 +374,23 @@ out:
+ return ret;
+ }
+
++void p2m_pod_get_mem_target(const struct domain *d, xen_pod_target_t *target)
++{
++ struct p2m_domain *p2m = p2m_get_hostp2m(d);
++
++ ASSERT(is_hvm_domain(d));
++
++ pod_lock(p2m);
++ lock_page_alloc(p2m);
++
++ target->tot_pages = domain_tot_pages(d);
++ target->pod_cache_pages = p2m->pod.count;
++ target->pod_entries = p2m->pod.entry_count;
++
++ unlock_page_alloc(p2m);
++ pod_unlock(p2m);
++}
++
+ int p2m_pod_empty_cache(struct domain *d)
+ {
+ struct p2m_domain *p2m = p2m_get_hostp2m(d);
+@@ -1387,6 +1408,9 @@ guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn,
+ if ( !paging_mode_translate(d) )
+ return -EINVAL;
+
++ if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
++ return -ENOTEMPTY;
++
+ do {
+ rc = mark_populate_on_demand(d, gfn, chunk_order);
+
+@@ -1408,3 +1432,20 @@ void p2m_pod_init(struct p2m_domain *p2m)
+ for ( i = 0; i < ARRAY_SIZE(p2m->pod.mrp.list); ++i )
+ p2m->pod.mrp.list[i] = gfn_x(INVALID_GFN);
+ }
++
++bool p2m_pod_active(const struct domain *d)
++{
++ struct p2m_domain *p2m;
++ bool res;
++
++ if ( !is_hvm_domain(d) )
++ return false;
++
++ p2m = p2m_get_hostp2m(d);
++
++ pod_lock(p2m);
++ res = p2m->pod.entry_count | p2m->pod.count;
++ pod_unlock(p2m);
++
++ return res;
++}
+diff --git a/xen/common/vm_event.c b/xen/common/vm_event.c
+index 70ab3ba406ff..21d2f0edf727 100644
+--- a/xen/common/vm_event.c
++++ b/xen/common/vm_event.c
+@@ -639,7 +639,7 @@ int vm_event_domctl(struct domain *d, struct xen_domctl_vm_event_op *vec)
+
+ rc = -EXDEV;
+ /* Disallow paging in a PoD guest */
+- if ( p2m_pod_entry_count(p2m_get_hostp2m(d)) )
++ if ( p2m_pod_active(d) )
+ break;
+
+ /* domain_pause() not required here, see XSA-99 */
+diff --git a/xen/drivers/passthrough/x86/iommu.c b/xen/drivers/passthrough/x86/iommu.c
+index a36a6bd4b249..dc9936e16930 100644
+--- a/xen/drivers/passthrough/x86/iommu.c
++++ b/xen/drivers/passthrough/x86/iommu.c
+@@ -502,11 +502,12 @@ bool arch_iommu_use_permitted(const struct domain *d)
+ {
+ /*
+ * Prevent device assign if mem paging, mem sharing or log-dirty
+- * have been enabled for this domain.
++ * have been enabled for this domain, or if PoD is still in active use.
+ */
+ return d == dom_io ||
+ (likely(!mem_sharing_enabled(d)) &&
+ likely(!mem_paging_enabled(d)) &&
++ likely(!p2m_pod_active(d)) &&
+ likely(!p2m_get_hostp2m(d)->global_logdirty));
+ }
+
+diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h
+index 357a8087481e..f2af7a746ced 100644
+--- a/xen/include/asm-x86/p2m.h
++++ b/xen/include/asm-x86/p2m.h
+@@ -661,6 +661,12 @@ int p2m_pod_empty_cache(struct domain *d);
+ * domain matches target */
+ int p2m_pod_set_mem_target(struct domain *d, unsigned long target);
+
++/* Obtain a consistent snapshot of PoD related domain state. */
++void p2m_pod_get_mem_target(const struct domain *d, xen_pod_target_t *target);
++
++/* Check whether PoD is (still) active in a domain. */
++bool p2m_pod_active(const struct domain *d);
++
+ /* Scan pod cache when offline/broken page triggered */
+ int
+ p2m_pod_offline_or_broken_hit(struct page_info *p);
+@@ -669,11 +675,6 @@ p2m_pod_offline_or_broken_hit(struct page_info *p);
+ void
+ p2m_pod_offline_or_broken_replace(struct page_info *p);
+
+-static inline long p2m_pod_entry_count(const struct p2m_domain *p2m)
+-{
+- return p2m->pod.entry_count;
+-}
+-
+ void p2m_pod_init(struct p2m_domain *p2m);
+
+ #else
+@@ -689,6 +690,11 @@ static inline int p2m_pod_empty_cache(struct domain *d)
+ return 0;
+ }
+
++static inline bool p2m_pod_active(const struct domain *d)
++{
++ return false;
++}
++
+ static inline int p2m_pod_offline_or_broken_hit(struct page_info *p)
+ {
+ return 0;
+@@ -699,11 +705,6 @@ static inline void p2m_pod_offline_or_broken_replace(struct page_info *p)
+ ASSERT_UNREACHABLE();
+ }
+
+-static inline long p2m_pod_entry_count(const struct p2m_domain *p2m)
+-{
+- return 0;
+-}
+-
+ static inline void p2m_pod_init(struct p2m_domain *p2m) {}
+
+ #endif
+--
+2.35.1
+
diff --git a/0014-x86-msr-handle-reads-to-MSR_P5_MC_-ADDR-TYPE.patch b/0014-x86-msr-handle-reads-to-MSR_P5_MC_-ADDR-TYPE.patch
new file mode 100644
index 0000000..09f56f5
--- /dev/null
+++ b/0014-x86-msr-handle-reads-to-MSR_P5_MC_-ADDR-TYPE.patch
@@ -0,0 +1,121 @@
+From 9ebe2ba83644ec6cd33a93c68dab5f551adcbea0 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Tue, 7 Jun 2022 14:04:16 +0200
+Subject: [PATCH 14/32] x86/msr: handle reads to MSR_P5_MC_{ADDR,TYPE}
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Windows Server 2019 Essentials will unconditionally attempt to read
+P5_MC_ADDR MSR at boot and throw a BSOD if injected a #GP.
+
+Fix this by mapping MSR_P5_MC_{ADDR,TYPE} to
+MSR_IA32_MCi_{ADDR,STATUS}, as reported also done by hardware in Intel
+SDM "Mapping of the Pentium Processor Machine-Check Errors to the
+Machine-Check Architecture" section.
+
+Reported-by: Steffen Einsle <einsle@phptrix.de>
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+master commit: ce59e472b581e4923f6892172dde62b88c8aa8b7
+master date: 2022-05-02 08:49:12 +0200
+---
+ xen/arch/x86/cpu/mcheck/mce.h | 6 ++++++
+ xen/arch/x86/cpu/mcheck/mce_intel.c | 19 +++++++++++++++++++
+ xen/arch/x86/cpu/mcheck/vmce.c | 2 ++
+ xen/arch/x86/msr.c | 2 ++
+ xen/include/asm-x86/msr-index.h | 3 +++
+ 5 files changed, 32 insertions(+)
+
+diff --git a/xen/arch/x86/cpu/mcheck/mce.h b/xen/arch/x86/cpu/mcheck/mce.h
+index 195362691904..192315ecfa3d 100644
+--- a/xen/arch/x86/cpu/mcheck/mce.h
++++ b/xen/arch/x86/cpu/mcheck/mce.h
+@@ -169,6 +169,12 @@ static inline int mce_vendor_bank_msr(const struct vcpu *v, uint32_t msr)
+ if (msr >= MSR_IA32_MC0_CTL2 &&
+ msr < MSR_IA32_MCx_CTL2(v->arch.vmce.mcg_cap & MCG_CAP_COUNT) )
+ return 1;
++ fallthrough;
++
++ case X86_VENDOR_CENTAUR:
++ case X86_VENDOR_SHANGHAI:
++ if (msr == MSR_P5_MC_ADDR || msr == MSR_P5_MC_TYPE)
++ return 1;
+ break;
+
+ case X86_VENDOR_AMD:
+diff --git a/xen/arch/x86/cpu/mcheck/mce_intel.c b/xen/arch/x86/cpu/mcheck/mce_intel.c
+index bb9f3a3ff795..d364e9bf5ad1 100644
+--- a/xen/arch/x86/cpu/mcheck/mce_intel.c
++++ b/xen/arch/x86/cpu/mcheck/mce_intel.c
+@@ -1001,8 +1001,27 @@ int vmce_intel_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
+
+ int vmce_intel_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
+ {
++ const struct cpuid_policy *cp = v->domain->arch.cpuid;
+ unsigned int bank = msr - MSR_IA32_MC0_CTL2;
+
++ switch ( msr )
++ {
++ case MSR_P5_MC_ADDR:
++ /*
++ * Bank 0 is used for the 'bank 0 quirk' on older processors.
++ * See vcpu_fill_mc_msrs() for reference.
++ */
++ *val = v->arch.vmce.bank[1].mci_addr;
++ return 1;
++
++ case MSR_P5_MC_TYPE:
++ *val = v->arch.vmce.bank[1].mci_status;
++ return 1;
++ }
++
++ if ( !(cp->x86_vendor & X86_VENDOR_INTEL) )
++ return 0;
++
+ if ( bank < GUEST_MC_BANK_NUM )
+ {
+ *val = v->arch.vmce.bank[bank].mci_ctl2;
+diff --git a/xen/arch/x86/cpu/mcheck/vmce.c b/xen/arch/x86/cpu/mcheck/vmce.c
+index eb6434a3ba20..0899df58bcbf 100644
+--- a/xen/arch/x86/cpu/mcheck/vmce.c
++++ b/xen/arch/x86/cpu/mcheck/vmce.c
+@@ -150,6 +150,8 @@ static int bank_mce_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
+ default:
+ switch ( boot_cpu_data.x86_vendor )
+ {
++ case X86_VENDOR_CENTAUR:
++ case X86_VENDOR_SHANGHAI:
+ case X86_VENDOR_INTEL:
+ ret = vmce_intel_rdmsr(v, msr, val);
+ break;
+diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c
+index aaedb2c31287..da305c7aa4c9 100644
+--- a/xen/arch/x86/msr.c
++++ b/xen/arch/x86/msr.c
+@@ -282,6 +282,8 @@ int guest_rdmsr(struct vcpu *v, uint32_t msr, uint64_t *val)
+ *val = msrs->misc_features_enables.raw;
+ break;
+
++ case MSR_P5_MC_ADDR:
++ case MSR_P5_MC_TYPE:
+ case MSR_IA32_MCG_CAP ... MSR_IA32_MCG_CTL: /* 0x179 -> 0x17b */
+ case MSR_IA32_MCx_CTL2(0) ... MSR_IA32_MCx_CTL2(31): /* 0x280 -> 0x29f */
+ case MSR_IA32_MCx_CTL(0) ... MSR_IA32_MCx_MISC(31): /* 0x400 -> 0x47f */
+diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
+index 3e038db618ff..31964b88af7a 100644
+--- a/xen/include/asm-x86/msr-index.h
++++ b/xen/include/asm-x86/msr-index.h
+@@ -15,6 +15,9 @@
+ * abbreviated name. Exceptions will be considered on a case-by-case basis.
+ */
+
++#define MSR_P5_MC_ADDR 0
++#define MSR_P5_MC_TYPE 0x00000001
++
+ #define MSR_APIC_BASE 0x0000001b
+ #define APIC_BASE_BSP (_AC(1, ULL) << 8)
+ #define APIC_BASE_EXTD (_AC(1, ULL) << 10)
+--
+2.35.1
+
diff --git a/0015-kconfig-detect-LD-implementation.patch b/0015-kconfig-detect-LD-implementation.patch
new file mode 100644
index 0000000..f2fc24a
--- /dev/null
+++ b/0015-kconfig-detect-LD-implementation.patch
@@ -0,0 +1,46 @@
+From 3754bd128d1a6b3d5864d1a3ee5d27b67d35387a Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Tue, 7 Jun 2022 14:05:06 +0200
+Subject: [PATCH 15/32] kconfig: detect LD implementation
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Detect GNU and LLVM ld implementations. This is required for further
+patches that will introduce diverging behaviour depending on the
+linker implementation in use.
+
+Note that LLVM ld returns "compatible with GNU linkers" as part of the
+version string, so be on the safe side and use '^' to only match at
+the start of the line in case LLVM ever decides to change the text to
+use "compatible with GNU ld" instead.
+
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Michal Orzel <michal.orzel@arm.com>
+Acked-by: Julien Grall <jgrall@amazon.com>
+master commit: c70c4b624f85f7d4e28c70a804a0a3f20d73092b
+master date: 2022-05-02 08:50:39 +0200
+---
+ xen/Kconfig | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/xen/Kconfig b/xen/Kconfig
+index bcbd2758e5d3..0c89afd50fcf 100644
+--- a/xen/Kconfig
++++ b/xen/Kconfig
+@@ -23,6 +23,12 @@ config CLANG_VERSION
+ int
+ default $(shell,$(BASEDIR)/scripts/clang-version.sh $(CC))
+
++config LD_IS_GNU
++ def_bool $(success,$(LD) --version | head -n 1 | grep -q "^GNU ld")
++
++config LD_IS_LLVM
++ def_bool $(success,$(LD) --version | head -n 1 | grep -q "^LLD")
++
+ # -fvisibility=hidden reduces -fpic cost, if it's available
+ config CC_HAS_VISIBILITY_ATTRIBUTE
+ def_bool $(cc-option,-fvisibility=hidden)
+--
+2.35.1
+
diff --git a/0016-linker-lld-do-not-generate-quoted-section-names.patch b/0016-linker-lld-do-not-generate-quoted-section-names.patch
new file mode 100644
index 0000000..a42083e
--- /dev/null
+++ b/0016-linker-lld-do-not-generate-quoted-section-names.patch
@@ -0,0 +1,54 @@
+From 88b653f73928117461dc250acd1e830a47a14c2b Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Tue, 7 Jun 2022 14:05:24 +0200
+Subject: [PATCH 16/32] linker/lld: do not generate quoted section names
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+LLVM LD doesn't strip the quotes from the section names, and so the
+resulting binary ends up with section names like:
+
+ [ 1] ".text" PROGBITS ffff82d040200000 00008000
+ 000000000018cbc1 0000000000000000 AX 0 0 4096
+
+This confuses some tools (like gdb) and prevents proper parsing of the
+binary.
+
+The issue has already been reported and is being fixed in LLD. In
+order to workaround this issue and keep the GNU ld support define
+different DECL_SECTION macros depending on the used ld
+implementation.
+
+Drop the quotes from the definitions of the debug sections in
+DECL_DEBUG{2}, as those quotes are not required for GNU ld either.
+
+Fixes: 6254920587c3 ('x86: quote section names when defining them in linker script')
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+master commit: 702c9a800eb3ecd4b8595998d37a769d470c5bb0
+master date: 2022-05-02 08:51:45 +0200
+---
+ xen/arch/x86/xen.lds.S | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+diff --git a/xen/arch/x86/xen.lds.S b/xen/arch/x86/xen.lds.S
+index 4c58f3209c3d..bc9b9651b192 100644
+--- a/xen/arch/x86/xen.lds.S
++++ b/xen/arch/x86/xen.lds.S
+@@ -18,7 +18,11 @@ ENTRY(efi_start)
+ #else /* !EFI */
+
+ #define FORMAT "elf64-x86-64"
+-#define DECL_SECTION(x) #x : AT(ADDR(#x) - __XEN_VIRT_START)
++#ifdef CONFIG_LD_IS_GNU
++# define DECL_SECTION(x) x : AT(ADDR(#x) - __XEN_VIRT_START)
++#else
++# define DECL_SECTION(x) x : AT(ADDR(x) - __XEN_VIRT_START)
++#endif
+
+ ENTRY(start_pa)
+
+--
+2.35.1
+
diff --git a/0017-xen-io-Fix-race-between-sending-an-I-O-and-domain-sh.patch b/0017-xen-io-Fix-race-between-sending-an-I-O-and-domain-sh.patch
new file mode 100644
index 0000000..d226e97
--- /dev/null
+++ b/0017-xen-io-Fix-race-between-sending-an-I-O-and-domain-sh.patch
@@ -0,0 +1,142 @@
+From 982a314bd3000a16c3128afadb36a8ff41029adc Mon Sep 17 00:00:00 2001
+From: Julien Grall <jgrall@amazon.com>
+Date: Tue, 7 Jun 2022 14:06:11 +0200
+Subject: [PATCH 17/32] xen: io: Fix race between sending an I/O and domain
+ shutdown
+
+Xen provides hypercalls to shutdown (SCHEDOP_shutdown{,_code}) and
+resume a domain (XEN_DOMCTL_resumedomain). They can be used for checkpoint
+where the expectation is the domain should continue as nothing happened
+afterwards.
+
+hvmemul_do_io() and handle_pio() will act differently if the return
+code of hvm_send_ioreq() (resp. hvmemul_do_pio_buffer()) is X86EMUL_RETRY.
+
+In this case, the I/O state will be reset to STATE_IOREQ_NONE (i.e
+no I/O is pending) and/or the PC will not be advanced.
+
+If the shutdown request happens right after the I/O was sent to the
+IOREQ, then emulation code will end up to re-execute the instruction
+and therefore forward again the same I/O (at least when reading IO port).
+
+This would be problem if the access has a side-effect. A dumb example,
+is a device implementing a counter which is incremented by one for every
+access. When running shutdown/resume in a loop, the value read by the
+OS may not be the old value + 1.
+
+Add an extra boolean in the structure hvm_vcpu_io to indicate whether
+the I/O was suspended. This is then used in place of checking the domain
+is shutting down in hvmemul_do_io() and handle_pio() as they should
+act on suspend (i.e. vcpu_start_shutdown_deferral() returns false) rather
+than shutdown.
+
+Signed-off-by: Julien Grall <jgrall@amazon.com>
+Reviewed-by: Paul Durrant <paul@xen.org>
+master commit: b7e0d8978810b534725e94a321736496928f00a5
+master date: 2022-05-06 17:16:22 +0100
+---
+ xen/arch/arm/ioreq.c | 3 ++-
+ xen/arch/x86/hvm/emulate.c | 3 ++-
+ xen/arch/x86/hvm/io.c | 7 ++++---
+ xen/common/ioreq.c | 4 ++++
+ xen/include/xen/sched.h | 5 +++++
+ 5 files changed, 17 insertions(+), 5 deletions(-)
+
+diff --git a/xen/arch/arm/ioreq.c b/xen/arch/arm/ioreq.c
+index 308650b40051..fbccef212bf1 100644
+--- a/xen/arch/arm/ioreq.c
++++ b/xen/arch/arm/ioreq.c
+@@ -80,9 +80,10 @@ enum io_state try_fwd_ioserv(struct cpu_user_regs *regs,
+ return IO_ABORT;
+
+ vio->req = p;
++ vio->suspended = false;
+
+ rc = ioreq_send(s, &p, 0);
+- if ( rc != IO_RETRY || v->domain->is_shutting_down )
++ if ( rc != IO_RETRY || vio->suspended )
+ vio->req.state = STATE_IOREQ_NONE;
+ else if ( !ioreq_needs_completion(&vio->req) )
+ rc = IO_HANDLED;
+diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
+index 76a2ccfafe23..7da348b5d486 100644
+--- a/xen/arch/x86/hvm/emulate.c
++++ b/xen/arch/x86/hvm/emulate.c
+@@ -239,6 +239,7 @@ static int hvmemul_do_io(
+ ASSERT(p.count);
+
+ vio->req = p;
++ vio->suspended = false;
+
+ rc = hvm_io_intercept(&p);
+
+@@ -334,7 +335,7 @@ static int hvmemul_do_io(
+ else
+ {
+ rc = ioreq_send(s, &p, 0);
+- if ( rc != X86EMUL_RETRY || currd->is_shutting_down )
++ if ( rc != X86EMUL_RETRY || vio->suspended )
+ vio->req.state = STATE_IOREQ_NONE;
+ else if ( !ioreq_needs_completion(&vio->req) )
+ rc = X86EMUL_OKAY;
+diff --git a/xen/arch/x86/hvm/io.c b/xen/arch/x86/hvm/io.c
+index 93f1d1503fa6..80915f27e488 100644
+--- a/xen/arch/x86/hvm/io.c
++++ b/xen/arch/x86/hvm/io.c
+@@ -138,10 +138,11 @@ bool handle_pio(uint16_t port, unsigned int size, int dir)
+
+ case X86EMUL_RETRY:
+ /*
+- * We should not advance RIP/EIP if the domain is shutting down or
+- * if X86EMUL_RETRY has been returned by an internal handler.
++ * We should not advance RIP/EIP if the vio was suspended (e.g.
++ * because the domain is shutting down) or if X86EMUL_RETRY has
++ * been returned by an internal handler.
+ */
+- if ( curr->domain->is_shutting_down || !vcpu_ioreq_pending(curr) )
++ if ( vio->suspended || !vcpu_ioreq_pending(curr) )
+ return false;
+ break;
+
+diff --git a/xen/common/ioreq.c b/xen/common/ioreq.c
+index d732dc045df9..42414b750bef 100644
+--- a/xen/common/ioreq.c
++++ b/xen/common/ioreq.c
+@@ -1256,6 +1256,7 @@ int ioreq_send(struct ioreq_server *s, ioreq_t *proto_p,
+ struct vcpu *curr = current;
+ struct domain *d = curr->domain;
+ struct ioreq_vcpu *sv;
++ struct vcpu_io *vio = &curr->io;
+
+ ASSERT(s);
+
+@@ -1263,7 +1264,10 @@ int ioreq_send(struct ioreq_server *s, ioreq_t *proto_p,
+ return ioreq_send_buffered(s, proto_p);
+
+ if ( unlikely(!vcpu_start_shutdown_deferral(curr)) )
++ {
++ vio->suspended = true;
+ return IOREQ_STATUS_RETRY;
++ }
+
+ list_for_each_entry ( sv,
+ &s->ioreq_vcpu_list,
+diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
+index 28146ee404e6..9671062360ac 100644
+--- a/xen/include/xen/sched.h
++++ b/xen/include/xen/sched.h
+@@ -159,6 +159,11 @@ enum vio_completion {
+ struct vcpu_io {
+ /* I/O request in flight to device model. */
+ enum vio_completion completion;
++ /*
++ * Indicate whether the I/O was not handled because the domain
++ * is about to be paused.
++ */
++ bool suspended;
+ ioreq_t req;
+ };
+
+--
+2.35.1
+
diff --git a/0018-build-suppress-GNU-ld-warning-about-RWX-load-segment.patch b/0018-build-suppress-GNU-ld-warning-about-RWX-load-segment.patch
new file mode 100644
index 0000000..87a0873
--- /dev/null
+++ b/0018-build-suppress-GNU-ld-warning-about-RWX-load-segment.patch
@@ -0,0 +1,35 @@
+From 4890031d224262a6cf43d3bef1af4a16c13db306 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 7 Jun 2022 14:06:51 +0200
+Subject: [PATCH 18/32] build: suppress GNU ld warning about RWX load segments
+
+We cannot really avoid such and we're also not really at risk because of
+them, as we control page table permissions ourselves rather than relying
+on a loader of some sort. Present GNU ld master started warning about
+such, and hence 2.39 is anticipated to have this warning.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Acked-by: Julien Grall <jgrall@amazon.com>
+master commit: 68f5aac012b9ae36ce9b65d9ca9cc9f232191ad3
+master date: 2022-05-18 11:17:19 +0200
+---
+ xen/Makefile | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/xen/Makefile b/xen/Makefile
+index ce4eca3ee4d7..4d9abe704628 100644
+--- a/xen/Makefile
++++ b/xen/Makefile
+@@ -260,6 +260,8 @@ endif
+
+ AFLAGS += -D__ASSEMBLY__
+
++LDFLAGS-$(call ld-option,--warn-rwx-segments) += --no-warn-rwx-segments
++
+ CFLAGS += $(CFLAGS-y)
+ # allow extra CFLAGS externally via EXTRA_CFLAGS_XEN_CORE
+ CFLAGS += $(EXTRA_CFLAGS_XEN_CORE)
+--
+2.35.1
+
diff --git a/0019-build-silence-GNU-ld-warning-about-executable-stacks.patch b/0019-build-silence-GNU-ld-warning-about-executable-stacks.patch
new file mode 100644
index 0000000..75e9f7e
--- /dev/null
+++ b/0019-build-silence-GNU-ld-warning-about-executable-stacks.patch
@@ -0,0 +1,35 @@
+From 1bc669a568a9f4bdab9e9ddb95823ba370dc0baf Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 7 Jun 2022 14:07:11 +0200
+Subject: [PATCH 19/32] build: silence GNU ld warning about executable stacks
+
+While for C files the compiler is supposed to arrange for emitting
+respective information, for assembly sources we're responsible ourselves.
+Present GNU ld master started warning about such, and hence 2.39 is
+anticipated to have this warning.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Acked-by: Julien Grall <jgrall@amazon.com>
+master commit: 62d22296a95d259c934ca2f39ac511d729cfbb68
+master date: 2022-05-18 11:18:45 +0200
+---
+ xen/Makefile | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/xen/Makefile b/xen/Makefile
+index 4d9abe704628..971028eda240 100644
+--- a/xen/Makefile
++++ b/xen/Makefile
+@@ -260,6 +260,8 @@ endif
+
+ AFLAGS += -D__ASSEMBLY__
+
++$(call cc-option-add,AFLAGS,CC,-Wa$(comma)--noexecstack)
++
+ LDFLAGS-$(call ld-option,--warn-rwx-segments) += --no-warn-rwx-segments
+
+ CFLAGS += $(CFLAGS-y)
+--
+2.35.1
+
diff --git a/0020-ns16550-use-poll-mode-if-INTERRUPT_LINE-is-0xff.patch b/0020-ns16550-use-poll-mode-if-INTERRUPT_LINE-is-0xff.patch
new file mode 100644
index 0000000..b83be9a
--- /dev/null
+++ b/0020-ns16550-use-poll-mode-if-INTERRUPT_LINE-is-0xff.patch
@@ -0,0 +1,50 @@
+From f1be0b62a03b90a40a03e21f965e4cbb89809bb1 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?=
+ <marmarek@invisiblethingslab.com>
+Date: Tue, 7 Jun 2022 14:07:34 +0200
+Subject: [PATCH 20/32] ns16550: use poll mode if INTERRUPT_LINE is 0xff
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Intel LPSS has INTERRUPT_LINE set to 0xff by default, that is declared
+by the PCI Local Bus Specification Revision 3.0 (from 2004) as
+"unknown"/"no connection". Fallback to poll mode in this case.
+The 0xff handling is x86-specific, the surrounding code is guarded with
+CONFIG_X86 anyway.
+
+Signed-off-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+master commit: 6a2ea1a2370a0c8a0210accac0ae62e68c185134
+master date: 2022-05-20 12:19:45 +0200
+---
+ xen/drivers/char/ns16550.c | 13 +++++++++++++
+ 1 file changed, 13 insertions(+)
+
+diff --git a/xen/drivers/char/ns16550.c b/xen/drivers/char/ns16550.c
+index 30596d60d4ed..2d2bd2a02469 100644
+--- a/xen/drivers/char/ns16550.c
++++ b/xen/drivers/char/ns16550.c
+@@ -1221,6 +1221,19 @@ pci_uart_config(struct ns16550 *uart, bool_t skip_amt, unsigned int idx)
+ pci_conf_read8(PCI_SBDF(0, b, d, f),
+ PCI_INTERRUPT_LINE) : 0;
+
++#ifdef CONFIG_X86
++ /*
++ * PCI Local Bus Specification Revision 3.0 defines 0xff value
++ * as special only for X86.
++ */
++ if ( uart->irq == 0xff )
++ uart->irq = 0;
++#endif
++ if ( !uart->irq )
++ printk(XENLOG_INFO
++ "ns16550: %pp: no legacy IRQ, using poll mode\n",
++ &PCI_SBDF(0, b, d, f));
++
+ return 0;
+ }
+ }
+--
+2.35.1
+
diff --git a/0021-PCI-don-t-allow-pci-phantom-to-mark-real-devices-as-.patch b/0021-PCI-don-t-allow-pci-phantom-to-mark-real-devices-as-.patch
new file mode 100644
index 0000000..1264578
--- /dev/null
+++ b/0021-PCI-don-t-allow-pci-phantom-to-mark-real-devices-as-.patch
@@ -0,0 +1,56 @@
+From 8e11ec8fbf6f933f8854f4bc54226653316903f2 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 7 Jun 2022 14:08:06 +0200
+Subject: [PATCH 21/32] PCI: don't allow "pci-phantom=" to mark real devices as
+ phantom functions
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+IOMMU code mapping / unmapping devices and interrupts will misbehave if
+a wrong command line option declared a function "phantom" when there's a
+real device at that position. Warn about this and adjust the specified
+stride (in the worst case ignoring the option altogether).
+
+Requested-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+master commit: 444b555dc9e09fa3ce90f066e0c88dec9b47f422
+master date: 2022-05-20 12:20:35 +0200
+---
+ xen/drivers/passthrough/pci.c | 19 ++++++++++++++++++-
+ 1 file changed, 18 insertions(+), 1 deletion(-)
+
+diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
+index 395958698e6a..e0491c908f10 100644
+--- a/xen/drivers/passthrough/pci.c
++++ b/xen/drivers/passthrough/pci.c
+@@ -382,7 +382,24 @@ static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn)
+ phantom_devs[i].slot == PCI_SLOT(devfn) &&
+ phantom_devs[i].stride > PCI_FUNC(devfn) )
+ {
+- pdev->phantom_stride = phantom_devs[i].stride;
++ pci_sbdf_t sbdf = pdev->sbdf;
++ unsigned int stride = phantom_devs[i].stride;
++
++ while ( (sbdf.fn += stride) > PCI_FUNC(devfn) )
++ {
++ if ( pci_conf_read16(sbdf, PCI_VENDOR_ID) == 0xffff &&
++ pci_conf_read16(sbdf, PCI_DEVICE_ID) == 0xffff )
++ continue;
++ stride <<= 1;
++ printk(XENLOG_WARNING
++ "%pp looks to be a real device; bumping %04x:%02x:%02x stride to %u\n",
++ &sbdf, phantom_devs[i].seg,
++ phantom_devs[i].bus, phantom_devs[i].slot,
++ stride);
++ sbdf = pdev->sbdf;
++ }
++ if ( PCI_FUNC(stride) )
++ pdev->phantom_stride = stride;
+ break;
+ }
+ }
+--
+2.35.1
+
diff --git a/0022-x86-pv-Clean-up-_get_page_type.patch b/0022-x86-pv-Clean-up-_get_page_type.patch
new file mode 100644
index 0000000..a6008b0
--- /dev/null
+++ b/0022-x86-pv-Clean-up-_get_page_type.patch
@@ -0,0 +1,180 @@
+From b152dfbc3ad71a788996440b18174d995c3bffc9 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Thu, 9 Jun 2022 15:27:19 +0200
+Subject: [PATCH 22/32] x86/pv: Clean up _get_page_type()
+
+Various fixes for clarity, ahead of making complicated changes.
+
+ * Split the overflow check out of the if/else chain for type handling, as
+ it's somewhat unrelated.
+ * Comment the main if/else chain to explain what is going on. Adjust one
+ ASSERT() and state the bit layout for validate-locked and partial states.
+ * Correct the comment about TLB flushing, as it's backwards. The problem
+ case is when writeable mappings are retained to a page becoming read-only,
+ as it allows the guest to bypass Xen's safety checks for updates.
+ * Reduce the scope of 'y'. It is an artefact of the cmpxchg loop and not
+ valid for use by subsequent logic. Switch to using ACCESS_ONCE() to treat
+ all reads as explicitly volatile. The only thing preventing the validated
+ wait-loop being infinite is the compiler barrier hidden in cpu_relax().
+ * Replace one page_get_owner(page) with the already-calculated 'd' already in
+ scope.
+
+No functional change.
+
+This is part of XSA-401 / CVE-2022-26362.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+master commit: 9186e96b199e4f7e52e033b238f9fe869afb69c7
+master date: 2022-06-09 14:20:36 +0200
+---
+ xen/arch/x86/mm.c | 72 +++++++++++++++++++++++++++++++++++++++--------
+ 1 file changed, 61 insertions(+), 11 deletions(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 4ee2de11051d..79ad7fdd2b82 100644
+--- a/xen/arch/x86/mm.c
++++ b/xen/arch/x86/mm.c
+@@ -2906,16 +2906,17 @@ static int _put_page_type(struct page_info *page, unsigned int flags,
+ static int _get_page_type(struct page_info *page, unsigned long type,
+ bool preemptible)
+ {
+- unsigned long nx, x, y = page->u.inuse.type_info;
++ unsigned long nx, x;
+ int rc = 0;
+
+ ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
+ ASSERT(!in_irq());
+
+- for ( ; ; )
++ for ( unsigned long y = ACCESS_ONCE(page->u.inuse.type_info); ; )
+ {
+ x = y;
+ nx = x + 1;
++
+ if ( unlikely((nx & PGT_count_mask) == 0) )
+ {
+ gdprintk(XENLOG_WARNING,
+@@ -2923,8 +2924,15 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+ mfn_x(page_to_mfn(page)));
+ return -EINVAL;
+ }
+- else if ( unlikely((x & PGT_count_mask) == 0) )
++
++ if ( unlikely((x & PGT_count_mask) == 0) )
+ {
++ /*
++ * Typeref 0 -> 1.
++ *
++ * Type changes are permitted when the typeref is 0. If the type
++ * actually changes, the page needs re-validating.
++ */
+ struct domain *d = page_get_owner(page);
+
+ if ( d && shadow_mode_enabled(d) )
+@@ -2935,8 +2943,8 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+ {
+ /*
+ * On type change we check to flush stale TLB entries. It is
+- * vital that no other CPUs are left with mappings of a frame
+- * which is about to become writeable to the guest.
++ * vital that no other CPUs are left with writeable mappings
++ * to a frame which is intending to become pgtable/segdesc.
+ */
+ cpumask_t *mask = this_cpu(scratch_cpumask);
+
+@@ -2948,7 +2956,7 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+
+ if ( unlikely(!cpumask_empty(mask)) &&
+ /* Shadow mode: track only writable pages. */
+- (!shadow_mode_enabled(page_get_owner(page)) ||
++ (!shadow_mode_enabled(d) ||
+ ((nx & PGT_type_mask) == PGT_writable_page)) )
+ {
+ perfc_incr(need_flush_tlb_flush);
+@@ -2979,7 +2987,14 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+ }
+ else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
+ {
+- /* Don't log failure if it could be a recursive-mapping attempt. */
++ /*
++ * else, we're trying to take a new reference, of the wrong type.
++ *
++ * This (being able to prohibit use of the wrong type) is what the
++ * typeref system exists for, but skip printing the failure if it
++ * looks like a recursive mapping, as subsequent logic might
++ * ultimately permit the attempt.
++ */
+ if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
+ (type == PGT_l1_page_table) )
+ return -EINVAL;
+@@ -2998,18 +3013,46 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+ }
+ else if ( unlikely(!(x & PGT_validated)) )
+ {
++ /*
++ * else, the count is non-zero, and we're grabbing the right type;
++ * but the page hasn't been validated yet.
++ *
++ * The page is in one of two states (depending on PGT_partial),
++ * and should have exactly one reference.
++ */
++ ASSERT((x & (PGT_type_mask | PGT_count_mask)) == (type | 1));
++
+ if ( !(x & PGT_partial) )
+ {
+- /* Someone else is updating validation of this page. Wait... */
++ /*
++ * The page has been left in the "validate locked" state
++ * (i.e. PGT_[type] | 1) which means that a concurrent caller
++ * of _get_page_type() is in the middle of validation.
++ *
++ * Spin waiting for the concurrent user to complete (partial
++ * or fully validated), then restart our attempt to acquire a
++ * type reference.
++ */
+ do {
+ if ( preemptible && hypercall_preempt_check() )
+ return -EINTR;
+ cpu_relax();
+- } while ( (y = page->u.inuse.type_info) == x );
++ } while ( (y = ACCESS_ONCE(page->u.inuse.type_info)) == x );
+ continue;
+ }
+- /* Type ref count was left at 1 when PGT_partial got set. */
+- ASSERT((x & PGT_count_mask) == 1);
++
++ /*
++ * The page has been left in the "partial" state
++ * (i.e., PGT_[type] | PGT_partial | 1).
++ *
++ * Rather than bumping the type count, we need to try to grab the
++ * validation lock; if we succeed, we need to validate the page,
++ * then drop the general ref associated with the PGT_partial bit.
++ *
++ * We grab the validation lock by setting nx to (PGT_[type] | 1)
++ * (i.e., non-zero type count, neither PGT_validated nor
++ * PGT_partial set).
++ */
+ nx = x & ~PGT_partial;
+ }
+
+@@ -3058,6 +3101,13 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+ }
+
+ out:
++ /*
++ * Did we drop the PGT_partial bit when acquiring the typeref? If so,
++ * drop the general reference that went along with it.
++ *
++ * N.B. validate_page() may have have re-set PGT_partial, not reflected in
++ * nx, but will have taken an extra ref when doing so.
++ */
+ if ( (x & PGT_partial) && !(nx & PGT_partial) )
+ put_page(page);
+
+--
+2.35.1
+
diff --git a/0023-x86-pv-Fix-ABAC-cmpxchg-race-in-_get_page_type.patch b/0023-x86-pv-Fix-ABAC-cmpxchg-race-in-_get_page_type.patch
new file mode 100644
index 0000000..2f4b734
--- /dev/null
+++ b/0023-x86-pv-Fix-ABAC-cmpxchg-race-in-_get_page_type.patch
@@ -0,0 +1,201 @@
+From 8dab3f79b122e69cbcdebca72cdc14f004ee2193 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Thu, 9 Jun 2022 15:27:37 +0200
+Subject: [PATCH 23/32] x86/pv: Fix ABAC cmpxchg() race in _get_page_type()
+
+_get_page_type() suffers from a race condition where it incorrectly assumes
+that because 'x' was read and a subsequent a cmpxchg() succeeds, the type
+cannot have changed in-between. Consider:
+
+CPU A:
+ 1. Creates an L2e referencing pg
+ `-> _get_page_type(pg, PGT_l1_page_table), sees count 0, type PGT_writable_page
+ 2. Issues flush_tlb_mask()
+CPU B:
+ 3. Creates a writeable mapping of pg
+ `-> _get_page_type(pg, PGT_writable_page), count increases to 1
+ 4. Writes into new mapping, creating a TLB entry for pg
+ 5. Removes the writeable mapping of pg
+ `-> _put_page_type(pg), count goes back down to 0
+CPU A:
+ 7. Issues cmpxchg(), setting count 1, type PGT_l1_page_table
+
+CPU B now has a writeable mapping to pg, which Xen believes is a pagetable and
+suitably protected (i.e. read-only). The TLB flush in step 2 must be deferred
+until after the guest is prohibited from creating new writeable mappings,
+which is after step 7.
+
+Defer all safety actions until after the cmpxchg() has successfully taken the
+intended typeref, because that is what prevents concurrent users from using
+the old type.
+
+Also remove the early validation for writeable and shared pages. This removes
+race conditions where one half of a parallel mapping attempt can return
+successfully before:
+ * The IOMMU pagetables are in sync with the new page type
+ * Writeable mappings to shared pages have been torn down
+
+This is part of XSA-401 / CVE-2022-26362.
+
+Reported-by: Jann Horn <jannh@google.com>
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+master commit: 8cc5036bc385112a82f1faff27a0970e6440dfed
+master date: 2022-06-09 14:21:04 +0200
+---
+ xen/arch/x86/mm.c | 116 ++++++++++++++++++++++++++--------------------
+ 1 file changed, 67 insertions(+), 49 deletions(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 79ad7fdd2b82..c6429b0f749a 100644
+--- a/xen/arch/x86/mm.c
++++ b/xen/arch/x86/mm.c
+@@ -2933,56 +2933,12 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+ * Type changes are permitted when the typeref is 0. If the type
+ * actually changes, the page needs re-validating.
+ */
+- struct domain *d = page_get_owner(page);
+-
+- if ( d && shadow_mode_enabled(d) )
+- shadow_prepare_page_type_change(d, page, type);
+
+ ASSERT(!(x & PGT_pae_xen_l2));
+ if ( (x & PGT_type_mask) != type )
+ {
+- /*
+- * On type change we check to flush stale TLB entries. It is
+- * vital that no other CPUs are left with writeable mappings
+- * to a frame which is intending to become pgtable/segdesc.
+- */
+- cpumask_t *mask = this_cpu(scratch_cpumask);
+-
+- BUG_ON(in_irq());
+- cpumask_copy(mask, d->dirty_cpumask);
+-
+- /* Don't flush if the timestamp is old enough */
+- tlbflush_filter(mask, page->tlbflush_timestamp);
+-
+- if ( unlikely(!cpumask_empty(mask)) &&
+- /* Shadow mode: track only writable pages. */
+- (!shadow_mode_enabled(d) ||
+- ((nx & PGT_type_mask) == PGT_writable_page)) )
+- {
+- perfc_incr(need_flush_tlb_flush);
+- /*
+- * If page was a page table make sure the flush is
+- * performed using an IPI in order to avoid changing the
+- * type of a page table page under the feet of
+- * spurious_page_fault().
+- */
+- flush_mask(mask,
+- (x & PGT_type_mask) &&
+- (x & PGT_type_mask) <= PGT_root_page_table
+- ? FLUSH_TLB | FLUSH_FORCE_IPI
+- : FLUSH_TLB);
+- }
+-
+- /* We lose existing type and validity. */
+ nx &= ~(PGT_type_mask | PGT_validated);
+ nx |= type;
+-
+- /*
+- * No special validation needed for writable pages.
+- * Page tables and GDT/LDT need to be scanned for validity.
+- */
+- if ( type == PGT_writable_page || type == PGT_shared_page )
+- nx |= PGT_validated;
+ }
+ }
+ else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
+@@ -3063,6 +3019,56 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+ return -EINTR;
+ }
+
++ /*
++ * One typeref has been taken and is now globally visible.
++ *
++ * The page is either in the "validate locked" state (PGT_[type] | 1) or
++ * fully validated (PGT_[type] | PGT_validated | >0).
++ */
++
++ if ( unlikely((x & PGT_count_mask) == 0) )
++ {
++ struct domain *d = page_get_owner(page);
++
++ if ( d && shadow_mode_enabled(d) )
++ shadow_prepare_page_type_change(d, page, type);
++
++ if ( (x & PGT_type_mask) != type )
++ {
++ /*
++ * On type change we check to flush stale TLB entries. It is
++ * vital that no other CPUs are left with writeable mappings
++ * to a frame which is intending to become pgtable/segdesc.
++ */
++ cpumask_t *mask = this_cpu(scratch_cpumask);
++
++ BUG_ON(in_irq());
++ cpumask_copy(mask, d->dirty_cpumask);
++
++ /* Don't flush if the timestamp is old enough */
++ tlbflush_filter(mask, page->tlbflush_timestamp);
++
++ if ( unlikely(!cpumask_empty(mask)) &&
++ /* Shadow mode: track only writable pages. */
++ (!shadow_mode_enabled(d) ||
++ ((nx & PGT_type_mask) == PGT_writable_page)) )
++ {
++ perfc_incr(need_flush_tlb_flush);
++ /*
++ * If page was a page table make sure the flush is
++ * performed using an IPI in order to avoid changing the
++ * type of a page table page under the feet of
++ * spurious_page_fault().
++ */
++ flush_mask(mask,
++ (x & PGT_type_mask) &&
++ (x & PGT_type_mask) <= PGT_root_page_table
++ ? FLUSH_TLB | FLUSH_FORCE_IPI
++ : FLUSH_TLB);
++ }
++ }
++ }
++
+ if ( unlikely(((x & PGT_type_mask) == PGT_writable_page) !=
+ (type == PGT_writable_page)) )
+ {
+@@ -3091,13 +3097,25 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+
+ if ( unlikely(!(nx & PGT_validated)) )
+ {
+- if ( !(x & PGT_partial) )
++ /*
++ * No special validation needed for writable or shared pages. Page
++ * tables and GDT/LDT need to have their contents audited.
++ *
++ * per validate_page(), non-atomic updates are fine here.
++ */
++ if ( type == PGT_writable_page || type == PGT_shared_page )
++ page->u.inuse.type_info |= PGT_validated;
++ else
+ {
+- page->nr_validated_ptes = 0;
+- page->partial_flags = 0;
+- page->linear_pt_count = 0;
++ if ( !(x & PGT_partial) )
++ {
++ page->nr_validated_ptes = 0;
++ page->partial_flags = 0;
++ page->linear_pt_count = 0;
++ }
++
++ rc = validate_page(page, type, preemptible);
+ }
+- rc = validate_page(page, type, preemptible);
+ }
+
+ out:
+--
+2.35.1
+
diff --git a/0024-x86-page-Introduce-_PAGE_-constants-for-memory-types.patch b/0024-x86-page-Introduce-_PAGE_-constants-for-memory-types.patch
new file mode 100644
index 0000000..c8c2dda
--- /dev/null
+++ b/0024-x86-page-Introduce-_PAGE_-constants-for-memory-types.patch
@@ -0,0 +1,53 @@
+From 9cfd796ae05421ded8e4f70b2c55352491cfa841 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Thu, 9 Jun 2022 15:27:53 +0200
+Subject: [PATCH 24/32] x86/page: Introduce _PAGE_* constants for memory types
+
+... rather than opencoding the PAT/PCD/PWT attributes in __PAGE_HYPERVISOR_*
+constants. These are going to be needed by forthcoming logic.
+
+No functional change.
+
+This is part of XSA-402.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+master commit: 1be8707c75bf4ba68447c74e1618b521dd432499
+master date: 2022-06-09 14:21:38 +0200
+---
+ xen/include/asm-x86/page.h | 12 ++++++++++--
+ 1 file changed, 10 insertions(+), 2 deletions(-)
+
+diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h
+index 1d080cffbe84..2e542050f65a 100644
+--- a/xen/include/asm-x86/page.h
++++ b/xen/include/asm-x86/page.h
+@@ -331,6 +331,14 @@ void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t);
+
+ #define PAGE_CACHE_ATTRS (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)
+
++/* Memory types, encoded under Xen's choice of MSR_PAT. */
++#define _PAGE_WB ( 0)
++#define _PAGE_WT ( _PAGE_PWT)
++#define _PAGE_UCM ( _PAGE_PCD )
++#define _PAGE_UC ( _PAGE_PCD | _PAGE_PWT)
++#define _PAGE_WC (_PAGE_PAT )
++#define _PAGE_WP (_PAGE_PAT | _PAGE_PWT)
++
+ /*
+ * Debug option: Ensure that granted mappings are not implicitly unmapped.
+ * WARNING: This will need to be disabled to run OSes that use the spare PTE
+@@ -349,8 +357,8 @@ void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t);
+ #define __PAGE_HYPERVISOR_RX (_PAGE_PRESENT | _PAGE_ACCESSED)
+ #define __PAGE_HYPERVISOR (__PAGE_HYPERVISOR_RX | \
+ _PAGE_DIRTY | _PAGE_RW)
+-#define __PAGE_HYPERVISOR_UCMINUS (__PAGE_HYPERVISOR | _PAGE_PCD)
+-#define __PAGE_HYPERVISOR_UC (__PAGE_HYPERVISOR | _PAGE_PCD | _PAGE_PWT)
++#define __PAGE_HYPERVISOR_UCMINUS (__PAGE_HYPERVISOR | _PAGE_UCM)
++#define __PAGE_HYPERVISOR_UC (__PAGE_HYPERVISOR | _PAGE_UC)
+ #define __PAGE_HYPERVISOR_SHSTK (__PAGE_HYPERVISOR_RO | _PAGE_DIRTY)
+
+ #define MAP_SMALL_PAGES _PAGE_AVAIL0 /* don't use superpages mappings */
+--
+2.35.1
+
diff --git a/0025-x86-Don-t-change-the-cacheability-of-the-directmap.patch b/0025-x86-Don-t-change-the-cacheability-of-the-directmap.patch
new file mode 100644
index 0000000..582fc74
--- /dev/null
+++ b/0025-x86-Don-t-change-the-cacheability-of-the-directmap.patch
@@ -0,0 +1,223 @@
+From 74193f4292d9cfc2874866e941d9939d8f33fcef Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Thu, 9 Jun 2022 15:28:23 +0200
+Subject: [PATCH 25/32] x86: Don't change the cacheability of the directmap
+
+Changeset 55f97f49b7ce ("x86: Change cache attributes of Xen 1:1 page mappings
+in response to guest mapping requests") attempted to keep the cacheability
+consistent between different mappings of the same page.
+
+The reason wasn't described in the changelog, but it is understood to be in
+regards to a concern over machine check exceptions, owing to errata when using
+mixed cacheabilities. It did this primarily by updating Xen's mapping of the
+page in the direct map when the guest mapped a page with reduced cacheability.
+
+Unfortunately, the logic didn't actually prevent mixed cacheability from
+occurring:
+ * A guest could map a page normally, and then map the same page with
+ different cacheability; nothing prevented this.
+ * The cacheability of the directmap was always latest-takes-precedence in
+ terms of guest requests.
+ * Grant-mapped frames with lesser cacheability didn't adjust the page's
+ cacheattr settings.
+ * The map_domain_page() function still unconditionally created WB mappings,
+ irrespective of the page's cacheattr settings.
+
+Additionally, update_xen_mappings() had a bug where the alias calculation was
+wrong for mfn's which were .init content, which should have been treated as
+fully guest pages, not Xen pages.
+
+Worse yet, the logic introduced a vulnerability whereby necessary
+pagetable/segdesc adjustments made by Xen in the validation logic could become
+non-coherent between the cache and main memory. The CPU could subsequently
+operate on the stale value in the cache, rather than the safe value in main
+memory.
+
+The directmap contains primarily mappings of RAM. PAT/MTRR conflict
+resolution is asymmetric, and generally for MTRR=WB ranges, PAT of lesser
+cacheability resolves to being coherent. The special case is WC mappings,
+which are non-coherent against MTRR=WB regions (except for fully-coherent
+CPUs).
+
+Xen must not have any WC cacheability in the directmap, to prevent Xen's
+actions from creating non-coherency. (Guest actions creating non-coherency is
+dealt with in subsequent patches.) As all memory types for MTRR=WB ranges
+inter-operate coherently, so leave Xen's directmap mappings as WB.
+
+Only PV guests with access to devices can use reduced-cacheability mappings to
+begin with, and they're trusted not to mount DoSs against the system anyway.
+
+Drop PGC_cacheattr_{base,mask} entirely, and the logic to manipulate them.
+Shift the later PGC_* constants up, to gain 3 extra bits in the main reference
+count. Retain the check in get_page_from_l1e() for special_pages() because a
+guest has no business using reduced cacheability on these.
+
+This reverts changeset 55f97f49b7ce6c3520c555d19caac6cf3f9a5df0
+
+This is CVE-2022-26363, part of XSA-402.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+master commit: ae09597da34aee6bc5b76475c5eea6994457e854
+master date: 2022-06-09 14:22:08 +0200
+---
+ xen/arch/x86/mm.c | 84 ++++------------------------------------
+ xen/include/asm-x86/mm.h | 23 +++++------
+ 2 files changed, 17 insertions(+), 90 deletions(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index c6429b0f749a..ab32d13a1a0d 100644
+--- a/xen/arch/x86/mm.c
++++ b/xen/arch/x86/mm.c
+@@ -783,28 +783,6 @@ bool is_iomem_page(mfn_t mfn)
+ return (page_get_owner(page) == dom_io);
+ }
+
+-static int update_xen_mappings(unsigned long mfn, unsigned int cacheattr)
+-{
+- int err = 0;
+- bool alias = mfn >= PFN_DOWN(xen_phys_start) &&
+- mfn < PFN_UP(xen_phys_start + xen_virt_end - XEN_VIRT_START);
+- unsigned long xen_va =
+- XEN_VIRT_START + ((mfn - PFN_DOWN(xen_phys_start)) << PAGE_SHIFT);
+-
+- if ( boot_cpu_has(X86_FEATURE_XEN_SELFSNOOP) )
+- return 0;
+-
+- if ( unlikely(alias) && cacheattr )
+- err = map_pages_to_xen(xen_va, _mfn(mfn), 1, 0);
+- if ( !err )
+- err = map_pages_to_xen((unsigned long)mfn_to_virt(mfn), _mfn(mfn), 1,
+- PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr));
+- if ( unlikely(alias) && !cacheattr && !err )
+- err = map_pages_to_xen(xen_va, _mfn(mfn), 1, PAGE_HYPERVISOR);
+-
+- return err;
+-}
+-
+ #ifndef NDEBUG
+ struct mmio_emul_range_ctxt {
+ const struct domain *d;
+@@ -1009,47 +987,14 @@ get_page_from_l1e(
+ goto could_not_pin;
+ }
+
+- if ( pte_flags_to_cacheattr(l1f) !=
+- ((page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base) )
++ if ( (l1f & PAGE_CACHE_ATTRS) != _PAGE_WB && is_special_page(page) )
+ {
+- unsigned long x, nx, y = page->count_info;
+- unsigned long cacheattr = pte_flags_to_cacheattr(l1f);
+- int err;
+-
+- if ( is_special_page(page) )
+- {
+- if ( write )
+- put_page_type(page);
+- put_page(page);
+- gdprintk(XENLOG_WARNING,
+- "Attempt to change cache attributes of Xen heap page\n");
+- return -EACCES;
+- }
+-
+- do {
+- x = y;
+- nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base);
+- } while ( (y = cmpxchg(&page->count_info, x, nx)) != x );
+-
+- err = update_xen_mappings(mfn, cacheattr);
+- if ( unlikely(err) )
+- {
+- cacheattr = y & PGC_cacheattr_mask;
+- do {
+- x = y;
+- nx = (x & ~PGC_cacheattr_mask) | cacheattr;
+- } while ( (y = cmpxchg(&page->count_info, x, nx)) != x );
+-
+- if ( write )
+- put_page_type(page);
+- put_page(page);
+-
+- gdprintk(XENLOG_WARNING, "Error updating mappings for mfn %" PRI_mfn
+- " (pfn %" PRI_pfn ", from L1 entry %" PRIpte ") for d%d\n",
+- mfn, get_gpfn_from_mfn(mfn),
+- l1e_get_intpte(l1e), l1e_owner->domain_id);
+- return err;
+- }
++ if ( write )
++ put_page_type(page);
++ put_page(page);
++ gdprintk(XENLOG_WARNING,
++ "Attempt to change cache attributes of Xen heap page\n");
++ return -EACCES;
+ }
+
+ return 0;
+@@ -2467,24 +2412,9 @@ static int mod_l4_entry(l4_pgentry_t *pl4e,
+ */
+ static int cleanup_page_mappings(struct page_info *page)
+ {
+- unsigned int cacheattr =
+- (page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base;
+ int rc = 0;
+ unsigned long mfn = mfn_x(page_to_mfn(page));
+
+- /*
+- * If we've modified xen mappings as a result of guest cache
+- * attributes, restore them to the "normal" state.
+- */
+- if ( unlikely(cacheattr) )
+- {
+- page->count_info &= ~PGC_cacheattr_mask;
+-
+- BUG_ON(is_special_page(page));
+-
+- rc = update_xen_mappings(mfn, 0);
+- }
+-
+ /*
+ * If this may be in a PV domain's IOMMU, remove it.
+ *
+diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
+index cb9052749963..8a9a43bb0a9d 100644
+--- a/xen/include/asm-x86/mm.h
++++ b/xen/include/asm-x86/mm.h
+@@ -69,25 +69,22 @@
+ /* Set when is using a page as a page table */
+ #define _PGC_page_table PG_shift(3)
+ #define PGC_page_table PG_mask(1, 3)
+- /* 3-bit PAT/PCD/PWT cache-attribute hint. */
+-#define PGC_cacheattr_base PG_shift(6)
+-#define PGC_cacheattr_mask PG_mask(7, 6)
+ /* Page is broken? */
+-#define _PGC_broken PG_shift(7)
+-#define PGC_broken PG_mask(1, 7)
++#define _PGC_broken PG_shift(4)
++#define PGC_broken PG_mask(1, 4)
+ /* Mutually-exclusive page states: { inuse, offlining, offlined, free }. */
+-#define PGC_state PG_mask(3, 9)
+-#define PGC_state_inuse PG_mask(0, 9)
+-#define PGC_state_offlining PG_mask(1, 9)
+-#define PGC_state_offlined PG_mask(2, 9)
+-#define PGC_state_free PG_mask(3, 9)
++#define PGC_state PG_mask(3, 6)
++#define PGC_state_inuse PG_mask(0, 6)
++#define PGC_state_offlining PG_mask(1, 6)
++#define PGC_state_offlined PG_mask(2, 6)
++#define PGC_state_free PG_mask(3, 6)
+ #define page_state_is(pg, st) (((pg)->count_info&PGC_state) == PGC_state_##st)
+ /* Page is not reference counted (see below for caveats) */
+-#define _PGC_extra PG_shift(10)
+-#define PGC_extra PG_mask(1, 10)
++#define _PGC_extra PG_shift(7)
++#define PGC_extra PG_mask(1, 7)
+
+ /* Count of references to this frame. */
+-#define PGC_count_width PG_shift(10)
++#define PGC_count_width PG_shift(7)
+ #define PGC_count_mask ((1UL<<PGC_count_width)-1)
+
+ /*
+--
+2.35.1
+
diff --git a/0026-x86-Split-cache_flush-out-of-cache_writeback.patch b/0026-x86-Split-cache_flush-out-of-cache_writeback.patch
new file mode 100644
index 0000000..ffd8d7c
--- /dev/null
+++ b/0026-x86-Split-cache_flush-out-of-cache_writeback.patch
@@ -0,0 +1,294 @@
+From 8eafa2d871ae51d461256e4a14175e24df330c70 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Thu, 9 Jun 2022 15:28:48 +0200
+Subject: [PATCH 26/32] x86: Split cache_flush() out of cache_writeback()
+
+Subsequent changes will want a fully flushing version.
+
+Use the new helper rather than opencoding it in flush_area_local(). This
+resolves an outstanding issue where the conditional sfence is on the wrong
+side of the clflushopt loop. clflushopt is ordered with respect to older
+stores, not to younger stores.
+
+Rename gnttab_cache_flush()'s helper to avoid colliding in name.
+grant_table.c can see the prototype from cache.h so the build fails
+otherwise.
+
+This is part of XSA-402.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+master commit: 9a67ffee3371506e1cbfdfff5b90658d4828f6a2
+master date: 2022-06-09 14:22:38 +0200
+---
+ xen/arch/x86/flushtlb.c | 84 ++++++++++++++++++++++++---
+ xen/common/grant_table.c | 4 +-
+ xen/drivers/passthrough/vtd/extern.h | 1 -
+ xen/drivers/passthrough/vtd/iommu.c | 53 +----------------
+ xen/drivers/passthrough/vtd/x86/vtd.c | 5 --
+ xen/include/asm-x86/cache.h | 7 +++
+ 6 files changed, 88 insertions(+), 66 deletions(-)
+
+diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c
+index 25798df50f54..0c912b8669f8 100644
+--- a/xen/arch/x86/flushtlb.c
++++ b/xen/arch/x86/flushtlb.c
+@@ -234,7 +234,7 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
+ if ( flags & FLUSH_CACHE )
+ {
+ const struct cpuinfo_x86 *c = &current_cpu_data;
+- unsigned long i, sz = 0;
++ unsigned long sz = 0;
+
+ if ( order < (BITS_PER_LONG - PAGE_SHIFT) )
+ sz = 1UL << (order + PAGE_SHIFT);
+@@ -244,13 +244,7 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
+ c->x86_clflush_size && c->x86_cache_size && sz &&
+ ((sz >> 10) < c->x86_cache_size) )
+ {
+- alternative("", "sfence", X86_FEATURE_CLFLUSHOPT);
+- for ( i = 0; i < sz; i += c->x86_clflush_size )
+- alternative_input(".byte " __stringify(NOP_DS_PREFIX) ";"
+- " clflush %0",
+- "data16 clflush %0", /* clflushopt */
+- X86_FEATURE_CLFLUSHOPT,
+- "m" (((const char *)va)[i]));
++ cache_flush(va, sz);
+ flags &= ~FLUSH_CACHE;
+ }
+ else
+@@ -265,6 +259,80 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
+ return flags;
+ }
+
++void cache_flush(const void *addr, unsigned int size)
++{
++ /*
++ * This function may be called before current_cpu_data is established.
++ * Hence a fallback is needed to prevent the loop below becoming infinite.
++ */
++ unsigned int clflush_size = current_cpu_data.x86_clflush_size ?: 16;
++ const void *end = addr + size;
++
++ addr -= (unsigned long)addr & (clflush_size - 1);
++ for ( ; addr < end; addr += clflush_size )
++ {
++ /*
++ * Note regarding the "ds" prefix use: it's faster to do a clflush
++ * + prefix than a clflush + nop, and hence the prefix is added instead
++ * of letting the alternative framework fill the gap by appending nops.
++ */
++ alternative_io("ds; clflush %[p]",
++ "data16 clflush %[p]", /* clflushopt */
++ X86_FEATURE_CLFLUSHOPT,
++ /* no outputs */,
++ [p] "m" (*(const char *)(addr)));
++ }
++
++ alternative("", "sfence", X86_FEATURE_CLFLUSHOPT);
++}
++
++void cache_writeback(const void *addr, unsigned int size)
++{
++ unsigned int clflush_size;
++ const void *end = addr + size;
++
++ /* Fall back to CLFLUSH{,OPT} when CLWB isn't available. */
++ if ( !boot_cpu_has(X86_FEATURE_CLWB) )
++ return cache_flush(addr, size);
++
++ /*
++ * This function may be called before current_cpu_data is established.
++ * Hence a fallback is needed to prevent the loop below becoming infinite.
++ */
++ clflush_size = current_cpu_data.x86_clflush_size ?: 16;
++ addr -= (unsigned long)addr & (clflush_size - 1);
++ for ( ; addr < end; addr += clflush_size )
++ {
++/*
++ * The arguments to a macro must not include preprocessor directives. Doing so
++ * results in undefined behavior, so we have to create some defines here in
++ * order to avoid it.
++ */
++#if defined(HAVE_AS_CLWB)
++# define CLWB_ENCODING "clwb %[p]"
++#elif defined(HAVE_AS_XSAVEOPT)
++# define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */
++#else
++# define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */
++#endif
++
++#define BASE_INPUT(addr) [p] "m" (*(const char *)(addr))
++#if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT)
++# define INPUT BASE_INPUT
++#else
++# define INPUT(addr) "a" (addr), BASE_INPUT(addr)
++#endif
++
++ asm volatile (CLWB_ENCODING :: INPUT(addr));
++
++#undef INPUT
++#undef BASE_INPUT
++#undef CLWB_ENCODING
++ }
++
++ asm volatile ("sfence" ::: "memory");
++}
++
+ unsigned int guest_flush_tlb_flags(const struct domain *d)
+ {
+ bool shadow = paging_mode_shadow(d);
+diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
+index 66f8ce71741c..4c742cd8fe81 100644
+--- a/xen/common/grant_table.c
++++ b/xen/common/grant_table.c
+@@ -3431,7 +3431,7 @@ gnttab_swap_grant_ref(XEN_GUEST_HANDLE_PARAM(gnttab_swap_grant_ref_t) uop,
+ return 0;
+ }
+
+-static int cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref)
++static int _cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref)
+ {
+ struct domain *d, *owner;
+ struct page_info *page;
+@@ -3525,7 +3525,7 @@ gnttab_cache_flush(XEN_GUEST_HANDLE_PARAM(gnttab_cache_flush_t) uop,
+ return -EFAULT;
+ for ( ; ; )
+ {
+- int ret = cache_flush(&op, cur_ref);
++ int ret = _cache_flush(&op, cur_ref);
+
+ if ( ret < 0 )
+ return ret;
+diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h
+index 01e010a10d61..401079299725 100644
+--- a/xen/drivers/passthrough/vtd/extern.h
++++ b/xen/drivers/passthrough/vtd/extern.h
+@@ -76,7 +76,6 @@ int __must_check qinval_device_iotlb_sync(struct vtd_iommu *iommu,
+ struct pci_dev *pdev,
+ u16 did, u16 size, u64 addr);
+
+-unsigned int get_cache_line_size(void);
+ void flush_all_cache(void);
+
+ uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node);
+diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
+index 8975c1de61bc..bc377c9bcfa4 100644
+--- a/xen/drivers/passthrough/vtd/iommu.c
++++ b/xen/drivers/passthrough/vtd/iommu.c
+@@ -31,6 +31,7 @@
+ #include <xen/pci.h>
+ #include <xen/pci_regs.h>
+ #include <xen/keyhandler.h>
++#include <asm/cache.h>
+ #include <asm/msi.h>
+ #include <asm/nops.h>
+ #include <asm/irq.h>
+@@ -206,54 +207,6 @@ static void check_cleanup_domid_map(const struct domain *d,
+ }
+ }
+
+-static void sync_cache(const void *addr, unsigned int size)
+-{
+- static unsigned long clflush_size = 0;
+- const void *end = addr + size;
+-
+- if ( clflush_size == 0 )
+- clflush_size = get_cache_line_size();
+-
+- addr -= (unsigned long)addr & (clflush_size - 1);
+- for ( ; addr < end; addr += clflush_size )
+-/*
+- * The arguments to a macro must not include preprocessor directives. Doing so
+- * results in undefined behavior, so we have to create some defines here in
+- * order to avoid it.
+- */
+-#if defined(HAVE_AS_CLWB)
+-# define CLWB_ENCODING "clwb %[p]"
+-#elif defined(HAVE_AS_XSAVEOPT)
+-# define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */
+-#else
+-# define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */
+-#endif
+-
+-#define BASE_INPUT(addr) [p] "m" (*(const char *)(addr))
+-#if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT)
+-# define INPUT BASE_INPUT
+-#else
+-# define INPUT(addr) "a" (addr), BASE_INPUT(addr)
+-#endif
+- /*
+- * Note regarding the use of NOP_DS_PREFIX: it's faster to do a clflush
+- * + prefix than a clflush + nop, and hence the prefix is added instead
+- * of letting the alternative framework fill the gap by appending nops.
+- */
+- alternative_io_2(".byte " __stringify(NOP_DS_PREFIX) "; clflush %[p]",
+- "data16 clflush %[p]", /* clflushopt */
+- X86_FEATURE_CLFLUSHOPT,
+- CLWB_ENCODING,
+- X86_FEATURE_CLWB, /* no outputs */,
+- INPUT(addr));
+-#undef INPUT
+-#undef BASE_INPUT
+-#undef CLWB_ENCODING
+-
+- alternative_2("", "sfence", X86_FEATURE_CLFLUSHOPT,
+- "sfence", X86_FEATURE_CLWB);
+-}
+-
+ /* Allocate page table, return its machine address */
+ uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node)
+ {
+@@ -273,7 +226,7 @@ uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node)
+ clear_page(vaddr);
+
+ if ( (iommu_ops.init ? &iommu_ops : &vtd_ops)->sync_cache )
+- sync_cache(vaddr, PAGE_SIZE);
++ cache_writeback(vaddr, PAGE_SIZE);
+ unmap_domain_page(vaddr);
+ cur_pg++;
+ }
+@@ -1305,7 +1258,7 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd)
+ iommu->nr_pt_levels = agaw_to_level(agaw);
+
+ if ( !ecap_coherent(iommu->ecap) )
+- vtd_ops.sync_cache = sync_cache;
++ vtd_ops.sync_cache = cache_writeback;
+
+ /* allocate domain id bitmap */
+ iommu->domid_bitmap = xzalloc_array(unsigned long, BITS_TO_LONGS(nr_dom));
+diff --git a/xen/drivers/passthrough/vtd/x86/vtd.c b/xen/drivers/passthrough/vtd/x86/vtd.c
+index 6681dccd6970..55f0faa521cb 100644
+--- a/xen/drivers/passthrough/vtd/x86/vtd.c
++++ b/xen/drivers/passthrough/vtd/x86/vtd.c
+@@ -47,11 +47,6 @@ void unmap_vtd_domain_page(const void *va)
+ unmap_domain_page(va);
+ }
+
+-unsigned int get_cache_line_size(void)
+-{
+- return ((cpuid_ebx(1) >> 8) & 0xff) * 8;
+-}
+-
+ void flush_all_cache()
+ {
+ wbinvd();
+diff --git a/xen/include/asm-x86/cache.h b/xen/include/asm-x86/cache.h
+index 1f7173d8c72c..e4770efb22b9 100644
+--- a/xen/include/asm-x86/cache.h
++++ b/xen/include/asm-x86/cache.h
+@@ -11,4 +11,11 @@
+
+ #define __read_mostly __section(".data.read_mostly")
+
++#ifndef __ASSEMBLY__
++
++void cache_flush(const void *addr, unsigned int size);
++void cache_writeback(const void *addr, unsigned int size);
++
++#endif
++
+ #endif
+--
+2.35.1
+
diff --git a/0027-x86-amd-Work-around-CLFLUSH-ordering-on-older-parts.patch b/0027-x86-amd-Work-around-CLFLUSH-ordering-on-older-parts.patch
new file mode 100644
index 0000000..a3ab379
--- /dev/null
+++ b/0027-x86-amd-Work-around-CLFLUSH-ordering-on-older-parts.patch
@@ -0,0 +1,95 @@
+From c4815be949aae6583a9a22897beb96b095b4f1a2 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Thu, 9 Jun 2022 15:29:13 +0200
+Subject: [PATCH 27/32] x86/amd: Work around CLFLUSH ordering on older parts
+
+On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakely ordered with everything,
+including reads and writes to the address, and LFENCE/SFENCE instructions.
+
+This creates a multitude of problematic corner cases, laid out in the manual.
+Arrange to use MFENCE on both sides of the CLFLUSH to force proper ordering.
+
+This is part of XSA-402.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+master commit: 062868a5a8b428b85db589fa9a6d6e43969ffeb9
+master date: 2022-06-09 14:23:07 +0200
+---
+ xen/arch/x86/cpu/amd.c | 8 ++++++++
+ xen/arch/x86/flushtlb.c | 13 ++++++++++++-
+ xen/include/asm-x86/cpufeatures.h | 1 +
+ 3 files changed, 21 insertions(+), 1 deletion(-)
+
+diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
+index a8e37dbb1f5c..b3b9a0df5fed 100644
+--- a/xen/arch/x86/cpu/amd.c
++++ b/xen/arch/x86/cpu/amd.c
+@@ -812,6 +812,14 @@ static void init_amd(struct cpuinfo_x86 *c)
+ if (!cpu_has_lfence_dispatch)
+ __set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability);
+
++ /*
++ * On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakly ordered with
++ * everything, including reads and writes to address, and
++ * LFENCE/SFENCE instructions.
++ */
++ if (!cpu_has_clflushopt)
++ setup_force_cpu_cap(X86_BUG_CLFLUSH_MFENCE);
++
+ switch(c->x86)
+ {
+ case 0xf ... 0x11:
+diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c
+index 0c912b8669f8..dcbb4064012e 100644
+--- a/xen/arch/x86/flushtlb.c
++++ b/xen/arch/x86/flushtlb.c
+@@ -259,6 +259,13 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
+ return flags;
+ }
+
++/*
++ * On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakly ordered with everything,
++ * including reads and writes to address, and LFENCE/SFENCE instructions.
++ *
++ * This function only works safely after alternatives have run. Luckily, at
++ * the time of writing, we don't flush the caches that early.
++ */
+ void cache_flush(const void *addr, unsigned int size)
+ {
+ /*
+@@ -268,6 +275,8 @@ void cache_flush(const void *addr, unsigned int size)
+ unsigned int clflush_size = current_cpu_data.x86_clflush_size ?: 16;
+ const void *end = addr + size;
+
++ alternative("", "mfence", X86_BUG_CLFLUSH_MFENCE);
++
+ addr -= (unsigned long)addr & (clflush_size - 1);
+ for ( ; addr < end; addr += clflush_size )
+ {
+@@ -283,7 +292,9 @@ void cache_flush(const void *addr, unsigned int size)
+ [p] "m" (*(const char *)(addr)));
+ }
+
+- alternative("", "sfence", X86_FEATURE_CLFLUSHOPT);
++ alternative_2("",
++ "sfence", X86_FEATURE_CLFLUSHOPT,
++ "mfence", X86_BUG_CLFLUSH_MFENCE);
+ }
+
+ void cache_writeback(const void *addr, unsigned int size)
+diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h
+index 7413febd7ad8..ff3157d52d13 100644
+--- a/xen/include/asm-x86/cpufeatures.h
++++ b/xen/include/asm-x86/cpufeatures.h
+@@ -47,6 +47,7 @@ XEN_CPUFEATURE(XEN_IBT, X86_SYNTH(27)) /* Xen uses CET Indirect Branch
+
+ #define X86_BUG_FPU_PTRS X86_BUG( 0) /* (F)X{SAVE,RSTOR} doesn't save/restore FOP/FIP/FDP. */
+ #define X86_BUG_NULL_SEG X86_BUG( 1) /* NULL-ing a selector preserves the base and limit. */
++#define X86_BUG_CLFLUSH_MFENCE X86_BUG( 2) /* MFENCE needed to serialise CLFLUSH */
+
+ /* Total number of capability words, inc synth and bug words. */
+ #define NCAPINTS (FSCAPINTS + X86_NR_SYNTH + X86_NR_BUG) /* N 32-bit words worth of info */
+--
+2.35.1
+
diff --git a/0028-x86-pv-Track-and-flush-non-coherent-mappings-of-RAM.patch b/0028-x86-pv-Track-and-flush-non-coherent-mappings-of-RAM.patch
new file mode 100644
index 0000000..66cd741
--- /dev/null
+++ b/0028-x86-pv-Track-and-flush-non-coherent-mappings-of-RAM.patch
@@ -0,0 +1,160 @@
+From dc020d8d1ba420e2dd0e7a40f5045db897f3c4f4 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Thu, 9 Jun 2022 15:29:38 +0200
+Subject: [PATCH 28/32] x86/pv: Track and flush non-coherent mappings of RAM
+
+There are legitimate uses of WC mappings of RAM, e.g. for DMA buffers with
+devices that make non-coherent writes. The Linux sound subsystem makes
+extensive use of this technique.
+
+For such usecases, the guest's DMA buffer is mapped and consistently used as
+WC, and Xen doesn't interact with the buffer.
+
+However, a mischevious guest can use WC mappings to deliberately create
+non-coherency between the cache and RAM, and use this to trick Xen into
+validating a pagetable which isn't actually safe.
+
+Allocate a new PGT_non_coherent to track the non-coherency of mappings. Set
+it whenever a non-coherent writeable mapping is created. If the page is used
+as anything other than PGT_writable_page, force a cache flush before
+validation. Also force a cache flush before the page is returned to the heap.
+
+This is CVE-2022-26364, part of XSA-402.
+
+Reported-by: Jann Horn <jannh@google.com>
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+master commit: c1c9cae3a9633054b177c5de21ad7268162b2f2c
+master date: 2022-06-09 14:23:37 +0200
+---
+ xen/arch/x86/mm.c | 38 +++++++++++++++++++++++++++++++++++
+ xen/arch/x86/pv/grant_table.c | 21 +++++++++++++++++++
+ xen/include/asm-x86/mm.h | 6 +++++-
+ 3 files changed, 64 insertions(+), 1 deletion(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index ab32d13a1a0d..bab9624fabb7 100644
+--- a/xen/arch/x86/mm.c
++++ b/xen/arch/x86/mm.c
+@@ -997,6 +997,15 @@ get_page_from_l1e(
+ return -EACCES;
+ }
+
++ /*
++ * Track writeable non-coherent mappings to RAM pages, to trigger a cache
++ * flush later if the target is used as anything but a PGT_writeable page.
++ * We care about all writeable mappings, including foreign mappings.
++ */
++ if ( !boot_cpu_has(X86_FEATURE_XEN_SELFSNOOP) &&
++ (l1f & (PAGE_CACHE_ATTRS | _PAGE_RW)) == (_PAGE_WC | _PAGE_RW) )
++ set_bit(_PGT_non_coherent, &page->u.inuse.type_info);
++
+ return 0;
+
+ could_not_pin:
+@@ -2454,6 +2463,19 @@ static int cleanup_page_mappings(struct page_info *page)
+ }
+ }
+
++ /*
++ * Flush the cache if there were previously non-coherent writeable
++ * mappings of this page. This forces the page to be coherent before it
++ * is freed back to the heap.
++ */
++ if ( __test_and_clear_bit(_PGT_non_coherent, &page->u.inuse.type_info) )
++ {
++ void *addr = __map_domain_page(page);
++
++ cache_flush(addr, PAGE_SIZE);
++ unmap_domain_page(addr);
++ }
++
+ return rc;
+ }
+
+@@ -3027,6 +3049,22 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+
+ if ( unlikely(!(nx & PGT_validated)) )
+ {
++ /*
++ * Flush the cache if there were previously non-coherent mappings of
++ * this page, and we're trying to use it as anything other than a
++ * writeable page. This forces the page to be coherent before we
++ * validate its contents for safety.
++ */
++ if ( (nx & PGT_non_coherent) && type != PGT_writable_page )
++ {
++ void *addr = __map_domain_page(page);
++
++ cache_flush(addr, PAGE_SIZE);
++ unmap_domain_page(addr);
++
++ page->u.inuse.type_info &= ~PGT_non_coherent;
++ }
++
+ /*
+ * No special validation needed for writable or shared pages. Page
+ * tables and GDT/LDT need to have their contents audited.
+diff --git a/xen/arch/x86/pv/grant_table.c b/xen/arch/x86/pv/grant_table.c
+index 0325618c9883..81c72e61ed55 100644
+--- a/xen/arch/x86/pv/grant_table.c
++++ b/xen/arch/x86/pv/grant_table.c
+@@ -109,7 +109,17 @@ int create_grant_pv_mapping(uint64_t addr, mfn_t frame,
+
+ ol1e = *pl1e;
+ if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) )
++ {
++ /*
++ * We always create mappings in this path. However, our caller,
++ * map_grant_ref(), only passes potentially non-zero cache_flags for
++ * MMIO frames, so this path doesn't create non-coherent mappings of
++ * RAM frames and there's no need to calculate PGT_non_coherent.
++ */
++ ASSERT(!cache_flags || is_iomem_page(frame));
++
+ rc = GNTST_okay;
++ }
+
+ out_unlock:
+ page_unlock(page);
+@@ -294,7 +304,18 @@ int replace_grant_pv_mapping(uint64_t addr, mfn_t frame,
+ l1e_get_flags(ol1e), addr, grant_pte_flags);
+
+ if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) )
++ {
++ /*
++ * Generally, replace_grant_pv_mapping() is used to destroy mappings
++ * (n1le = l1e_empty()), but it can be a present mapping on the
++ * GNTABOP_unmap_and_replace path.
++ *
++ * In such cases, the PTE is fully transplanted from its old location
++ * via steal_linear_addr(), so we need not perform PGT_non_coherent
++ * checking here.
++ */
+ rc = GNTST_okay;
++ }
+
+ out_unlock:
+ page_unlock(page);
+diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
+index 8a9a43bb0a9d..7464167ae192 100644
+--- a/xen/include/asm-x86/mm.h
++++ b/xen/include/asm-x86/mm.h
+@@ -53,8 +53,12 @@
+ #define _PGT_partial PG_shift(8)
+ #define PGT_partial PG_mask(1, 8)
+
++/* Has this page been mapped writeable with a non-coherent memory type? */
++#define _PGT_non_coherent PG_shift(9)
++#define PGT_non_coherent PG_mask(1, 9)
++
+ /* Count of uses of this frame as its current type. */
+-#define PGT_count_width PG_shift(8)
++#define PGT_count_width PG_shift(9)
+ #define PGT_count_mask ((1UL<<PGT_count_width)-1)
+
+ /* Are the 'type mask' bits identical? */
+--
+2.35.1
+
diff --git a/0029-x86-mm-account-for-PGT_pae_xen_l2-in-recently-added-.patch b/0029-x86-mm-account-for-PGT_pae_xen_l2-in-recently-added-.patch
new file mode 100644
index 0000000..0076984
--- /dev/null
+++ b/0029-x86-mm-account-for-PGT_pae_xen_l2-in-recently-added-.patch
@@ -0,0 +1,37 @@
+From 0b4e62847c5af1a59eea8d17093feccd550d1c26 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Fri, 10 Jun 2022 10:28:28 +0200
+Subject: [PATCH 29/32] x86/mm: account for PGT_pae_xen_l2 in recently added
+ assertion
+
+While PGT_pae_xen_l2 will be zapped once the type refcount of an L2 page
+reaches zero, it'll be retained as long as the type refcount is non-
+zero. Hence any checking against the requested type needs to either zap
+the bit from the type or include it in the used mask.
+
+Fixes: 9186e96b199e ("x86/pv: Clean up _get_page_type()")
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+master commit: c2095ac76be0f4a1940346c9ffb49fb967345060
+master date: 2022-06-10 10:21:06 +0200
+---
+ xen/arch/x86/mm.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index bab9624fabb7..c1b9a3bb102a 100644
+--- a/xen/arch/x86/mm.c
++++ b/xen/arch/x86/mm.c
+@@ -2928,7 +2928,8 @@ static int _get_page_type(struct page_info *page, unsigned long type,
+ * The page is in one of two states (depending on PGT_partial),
+ * and should have exactly one reference.
+ */
+- ASSERT((x & (PGT_type_mask | PGT_count_mask)) == (type | 1));
++ ASSERT((x & (PGT_type_mask | PGT_pae_xen_l2 | PGT_count_mask)) ==
++ (type | 1));
+
+ if ( !(x & PGT_partial) )
+ {
+--
+2.35.1
+
diff --git a/0030-x86-spec-ctrl-Make-VERW-flushing-runtime-conditional.patch b/0030-x86-spec-ctrl-Make-VERW-flushing-runtime-conditional.patch
new file mode 100644
index 0000000..8556452
--- /dev/null
+++ b/0030-x86-spec-ctrl-Make-VERW-flushing-runtime-conditional.patch
@@ -0,0 +1,258 @@
+From 0e80f9f61168d4e4f008da75762cee0118f802ed Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Mon, 13 Jun 2022 16:19:01 +0100
+Subject: [PATCH 30/32] x86/spec-ctrl: Make VERW flushing runtime conditional
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Currently, VERW flushing to mitigate MDS is boot time conditional per domain
+type. However, to provide mitigations for DRPW (CVE-2022-21166), we need to
+conditionally use VERW based on the trustworthiness of the guest, and the
+devices passed through.
+
+Remove the PV/HVM alternatives and instead issue a VERW on the return-to-guest
+path depending on the SCF_verw bit in cpuinfo spec_ctrl_flags.
+
+Introduce spec_ctrl_init_domain() and d->arch.verw to calculate the VERW
+disposition at domain creation time, and context switch the SCF_verw bit.
+
+For now, VERW flushing is used and controlled exactly as before, but later
+patches will add per-domain cases too.
+
+No change in behaviour.
+
+This is part of XSA-404.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+(cherry picked from commit e06b95c1d44ab80da255219fc9f1e2fc423edcb6)
+---
+ docs/misc/xen-command-line.pandoc | 5 ++---
+ xen/arch/x86/domain.c | 12 ++++++++++--
+ xen/arch/x86/hvm/vmx/entry.S | 2 +-
+ xen/arch/x86/spec_ctrl.c | 30 +++++++++++++++++------------
+ xen/include/asm-x86/cpufeatures.h | 3 +--
+ xen/include/asm-x86/domain.h | 3 +++
+ xen/include/asm-x86/spec_ctrl.h | 2 ++
+ xen/include/asm-x86/spec_ctrl_asm.h | 16 +++++++++++++--
+ 8 files changed, 51 insertions(+), 22 deletions(-)
+
+diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
+index 1d08fb7e9aa6..d5cb09f86541 100644
+--- a/docs/misc/xen-command-line.pandoc
++++ b/docs/misc/xen-command-line.pandoc
+@@ -2258,9 +2258,8 @@ in place for guests to use.
+ Use of a positive boolean value for either of these options is invalid.
+
+ The booleans `pv=`, `hvm=`, `msr-sc=`, `rsb=` and `md-clear=` offer fine
+-grained control over the alternative blocks used by Xen. These impact Xen's
+-ability to protect itself, and Xen's ability to virtualise support for guests
+-to use.
++grained control over the primitives by Xen. These impact Xen's ability to
++protect itself, and Xen's ability to virtualise support for guests to use.
+
+ * `pv=` and `hvm=` offer control over all suboptions for PV and HVM guests
+ respectively.
+diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
+index ef1812dc1402..1fe6644a71ae 100644
+--- a/xen/arch/x86/domain.c
++++ b/xen/arch/x86/domain.c
+@@ -863,6 +863,8 @@ int arch_domain_create(struct domain *d,
+
+ d->arch.msr_relaxed = config->arch.misc_flags & XEN_X86_MSR_RELAXED;
+
++ spec_ctrl_init_domain(d);
++
+ return 0;
+
+ fail:
+@@ -2017,14 +2019,15 @@ static void __context_switch(void)
+ void context_switch(struct vcpu *prev, struct vcpu *next)
+ {
+ unsigned int cpu = smp_processor_id();
++ struct cpu_info *info = get_cpu_info();
+ const struct domain *prevd = prev->domain, *nextd = next->domain;
+ unsigned int dirty_cpu = read_atomic(&next->dirty_cpu);
+
+ ASSERT(prev != next);
+ ASSERT(local_irq_is_enabled());
+
+- get_cpu_info()->use_pv_cr3 = false;
+- get_cpu_info()->xen_cr3 = 0;
++ info->use_pv_cr3 = false;
++ info->xen_cr3 = 0;
+
+ if ( unlikely(dirty_cpu != cpu) && dirty_cpu != VCPU_CPU_CLEAN )
+ {
+@@ -2088,6 +2091,11 @@ void context_switch(struct vcpu *prev, struct vcpu *next)
+ *last_id = next_id;
+ }
+ }
++
++ /* Update the top-of-stack block with the VERW disposition. */
++ info->spec_ctrl_flags &= ~SCF_verw;
++ if ( nextd->arch.verw )
++ info->spec_ctrl_flags |= SCF_verw;
+ }
+
+ sched_context_switched(prev, next);
+diff --git a/xen/arch/x86/hvm/vmx/entry.S b/xen/arch/x86/hvm/vmx/entry.S
+index 49651f3c435a..5f5de45a1309 100644
+--- a/xen/arch/x86/hvm/vmx/entry.S
++++ b/xen/arch/x86/hvm/vmx/entry.S
+@@ -87,7 +87,7 @@ UNLIKELY_END(realmode)
+
+ /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */
+ /* SPEC_CTRL_EXIT_TO_VMX Req: %rsp=regs/cpuinfo Clob: */
+- ALTERNATIVE "", __stringify(verw CPUINFO_verw_sel(%rsp)), X86_FEATURE_SC_VERW_HVM
++ DO_SPEC_CTRL_COND_VERW
+
+ mov VCPU_hvm_guest_cr2(%rbx),%rax
+
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index c19464da70ce..21730aa03071 100644
+--- a/xen/arch/x86/spec_ctrl.c
++++ b/xen/arch/x86/spec_ctrl.c
+@@ -36,8 +36,8 @@ static bool __initdata opt_msr_sc_pv = true;
+ static bool __initdata opt_msr_sc_hvm = true;
+ static int8_t __initdata opt_rsb_pv = -1;
+ static bool __initdata opt_rsb_hvm = true;
+-static int8_t __initdata opt_md_clear_pv = -1;
+-static int8_t __initdata opt_md_clear_hvm = -1;
++static int8_t __read_mostly opt_md_clear_pv = -1;
++static int8_t __read_mostly opt_md_clear_hvm = -1;
+
+ /* Cmdline controls for Xen's speculative settings. */
+ static enum ind_thunk {
+@@ -932,6 +932,13 @@ static __init void mds_calculations(uint64_t caps)
+ }
+ }
+
++void spec_ctrl_init_domain(struct domain *d)
++{
++ bool pv = is_pv_domain(d);
++
++ d->arch.verw = pv ? opt_md_clear_pv : opt_md_clear_hvm;
++}
++
+ void __init init_speculation_mitigations(void)
+ {
+ enum ind_thunk thunk = THUNK_DEFAULT;
+@@ -1196,21 +1203,20 @@ void __init init_speculation_mitigations(void)
+ boot_cpu_has(X86_FEATURE_MD_CLEAR));
+
+ /*
+- * Enable MDS defences as applicable. The PV blocks need using all the
+- * time, and the Idle blocks need using if either PV or HVM defences are
+- * used.
++ * Enable MDS defences as applicable. The Idle blocks need using if
++ * either PV or HVM defences are used.
+ *
+ * HVM is more complicated. The MD_CLEAR microcode extends L1D_FLUSH with
+- * equivelent semantics to avoid needing to perform both flushes on the
+- * HVM path. The HVM blocks don't need activating if our hypervisor told
+- * us it was handling L1D_FLUSH, or we are using L1D_FLUSH ourselves.
++ * equivalent semantics to avoid needing to perform both flushes on the
++ * HVM path. Therefore, we don't need VERW in addition to L1D_FLUSH.
++ *
++ * After calculating the appropriate idle setting, simplify
++ * opt_md_clear_hvm to mean just "should we VERW on the way into HVM
++ * guests", so spec_ctrl_init_domain() can calculate suitable settings.
+ */
+- if ( opt_md_clear_pv )
+- setup_force_cpu_cap(X86_FEATURE_SC_VERW_PV);
+ if ( opt_md_clear_pv || opt_md_clear_hvm )
+ setup_force_cpu_cap(X86_FEATURE_SC_VERW_IDLE);
+- if ( opt_md_clear_hvm && !(caps & ARCH_CAPS_SKIP_L1DFL) && !opt_l1d_flush )
+- setup_force_cpu_cap(X86_FEATURE_SC_VERW_HVM);
++ opt_md_clear_hvm &= !(caps & ARCH_CAPS_SKIP_L1DFL) && !opt_l1d_flush;
+
+ /*
+ * Warn the user if they are on MLPDS/MFBDS-vulnerable hardware with HT
+diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h
+index ff3157d52d13..bd45a144ee78 100644
+--- a/xen/include/asm-x86/cpufeatures.h
++++ b/xen/include/asm-x86/cpufeatures.h
+@@ -35,8 +35,7 @@ XEN_CPUFEATURE(SC_RSB_HVM, X86_SYNTH(19)) /* RSB overwrite needed for HVM
+ XEN_CPUFEATURE(XEN_SELFSNOOP, X86_SYNTH(20)) /* SELFSNOOP gets used by Xen itself */
+ XEN_CPUFEATURE(SC_MSR_IDLE, X86_SYNTH(21)) /* (SC_MSR_PV || SC_MSR_HVM) && default_xen_spec_ctrl */
+ XEN_CPUFEATURE(XEN_LBR, X86_SYNTH(22)) /* Xen uses MSR_DEBUGCTL.LBR */
+-XEN_CPUFEATURE(SC_VERW_PV, X86_SYNTH(23)) /* VERW used by Xen for PV */
+-XEN_CPUFEATURE(SC_VERW_HVM, X86_SYNTH(24)) /* VERW used by Xen for HVM */
++/* Bits 23,24 unused. */
+ XEN_CPUFEATURE(SC_VERW_IDLE, X86_SYNTH(25)) /* VERW used by Xen for idle */
+ XEN_CPUFEATURE(XEN_SHSTK, X86_SYNTH(26)) /* Xen uses CET Shadow Stacks */
+ XEN_CPUFEATURE(XEN_IBT, X86_SYNTH(27)) /* Xen uses CET Indirect Branch Tracking */
+diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
+index 92d54de0b9a1..2398a1d99da9 100644
+--- a/xen/include/asm-x86/domain.h
++++ b/xen/include/asm-x86/domain.h
+@@ -319,6 +319,9 @@ struct arch_domain
+ uint32_t pci_cf8;
+ uint8_t cmos_idx;
+
++ /* Use VERW on return-to-guest for its flushing side effect. */
++ bool verw;
++
+ union {
+ struct pv_domain pv;
+ struct hvm_domain hvm;
+diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h
+index f76029523610..751355f471f4 100644
+--- a/xen/include/asm-x86/spec_ctrl.h
++++ b/xen/include/asm-x86/spec_ctrl.h
+@@ -24,6 +24,7 @@
+ #define SCF_use_shadow (1 << 0)
+ #define SCF_ist_wrmsr (1 << 1)
+ #define SCF_ist_rsb (1 << 2)
++#define SCF_verw (1 << 3)
+
+ #ifndef __ASSEMBLY__
+
+@@ -32,6 +33,7 @@
+ #include <asm/msr-index.h>
+
+ void init_speculation_mitigations(void);
++void spec_ctrl_init_domain(struct domain *d);
+
+ extern bool opt_ibpb;
+ extern bool opt_ssbd;
+diff --git a/xen/include/asm-x86/spec_ctrl_asm.h b/xen/include/asm-x86/spec_ctrl_asm.h
+index 02b3b18ce69f..5a590bac44aa 100644
+--- a/xen/include/asm-x86/spec_ctrl_asm.h
++++ b/xen/include/asm-x86/spec_ctrl_asm.h
+@@ -136,6 +136,19 @@
+ #endif
+ .endm
+
++.macro DO_SPEC_CTRL_COND_VERW
++/*
++ * Requires %rsp=cpuinfo
++ *
++ * Issue a VERW for its flushing side effect, if indicated. This is a Spectre
++ * v1 gadget, but the IRET/VMEntry is serialising.
++ */
++ testb $SCF_verw, CPUINFO_spec_ctrl_flags(%rsp)
++ jz .L\@_verw_skip
++ verw CPUINFO_verw_sel(%rsp)
++.L\@_verw_skip:
++.endm
++
+ .macro DO_SPEC_CTRL_ENTRY maybexen:req
+ /*
+ * Requires %rsp=regs (also cpuinfo if !maybexen)
+@@ -231,8 +244,7 @@
+ #define SPEC_CTRL_EXIT_TO_PV \
+ ALTERNATIVE "", \
+ DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV; \
+- ALTERNATIVE "", __stringify(verw CPUINFO_verw_sel(%rsp)), \
+- X86_FEATURE_SC_VERW_PV
++ DO_SPEC_CTRL_COND_VERW
+
+ /*
+ * Use in IST interrupt/exception context. May interrupt Xen or PV context.
+--
+2.35.1
+
diff --git a/0031-x86-spec-ctrl-Enumeration-for-MMIO-Stale-Data-contro.patch b/0031-x86-spec-ctrl-Enumeration-for-MMIO-Stale-Data-contro.patch
new file mode 100644
index 0000000..6934800
--- /dev/null
+++ b/0031-x86-spec-ctrl-Enumeration-for-MMIO-Stale-Data-contro.patch
@@ -0,0 +1,98 @@
+From a83108736db0ddaa5855f5abda6dcc8ae4fe25e9 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Mon, 20 Sep 2021 18:47:49 +0100
+Subject: [PATCH 31/32] x86/spec-ctrl: Enumeration for MMIO Stale Data controls
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+The three *_NO bits indicate non-susceptibility to the SSDP, FBSDP and PSDP
+data movement primitives.
+
+FB_CLEAR indicates that the VERW instruction has re-gained it's Fill Buffer
+flushing side effect. This is only enumerated on parts where VERW had
+previously lost it's flushing side effect due to the MDS/TAA vulnerabilities
+being fixed in hardware.
+
+FB_CLEAR_CTRL is available on a subset of FB_CLEAR parts where the Fill Buffer
+clearing side effect of VERW can be turned off for performance reasons.
+
+This is part of XSA-404.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+(cherry picked from commit 2ebe8fe9b7e0d36e9ec3cfe4552b2b197ef0dcec)
+---
+ xen/arch/x86/spec_ctrl.c | 11 ++++++++---
+ xen/include/asm-x86/msr-index.h | 6 ++++++
+ 2 files changed, 14 insertions(+), 3 deletions(-)
+
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index 21730aa03071..d285538bde9f 100644
+--- a/xen/arch/x86/spec_ctrl.c
++++ b/xen/arch/x86/spec_ctrl.c
+@@ -323,7 +323,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
+ * Hardware read-only information, stating immunity to certain issues, or
+ * suggestions of which mitigation to use.
+ */
+- printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s\n",
++ printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
+ (caps & ARCH_CAPS_RDCL_NO) ? " RDCL_NO" : "",
+ (caps & ARCH_CAPS_IBRS_ALL) ? " IBRS_ALL" : "",
+ (caps & ARCH_CAPS_RSBA) ? " RSBA" : "",
+@@ -332,13 +332,16 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
+ (caps & ARCH_CAPS_SSB_NO) ? " SSB_NO" : "",
+ (caps & ARCH_CAPS_MDS_NO) ? " MDS_NO" : "",
+ (caps & ARCH_CAPS_TAA_NO) ? " TAA_NO" : "",
++ (caps & ARCH_CAPS_SBDR_SSDP_NO) ? " SBDR_SSDP_NO" : "",
++ (caps & ARCH_CAPS_FBSDP_NO) ? " FBSDP_NO" : "",
++ (caps & ARCH_CAPS_PSDP_NO) ? " PSDP_NO" : "",
+ (e8b & cpufeat_mask(X86_FEATURE_IBRS_ALWAYS)) ? " IBRS_ALWAYS" : "",
+ (e8b & cpufeat_mask(X86_FEATURE_STIBP_ALWAYS)) ? " STIBP_ALWAYS" : "",
+ (e8b & cpufeat_mask(X86_FEATURE_IBRS_FAST)) ? " IBRS_FAST" : "",
+ (e8b & cpufeat_mask(X86_FEATURE_IBRS_SAME_MODE)) ? " IBRS_SAME_MODE" : "");
+
+ /* Hardware features which need driving to mitigate issues. */
+- printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s\n",
++ printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s\n",
+ (e8b & cpufeat_mask(X86_FEATURE_IBPB)) ||
+ (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBPB" : "",
+ (e8b & cpufeat_mask(X86_FEATURE_IBRS)) ||
+@@ -353,7 +356,9 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
+ (_7d0 & cpufeat_mask(X86_FEATURE_MD_CLEAR)) ? " MD_CLEAR" : "",
+ (_7d0 & cpufeat_mask(X86_FEATURE_SRBDS_CTRL)) ? " SRBDS_CTRL" : "",
+ (e8b & cpufeat_mask(X86_FEATURE_VIRT_SSBD)) ? " VIRT_SSBD" : "",
+- (caps & ARCH_CAPS_TSX_CTRL) ? " TSX_CTRL" : "");
++ (caps & ARCH_CAPS_TSX_CTRL) ? " TSX_CTRL" : "",
++ (caps & ARCH_CAPS_FB_CLEAR) ? " FB_CLEAR" : "",
++ (caps & ARCH_CAPS_FB_CLEAR_CTRL) ? " FB_CLEAR_CTRL" : "");
+
+ /* Compiled-in support which pertains to mitigations. */
+ if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) )
+diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
+index 31964b88af7a..72bc32ba04ff 100644
+--- a/xen/include/asm-x86/msr-index.h
++++ b/xen/include/asm-x86/msr-index.h
+@@ -66,6 +66,11 @@
+ #define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6)
+ #define ARCH_CAPS_TSX_CTRL (_AC(1, ULL) << 7)
+ #define ARCH_CAPS_TAA_NO (_AC(1, ULL) << 8)
++#define ARCH_CAPS_SBDR_SSDP_NO (_AC(1, ULL) << 13)
++#define ARCH_CAPS_FBSDP_NO (_AC(1, ULL) << 14)
++#define ARCH_CAPS_PSDP_NO (_AC(1, ULL) << 15)
++#define ARCH_CAPS_FB_CLEAR (_AC(1, ULL) << 17)
++#define ARCH_CAPS_FB_CLEAR_CTRL (_AC(1, ULL) << 18)
+
+ #define MSR_FLUSH_CMD 0x0000010b
+ #define FLUSH_CMD_L1D (_AC(1, ULL) << 0)
+@@ -83,6 +88,7 @@
+ #define MCU_OPT_CTRL_RNGDS_MITG_DIS (_AC(1, ULL) << 0)
+ #define MCU_OPT_CTRL_RTM_ALLOW (_AC(1, ULL) << 1)
+ #define MCU_OPT_CTRL_RTM_LOCKED (_AC(1, ULL) << 2)
++#define MCU_OPT_CTRL_FB_CLEAR_DIS (_AC(1, ULL) << 3)
+
+ #define MSR_RTIT_OUTPUT_BASE 0x00000560
+ #define MSR_RTIT_OUTPUT_MASK 0x00000561
+--
+2.35.1
+
diff --git a/0032-x86-spec-ctrl-Add-spec-ctrl-unpriv-mmio.patch b/0032-x86-spec-ctrl-Add-spec-ctrl-unpriv-mmio.patch
new file mode 100644
index 0000000..a5ac3e9
--- /dev/null
+++ b/0032-x86-spec-ctrl-Add-spec-ctrl-unpriv-mmio.patch
@@ -0,0 +1,187 @@
+From 2e82446cb252f6c8ac697e81f4155872c69afde4 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Mon, 13 Jun 2022 19:18:32 +0100
+Subject: [PATCH 32/32] x86/spec-ctrl: Add spec-ctrl=unpriv-mmio
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Per Xen's support statement, PCI passthrough should be to trusted domains
+because the overall system security depends on factors outside of Xen's
+control.
+
+As such, Xen, in a supported configuration, is not vulnerable to DRPW/SBDR.
+
+However, users who have risk assessed their configuration may be happy with
+the risk of DoS, but unhappy with the risk of cross-domain data leakage. Such
+users should enable this option.
+
+On CPUs vulnerable to MDS, the existing mitigations are the best we can do to
+mitigate MMIO cross-domain data leakage.
+
+On CPUs fixed to MDS but vulnerable MMIO stale data leakage, this option:
+
+ * On CPUs susceptible to FBSDP, mitigates cross-domain fill buffer leakage
+ using FB_CLEAR.
+ * On CPUs susceptible to SBDR, mitigates RNG data recovery by engaging the
+ srb-lock, previously used to mitigate SRBDS.
+
+Both mitigations require microcode from IPU 2022.1, May 2022.
+
+This is part of XSA-404.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+(cherry picked from commit 8c24b70fedcb52633b2370f834d8a2be3f7fa38e)
+---
+ docs/misc/xen-command-line.pandoc | 14 +++++++--
+ xen/arch/x86/spec_ctrl.c | 48 ++++++++++++++++++++++++-------
+ 2 files changed, 48 insertions(+), 14 deletions(-)
+
+diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
+index d5cb09f86541..a642e43476a2 100644
+--- a/docs/misc/xen-command-line.pandoc
++++ b/docs/misc/xen-command-line.pandoc
+@@ -2235,7 +2235,7 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`).
+ ### spec-ctrl (x86)
+ > `= List of [ <bool>, xen=<bool>, {pv,hvm,msr-sc,rsb,md-clear}=<bool>,
+ > bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,eager-fpu,
+-> l1d-flush,branch-harden,srb-lock}=<bool> ]`
++> l1d-flush,branch-harden,srb-lock,unpriv-mmio}=<bool> ]`
+
+ Controls for speculative execution sidechannel mitigations. By default, Xen
+ will pick the most appropriate mitigations based on compiled in support,
+@@ -2314,8 +2314,16 @@ Xen will enable this mitigation.
+ On hardware supporting SRBDS_CTRL, the `srb-lock=` option can be used to force
+ or prevent Xen from protect the Special Register Buffer from leaking stale
+ data. By default, Xen will enable this mitigation, except on parts where MDS
+-is fixed and TAA is fixed/mitigated (in which case, there is believed to be no
+-way for an attacker to obtain the stale data).
++is fixed and TAA is fixed/mitigated and there are no unprivileged MMIO
++mappings (in which case, there is believed to be no way for an attacker to
++obtain stale data).
++
++The `unpriv-mmio=` boolean indicates whether the system has (or will have)
++less than fully privileged domains granted access to MMIO devices. By
++default, this option is disabled. If enabled, Xen will use the `FB_CLEAR`
++and/or `SRBDS_CTRL` functionality available in the Intel May 2022 microcode
++release to mitigate cross-domain leakage of data via the MMIO Stale Data
++vulnerabilities.
+
+ ### sync_console
+ > `= <boolean>`
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index d285538bde9f..099113ba41e6 100644
+--- a/xen/arch/x86/spec_ctrl.c
++++ b/xen/arch/x86/spec_ctrl.c
+@@ -67,6 +67,8 @@ static bool __initdata cpu_has_bug_msbds_only; /* => minimal HT impact. */
+ static bool __initdata cpu_has_bug_mds; /* Any other M{LP,SB,FB}DS combination. */
+
+ static int8_t __initdata opt_srb_lock = -1;
++static bool __initdata opt_unpriv_mmio;
++static bool __read_mostly opt_fb_clear_mmio;
+
+ static int __init parse_spec_ctrl(const char *s)
+ {
+@@ -184,6 +186,8 @@ static int __init parse_spec_ctrl(const char *s)
+ opt_branch_harden = val;
+ else if ( (val = parse_boolean("srb-lock", s, ss)) >= 0 )
+ opt_srb_lock = val;
++ else if ( (val = parse_boolean("unpriv-mmio", s, ss)) >= 0 )
++ opt_unpriv_mmio = val;
+ else
+ rc = -EINVAL;
+
+@@ -392,7 +396,8 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
+ opt_srb_lock ? " SRB_LOCK+" : " SRB_LOCK-",
+ opt_ibpb ? " IBPB" : "",
+ opt_l1d_flush ? " L1D_FLUSH" : "",
+- opt_md_clear_pv || opt_md_clear_hvm ? " VERW" : "",
++ opt_md_clear_pv || opt_md_clear_hvm ||
++ opt_fb_clear_mmio ? " VERW" : "",
+ opt_branch_harden ? " BRANCH_HARDEN" : "");
+
+ /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */
+@@ -941,7 +946,9 @@ void spec_ctrl_init_domain(struct domain *d)
+ {
+ bool pv = is_pv_domain(d);
+
+- d->arch.verw = pv ? opt_md_clear_pv : opt_md_clear_hvm;
++ d->arch.verw =
++ (pv ? opt_md_clear_pv : opt_md_clear_hvm) ||
++ (opt_fb_clear_mmio && is_iommu_enabled(d));
+ }
+
+ void __init init_speculation_mitigations(void)
+@@ -1195,6 +1202,18 @@ void __init init_speculation_mitigations(void)
+
+ mds_calculations(caps);
+
++ /*
++ * Parts which enumerate FB_CLEAR are those which are post-MDS_NO and have
++ * reintroduced the VERW fill buffer flushing side effect because of a
++ * susceptibility to FBSDP.
++ *
++ * If unprivileged guests have (or will have) MMIO mappings, we can
++ * mitigate cross-domain leakage of fill buffer data by issuing VERW on
++ * the return-to-guest path.
++ */
++ if ( opt_unpriv_mmio )
++ opt_fb_clear_mmio = caps & ARCH_CAPS_FB_CLEAR;
++
+ /*
+ * By default, enable PV and HVM mitigations on MDS-vulnerable hardware.
+ * This will only be a token effort for MLPDS/MFBDS when HT is enabled,
+@@ -1208,18 +1227,20 @@ void __init init_speculation_mitigations(void)
+ boot_cpu_has(X86_FEATURE_MD_CLEAR));
+
+ /*
+- * Enable MDS defences as applicable. The Idle blocks need using if
+- * either PV or HVM defences are used.
++ * Enable MDS/MMIO defences as applicable. The Idle blocks need using if
++ * either the PV or HVM MDS defences are used, or if we may give MMIO
++ * access to untrusted guests.
+ *
+ * HVM is more complicated. The MD_CLEAR microcode extends L1D_FLUSH with
+ * equivalent semantics to avoid needing to perform both flushes on the
+- * HVM path. Therefore, we don't need VERW in addition to L1D_FLUSH.
++ * HVM path. Therefore, we don't need VERW in addition to L1D_FLUSH (for
++ * MDS mitigations. L1D_FLUSH is not safe for MMIO mitigations.)
+ *
+ * After calculating the appropriate idle setting, simplify
+ * opt_md_clear_hvm to mean just "should we VERW on the way into HVM
+ * guests", so spec_ctrl_init_domain() can calculate suitable settings.
+ */
+- if ( opt_md_clear_pv || opt_md_clear_hvm )
++ if ( opt_md_clear_pv || opt_md_clear_hvm || opt_fb_clear_mmio )
+ setup_force_cpu_cap(X86_FEATURE_SC_VERW_IDLE);
+ opt_md_clear_hvm &= !(caps & ARCH_CAPS_SKIP_L1DFL) && !opt_l1d_flush;
+
+@@ -1284,14 +1305,19 @@ void __init init_speculation_mitigations(void)
+ * On some SRBDS-affected hardware, it may be safe to relax srb-lock by
+ * default.
+ *
+- * On parts which enumerate MDS_NO and not TAA_NO, TSX is the only known
+- * way to access the Fill Buffer. If TSX isn't available (inc. SKU
+- * reasons on some models), or TSX is explicitly disabled, then there is
+- * no need for the extra overhead to protect RDRAND/RDSEED.
++ * All parts with SRBDS_CTRL suffer SSDP, the mechanism by which stale RNG
++ * data becomes available to other contexts. To recover the data, an
++ * attacker needs to use:
++ * - SBDS (MDS or TAA to sample the cores fill buffer)
++ * - SBDR (Architecturally retrieve stale transaction buffer contents)
++ * - DRPW (Architecturally latch stale fill buffer data)
++ *
++ * On MDS_NO parts, and with TAA_NO or TSX unavailable/disabled, and there
++ * is no unprivileged MMIO access, the RNG data doesn't need protecting.
+ */
+ if ( cpu_has_srbds_ctrl )
+ {
+- if ( opt_srb_lock == -1 &&
++ if ( opt_srb_lock == -1 && !opt_unpriv_mmio &&
+ (caps & (ARCH_CAPS_MDS_NO|ARCH_CAPS_TAA_NO)) == ARCH_CAPS_MDS_NO &&
+ (!cpu_has_hle || ((caps & ARCH_CAPS_TSX_CTRL) && rtm_disabled)) )
+ opt_srb_lock = 0;
+--
+2.35.1
+
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..0e845b5
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,339 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ <signature of Ty Coon>, 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/create-patches b/create-patches
new file mode 100755
index 0000000..8e8c9fa
--- /dev/null
+++ b/create-patches
@@ -0,0 +1,60 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+cd "${SCRIPT_DIR}"
+
+if [[ ! -v XEN_REPO_DIR ]]; then
+ XEN_REPO_DIR="${HOME}/repos/xen"
+fi
+
+XEN_VERSION="${1}"
+OUR_PATCHES_VERSION="${2}"
+
+XEN_VER_COMPONENTS=( ${XEN_VERSION//./ } )
+XEN_MAJOR_MINOR_VERSION="${XEN_VER_COMPONENTS[0]}.${XEN_VER_COMPONENTS[1]}"
+
+
+git -C "${XEN_REPO_DIR}" fetch origin
+
+readarray -d '' CURRENT_PATCHES < <(find . -maxdepth 1 -type f -name "*.patch" -print0)
+if [[ ${CURRENT_PATCHES[@]} -gt 0 ]]; then
+ git rm -f *.patch
+fi
+
+PATCH_RANGE_START="RELEASE-${XEN_VERSION}"
+PATCH_RANGE_END="staging-${XEN_MAJOR_MINOR_VERSION}"
+git -C "${XEN_REPO_DIR}" format-patch \
+ -o "${SCRIPT_DIR}" \
+ ${PATCH_RANGE_START}..origin/${PATCH_RANGE_END}
+
+XEN_NEXT_PATCHLEVEL=$((XEN_VER_COMPONENTS[2]+1))
+XEN_NEXT_VERSION="${XEN_MAJOR_MINOR_VERSION}.${XEN_NEXT_PATCHLEVEL}"
+
+PATCH_RANGE_START_ID=$(git -C "${XEN_REPO_DIR}" rev-parse ${PATCH_RANGE_START})
+PATCH_RANGE_END_ID=$(git -C "${XEN_REPO_DIR}" rev-parse ${PATCH_RANGE_END})
+
+cat <<EOF > "info.txt"
+Xen upstream patchset #${OUR_PATCHES_VERSION} for ${XEN_NEXT_VERSION}-pre
+
+Containing patches from
+$PATCH_RANGE_START ($PATCH_RANGE_START_ID)
+to
+$PATCH_RANGE_END ($PATCH_RANGE_END_ID)
+EOF
+
+git add \
+ info.txt \
+ *.patch
+
+TAG="${XEN_NEXT_VERSION}-pre-patchset-${OUR_PATCHES_VERSION}"
+DESCRIPTION="Xen ${TAG}"
+
+git commit \
+ --signoff \
+ -m "${DESCRIPTION}"
+
+git tag \
+ -s \
+ -m "${DESCRIPTION}" \
+ "${TAG}"
diff --git a/info.txt b/info.txt
new file mode 100644
index 0000000..2310ace
--- /dev/null
+++ b/info.txt
@@ -0,0 +1,6 @@
+Xen Upstream Patchset #0 for 4.16.2-pre
+
+Containing patches from
+RELEASE-4.16.1 (13fee86475f3831d7a1ecf6d7e0acbc2ac779f7e)
+to
+staging-4.16 (2e82446cb252f6c8ac697e81f4155872c69afde4)