diff options
author | Mike Pagano <mpagano@gentoo.org> | 2019-11-12 16:00:29 -0500 |
---|---|---|
committer | Mike Pagano <mpagano@gentoo.org> | 2019-11-12 16:00:29 -0500 |
commit | 87cd611e311f55748b99b57248370a5289a2f86a (patch) | |
tree | 330565fb8d4e311126b8056a4e30229ec89270e1 | |
parent | Linux patch 4.19.83 (diff) | |
download | linux-patches-4.19-83.tar.gz linux-patches-4.19-83.tar.bz2 linux-patches-4.19-83.zip |
Linux patch 4.19.844.19-83
Signed-off-by: Mike Pagano <mpagano@gentoo.org>
-rw-r--r-- | 0000_README | 4 | ||||
-rw-r--r-- | 1083_linux-4.19.84.patch | 11705 |
2 files changed, 11709 insertions, 0 deletions
diff --git a/0000_README b/0000_README index 4a7a9266..3d259d67 100644 --- a/0000_README +++ b/0000_README @@ -371,6 +371,10 @@ Patch: 1082_linux-4.19.83.patch From: https://www.kernel.org Desc: Linux 4.19.83 +Patch: 1083_linux-4.19.84.patch +From: https://www.kernel.org +Desc: Linux 4.19.84 + Patch: 1500_XATTR_USER_PREFIX.patch From: https://bugs.gentoo.org/show_bug.cgi?id=470644 Desc: Support for namespace user.pax.* on tmpfs. diff --git a/1083_linux-4.19.84.patch b/1083_linux-4.19.84.patch new file mode 100644 index 00000000..86272119 --- /dev/null +++ b/1083_linux-4.19.84.patch @@ -0,0 +1,11705 @@ +diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu +index 8718d4ad227b..b492fb6057c9 100644 +--- a/Documentation/ABI/testing/sysfs-devices-system-cpu ++++ b/Documentation/ABI/testing/sysfs-devices-system-cpu +@@ -478,6 +478,8 @@ What: /sys/devices/system/cpu/vulnerabilities + /sys/devices/system/cpu/vulnerabilities/spec_store_bypass + /sys/devices/system/cpu/vulnerabilities/l1tf + /sys/devices/system/cpu/vulnerabilities/mds ++ /sys/devices/system/cpu/vulnerabilities/tsx_async_abort ++ /sys/devices/system/cpu/vulnerabilities/itlb_multihit + Date: January 2018 + Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org> + Description: Information about CPU vulnerabilities +diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst +index 49311f3da6f2..0795e3c2643f 100644 +--- a/Documentation/admin-guide/hw-vuln/index.rst ++++ b/Documentation/admin-guide/hw-vuln/index.rst +@@ -12,3 +12,5 @@ are configurable at compile, boot or run time. + spectre + l1tf + mds ++ tsx_async_abort ++ multihit.rst +diff --git a/Documentation/admin-guide/hw-vuln/multihit.rst b/Documentation/admin-guide/hw-vuln/multihit.rst +new file mode 100644 +index 000000000000..ba9988d8bce5 +--- /dev/null ++++ b/Documentation/admin-guide/hw-vuln/multihit.rst +@@ -0,0 +1,163 @@ ++iTLB multihit ++============= ++ ++iTLB multihit is an erratum where some processors may incur a machine check ++error, possibly resulting in an unrecoverable CPU lockup, when an ++instruction fetch hits multiple entries in the instruction TLB. This can ++occur when the page size is changed along with either the physical address ++or cache type. A malicious guest running on a virtualized system can ++exploit this erratum to perform a denial of service attack. ++ ++ ++Affected processors ++------------------- ++ ++Variations of this erratum are present on most Intel Core and Xeon processor ++models. The erratum is not present on: ++ ++ - non-Intel processors ++ ++ - Some Atoms (Airmont, Bonnell, Goldmont, GoldmontPlus, Saltwell, Silvermont) ++ ++ - Intel processors that have the PSCHANGE_MC_NO bit set in the ++ IA32_ARCH_CAPABILITIES MSR. ++ ++ ++Related CVEs ++------------ ++ ++The following CVE entry is related to this issue: ++ ++ ============== ================================================= ++ CVE-2018-12207 Machine Check Error Avoidance on Page Size Change ++ ============== ================================================= ++ ++ ++Problem ++------- ++ ++Privileged software, including OS and virtual machine managers (VMM), are in ++charge of memory management. A key component in memory management is the control ++of the page tables. Modern processors use virtual memory, a technique that creates ++the illusion of a very large memory for processors. This virtual space is split ++into pages of a given size. Page tables translate virtual addresses to physical ++addresses. ++ ++To reduce latency when performing a virtual to physical address translation, ++processors include a structure, called TLB, that caches recent translations. ++There are separate TLBs for instruction (iTLB) and data (dTLB). ++ ++Under this errata, instructions are fetched from a linear address translated ++using a 4 KB translation cached in the iTLB. Privileged software modifies the ++paging structure so that the same linear address using large page size (2 MB, 4 ++MB, 1 GB) with a different physical address or memory type. After the page ++structure modification but before the software invalidates any iTLB entries for ++the linear address, a code fetch that happens on the same linear address may ++cause a machine-check error which can result in a system hang or shutdown. ++ ++ ++Attack scenarios ++---------------- ++ ++Attacks against the iTLB multihit erratum can be mounted from malicious ++guests in a virtualized system. ++ ++ ++iTLB multihit system information ++-------------------------------- ++ ++The Linux kernel provides a sysfs interface to enumerate the current iTLB ++multihit status of the system:whether the system is vulnerable and which ++mitigations are active. The relevant sysfs file is: ++ ++/sys/devices/system/cpu/vulnerabilities/itlb_multihit ++ ++The possible values in this file are: ++ ++.. list-table:: ++ ++ * - Not affected ++ - The processor is not vulnerable. ++ * - KVM: Mitigation: Split huge pages ++ - Software changes mitigate this issue. ++ * - KVM: Vulnerable ++ - The processor is vulnerable, but no mitigation enabled ++ ++ ++Enumeration of the erratum ++-------------------------------- ++ ++A new bit has been allocated in the IA32_ARCH_CAPABILITIES (PSCHANGE_MC_NO) msr ++and will be set on CPU's which are mitigated against this issue. ++ ++ ======================================= =========== =============================== ++ IA32_ARCH_CAPABILITIES MSR Not present Possibly vulnerable,check model ++ IA32_ARCH_CAPABILITIES[PSCHANGE_MC_NO] '0' Likely vulnerable,check model ++ IA32_ARCH_CAPABILITIES[PSCHANGE_MC_NO] '1' Not vulnerable ++ ======================================= =========== =============================== ++ ++ ++Mitigation mechanism ++------------------------- ++ ++This erratum can be mitigated by restricting the use of large page sizes to ++non-executable pages. This forces all iTLB entries to be 4K, and removes ++the possibility of multiple hits. ++ ++In order to mitigate the vulnerability, KVM initially marks all huge pages ++as non-executable. If the guest attempts to execute in one of those pages, ++the page is broken down into 4K pages, which are then marked executable. ++ ++If EPT is disabled or not available on the host, KVM is in control of TLB ++flushes and the problematic situation cannot happen. However, the shadow ++EPT paging mechanism used by nested virtualization is vulnerable, because ++the nested guest can trigger multiple iTLB hits by modifying its own ++(non-nested) page tables. For simplicity, KVM will make large pages ++non-executable in all shadow paging modes. ++ ++Mitigation control on the kernel command line and KVM - module parameter ++------------------------------------------------------------------------ ++ ++The KVM hypervisor mitigation mechanism for marking huge pages as ++non-executable can be controlled with a module parameter "nx_huge_pages=". ++The kernel command line allows to control the iTLB multihit mitigations at ++boot time with the option "kvm.nx_huge_pages=". ++ ++The valid arguments for these options are: ++ ++ ========== ================================================================ ++ force Mitigation is enabled. In this case, the mitigation implements ++ non-executable huge pages in Linux kernel KVM module. All huge ++ pages in the EPT are marked as non-executable. ++ If a guest attempts to execute in one of those pages, the page is ++ broken down into 4K pages, which are then marked executable. ++ ++ off Mitigation is disabled. ++ ++ auto Enable mitigation only if the platform is affected and the kernel ++ was not booted with the "mitigations=off" command line parameter. ++ This is the default option. ++ ========== ================================================================ ++ ++ ++Mitigation selection guide ++-------------------------- ++ ++1. No virtualization in use ++^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ ++ The system is protected by the kernel unconditionally and no further ++ action is required. ++ ++2. Virtualization with trusted guests ++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ ++ If the guest comes from a trusted source, you may assume that the guest will ++ not attempt to maliciously exploit these errata and no further action is ++ required. ++ ++3. Virtualization with untrusted guests ++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ If the guest comes from an untrusted source, the guest host kernel will need ++ to apply iTLB multihit mitigation via the kernel command line or kvm ++ module parameter. +diff --git a/Documentation/admin-guide/hw-vuln/tsx_async_abort.rst b/Documentation/admin-guide/hw-vuln/tsx_async_abort.rst +new file mode 100644 +index 000000000000..fddbd7579c53 +--- /dev/null ++++ b/Documentation/admin-guide/hw-vuln/tsx_async_abort.rst +@@ -0,0 +1,276 @@ ++.. SPDX-License-Identifier: GPL-2.0 ++ ++TAA - TSX Asynchronous Abort ++====================================== ++ ++TAA is a hardware vulnerability that allows unprivileged speculative access to ++data which is available in various CPU internal buffers by using asynchronous ++aborts within an Intel TSX transactional region. ++ ++Affected processors ++------------------- ++ ++This vulnerability only affects Intel processors that support Intel ++Transactional Synchronization Extensions (TSX) when the TAA_NO bit (bit 8) ++is 0 in the IA32_ARCH_CAPABILITIES MSR. On processors where the MDS_NO bit ++(bit 5) is 0 in the IA32_ARCH_CAPABILITIES MSR, the existing MDS mitigations ++also mitigate against TAA. ++ ++Whether a processor is affected or not can be read out from the TAA ++vulnerability file in sysfs. See :ref:`tsx_async_abort_sys_info`. ++ ++Related CVEs ++------------ ++ ++The following CVE entry is related to this TAA issue: ++ ++ ============== ===== =================================================== ++ CVE-2019-11135 TAA TSX Asynchronous Abort (TAA) condition on some ++ microprocessors utilizing speculative execution may ++ allow an authenticated user to potentially enable ++ information disclosure via a side channel with ++ local access. ++ ============== ===== =================================================== ++ ++Problem ++------- ++ ++When performing store, load or L1 refill operations, processors write ++data into temporary microarchitectural structures (buffers). The data in ++those buffers can be forwarded to load operations as an optimization. ++ ++Intel TSX is an extension to the x86 instruction set architecture that adds ++hardware transactional memory support to improve performance of multi-threaded ++software. TSX lets the processor expose and exploit concurrency hidden in an ++application due to dynamically avoiding unnecessary synchronization. ++ ++TSX supports atomic memory transactions that are either committed (success) or ++aborted. During an abort, operations that happened within the transactional region ++are rolled back. An asynchronous abort takes place, among other options, when a ++different thread accesses a cache line that is also used within the transactional ++region when that access might lead to a data race. ++ ++Immediately after an uncompleted asynchronous abort, certain speculatively ++executed loads may read data from those internal buffers and pass it to dependent ++operations. This can be then used to infer the value via a cache side channel ++attack. ++ ++Because the buffers are potentially shared between Hyper-Threads cross ++Hyper-Thread attacks are possible. ++ ++The victim of a malicious actor does not need to make use of TSX. Only the ++attacker needs to begin a TSX transaction and raise an asynchronous abort ++which in turn potenitally leaks data stored in the buffers. ++ ++More detailed technical information is available in the TAA specific x86 ++architecture section: :ref:`Documentation/x86/tsx_async_abort.rst <tsx_async_abort>`. ++ ++ ++Attack scenarios ++---------------- ++ ++Attacks against the TAA vulnerability can be implemented from unprivileged ++applications running on hosts or guests. ++ ++As for MDS, the attacker has no control over the memory addresses that can ++be leaked. Only the victim is responsible for bringing data to the CPU. As ++a result, the malicious actor has to sample as much data as possible and ++then postprocess it to try to infer any useful information from it. ++ ++A potential attacker only has read access to the data. Also, there is no direct ++privilege escalation by using this technique. ++ ++ ++.. _tsx_async_abort_sys_info: ++ ++TAA system information ++----------------------- ++ ++The Linux kernel provides a sysfs interface to enumerate the current TAA status ++of mitigated systems. The relevant sysfs file is: ++ ++/sys/devices/system/cpu/vulnerabilities/tsx_async_abort ++ ++The possible values in this file are: ++ ++.. list-table:: ++ ++ * - 'Vulnerable' ++ - The CPU is affected by this vulnerability and the microcode and kernel mitigation are not applied. ++ * - 'Vulnerable: Clear CPU buffers attempted, no microcode' ++ - The system tries to clear the buffers but the microcode might not support the operation. ++ * - 'Mitigation: Clear CPU buffers' ++ - The microcode has been updated to clear the buffers. TSX is still enabled. ++ * - 'Mitigation: TSX disabled' ++ - TSX is disabled. ++ * - 'Not affected' ++ - The CPU is not affected by this issue. ++ ++.. _ucode_needed: ++ ++Best effort mitigation mode ++^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ ++If the processor is vulnerable, but the availability of the microcode-based ++mitigation mechanism is not advertised via CPUID the kernel selects a best ++effort mitigation mode. This mode invokes the mitigation instructions ++without a guarantee that they clear the CPU buffers. ++ ++This is done to address virtualization scenarios where the host has the ++microcode update applied, but the hypervisor is not yet updated to expose the ++CPUID to the guest. If the host has updated microcode the protection takes ++effect; otherwise a few CPU cycles are wasted pointlessly. ++ ++The state in the tsx_async_abort sysfs file reflects this situation ++accordingly. ++ ++ ++Mitigation mechanism ++-------------------- ++ ++The kernel detects the affected CPUs and the presence of the microcode which is ++required. If a CPU is affected and the microcode is available, then the kernel ++enables the mitigation by default. ++ ++ ++The mitigation can be controlled at boot time via a kernel command line option. ++See :ref:`taa_mitigation_control_command_line`. ++ ++.. _virt_mechanism: ++ ++Virtualization mitigation ++^^^^^^^^^^^^^^^^^^^^^^^^^ ++ ++Affected systems where the host has TAA microcode and TAA is mitigated by ++having disabled TSX previously, are not vulnerable regardless of the status ++of the VMs. ++ ++In all other cases, if the host either does not have the TAA microcode or ++the kernel is not mitigated, the system might be vulnerable. ++ ++ ++.. _taa_mitigation_control_command_line: ++ ++Mitigation control on the kernel command line ++--------------------------------------------- ++ ++The kernel command line allows to control the TAA mitigations at boot time with ++the option "tsx_async_abort=". The valid arguments for this option are: ++ ++ ============ ============================================================= ++ off This option disables the TAA mitigation on affected platforms. ++ If the system has TSX enabled (see next parameter) and the CPU ++ is affected, the system is vulnerable. ++ ++ full TAA mitigation is enabled. If TSX is enabled, on an affected ++ system it will clear CPU buffers on ring transitions. On ++ systems which are MDS-affected and deploy MDS mitigation, ++ TAA is also mitigated. Specifying this option on those ++ systems will have no effect. ++ ++ full,nosmt The same as tsx_async_abort=full, with SMT disabled on ++ vulnerable CPUs that have TSX enabled. This is the complete ++ mitigation. When TSX is disabled, SMT is not disabled because ++ CPU is not vulnerable to cross-thread TAA attacks. ++ ============ ============================================================= ++ ++Not specifying this option is equivalent to "tsx_async_abort=full". ++ ++The kernel command line also allows to control the TSX feature using the ++parameter "tsx=" on CPUs which support TSX control. MSR_IA32_TSX_CTRL is used ++to control the TSX feature and the enumeration of the TSX feature bits (RTM ++and HLE) in CPUID. ++ ++The valid options are: ++ ++ ============ ============================================================= ++ off Disables TSX on the system. ++ ++ Note that this option takes effect only on newer CPUs which are ++ not vulnerable to MDS, i.e., have MSR_IA32_ARCH_CAPABILITIES.MDS_NO=1 ++ and which get the new IA32_TSX_CTRL MSR through a microcode ++ update. This new MSR allows for the reliable deactivation of ++ the TSX functionality. ++ ++ on Enables TSX. ++ ++ Although there are mitigations for all known security ++ vulnerabilities, TSX has been known to be an accelerator for ++ several previous speculation-related CVEs, and so there may be ++ unknown security risks associated with leaving it enabled. ++ ++ auto Disables TSX if X86_BUG_TAA is present, otherwise enables TSX ++ on the system. ++ ============ ============================================================= ++ ++Not specifying this option is equivalent to "tsx=off". ++ ++The following combinations of the "tsx_async_abort" and "tsx" are possible. For ++affected platforms tsx=auto is equivalent to tsx=off and the result will be: ++ ++ ========= ========================== ========================================= ++ tsx=on tsx_async_abort=full The system will use VERW to clear CPU ++ buffers. Cross-thread attacks are still ++ possible on SMT machines. ++ tsx=on tsx_async_abort=full,nosmt As above, cross-thread attacks on SMT ++ mitigated. ++ tsx=on tsx_async_abort=off The system is vulnerable. ++ tsx=off tsx_async_abort=full TSX might be disabled if microcode ++ provides a TSX control MSR. If so, ++ system is not vulnerable. ++ tsx=off tsx_async_abort=full,nosmt Ditto ++ tsx=off tsx_async_abort=off ditto ++ ========= ========================== ========================================= ++ ++ ++For unaffected platforms "tsx=on" and "tsx_async_abort=full" does not clear CPU ++buffers. For platforms without TSX control (MSR_IA32_ARCH_CAPABILITIES.MDS_NO=0) ++"tsx" command line argument has no effect. ++ ++For the affected platforms below table indicates the mitigation status for the ++combinations of CPUID bit MD_CLEAR and IA32_ARCH_CAPABILITIES MSR bits MDS_NO ++and TSX_CTRL_MSR. ++ ++ ======= ========= ============= ======================================== ++ MDS_NO MD_CLEAR TSX_CTRL_MSR Status ++ ======= ========= ============= ======================================== ++ 0 0 0 Vulnerable (needs microcode) ++ 0 1 0 MDS and TAA mitigated via VERW ++ 1 1 0 MDS fixed, TAA vulnerable if TSX enabled ++ because MD_CLEAR has no meaning and ++ VERW is not guaranteed to clear buffers ++ 1 X 1 MDS fixed, TAA can be mitigated by ++ VERW or TSX_CTRL_MSR ++ ======= ========= ============= ======================================== ++ ++Mitigation selection guide ++-------------------------- ++ ++1. Trusted userspace and guests ++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ ++If all user space applications are from a trusted source and do not execute ++untrusted code which is supplied externally, then the mitigation can be ++disabled. The same applies to virtualized environments with trusted guests. ++ ++ ++2. Untrusted userspace and guests ++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ ++If there are untrusted applications or guests on the system, enabling TSX ++might allow a malicious actor to leak data from the host or from other ++processes running on the same physical core. ++ ++If the microcode is available and the TSX is disabled on the host, attacks ++are prevented in a virtualized environment as well, even if the VMs do not ++explicitly enable the mitigation. ++ ++ ++.. _taa_default_mitigations: ++ ++Default mitigations ++------------------- ++ ++The kernel's default action for vulnerable processors is: ++ ++ - Deploy TSX disable mitigation (tsx_async_abort=full tsx=off). +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index a855f83defa6..475ed980b25b 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -1956,6 +1956,25 @@ + KVM MMU at runtime. + Default is 0 (off) + ++ kvm.nx_huge_pages= ++ [KVM] Controls the software workaround for the ++ X86_BUG_ITLB_MULTIHIT bug. ++ force : Always deploy workaround. ++ off : Never deploy workaround. ++ auto : Deploy workaround based on the presence of ++ X86_BUG_ITLB_MULTIHIT. ++ ++ Default is 'auto'. ++ ++ If the software workaround is enabled for the host, ++ guests do need not to enable it for nested guests. ++ ++ kvm.nx_huge_pages_recovery_ratio= ++ [KVM] Controls how many 4KiB pages are periodically zapped ++ back to huge pages. 0 disables the recovery, otherwise if ++ the value is N KVM will zap 1/Nth of the 4KiB pages every ++ minute. The default is 60. ++ + kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM. + Default is 1 (enabled) + +@@ -2523,6 +2542,13 @@ + ssbd=force-off [ARM64] + l1tf=off [X86] + mds=off [X86] ++ tsx_async_abort=off [X86] ++ kvm.nx_huge_pages=off [X86] ++ ++ Exceptions: ++ This does not have any effect on ++ kvm.nx_huge_pages when ++ kvm.nx_huge_pages=force. + + auto (default) + Mitigate all CPU vulnerabilities, but leave SMT +@@ -2538,6 +2564,7 @@ + be fully mitigated, even if it means losing SMT. + Equivalent to: l1tf=flush,nosmt [X86] + mds=full,nosmt [X86] ++ tsx_async_abort=full,nosmt [X86] + + mminit_loglevel= + [KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this +@@ -4690,6 +4717,71 @@ + marks the TSC unconditionally unstable at bootup and + avoids any further wobbles once the TSC watchdog notices. + ++ tsx= [X86] Control Transactional Synchronization ++ Extensions (TSX) feature in Intel processors that ++ support TSX control. ++ ++ This parameter controls the TSX feature. The options are: ++ ++ on - Enable TSX on the system. Although there are ++ mitigations for all known security vulnerabilities, ++ TSX has been known to be an accelerator for ++ several previous speculation-related CVEs, and ++ so there may be unknown security risks associated ++ with leaving it enabled. ++ ++ off - Disable TSX on the system. (Note that this ++ option takes effect only on newer CPUs which are ++ not vulnerable to MDS, i.e., have ++ MSR_IA32_ARCH_CAPABILITIES.MDS_NO=1 and which get ++ the new IA32_TSX_CTRL MSR through a microcode ++ update. This new MSR allows for the reliable ++ deactivation of the TSX functionality.) ++ ++ auto - Disable TSX if X86_BUG_TAA is present, ++ otherwise enable TSX on the system. ++ ++ Not specifying this option is equivalent to tsx=off. ++ ++ See Documentation/admin-guide/hw-vuln/tsx_async_abort.rst ++ for more details. ++ ++ tsx_async_abort= [X86,INTEL] Control mitigation for the TSX Async ++ Abort (TAA) vulnerability. ++ ++ Similar to Micro-architectural Data Sampling (MDS) ++ certain CPUs that support Transactional ++ Synchronization Extensions (TSX) are vulnerable to an ++ exploit against CPU internal buffers which can forward ++ information to a disclosure gadget under certain ++ conditions. ++ ++ In vulnerable processors, the speculatively forwarded ++ data can be used in a cache side channel attack, to ++ access data to which the attacker does not have direct ++ access. ++ ++ This parameter controls the TAA mitigation. The ++ options are: ++ ++ full - Enable TAA mitigation on vulnerable CPUs ++ if TSX is enabled. ++ ++ full,nosmt - Enable TAA mitigation and disable SMT on ++ vulnerable CPUs. If TSX is disabled, SMT ++ is not disabled because CPU is not ++ vulnerable to cross-thread TAA attacks. ++ off - Unconditionally disable TAA mitigation ++ ++ Not specifying this option is equivalent to ++ tsx_async_abort=full. On CPUs which are MDS affected ++ and deploy MDS mitigation, TAA mitigation is not ++ required and doesn't provide any additional ++ mitigation. ++ ++ For details see: ++ Documentation/admin-guide/hw-vuln/tsx_async_abort.rst ++ + turbografx.map[2|3]= [HW,JOY] + TurboGraFX parallel port interface + Format: +diff --git a/Documentation/scheduler/sched-bwc.txt b/Documentation/scheduler/sched-bwc.txt +index f6b1873f68ab..de583fbbfe42 100644 +--- a/Documentation/scheduler/sched-bwc.txt ++++ b/Documentation/scheduler/sched-bwc.txt +@@ -90,6 +90,51 @@ There are two ways in which a group may become throttled: + In case b) above, even though the child may have runtime remaining it will not + be allowed to until the parent's runtime is refreshed. + ++CFS Bandwidth Quota Caveats ++--------------------------- ++Once a slice is assigned to a cpu it does not expire. However all but 1ms of ++the slice may be returned to the global pool if all threads on that cpu become ++unrunnable. This is configured at compile time by the min_cfs_rq_runtime ++variable. This is a performance tweak that helps prevent added contention on ++the global lock. ++ ++The fact that cpu-local slices do not expire results in some interesting corner ++cases that should be understood. ++ ++For cgroup cpu constrained applications that are cpu limited this is a ++relatively moot point because they will naturally consume the entirety of their ++quota as well as the entirety of each cpu-local slice in each period. As a ++result it is expected that nr_periods roughly equal nr_throttled, and that ++cpuacct.usage will increase roughly equal to cfs_quota_us in each period. ++ ++For highly-threaded, non-cpu bound applications this non-expiration nuance ++allows applications to briefly burst past their quota limits by the amount of ++unused slice on each cpu that the task group is running on (typically at most ++1ms per cpu or as defined by min_cfs_rq_runtime). This slight burst only ++applies if quota had been assigned to a cpu and then not fully used or returned ++in previous periods. This burst amount will not be transferred between cores. ++As a result, this mechanism still strictly limits the task group to quota ++average usage, albeit over a longer time window than a single period. This ++also limits the burst ability to no more than 1ms per cpu. This provides ++better more predictable user experience for highly threaded applications with ++small quota limits on high core count machines. It also eliminates the ++propensity to throttle these applications while simultanously using less than ++quota amounts of cpu. Another way to say this, is that by allowing the unused ++portion of a slice to remain valid across periods we have decreased the ++possibility of wastefully expiring quota on cpu-local silos that don't need a ++full slice's amount of cpu time. ++ ++The interaction between cpu-bound and non-cpu-bound-interactive applications ++should also be considered, especially when single core usage hits 100%. If you ++gave each of these applications half of a cpu-core and they both got scheduled ++on the same CPU it is theoretically possible that the non-cpu bound application ++will use up to 1ms additional quota in some periods, thereby preventing the ++cpu-bound application from fully using its quota by that same amount. In these ++instances it will be up to the CFS algorithm (see sched-design-CFS.rst) to ++decide which application is chosen to run, as they will both be runnable and ++have remaining quota. This runtime discrepancy will be made up in the following ++periods when the interactive application idles. ++ + Examples + -------- + 1. Limit a group to 1 CPU worth of runtime. +diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt +index 1bb8bcaf8497..635cd6eaf714 100644 +--- a/Documentation/virtual/kvm/locking.txt ++++ b/Documentation/virtual/kvm/locking.txt +@@ -15,8 +15,6 @@ The acquisition orders for mutexes are as follows: + + On x86, vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock. + +-For spinlocks, kvm_lock is taken outside kvm->mmu_lock. +- + Everything else is a leaf: no other lock is taken inside the critical + sections. + +@@ -169,7 +167,7 @@ which time it will be set using the Dirty tracking mechanism described above. + ------------ + + Name: kvm_lock +-Type: spinlock_t ++Type: mutex + Arch: any + Protects: - vm_list + +diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst +index ef389dcf1b1d..0780d55c5aa8 100644 +--- a/Documentation/x86/index.rst ++++ b/Documentation/x86/index.rst +@@ -6,3 +6,4 @@ x86 architecture specifics + :maxdepth: 1 + + mds ++ tsx_async_abort +diff --git a/Documentation/x86/tsx_async_abort.rst b/Documentation/x86/tsx_async_abort.rst +new file mode 100644 +index 000000000000..583ddc185ba2 +--- /dev/null ++++ b/Documentation/x86/tsx_async_abort.rst +@@ -0,0 +1,117 @@ ++.. SPDX-License-Identifier: GPL-2.0 ++ ++TSX Async Abort (TAA) mitigation ++================================ ++ ++.. _tsx_async_abort: ++ ++Overview ++-------- ++ ++TSX Async Abort (TAA) is a side channel attack on internal buffers in some ++Intel processors similar to Microachitectural Data Sampling (MDS). In this ++case certain loads may speculatively pass invalid data to dependent operations ++when an asynchronous abort condition is pending in a Transactional ++Synchronization Extensions (TSX) transaction. This includes loads with no ++fault or assist condition. Such loads may speculatively expose stale data from ++the same uarch data structures as in MDS, with same scope of exposure i.e. ++same-thread and cross-thread. This issue affects all current processors that ++support TSX. ++ ++Mitigation strategy ++------------------- ++ ++a) TSX disable - one of the mitigations is to disable TSX. A new MSR ++IA32_TSX_CTRL will be available in future and current processors after ++microcode update which can be used to disable TSX. In addition, it ++controls the enumeration of the TSX feature bits (RTM and HLE) in CPUID. ++ ++b) Clear CPU buffers - similar to MDS, clearing the CPU buffers mitigates this ++vulnerability. More details on this approach can be found in ++:ref:`Documentation/admin-guide/hw-vuln/mds.rst <mds>`. ++ ++Kernel internal mitigation modes ++-------------------------------- ++ ++ ============= ============================================================ ++ off Mitigation is disabled. Either the CPU is not affected or ++ tsx_async_abort=off is supplied on the kernel command line. ++ ++ tsx disabled Mitigation is enabled. TSX feature is disabled by default at ++ bootup on processors that support TSX control. ++ ++ verw Mitigation is enabled. CPU is affected and MD_CLEAR is ++ advertised in CPUID. ++ ++ ucode needed Mitigation is enabled. CPU is affected and MD_CLEAR is not ++ advertised in CPUID. That is mainly for virtualization ++ scenarios where the host has the updated microcode but the ++ hypervisor does not expose MD_CLEAR in CPUID. It's a best ++ effort approach without guarantee. ++ ============= ============================================================ ++ ++If the CPU is affected and the "tsx_async_abort" kernel command line parameter is ++not provided then the kernel selects an appropriate mitigation depending on the ++status of RTM and MD_CLEAR CPUID bits. ++ ++Below tables indicate the impact of tsx=on|off|auto cmdline options on state of ++TAA mitigation, VERW behavior and TSX feature for various combinations of ++MSR_IA32_ARCH_CAPABILITIES bits. ++ ++1. "tsx=off" ++ ++========= ========= ============ ============ ============== =================== ====================== ++MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=off ++---------------------------------- ------------------------------------------------------------------------- ++TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation ++ after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full ++========= ========= ============ ============ ============== =================== ====================== ++ 0 0 0 HW default Yes Same as MDS Same as MDS ++ 0 0 1 Invalid case Invalid case Invalid case Invalid case ++ 0 1 0 HW default No Need ucode update Need ucode update ++ 0 1 1 Disabled Yes TSX disabled TSX disabled ++ 1 X 1 Disabled X None needed None needed ++========= ========= ============ ============ ============== =================== ====================== ++ ++2. "tsx=on" ++ ++========= ========= ============ ============ ============== =================== ====================== ++MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=on ++---------------------------------- ------------------------------------------------------------------------- ++TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation ++ after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full ++========= ========= ============ ============ ============== =================== ====================== ++ 0 0 0 HW default Yes Same as MDS Same as MDS ++ 0 0 1 Invalid case Invalid case Invalid case Invalid case ++ 0 1 0 HW default No Need ucode update Need ucode update ++ 0 1 1 Enabled Yes None Same as MDS ++ 1 X 1 Enabled X None needed None needed ++========= ========= ============ ============ ============== =================== ====================== ++ ++3. "tsx=auto" ++ ++========= ========= ============ ============ ============== =================== ====================== ++MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=auto ++---------------------------------- ------------------------------------------------------------------------- ++TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation ++ after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full ++========= ========= ============ ============ ============== =================== ====================== ++ 0 0 0 HW default Yes Same as MDS Same as MDS ++ 0 0 1 Invalid case Invalid case Invalid case Invalid case ++ 0 1 0 HW default No Need ucode update Need ucode update ++ 0 1 1 Disabled Yes TSX disabled TSX disabled ++ 1 X 1 Enabled X None needed None needed ++========= ========= ============ ============ ============== =================== ====================== ++ ++In the tables, TSX_CTRL_MSR is a new bit in MSR_IA32_ARCH_CAPABILITIES that ++indicates whether MSR_IA32_TSX_CTRL is supported. ++ ++There are two control bits in IA32_TSX_CTRL MSR: ++ ++ Bit 0: When set it disables the Restricted Transactional Memory (RTM) ++ sub-feature of TSX (will force all transactions to abort on the ++ XBEGIN instruction). ++ ++ Bit 1: When set it disables the enumeration of the RTM and HLE feature ++ (i.e. it will make CPUID(EAX=7).EBX{bit4} and ++ CPUID(EAX=7).EBX{bit11} read as 0). +diff --git a/Makefile b/Makefile +index c2c0cf2b1bd7..1ca0b8f37951 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,7 +1,7 @@ + # SPDX-License-Identifier: GPL-2.0 + VERSION = 4 + PATCHLEVEL = 19 +-SUBLEVEL = 83 ++SUBLEVEL = 84 + EXTRAVERSION = + NAME = "People's Front" + +diff --git a/arch/arm/mach-sunxi/mc_smp.c b/arch/arm/mach-sunxi/mc_smp.c +index b4037b603897..ff173e67eed2 100644 +--- a/arch/arm/mach-sunxi/mc_smp.c ++++ b/arch/arm/mach-sunxi/mc_smp.c +@@ -478,14 +478,18 @@ static void sunxi_mc_smp_cpu_die(unsigned int l_cpu) + static int sunxi_cpu_powerdown(unsigned int cpu, unsigned int cluster) + { + u32 reg; ++ int gating_bit = cpu; + + pr_debug("%s: cluster %u cpu %u\n", __func__, cluster, cpu); + if (cpu >= SUNXI_CPUS_PER_CLUSTER || cluster >= SUNXI_NR_CLUSTERS) + return -EINVAL; + ++ if (is_a83t && cpu == 0) ++ gating_bit = 4; ++ + /* gate processor power */ + reg = readl(prcm_base + PRCM_PWROFF_GATING_REG(cluster)); +- reg |= PRCM_PWROFF_GATING_REG_CORE(cpu); ++ reg |= PRCM_PWROFF_GATING_REG_CORE(gating_bit); + writel(reg, prcm_base + PRCM_PWROFF_GATING_REG(cluster)); + udelay(20); + +diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h +index 212a48826655..7ae553c15b9a 100644 +--- a/arch/arm64/include/asm/pgtable.h ++++ b/arch/arm64/include/asm/pgtable.h +@@ -274,23 +274,6 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, + set_pte(ptep, pte); + } + +-#define __HAVE_ARCH_PTE_SAME +-static inline int pte_same(pte_t pte_a, pte_t pte_b) +-{ +- pteval_t lhs, rhs; +- +- lhs = pte_val(pte_a); +- rhs = pte_val(pte_b); +- +- if (pte_present(pte_a)) +- lhs &= ~PTE_RDONLY; +- +- if (pte_present(pte_b)) +- rhs &= ~PTE_RDONLY; +- +- return (lhs == rhs); +-} +- + /* + * Huge pte definitions. + */ +diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c +index fac1d4eaa426..3c317bc6b799 100644 +--- a/arch/s390/kvm/kvm-s390.c ++++ b/arch/s390/kvm/kvm-s390.c +@@ -2110,13 +2110,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) + kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags); + if (!kvm->arch.sca) + goto out_err; +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + sca_offset += 16; + if (sca_offset + sizeof(struct bsca_block) > PAGE_SIZE) + sca_offset = 0; + kvm->arch.sca = (struct bsca_block *) + ((char *) kvm->arch.sca + sca_offset); +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); + + sprintf(debug_name, "kvm-%u", current->pid); + +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index e76d16ac2776..5726b264036f 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1903,6 +1903,51 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS + + If unsure, say y. + ++choice ++ prompt "TSX enable mode" ++ depends on CPU_SUP_INTEL ++ default X86_INTEL_TSX_MODE_OFF ++ help ++ Intel's TSX (Transactional Synchronization Extensions) feature ++ allows to optimize locking protocols through lock elision which ++ can lead to a noticeable performance boost. ++ ++ On the other hand it has been shown that TSX can be exploited ++ to form side channel attacks (e.g. TAA) and chances are there ++ will be more of those attacks discovered in the future. ++ ++ Therefore TSX is not enabled by default (aka tsx=off). An admin ++ might override this decision by tsx=on the command line parameter. ++ Even with TSX enabled, the kernel will attempt to enable the best ++ possible TAA mitigation setting depending on the microcode available ++ for the particular machine. ++ ++ This option allows to set the default tsx mode between tsx=on, =off ++ and =auto. See Documentation/admin-guide/kernel-parameters.txt for more ++ details. ++ ++ Say off if not sure, auto if TSX is in use but it should be used on safe ++ platforms or on if TSX is in use and the security aspect of tsx is not ++ relevant. ++ ++config X86_INTEL_TSX_MODE_OFF ++ bool "off" ++ help ++ TSX is disabled if possible - equals to tsx=off command line parameter. ++ ++config X86_INTEL_TSX_MODE_ON ++ bool "on" ++ help ++ TSX is always enabled on TSX capable HW - equals the tsx=on command ++ line parameter. ++ ++config X86_INTEL_TSX_MODE_AUTO ++ bool "auto" ++ help ++ TSX is enabled on TSX capable HW that is believed to be safe against ++ side channel attacks- equals the tsx=auto command line parameter. ++endchoice ++ + config EFI + bool "EFI runtime service support" + depends on ACPI +diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c +index 80c6d84cad67..07bf5517d9d8 100644 +--- a/arch/x86/events/amd/ibs.c ++++ b/arch/x86/events/amd/ibs.c +@@ -389,7 +389,8 @@ static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs, + struct hw_perf_event *hwc, u64 config) + { + config &= ~perf_ibs->cnt_mask; +- wrmsrl(hwc->config_base, config); ++ if (boot_cpu_data.x86 == 0x10) ++ wrmsrl(hwc->config_base, config); + config &= ~perf_ibs->enable_mask; + wrmsrl(hwc->config_base, config); + } +@@ -564,7 +565,8 @@ static struct perf_ibs perf_ibs_op = { + }, + .msr = MSR_AMD64_IBSOPCTL, + .config_mask = IBS_OP_CONFIG_MASK, +- .cnt_mask = IBS_OP_MAX_CNT, ++ .cnt_mask = IBS_OP_MAX_CNT | IBS_OP_CUR_CNT | ++ IBS_OP_CUR_CNT_RAND, + .enable_mask = IBS_OP_ENABLE, + .valid_mask = IBS_OP_VAL, + .max_period = IBS_OP_MAX_CNT << 4, +@@ -625,7 +627,7 @@ fail: + if (event->attr.sample_type & PERF_SAMPLE_RAW) + offset_max = perf_ibs->offset_max; + else if (check_rip) +- offset_max = 2; ++ offset_max = 3; + else + offset_max = 1; + do { +diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c +index 2690135bf83f..7098b9b05d56 100644 +--- a/arch/x86/events/intel/uncore.c ++++ b/arch/x86/events/intel/uncore.c +@@ -485,10 +485,8 @@ void uncore_pmu_event_start(struct perf_event *event, int flags) + local64_set(&event->hw.prev_count, uncore_read_counter(box, event)); + uncore_enable_event(box, event); + +- if (box->n_active == 1) { +- uncore_enable_box(box); ++ if (box->n_active == 1) + uncore_pmu_start_hrtimer(box); +- } + } + + void uncore_pmu_event_stop(struct perf_event *event, int flags) +@@ -512,10 +510,8 @@ void uncore_pmu_event_stop(struct perf_event *event, int flags) + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + +- if (box->n_active == 0) { +- uncore_disable_box(box); ++ if (box->n_active == 0) + uncore_pmu_cancel_hrtimer(box); +- } + } + + if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { +@@ -769,6 +765,40 @@ static int uncore_pmu_event_init(struct perf_event *event) + return ret; + } + ++static void uncore_pmu_enable(struct pmu *pmu) ++{ ++ struct intel_uncore_pmu *uncore_pmu; ++ struct intel_uncore_box *box; ++ ++ uncore_pmu = container_of(pmu, struct intel_uncore_pmu, pmu); ++ if (!uncore_pmu) ++ return; ++ ++ box = uncore_pmu_to_box(uncore_pmu, smp_processor_id()); ++ if (!box) ++ return; ++ ++ if (uncore_pmu->type->ops->enable_box) ++ uncore_pmu->type->ops->enable_box(box); ++} ++ ++static void uncore_pmu_disable(struct pmu *pmu) ++{ ++ struct intel_uncore_pmu *uncore_pmu; ++ struct intel_uncore_box *box; ++ ++ uncore_pmu = container_of(pmu, struct intel_uncore_pmu, pmu); ++ if (!uncore_pmu) ++ return; ++ ++ box = uncore_pmu_to_box(uncore_pmu, smp_processor_id()); ++ if (!box) ++ return; ++ ++ if (uncore_pmu->type->ops->disable_box) ++ uncore_pmu->type->ops->disable_box(box); ++} ++ + static ssize_t uncore_get_attr_cpumask(struct device *dev, + struct device_attribute *attr, char *buf) + { +@@ -794,6 +824,8 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu) + pmu->pmu = (struct pmu) { + .attr_groups = pmu->type->attr_groups, + .task_ctx_nr = perf_invalid_context, ++ .pmu_enable = uncore_pmu_enable, ++ .pmu_disable = uncore_pmu_disable, + .event_init = uncore_pmu_event_init, + .add = uncore_pmu_event_add, + .del = uncore_pmu_event_del, +diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h +index 42fa3974c421..40e040ec31b5 100644 +--- a/arch/x86/events/intel/uncore.h ++++ b/arch/x86/events/intel/uncore.h +@@ -412,18 +412,6 @@ static inline int uncore_freerunning_hw_config(struct intel_uncore_box *box, + return -EINVAL; + } + +-static inline void uncore_disable_box(struct intel_uncore_box *box) +-{ +- if (box->pmu->type->ops->disable_box) +- box->pmu->type->ops->disable_box(box); +-} +- +-static inline void uncore_enable_box(struct intel_uncore_box *box) +-{ +- if (box->pmu->type->ops->enable_box) +- box->pmu->type->ops->enable_box(box); +-} +- + static inline void uncore_disable_event(struct intel_uncore_box *box, + struct perf_event *event) + { +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index 759f0a176612..8c13b99b9507 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -389,5 +389,7 @@ + #define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */ + #define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */ + #define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */ ++#define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */ ++#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ + + #endif /* _ASM_X86_CPUFEATURES_H */ +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 0d3f5cf3ff3e..155be8adb934 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -281,6 +281,7 @@ struct kvm_rmap_head { + struct kvm_mmu_page { + struct list_head link; + struct hlist_node hash_link; ++ struct list_head lpage_disallowed_link; + + /* + * The following two entries are used to key the shadow page in the +@@ -293,6 +294,7 @@ struct kvm_mmu_page { + /* hold the gfn of each spte inside spt */ + gfn_t *gfns; + bool unsync; ++ bool lpage_disallowed; /* Can't be replaced by an equiv large page */ + int root_count; /* Currently serving as active root */ + unsigned int unsync_children; + struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ +@@ -807,6 +809,7 @@ struct kvm_arch { + */ + struct list_head active_mmu_pages; + struct list_head zapped_obsolete_pages; ++ struct list_head lpage_disallowed_mmu_pages; + struct kvm_page_track_notifier_node mmu_sp_tracker; + struct kvm_page_track_notifier_head track_notifier_head; + +@@ -877,6 +880,8 @@ struct kvm_arch { + bool x2apic_broadcast_quirk_disabled; + + bool guest_can_read_msr_platform_info; ++ ++ struct task_struct *nx_lpage_recovery_thread; + }; + + struct kvm_vm_stat { +@@ -890,6 +895,7 @@ struct kvm_vm_stat { + ulong mmu_unsync; + ulong remote_tlb_flush; + ulong lpages; ++ ulong nx_lpage_splits; + ulong max_mmu_page_hash_collisions; + }; + +diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h +index a1d22e4428f6..0f4feee6d082 100644 +--- a/arch/x86/include/asm/msr-index.h ++++ b/arch/x86/include/asm/msr-index.h +@@ -84,6 +84,18 @@ + * Microarchitectural Data + * Sampling (MDS) vulnerabilities. + */ ++#define ARCH_CAP_PSCHANGE_MC_NO BIT(6) /* ++ * The processor is not susceptible to a ++ * machine check error due to modifying the ++ * code page size along with either the ++ * physical address or cache type ++ * without TLB invalidation. ++ */ ++#define ARCH_CAP_TSX_CTRL_MSR BIT(7) /* MSR for TSX control is available. */ ++#define ARCH_CAP_TAA_NO BIT(8) /* ++ * Not susceptible to ++ * TSX Async Abort (TAA) vulnerabilities. ++ */ + + #define MSR_IA32_FLUSH_CMD 0x0000010b + #define L1D_FLUSH BIT(0) /* +@@ -94,6 +106,10 @@ + #define MSR_IA32_BBL_CR_CTL 0x00000119 + #define MSR_IA32_BBL_CR_CTL3 0x0000011e + ++#define MSR_IA32_TSX_CTRL 0x00000122 ++#define TSX_CTRL_RTM_DISABLE BIT(0) /* Disable RTM feature */ ++#define TSX_CTRL_CPUID_CLEAR BIT(1) /* Disable TSX enumeration */ ++ + #define MSR_IA32_SYSENTER_CS 0x00000174 + #define MSR_IA32_SYSENTER_ESP 0x00000175 + #define MSR_IA32_SYSENTER_EIP 0x00000176 +diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h +index 28cb2b31527a..09c7466c4880 100644 +--- a/arch/x86/include/asm/nospec-branch.h ++++ b/arch/x86/include/asm/nospec-branch.h +@@ -323,7 +323,7 @@ DECLARE_STATIC_KEY_FALSE(mds_idle_clear); + #include <asm/segment.h> + + /** +- * mds_clear_cpu_buffers - Mitigation for MDS vulnerability ++ * mds_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability + * + * This uses the otherwise unused and obsolete VERW instruction in + * combination with microcode which triggers a CPU buffer flush when the +@@ -346,7 +346,7 @@ static inline void mds_clear_cpu_buffers(void) + } + + /** +- * mds_user_clear_cpu_buffers - Mitigation for MDS vulnerability ++ * mds_user_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability + * + * Clear CPU buffers if the corresponding static key is enabled + */ +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index b54f25697beb..efb44bd3a714 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -1003,4 +1003,11 @@ enum mds_mitigations { + MDS_MITIGATION_VMWERV, + }; + ++enum taa_mitigations { ++ TAA_MITIGATION_OFF, ++ TAA_MITIGATION_UCODE_NEEDED, ++ TAA_MITIGATION_VERW, ++ TAA_MITIGATION_TSX_DISABLED, ++}; ++ + #endif /* _ASM_X86_PROCESSOR_H */ +diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c +index dfdd1caf0d55..1ca76ca944ba 100644 +--- a/arch/x86/kernel/apic/apic.c ++++ b/arch/x86/kernel/apic/apic.c +@@ -1528,9 +1528,6 @@ static void setup_local_APIC(void) + { + int cpu = smp_processor_id(); + unsigned int value; +-#ifdef CONFIG_X86_32 +- int logical_apicid, ldr_apicid; +-#endif + + + if (disable_apic) { +@@ -1571,16 +1568,21 @@ static void setup_local_APIC(void) + apic->init_apic_ldr(); + + #ifdef CONFIG_X86_32 +- /* +- * APIC LDR is initialized. If logical_apicid mapping was +- * initialized during get_smp_config(), make sure it matches the +- * actual value. +- */ +- logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); +- ldr_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); +- WARN_ON(logical_apicid != BAD_APICID && logical_apicid != ldr_apicid); +- /* always use the value from LDR */ +- early_per_cpu(x86_cpu_to_logical_apicid, cpu) = ldr_apicid; ++ if (apic->dest_logical) { ++ int logical_apicid, ldr_apicid; ++ ++ /* ++ * APIC LDR is initialized. If logical_apicid mapping was ++ * initialized during get_smp_config(), make sure it matches ++ * the actual value. ++ */ ++ logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); ++ ldr_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); ++ if (logical_apicid != BAD_APICID) ++ WARN_ON(logical_apicid != ldr_apicid); ++ /* Always use the value from LDR. */ ++ early_per_cpu(x86_cpu_to_logical_apicid, cpu) = ldr_apicid; ++ } + #endif + + /* +diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile +index 347137e80bf5..320769b4807b 100644 +--- a/arch/x86/kernel/cpu/Makefile ++++ b/arch/x86/kernel/cpu/Makefile +@@ -28,7 +28,7 @@ obj-y += cpuid-deps.o + obj-$(CONFIG_PROC_FS) += proc.o + obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o + +-obj-$(CONFIG_CPU_SUP_INTEL) += intel.o intel_pconfig.o ++obj-$(CONFIG_CPU_SUP_INTEL) += intel.o intel_pconfig.o tsx.o + obj-$(CONFIG_CPU_SUP_AMD) += amd.o + obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o + obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o +diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c +index ee7d17611ead..4684ad7ba793 100644 +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -32,11 +32,14 @@ + #include <asm/e820/api.h> + #include <asm/hypervisor.h> + ++#include "cpu.h" ++ + static void __init spectre_v1_select_mitigation(void); + static void __init spectre_v2_select_mitigation(void); + static void __init ssb_select_mitigation(void); + static void __init l1tf_select_mitigation(void); + static void __init mds_select_mitigation(void); ++static void __init taa_select_mitigation(void); + + /* The base value of the SPEC_CTRL MSR that always has to be preserved. */ + u64 x86_spec_ctrl_base; +@@ -103,6 +106,7 @@ void __init check_bugs(void) + ssb_select_mitigation(); + l1tf_select_mitigation(); + mds_select_mitigation(); ++ taa_select_mitigation(); + + arch_smt_update(); + +@@ -266,6 +270,100 @@ static int __init mds_cmdline(char *str) + } + early_param("mds", mds_cmdline); + ++#undef pr_fmt ++#define pr_fmt(fmt) "TAA: " fmt ++ ++/* Default mitigation for TAA-affected CPUs */ ++static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW; ++static bool taa_nosmt __ro_after_init; ++ ++static const char * const taa_strings[] = { ++ [TAA_MITIGATION_OFF] = "Vulnerable", ++ [TAA_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode", ++ [TAA_MITIGATION_VERW] = "Mitigation: Clear CPU buffers", ++ [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled", ++}; ++ ++static void __init taa_select_mitigation(void) ++{ ++ u64 ia32_cap; ++ ++ if (!boot_cpu_has_bug(X86_BUG_TAA)) { ++ taa_mitigation = TAA_MITIGATION_OFF; ++ return; ++ } ++ ++ /* TSX previously disabled by tsx=off */ ++ if (!boot_cpu_has(X86_FEATURE_RTM)) { ++ taa_mitigation = TAA_MITIGATION_TSX_DISABLED; ++ goto out; ++ } ++ ++ if (cpu_mitigations_off()) { ++ taa_mitigation = TAA_MITIGATION_OFF; ++ return; ++ } ++ ++ /* TAA mitigation is turned off on the cmdline (tsx_async_abort=off) */ ++ if (taa_mitigation == TAA_MITIGATION_OFF) ++ goto out; ++ ++ if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) ++ taa_mitigation = TAA_MITIGATION_VERW; ++ else ++ taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; ++ ++ /* ++ * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1. ++ * A microcode update fixes this behavior to clear CPU buffers. It also ++ * adds support for MSR_IA32_TSX_CTRL which is enumerated by the ++ * ARCH_CAP_TSX_CTRL_MSR bit. ++ * ++ * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode ++ * update is required. ++ */ ++ ia32_cap = x86_read_arch_cap_msr(); ++ if ( (ia32_cap & ARCH_CAP_MDS_NO) && ++ !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR)) ++ taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; ++ ++ /* ++ * TSX is enabled, select alternate mitigation for TAA which is ++ * the same as MDS. Enable MDS static branch to clear CPU buffers. ++ * ++ * For guests that can't determine whether the correct microcode is ++ * present on host, enable the mitigation for UCODE_NEEDED as well. ++ */ ++ static_branch_enable(&mds_user_clear); ++ ++ if (taa_nosmt || cpu_mitigations_auto_nosmt()) ++ cpu_smt_disable(false); ++ ++out: ++ pr_info("%s\n", taa_strings[taa_mitigation]); ++} ++ ++static int __init tsx_async_abort_parse_cmdline(char *str) ++{ ++ if (!boot_cpu_has_bug(X86_BUG_TAA)) ++ return 0; ++ ++ if (!str) ++ return -EINVAL; ++ ++ if (!strcmp(str, "off")) { ++ taa_mitigation = TAA_MITIGATION_OFF; ++ } else if (!strcmp(str, "full")) { ++ taa_mitigation = TAA_MITIGATION_VERW; ++ } else if (!strcmp(str, "full,nosmt")) { ++ taa_mitigation = TAA_MITIGATION_VERW; ++ taa_nosmt = true; ++ } ++ ++ return 0; ++} ++early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); ++ + #undef pr_fmt + #define pr_fmt(fmt) "Spectre V1 : " fmt + +@@ -772,13 +870,10 @@ static void update_mds_branch_idle(void) + } + + #define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" ++#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n" + + void arch_smt_update(void) + { +- /* Enhanced IBRS implies STIBP. No update required. */ +- if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) +- return; +- + mutex_lock(&spec_ctrl_mutex); + + switch (spectre_v2_user) { +@@ -804,6 +899,17 @@ void arch_smt_update(void) + break; + } + ++ switch (taa_mitigation) { ++ case TAA_MITIGATION_VERW: ++ case TAA_MITIGATION_UCODE_NEEDED: ++ if (sched_smt_active()) ++ pr_warn_once(TAA_MSG_SMT); ++ break; ++ case TAA_MITIGATION_TSX_DISABLED: ++ case TAA_MITIGATION_OFF: ++ break; ++ } ++ + mutex_unlock(&spec_ctrl_mutex); + } + +@@ -1119,6 +1225,9 @@ void x86_spec_ctrl_setup_ap(void) + x86_amd_ssb_disable(); + } + ++bool itlb_multihit_kvm_mitigation; ++EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation); ++ + #undef pr_fmt + #define pr_fmt(fmt) "L1TF: " fmt + +@@ -1274,11 +1383,24 @@ static ssize_t l1tf_show_state(char *buf) + l1tf_vmx_states[l1tf_vmx_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); + } ++ ++static ssize_t itlb_multihit_show_state(char *buf) ++{ ++ if (itlb_multihit_kvm_mitigation) ++ return sprintf(buf, "KVM: Mitigation: Split huge pages\n"); ++ else ++ return sprintf(buf, "KVM: Vulnerable\n"); ++} + #else + static ssize_t l1tf_show_state(char *buf) + { + return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG); + } ++ ++static ssize_t itlb_multihit_show_state(char *buf) ++{ ++ return sprintf(buf, "Processor vulnerable\n"); ++} + #endif + + static ssize_t mds_show_state(char *buf) +@@ -1298,6 +1420,21 @@ static ssize_t mds_show_state(char *buf) + sched_smt_active() ? "vulnerable" : "disabled"); + } + ++static ssize_t tsx_async_abort_show_state(char *buf) ++{ ++ if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) || ++ (taa_mitigation == TAA_MITIGATION_OFF)) ++ return sprintf(buf, "%s\n", taa_strings[taa_mitigation]); ++ ++ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { ++ return sprintf(buf, "%s; SMT Host state unknown\n", ++ taa_strings[taa_mitigation]); ++ } ++ ++ return sprintf(buf, "%s; SMT %s\n", taa_strings[taa_mitigation], ++ sched_smt_active() ? "vulnerable" : "disabled"); ++} ++ + static char *stibp_state(void) + { + if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) +@@ -1366,6 +1503,12 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr + case X86_BUG_MDS: + return mds_show_state(buf); + ++ case X86_BUG_TAA: ++ return tsx_async_abort_show_state(buf); ++ ++ case X86_BUG_ITLB_MULTIHIT: ++ return itlb_multihit_show_state(buf); ++ + default: + break; + } +@@ -1402,4 +1545,14 @@ ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *bu + { + return cpu_show_common(dev, attr, buf, X86_BUG_MDS); + } ++ ++ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ return cpu_show_common(dev, attr, buf, X86_BUG_TAA); ++} ++ ++ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT); ++} + #endif +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index b33fdfa0ff49..1e07814f02bc 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -946,13 +946,14 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) + #endif + } + +-#define NO_SPECULATION BIT(0) +-#define NO_MELTDOWN BIT(1) +-#define NO_SSB BIT(2) +-#define NO_L1TF BIT(3) +-#define NO_MDS BIT(4) +-#define MSBDS_ONLY BIT(5) +-#define NO_SWAPGS BIT(6) ++#define NO_SPECULATION BIT(0) ++#define NO_MELTDOWN BIT(1) ++#define NO_SSB BIT(2) ++#define NO_L1TF BIT(3) ++#define NO_MDS BIT(4) ++#define MSBDS_ONLY BIT(5) ++#define NO_SWAPGS BIT(6) ++#define NO_ITLB_MULTIHIT BIT(7) + + #define VULNWL(_vendor, _family, _model, _whitelist) \ + { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist } +@@ -970,26 +971,26 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { + VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION), + + /* Intel Family 6 */ +- VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION), +- VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION), +- VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION), +- VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION), +- VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION), +- +- VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), +- VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), +- VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), +- VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), +- VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), +- VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), ++ VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), ++ ++ VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + + VULNWL_INTEL(CORE_YONAH, NO_SSB), + +- VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS), ++ VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + +- VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS), +- VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF | NO_SWAPGS), +- VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS), ++ VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + + /* + * Technically, swapgs isn't serializing on AMD (despite it previously +@@ -999,14 +1000,16 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { + * good enough for our purposes. + */ + ++ VULNWL_INTEL(ATOM_TREMONT_X, NO_ITLB_MULTIHIT), ++ + /* AMD Family 0xf - 0x12 */ +- VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), +- VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), +- VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), +- VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), ++ VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), ++ VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), ++ VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), ++ VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + + /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ +- VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS), ++ VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + {} + }; + +@@ -1017,19 +1020,30 @@ static bool __init cpu_matches(unsigned long which) + return m && !!(m->driver_data & which); + } + +-static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) ++u64 x86_read_arch_cap_msr(void) + { + u64 ia32_cap = 0; + ++ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) ++ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); ++ ++ return ia32_cap; ++} ++ ++static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) ++{ ++ u64 ia32_cap = x86_read_arch_cap_msr(); ++ ++ /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ ++ if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) ++ setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT); ++ + if (cpu_matches(NO_SPECULATION)) + return; + + setup_force_cpu_bug(X86_BUG_SPECTRE_V1); + setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + +- if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES)) +- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); +- + if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) && + !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) + setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); +@@ -1046,6 +1060,21 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) + if (!cpu_matches(NO_SWAPGS)) + setup_force_cpu_bug(X86_BUG_SWAPGS); + ++ /* ++ * When the CPU is not mitigated for TAA (TAA_NO=0) set TAA bug when: ++ * - TSX is supported or ++ * - TSX_CTRL is present ++ * ++ * TSX_CTRL check is needed for cases when TSX could be disabled before ++ * the kernel boot e.g. kexec. ++ * TSX_CTRL check alone is not sufficient for cases when the microcode ++ * update is not present or running as guest that don't get TSX_CTRL. ++ */ ++ if (!(ia32_cap & ARCH_CAP_TAA_NO) && ++ (cpu_has(c, X86_FEATURE_RTM) || ++ (ia32_cap & ARCH_CAP_TSX_CTRL_MSR))) ++ setup_force_cpu_bug(X86_BUG_TAA); ++ + if (cpu_matches(NO_MELTDOWN)) + return; + +@@ -1475,6 +1504,7 @@ void __init identify_boot_cpu(void) + enable_sep_cpu(); + #endif + cpu_detect_tlb(&boot_cpu_data); ++ tsx_init(); + } + + void identify_secondary_cpu(struct cpuinfo_x86 *c) +diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h +index 7b229afa0a37..236582c90d3f 100644 +--- a/arch/x86/kernel/cpu/cpu.h ++++ b/arch/x86/kernel/cpu/cpu.h +@@ -45,6 +45,22 @@ struct _tlb_table { + extern const struct cpu_dev *const __x86_cpu_dev_start[], + *const __x86_cpu_dev_end[]; + ++#ifdef CONFIG_CPU_SUP_INTEL ++enum tsx_ctrl_states { ++ TSX_CTRL_ENABLE, ++ TSX_CTRL_DISABLE, ++ TSX_CTRL_NOT_SUPPORTED, ++}; ++ ++extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state; ++ ++extern void __init tsx_init(void); ++extern void tsx_enable(void); ++extern void tsx_disable(void); ++#else ++static inline void tsx_init(void) { } ++#endif /* CONFIG_CPU_SUP_INTEL */ ++ + extern void get_cpu_cap(struct cpuinfo_x86 *c); + extern void get_cpu_address_sizes(struct cpuinfo_x86 *c); + extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); +@@ -65,4 +81,6 @@ unsigned int aperfmperf_get_khz(int cpu); + + extern void x86_spec_ctrl_setup_ap(void); + ++extern u64 x86_read_arch_cap_msr(void); ++ + #endif /* ARCH_X86_CPU_H */ +diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c +index fc3c07fe7df5..a5287b18a63f 100644 +--- a/arch/x86/kernel/cpu/intel.c ++++ b/arch/x86/kernel/cpu/intel.c +@@ -766,6 +766,11 @@ static void init_intel(struct cpuinfo_x86 *c) + init_intel_energy_perf(c); + + init_intel_misc_features(c); ++ ++ if (tsx_ctrl_state == TSX_CTRL_ENABLE) ++ tsx_enable(); ++ if (tsx_ctrl_state == TSX_CTRL_DISABLE) ++ tsx_disable(); + } + + #ifdef CONFIG_X86_32 +diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c +new file mode 100644 +index 000000000000..3e20d322bc98 +--- /dev/null ++++ b/arch/x86/kernel/cpu/tsx.c +@@ -0,0 +1,140 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Intel Transactional Synchronization Extensions (TSX) control. ++ * ++ * Copyright (C) 2019 Intel Corporation ++ * ++ * Author: ++ * Pawan Gupta <pawan.kumar.gupta@linux.intel.com> ++ */ ++ ++#include <linux/cpufeature.h> ++ ++#include <asm/cmdline.h> ++ ++#include "cpu.h" ++ ++enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED; ++ ++void tsx_disable(void) ++{ ++ u64 tsx; ++ ++ rdmsrl(MSR_IA32_TSX_CTRL, tsx); ++ ++ /* Force all transactions to immediately abort */ ++ tsx |= TSX_CTRL_RTM_DISABLE; ++ ++ /* ++ * Ensure TSX support is not enumerated in CPUID. ++ * This is visible to userspace and will ensure they ++ * do not waste resources trying TSX transactions that ++ * will always abort. ++ */ ++ tsx |= TSX_CTRL_CPUID_CLEAR; ++ ++ wrmsrl(MSR_IA32_TSX_CTRL, tsx); ++} ++ ++void tsx_enable(void) ++{ ++ u64 tsx; ++ ++ rdmsrl(MSR_IA32_TSX_CTRL, tsx); ++ ++ /* Enable the RTM feature in the cpu */ ++ tsx &= ~TSX_CTRL_RTM_DISABLE; ++ ++ /* ++ * Ensure TSX support is enumerated in CPUID. ++ * This is visible to userspace and will ensure they ++ * can enumerate and use the TSX feature. ++ */ ++ tsx &= ~TSX_CTRL_CPUID_CLEAR; ++ ++ wrmsrl(MSR_IA32_TSX_CTRL, tsx); ++} ++ ++static bool __init tsx_ctrl_is_supported(void) ++{ ++ u64 ia32_cap = x86_read_arch_cap_msr(); ++ ++ /* ++ * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this ++ * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES. ++ * ++ * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a ++ * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES ++ * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get ++ * MSR_IA32_TSX_CTRL support even after a microcode update. Thus, ++ * tsx= cmdline requests will do nothing on CPUs without ++ * MSR_IA32_TSX_CTRL support. ++ */ ++ return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR); ++} ++ ++static enum tsx_ctrl_states x86_get_tsx_auto_mode(void) ++{ ++ if (boot_cpu_has_bug(X86_BUG_TAA)) ++ return TSX_CTRL_DISABLE; ++ ++ return TSX_CTRL_ENABLE; ++} ++ ++void __init tsx_init(void) ++{ ++ char arg[5] = {}; ++ int ret; ++ ++ if (!tsx_ctrl_is_supported()) ++ return; ++ ++ ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg)); ++ if (ret >= 0) { ++ if (!strcmp(arg, "on")) { ++ tsx_ctrl_state = TSX_CTRL_ENABLE; ++ } else if (!strcmp(arg, "off")) { ++ tsx_ctrl_state = TSX_CTRL_DISABLE; ++ } else if (!strcmp(arg, "auto")) { ++ tsx_ctrl_state = x86_get_tsx_auto_mode(); ++ } else { ++ tsx_ctrl_state = TSX_CTRL_DISABLE; ++ pr_err("tsx: invalid option, defaulting to off\n"); ++ } ++ } else { ++ /* tsx= not provided */ ++ if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO)) ++ tsx_ctrl_state = x86_get_tsx_auto_mode(); ++ else if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF)) ++ tsx_ctrl_state = TSX_CTRL_DISABLE; ++ else ++ tsx_ctrl_state = TSX_CTRL_ENABLE; ++ } ++ ++ if (tsx_ctrl_state == TSX_CTRL_DISABLE) { ++ tsx_disable(); ++ ++ /* ++ * tsx_disable() will change the state of the ++ * RTM CPUID bit. Clear it here since it is now ++ * expected to be not set. ++ */ ++ setup_clear_cpu_cap(X86_FEATURE_RTM); ++ } else if (tsx_ctrl_state == TSX_CTRL_ENABLE) { ++ ++ /* ++ * HW defaults TSX to be enabled at bootup. ++ * We may still need the TSX enable support ++ * during init for special cases like ++ * kexec after TSX is disabled. ++ */ ++ tsx_enable(); ++ ++ /* ++ * tsx_enable() will change the state of the ++ * RTM CPUID bit. Force it here since it is now ++ * expected to be set. ++ */ ++ setup_force_cpu_cap(X86_FEATURE_RTM); ++ } ++} +diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c +index b810102a9cfa..970e261ef3e8 100644 +--- a/arch/x86/kvm/cpuid.c ++++ b/arch/x86/kvm/cpuid.c +@@ -501,8 +501,16 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, + /* PKU is not yet implemented for shadow paging. */ + if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) + entry->ecx &= ~F(PKU); ++ + entry->edx &= kvm_cpuid_7_0_edx_x86_features; + cpuid_mask(&entry->edx, CPUID_7_EDX); ++ if (boot_cpu_has(X86_FEATURE_IBPB) && ++ boot_cpu_has(X86_FEATURE_IBRS)) ++ entry->edx |= F(SPEC_CTRL); ++ if (boot_cpu_has(X86_FEATURE_STIBP)) ++ entry->edx |= F(INTEL_STIBP); ++ if (boot_cpu_has(X86_FEATURE_SSBD)) ++ entry->edx |= F(SPEC_CTRL_SSBD); + /* + * We emulate ARCH_CAPABILITIES in software even + * if the host doesn't support it. +diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c +index 88940261fb53..d7db7608de5f 100644 +--- a/arch/x86/kvm/mmu.c ++++ b/arch/x86/kvm/mmu.c +@@ -40,6 +40,7 @@ + #include <linux/uaccess.h> + #include <linux/hash.h> + #include <linux/kern_levels.h> ++#include <linux/kthread.h> + + #include <asm/page.h> + #include <asm/pat.h> +@@ -49,6 +50,30 @@ + #include <asm/kvm_page_track.h> + #include "trace.h" + ++extern bool itlb_multihit_kvm_mitigation; ++ ++static int __read_mostly nx_huge_pages = -1; ++static uint __read_mostly nx_huge_pages_recovery_ratio = 60; ++ ++static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); ++static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp); ++ ++static struct kernel_param_ops nx_huge_pages_ops = { ++ .set = set_nx_huge_pages, ++ .get = param_get_bool, ++}; ++ ++static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = { ++ .set = set_nx_huge_pages_recovery_ratio, ++ .get = param_get_uint, ++}; ++ ++module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644); ++__MODULE_PARM_TYPE(nx_huge_pages, "bool"); ++module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops, ++ &nx_huge_pages_recovery_ratio, 0644); ++__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint"); ++ + /* + * When setting this variable to true it enables Two-Dimensional-Paging + * where the hardware walks 2 page tables: +@@ -140,9 +165,6 @@ module_param(dbg, bool, 0644); + + #include <trace/events/kvm.h> + +-#define CREATE_TRACE_POINTS +-#include "mmutrace.h" +- + #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) + #define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) + +@@ -261,9 +283,14 @@ static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; + + + static void mmu_spte_set(u64 *sptep, u64 spte); ++static bool is_executable_pte(u64 spte); + static union kvm_mmu_page_role + kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); + ++#define CREATE_TRACE_POINTS ++#include "mmutrace.h" ++ ++ + void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value) + { + BUG_ON((mmio_mask & mmio_value) != mmio_value); +@@ -283,6 +310,11 @@ static inline bool spte_ad_enabled(u64 spte) + return !(spte & shadow_acc_track_value); + } + ++static bool is_nx_huge_page_enabled(void) ++{ ++ return READ_ONCE(nx_huge_pages); ++} ++ + static inline u64 spte_shadow_accessed_mask(u64 spte) + { + MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); +@@ -1027,10 +1059,16 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) + + static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) + { +- if (sp->role.direct) +- BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index)); +- else ++ if (!sp->role.direct) { + sp->gfns[index] = gfn; ++ return; ++ } ++ ++ if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index))) ++ pr_err_ratelimited("gfn mismatch under direct page %llx " ++ "(expected %llx, got %llx)\n", ++ sp->gfn, ++ kvm_mmu_page_get_gfn(sp, index), gfn); + } + + /* +@@ -1089,6 +1127,17 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) + kvm_mmu_gfn_disallow_lpage(slot, gfn); + } + ++static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) ++{ ++ if (sp->lpage_disallowed) ++ return; ++ ++ ++kvm->stat.nx_lpage_splits; ++ list_add_tail(&sp->lpage_disallowed_link, ++ &kvm->arch.lpage_disallowed_mmu_pages); ++ sp->lpage_disallowed = true; ++} ++ + static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) + { + struct kvm_memslots *slots; +@@ -1106,6 +1155,13 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) + kvm_mmu_gfn_allow_lpage(slot, gfn); + } + ++static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) ++{ ++ --kvm->stat.nx_lpage_splits; ++ sp->lpage_disallowed = false; ++ list_del(&sp->lpage_disallowed_link); ++} ++ + static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level, + struct kvm_memory_slot *slot) + { +@@ -2658,6 +2714,9 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, + kvm_reload_remote_mmus(kvm); + } + ++ if (sp->lpage_disallowed) ++ unaccount_huge_nx_page(kvm, sp); ++ + sp->role.invalid = 1; + return ret; + } +@@ -2866,6 +2925,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, + if (!speculative) + spte |= spte_shadow_accessed_mask(spte); + ++ if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) && ++ is_nx_huge_page_enabled()) { ++ pte_access &= ~ACC_EXEC_MASK; ++ } ++ + if (pte_access & ACC_EXEC_MASK) + spte |= shadow_x_mask; + else +@@ -2986,10 +3050,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, + ret = RET_PF_EMULATE; + + pgprintk("%s: setting spte %llx\n", __func__, *sptep); +- pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", +- is_large_pte(*sptep)? "2MB" : "4kB", +- *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn, +- *sptep, sptep); ++ trace_kvm_mmu_set_spte(level, gfn, sptep); + if (!was_rmapped && is_large_pte(*sptep)) + ++vcpu->kvm->stat.lpages; + +@@ -3001,8 +3062,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, + } + } + +- kvm_release_pfn_clean(pfn); +- + return ret; + } + +@@ -3037,9 +3096,11 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, + if (ret <= 0) + return -1; + +- for (i = 0; i < ret; i++, gfn++, start++) ++ for (i = 0; i < ret; i++, gfn++, start++) { + mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn, + page_to_pfn(pages[i]), true, true); ++ put_page(pages[i]); ++ } + + return 0; + } +@@ -3087,40 +3148,71 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) + __direct_pte_prefetch(vcpu, sp, sptep); + } + +-static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable, +- int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault) ++static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, ++ gfn_t gfn, kvm_pfn_t *pfnp, int *levelp) + { +- struct kvm_shadow_walk_iterator iterator; ++ int level = *levelp; ++ u64 spte = *it.sptep; ++ ++ if (it.level == level && level > PT_PAGE_TABLE_LEVEL && ++ is_nx_huge_page_enabled() && ++ is_shadow_present_pte(spte) && ++ !is_large_pte(spte)) { ++ /* ++ * A small SPTE exists for this pfn, but FNAME(fetch) ++ * and __direct_map would like to create a large PTE ++ * instead: just force them to go down another level, ++ * patching back for them into pfn the next 9 bits of ++ * the address. ++ */ ++ u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1); ++ *pfnp |= gfn & page_mask; ++ (*levelp)--; ++ } ++} ++ ++static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, ++ int map_writable, int level, kvm_pfn_t pfn, ++ bool prefault, bool lpage_disallowed) ++{ ++ struct kvm_shadow_walk_iterator it; + struct kvm_mmu_page *sp; +- int emulate = 0; +- gfn_t pseudo_gfn; ++ int ret; ++ gfn_t gfn = gpa >> PAGE_SHIFT; ++ gfn_t base_gfn = gfn; + + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) +- return 0; ++ return RET_PF_RETRY; + +- for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { +- if (iterator.level == level) { +- emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, +- write, level, gfn, pfn, prefault, +- map_writable); +- direct_pte_prefetch(vcpu, iterator.sptep); +- ++vcpu->stat.pf_fixed; +- break; +- } ++ trace_kvm_mmu_spte_requested(gpa, level, pfn); ++ for_each_shadow_entry(vcpu, gpa, it) { ++ /* ++ * We cannot overwrite existing page tables with an NX ++ * large page, as the leaf could be executable. ++ */ ++ disallowed_hugepage_adjust(it, gfn, &pfn, &level); + +- drop_large_spte(vcpu, iterator.sptep); +- if (!is_shadow_present_pte(*iterator.sptep)) { +- u64 base_addr = iterator.addr; ++ base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); ++ if (it.level == level) ++ break; + +- base_addr &= PT64_LVL_ADDR_MASK(iterator.level); +- pseudo_gfn = base_addr >> PAGE_SHIFT; +- sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, +- iterator.level - 1, 1, ACC_ALL); ++ drop_large_spte(vcpu, it.sptep); ++ if (!is_shadow_present_pte(*it.sptep)) { ++ sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, ++ it.level - 1, true, ACC_ALL); + +- link_shadow_page(vcpu, iterator.sptep, sp); ++ link_shadow_page(vcpu, it.sptep, sp); ++ if (lpage_disallowed) ++ account_huge_nx_page(vcpu->kvm, sp); + } + } +- return emulate; ++ ++ ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL, ++ write, level, base_gfn, pfn, prefault, ++ map_writable); ++ direct_pte_prefetch(vcpu, it.sptep); ++ ++vcpu->stat.pf_fixed; ++ return ret; + } + + static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) +@@ -3156,11 +3248,10 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) + } + + static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, +- gfn_t *gfnp, kvm_pfn_t *pfnp, ++ gfn_t gfn, kvm_pfn_t *pfnp, + int *levelp) + { + kvm_pfn_t pfn = *pfnp; +- gfn_t gfn = *gfnp; + int level = *levelp; + + /* +@@ -3187,8 +3278,6 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, + mask = KVM_PAGES_PER_HPAGE(level) - 1; + VM_BUG_ON((gfn & mask) != (pfn & mask)); + if (pfn & mask) { +- gfn &= ~mask; +- *gfnp = gfn; + kvm_release_pfn_clean(pfn); + pfn &= ~mask; + kvm_get_pfn(pfn); +@@ -3415,11 +3504,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, + { + int r; + int level; +- bool force_pt_level = false; ++ bool force_pt_level; + kvm_pfn_t pfn; + unsigned long mmu_seq; + bool map_writable, write = error_code & PFERR_WRITE_MASK; ++ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && ++ is_nx_huge_page_enabled(); + ++ force_pt_level = lpage_disallowed; + level = mapping_level(vcpu, gfn, &force_pt_level); + if (likely(!force_pt_level)) { + /* +@@ -3445,22 +3537,20 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, + if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) + return r; + ++ r = RET_PF_RETRY; + spin_lock(&vcpu->kvm->mmu_lock); + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) + goto out_unlock; + if (make_mmu_pages_available(vcpu) < 0) + goto out_unlock; + if (likely(!force_pt_level)) +- transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); +- r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); +- spin_unlock(&vcpu->kvm->mmu_lock); +- +- return r; +- ++ transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); ++ r = __direct_map(vcpu, v, write, map_writable, level, pfn, ++ prefault, false); + out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); +- return RET_PF_RETRY; ++ return r; + } + + static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, +@@ -4050,6 +4140,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, + unsigned long mmu_seq; + int write = error_code & PFERR_WRITE_MASK; + bool map_writable; ++ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && ++ is_nx_huge_page_enabled(); + + MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + +@@ -4060,8 +4152,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, + if (r) + return r; + +- force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn, +- PT_DIRECTORY_LEVEL); ++ force_pt_level = ++ lpage_disallowed || ++ !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL); + level = mapping_level(vcpu, gfn, &force_pt_level); + if (likely(!force_pt_level)) { + if (level > PT_DIRECTORY_LEVEL && +@@ -4082,22 +4175,20 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, + if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) + return r; + ++ r = RET_PF_RETRY; + spin_lock(&vcpu->kvm->mmu_lock); + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) + goto out_unlock; + if (make_mmu_pages_available(vcpu) < 0) + goto out_unlock; + if (likely(!force_pt_level)) +- transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); +- r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); +- spin_unlock(&vcpu->kvm->mmu_lock); +- +- return r; +- ++ transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); ++ r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, ++ prefault, lpage_disallowed); + out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); +- return RET_PF_RETRY; ++ return r; + } + + static void nonpaging_init_context(struct kvm_vcpu *vcpu, +@@ -5819,7 +5910,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) + int nr_to_scan = sc->nr_to_scan; + unsigned long freed = 0; + +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + + list_for_each_entry(kvm, &vm_list, vm_list) { + int idx; +@@ -5869,7 +5960,7 @@ unlock: + break; + } + +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); + return freed; + } + +@@ -5891,10 +5982,60 @@ static void mmu_destroy_caches(void) + kmem_cache_destroy(mmu_page_header_cache); + } + ++static bool get_nx_auto_mode(void) ++{ ++ /* Return true when CPU has the bug, and mitigations are ON */ ++ return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off(); ++} ++ ++static void __set_nx_huge_pages(bool val) ++{ ++ nx_huge_pages = itlb_multihit_kvm_mitigation = val; ++} ++ ++static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) ++{ ++ bool old_val = nx_huge_pages; ++ bool new_val; ++ ++ /* In "auto" mode deploy workaround only if CPU has the bug. */ ++ if (sysfs_streq(val, "off")) ++ new_val = 0; ++ else if (sysfs_streq(val, "force")) ++ new_val = 1; ++ else if (sysfs_streq(val, "auto")) ++ new_val = get_nx_auto_mode(); ++ else if (strtobool(val, &new_val) < 0) ++ return -EINVAL; ++ ++ __set_nx_huge_pages(new_val); ++ ++ if (new_val != old_val) { ++ struct kvm *kvm; ++ int idx; ++ ++ mutex_lock(&kvm_lock); ++ ++ list_for_each_entry(kvm, &vm_list, vm_list) { ++ idx = srcu_read_lock(&kvm->srcu); ++ kvm_mmu_invalidate_zap_all_pages(kvm); ++ srcu_read_unlock(&kvm->srcu, idx); ++ ++ wake_up_process(kvm->arch.nx_lpage_recovery_thread); ++ } ++ mutex_unlock(&kvm_lock); ++ } ++ ++ return 0; ++} ++ + int kvm_mmu_module_init(void) + { + int ret = -ENOMEM; + ++ if (nx_huge_pages == -1) ++ __set_nx_huge_pages(get_nx_auto_mode()); ++ + kvm_mmu_reset_all_pte_masks(); + + pte_list_desc_cache = kmem_cache_create("pte_list_desc", +@@ -5961,3 +6102,116 @@ void kvm_mmu_module_exit(void) + unregister_shrinker(&mmu_shrinker); + mmu_audit_disable(); + } ++ ++static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp) ++{ ++ unsigned int old_val; ++ int err; ++ ++ old_val = nx_huge_pages_recovery_ratio; ++ err = param_set_uint(val, kp); ++ if (err) ++ return err; ++ ++ if (READ_ONCE(nx_huge_pages) && ++ !old_val && nx_huge_pages_recovery_ratio) { ++ struct kvm *kvm; ++ ++ mutex_lock(&kvm_lock); ++ ++ list_for_each_entry(kvm, &vm_list, vm_list) ++ wake_up_process(kvm->arch.nx_lpage_recovery_thread); ++ ++ mutex_unlock(&kvm_lock); ++ } ++ ++ return err; ++} ++ ++static void kvm_recover_nx_lpages(struct kvm *kvm) ++{ ++ int rcu_idx; ++ struct kvm_mmu_page *sp; ++ unsigned int ratio; ++ LIST_HEAD(invalid_list); ++ ulong to_zap; ++ ++ rcu_idx = srcu_read_lock(&kvm->srcu); ++ spin_lock(&kvm->mmu_lock); ++ ++ ratio = READ_ONCE(nx_huge_pages_recovery_ratio); ++ to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0; ++ while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) { ++ /* ++ * We use a separate list instead of just using active_mmu_pages ++ * because the number of lpage_disallowed pages is expected to ++ * be relatively small compared to the total. ++ */ ++ sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages, ++ struct kvm_mmu_page, ++ lpage_disallowed_link); ++ WARN_ON_ONCE(!sp->lpage_disallowed); ++ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); ++ WARN_ON_ONCE(sp->lpage_disallowed); ++ ++ if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) { ++ kvm_mmu_commit_zap_page(kvm, &invalid_list); ++ if (to_zap) ++ cond_resched_lock(&kvm->mmu_lock); ++ } ++ } ++ ++ spin_unlock(&kvm->mmu_lock); ++ srcu_read_unlock(&kvm->srcu, rcu_idx); ++} ++ ++static long get_nx_lpage_recovery_timeout(u64 start_time) ++{ ++ return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio) ++ ? start_time + 60 * HZ - get_jiffies_64() ++ : MAX_SCHEDULE_TIMEOUT; ++} ++ ++static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data) ++{ ++ u64 start_time; ++ long remaining_time; ++ ++ while (true) { ++ start_time = get_jiffies_64(); ++ remaining_time = get_nx_lpage_recovery_timeout(start_time); ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ while (!kthread_should_stop() && remaining_time > 0) { ++ schedule_timeout(remaining_time); ++ remaining_time = get_nx_lpage_recovery_timeout(start_time); ++ set_current_state(TASK_INTERRUPTIBLE); ++ } ++ ++ set_current_state(TASK_RUNNING); ++ ++ if (kthread_should_stop()) ++ return 0; ++ ++ kvm_recover_nx_lpages(kvm); ++ } ++} ++ ++int kvm_mmu_post_init_vm(struct kvm *kvm) ++{ ++ int err; ++ ++ err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0, ++ "kvm-nx-lpage-recovery", ++ &kvm->arch.nx_lpage_recovery_thread); ++ if (!err) ++ kthread_unpark(kvm->arch.nx_lpage_recovery_thread); ++ ++ return err; ++} ++ ++void kvm_mmu_pre_destroy_vm(struct kvm *kvm) ++{ ++ if (kvm->arch.nx_lpage_recovery_thread) ++ kthread_stop(kvm->arch.nx_lpage_recovery_thread); ++} +diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h +index 65892288bf51..f7b2de7b6382 100644 +--- a/arch/x86/kvm/mmu.h ++++ b/arch/x86/kvm/mmu.h +@@ -216,4 +216,8 @@ void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); + bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, + struct kvm_memory_slot *slot, u64 gfn); + int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); ++ ++int kvm_mmu_post_init_vm(struct kvm *kvm); ++void kvm_mmu_pre_destroy_vm(struct kvm *kvm); ++ + #endif +diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h +index c73bf4e4988c..918b0d5bf272 100644 +--- a/arch/x86/kvm/mmutrace.h ++++ b/arch/x86/kvm/mmutrace.h +@@ -325,6 +325,65 @@ TRACE_EVENT( + __entry->kvm_gen == __entry->spte_gen + ) + ); ++ ++TRACE_EVENT( ++ kvm_mmu_set_spte, ++ TP_PROTO(int level, gfn_t gfn, u64 *sptep), ++ TP_ARGS(level, gfn, sptep), ++ ++ TP_STRUCT__entry( ++ __field(u64, gfn) ++ __field(u64, spte) ++ __field(u64, sptep) ++ __field(u8, level) ++ /* These depend on page entry type, so compute them now. */ ++ __field(bool, r) ++ __field(bool, x) ++ __field(u8, u) ++ ), ++ ++ TP_fast_assign( ++ __entry->gfn = gfn; ++ __entry->spte = *sptep; ++ __entry->sptep = virt_to_phys(sptep); ++ __entry->level = level; ++ __entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK); ++ __entry->x = is_executable_pte(__entry->spte); ++ __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1; ++ ), ++ ++ TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx", ++ __entry->gfn, __entry->spte, ++ __entry->r ? "r" : "-", ++ __entry->spte & PT_WRITABLE_MASK ? "w" : "-", ++ __entry->x ? "x" : "-", ++ __entry->u == -1 ? "" : (__entry->u ? "u" : "-"), ++ __entry->level, __entry->sptep ++ ) ++); ++ ++TRACE_EVENT( ++ kvm_mmu_spte_requested, ++ TP_PROTO(gpa_t addr, int level, kvm_pfn_t pfn), ++ TP_ARGS(addr, level, pfn), ++ ++ TP_STRUCT__entry( ++ __field(u64, gfn) ++ __field(u64, pfn) ++ __field(u8, level) ++ ), ++ ++ TP_fast_assign( ++ __entry->gfn = addr >> PAGE_SHIFT; ++ __entry->pfn = pfn | (__entry->gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); ++ __entry->level = level; ++ ), ++ ++ TP_printk("gfn %llx pfn %llx level %d", ++ __entry->gfn, __entry->pfn, __entry->level ++ ) ++); ++ + #endif /* _TRACE_KVMMMU_H */ + + #undef TRACE_INCLUDE_PATH +diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h +index 14ffd973df54..adf42dc8d38b 100644 +--- a/arch/x86/kvm/paging_tmpl.h ++++ b/arch/x86/kvm/paging_tmpl.h +@@ -522,6 +522,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + mmu_set_spte(vcpu, spte, pte_access, 0, PT_PAGE_TABLE_LEVEL, gfn, pfn, + true, true); + ++ kvm_release_pfn_clean(pfn); + return true; + } + +@@ -595,12 +596,14 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, + static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, + struct guest_walker *gw, + int write_fault, int hlevel, +- kvm_pfn_t pfn, bool map_writable, bool prefault) ++ kvm_pfn_t pfn, bool map_writable, bool prefault, ++ bool lpage_disallowed) + { + struct kvm_mmu_page *sp = NULL; + struct kvm_shadow_walk_iterator it; + unsigned direct_access, access = gw->pt_access; + int top_level, ret; ++ gfn_t gfn, base_gfn; + + direct_access = gw->pte_access; + +@@ -645,35 +648,48 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, + link_shadow_page(vcpu, it.sptep, sp); + } + +- for (; +- shadow_walk_okay(&it) && it.level > hlevel; +- shadow_walk_next(&it)) { +- gfn_t direct_gfn; ++ /* ++ * FNAME(page_fault) might have clobbered the bottom bits of ++ * gw->gfn, restore them from the virtual address. ++ */ ++ gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT); ++ base_gfn = gfn; + ++ trace_kvm_mmu_spte_requested(addr, gw->level, pfn); ++ ++ for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { + clear_sp_write_flooding_count(it.sptep); +- validate_direct_spte(vcpu, it.sptep, direct_access); + +- drop_large_spte(vcpu, it.sptep); ++ /* ++ * We cannot overwrite existing page tables with an NX ++ * large page, as the leaf could be executable. ++ */ ++ disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel); + +- if (is_shadow_present_pte(*it.sptep)) +- continue; ++ base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); ++ if (it.level == hlevel) ++ break; ++ ++ validate_direct_spte(vcpu, it.sptep, direct_access); + +- direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); ++ drop_large_spte(vcpu, it.sptep); + +- sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, +- true, direct_access); +- link_shadow_page(vcpu, it.sptep, sp); ++ if (!is_shadow_present_pte(*it.sptep)) { ++ sp = kvm_mmu_get_page(vcpu, base_gfn, addr, ++ it.level - 1, true, direct_access); ++ link_shadow_page(vcpu, it.sptep, sp); ++ if (lpage_disallowed) ++ account_huge_nx_page(vcpu->kvm, sp); ++ } + } + +- clear_sp_write_flooding_count(it.sptep); + ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, +- it.level, gw->gfn, pfn, prefault, map_writable); ++ it.level, base_gfn, pfn, prefault, map_writable); + FNAME(pte_prefetch)(vcpu, gw, it.sptep); +- ++ ++vcpu->stat.pf_fixed; + return ret; + + out_gpte_changed: +- kvm_release_pfn_clean(pfn); + return RET_PF_RETRY; + } + +@@ -740,9 +756,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, + int r; + kvm_pfn_t pfn; + int level = PT_PAGE_TABLE_LEVEL; +- bool force_pt_level = false; + unsigned long mmu_seq; + bool map_writable, is_self_change_mapping; ++ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && ++ is_nx_huge_page_enabled(); ++ bool force_pt_level = lpage_disallowed; + + pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); + +@@ -821,6 +839,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, + walker.pte_access &= ~ACC_EXEC_MASK; + } + ++ r = RET_PF_RETRY; + spin_lock(&vcpu->kvm->mmu_lock); + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) + goto out_unlock; +@@ -829,19 +848,15 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, + if (make_mmu_pages_available(vcpu) < 0) + goto out_unlock; + if (!force_pt_level) +- transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); ++ transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level); + r = FNAME(fetch)(vcpu, addr, &walker, write_fault, +- level, pfn, map_writable, prefault); +- ++vcpu->stat.pf_fixed; ++ level, pfn, map_writable, prefault, lpage_disallowed); + kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); +- spin_unlock(&vcpu->kvm->mmu_lock); +- +- return r; + + out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); +- return RET_PF_RETRY; ++ return r; + } + + static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) +diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c +index ac2cc2ed7a85..7657dcd72134 100644 +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -736,8 +736,14 @@ static int get_npt_level(struct kvm_vcpu *vcpu) + static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) + { + vcpu->arch.efer = efer; +- if (!npt_enabled && !(efer & EFER_LMA)) +- efer &= ~EFER_LME; ++ ++ if (!npt_enabled) { ++ /* Shadow paging assumes NX to be available. */ ++ efer |= EFER_NX; ++ ++ if (!(efer & EFER_LMA)) ++ efer &= ~EFER_LME; ++ } + + to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; + mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index 6f7b3acdab26..4eda2a9c234a 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -2785,17 +2785,9 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) + u64 guest_efer = vmx->vcpu.arch.efer; + u64 ignore_bits = 0; + +- if (!enable_ept) { +- /* +- * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing +- * host CPUID is more efficient than testing guest CPUID +- * or CR4. Host SMEP is anyway a requirement for guest SMEP. +- */ +- if (boot_cpu_has(X86_FEATURE_SMEP)) +- guest_efer |= EFER_NX; +- else if (!(guest_efer & EFER_NX)) +- ignore_bits |= EFER_NX; +- } ++ /* Shadow paging assumes NX to be available. */ ++ if (!enable_ept) ++ guest_efer |= EFER_NX; + + /* + * LMA and LME handled by hardware; SCE meaningless outside long mode. +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 6ae8a013af31..e536503ac788 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -92,8 +92,8 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); + static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); + #endif + +-#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM +-#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU ++#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ ++#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ + + #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ + KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) +@@ -205,7 +205,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { + { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, + { "mmu_unsync", VM_STAT(mmu_unsync) }, + { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, +- { "largepages", VM_STAT(lpages) }, ++ { "largepages", VM_STAT(lpages, .mode = 0444) }, ++ { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) }, + { "max_mmu_page_hash_collisions", + VM_STAT(max_mmu_page_hash_collisions) }, + { NULL } +@@ -1130,6 +1131,14 @@ u64 kvm_get_arch_capabilities(void) + + rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data); + ++ /* ++ * If nx_huge_pages is enabled, KVM's shadow paging will ensure that ++ * the nested hypervisor runs with NX huge pages. If it is not, ++ * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other ++ * L1 guests, so it need not worry about its own (L2) guests. ++ */ ++ data |= ARCH_CAP_PSCHANGE_MC_NO; ++ + /* + * If we're doing cache flushes (either "always" or "cond") + * we will do one whenever the guest does a vmlaunch/vmresume. +@@ -1142,8 +1151,35 @@ u64 kvm_get_arch_capabilities(void) + if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER) + data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; + ++ if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) ++ data |= ARCH_CAP_RDCL_NO; ++ if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) ++ data |= ARCH_CAP_SSB_NO; ++ if (!boot_cpu_has_bug(X86_BUG_MDS)) ++ data |= ARCH_CAP_MDS_NO; ++ ++ /* ++ * On TAA affected systems, export MDS_NO=0 when: ++ * - TSX is enabled on the host, i.e. X86_FEATURE_RTM=1. ++ * - Updated microcode is present. This is detected by ++ * the presence of ARCH_CAP_TSX_CTRL_MSR and ensures ++ * that VERW clears CPU buffers. ++ * ++ * When MDS_NO=0 is exported, guests deploy clear CPU buffer ++ * mitigation and don't complain: ++ * ++ * "Vulnerable: Clear CPU buffers attempted, no microcode" ++ * ++ * If TSX is disabled on the system, guests are also mitigated against ++ * TAA and clear CPU buffer mitigation is not required for guests. ++ */ ++ if (boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM) && ++ (data & ARCH_CAP_TSX_CTRL_MSR)) ++ data &= ~ARCH_CAP_MDS_NO; ++ + return data; + } ++ + EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities); + + static int kvm_get_msr_feature(struct kvm_msr_entry *msr) +@@ -6502,7 +6538,7 @@ static void kvm_hyperv_tsc_notifier(void) + struct kvm_vcpu *vcpu; + int cpu; + +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) + kvm_make_mclock_inprogress_request(kvm); + +@@ -6528,7 +6564,7 @@ static void kvm_hyperv_tsc_notifier(void) + + spin_unlock(&ka->pvclock_gtod_sync_lock); + } +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); + } + #endif + +@@ -6586,17 +6622,17 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va + + smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); + +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) { + kvm_for_each_vcpu(i, vcpu, kvm) { + if (vcpu->cpu != freq->cpu) + continue; + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); +- if (vcpu->cpu != smp_processor_id()) ++ if (vcpu->cpu != raw_smp_processor_id()) + send_ipi = 1; + } + } +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); + + if (freq->old < freq->new && send_ipi) { + /* +@@ -6722,12 +6758,12 @@ static void pvclock_gtod_update_fn(struct work_struct *work) + struct kvm_vcpu *vcpu; + int i; + +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); + atomic_set(&kvm_guest_has_master_clock, 0); +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); + } + + static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); +@@ -8949,6 +8985,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) + INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); + INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); ++ INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages); + INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); + atomic_set(&kvm->arch.noncoherent_dma_count, 0); + +@@ -8980,6 +9017,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) + return 0; + } + ++int kvm_arch_post_init_vm(struct kvm *kvm) ++{ ++ return kvm_mmu_post_init_vm(kvm); ++} ++ + static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) + { + vcpu_load(vcpu); +@@ -9081,6 +9123,11 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) + } + EXPORT_SYMBOL_GPL(x86_set_memory_region); + ++void kvm_arch_pre_destroy_vm(struct kvm *kvm) ++{ ++ kvm_mmu_pre_destroy_vm(kvm); ++} ++ + void kvm_arch_destroy_vm(struct kvm *kvm) + { + if (current->mm == kvm->mm) { +diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c +index 527524134693..a06547fe6f6b 100644 +--- a/block/blk-cgroup.c ++++ b/block/blk-cgroup.c +@@ -955,9 +955,14 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) + int i; + bool has_stats = false; + ++ spin_lock_irq(blkg->q->queue_lock); ++ ++ if (!blkg->online) ++ goto skip; ++ + dname = blkg_dev_name(blkg); + if (!dname) +- continue; ++ goto skip; + + /* + * Hooray string manipulation, count is the size written NOT +@@ -967,8 +972,6 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) + */ + off += scnprintf(buf+off, size-off, "%s ", dname); + +- spin_lock_irq(blkg->q->queue_lock); +- + rwstat = blkg_rwstat_recursive_sum(blkg, NULL, + offsetof(struct blkcg_gq, stat_bytes)); + rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); +@@ -981,8 +984,6 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) + wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); + dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]); + +- spin_unlock_irq(blkg->q->queue_lock); +- + if (rbytes || wbytes || rios || wios) { + has_stats = true; + off += scnprintf(buf+off, size-off, +@@ -1023,6 +1024,8 @@ next: + seq_commit(sf, -1); + } + } ++ skip: ++ spin_unlock_irq(blkg->q->queue_lock); + } + + rcu_read_unlock(); +diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c +index 2fd6ca1021c2..f3ecf7418ed4 100644 +--- a/drivers/base/cpu.c ++++ b/drivers/base/cpu.c +@@ -552,12 +552,27 @@ ssize_t __weak cpu_show_mds(struct device *dev, + return sprintf(buf, "Not affected\n"); + } + ++ssize_t __weak cpu_show_tsx_async_abort(struct device *dev, ++ struct device_attribute *attr, ++ char *buf) ++{ ++ return sprintf(buf, "Not affected\n"); ++} ++ ++ssize_t __weak cpu_show_itlb_multihit(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "Not affected\n"); ++} ++ + static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); + static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); + static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL); + static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL); + static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL); + static DEVICE_ATTR(mds, 0444, cpu_show_mds, NULL); ++static DEVICE_ATTR(tsx_async_abort, 0444, cpu_show_tsx_async_abort, NULL); ++static DEVICE_ATTR(itlb_multihit, 0444, cpu_show_itlb_multihit, NULL); + + static struct attribute *cpu_root_vulnerabilities_attrs[] = { + &dev_attr_meltdown.attr, +@@ -566,6 +581,8 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = { + &dev_attr_spec_store_bypass.attr, + &dev_attr_l1tf.attr, + &dev_attr_mds.attr, ++ &dev_attr_tsx_async_abort.attr, ++ &dev_attr_itlb_multihit.attr, + NULL + }; + +diff --git a/drivers/dma/sprd-dma.c b/drivers/dma/sprd-dma.c +index 1ed1c7efa288..9e8ce56a83d8 100644 +--- a/drivers/dma/sprd-dma.c ++++ b/drivers/dma/sprd-dma.c +@@ -181,6 +181,7 @@ struct sprd_dma_dev { + struct sprd_dma_chn channels[0]; + }; + ++static void sprd_dma_free_desc(struct virt_dma_desc *vd); + static bool sprd_dma_filter_fn(struct dma_chan *chan, void *param); + static struct of_dma_filter_info sprd_dma_info = { + .filter_fn = sprd_dma_filter_fn, +@@ -493,12 +494,19 @@ static int sprd_dma_alloc_chan_resources(struct dma_chan *chan) + static void sprd_dma_free_chan_resources(struct dma_chan *chan) + { + struct sprd_dma_chn *schan = to_sprd_dma_chan(chan); ++ struct virt_dma_desc *cur_vd = NULL; + unsigned long flags; + + spin_lock_irqsave(&schan->vc.lock, flags); ++ if (schan->cur_desc) ++ cur_vd = &schan->cur_desc->vd; ++ + sprd_dma_stop(schan); + spin_unlock_irqrestore(&schan->vc.lock, flags); + ++ if (cur_vd) ++ sprd_dma_free_desc(cur_vd); ++ + vchan_free_chan_resources(&schan->vc); + pm_runtime_put(chan->device->dev); + } +@@ -814,15 +822,22 @@ static int sprd_dma_resume(struct dma_chan *chan) + static int sprd_dma_terminate_all(struct dma_chan *chan) + { + struct sprd_dma_chn *schan = to_sprd_dma_chan(chan); ++ struct virt_dma_desc *cur_vd = NULL; + unsigned long flags; + LIST_HEAD(head); + + spin_lock_irqsave(&schan->vc.lock, flags); ++ if (schan->cur_desc) ++ cur_vd = &schan->cur_desc->vd; ++ + sprd_dma_stop(schan); + + vchan_get_all_descriptors(&schan->vc, &head); + spin_unlock_irqrestore(&schan->vc.lock, flags); + ++ if (cur_vd) ++ sprd_dma_free_desc(cur_vd); ++ + vchan_dma_desc_free_list(&schan->vc, &head); + return 0; + } +diff --git a/drivers/dma/xilinx/xilinx_dma.c b/drivers/dma/xilinx/xilinx_dma.c +index c12442312595..8aec137b4fca 100644 +--- a/drivers/dma/xilinx/xilinx_dma.c ++++ b/drivers/dma/xilinx/xilinx_dma.c +@@ -72,6 +72,9 @@ + #define XILINX_DMA_DMACR_CIRC_EN BIT(1) + #define XILINX_DMA_DMACR_RUNSTOP BIT(0) + #define XILINX_DMA_DMACR_FSYNCSRC_MASK GENMASK(6, 5) ++#define XILINX_DMA_DMACR_DELAY_MASK GENMASK(31, 24) ++#define XILINX_DMA_DMACR_FRAME_COUNT_MASK GENMASK(23, 16) ++#define XILINX_DMA_DMACR_MASTER_MASK GENMASK(11, 8) + + #define XILINX_DMA_REG_DMASR 0x0004 + #define XILINX_DMA_DMASR_EOL_LATE_ERR BIT(15) +@@ -2112,8 +2115,10 @@ int xilinx_vdma_channel_set_config(struct dma_chan *dchan, + chan->config.gen_lock = cfg->gen_lock; + chan->config.master = cfg->master; + ++ dmacr &= ~XILINX_DMA_DMACR_GENLOCK_EN; + if (cfg->gen_lock && chan->genlock) { + dmacr |= XILINX_DMA_DMACR_GENLOCK_EN; ++ dmacr &= ~XILINX_DMA_DMACR_MASTER_MASK; + dmacr |= cfg->master << XILINX_DMA_DMACR_MASTER_SHIFT; + } + +@@ -2129,11 +2134,13 @@ int xilinx_vdma_channel_set_config(struct dma_chan *dchan, + chan->config.delay = cfg->delay; + + if (cfg->coalesc <= XILINX_DMA_DMACR_FRAME_COUNT_MAX) { ++ dmacr &= ~XILINX_DMA_DMACR_FRAME_COUNT_MASK; + dmacr |= cfg->coalesc << XILINX_DMA_DMACR_FRAME_COUNT_SHIFT; + chan->config.coalesc = cfg->coalesc; + } + + if (cfg->delay <= XILINX_DMA_DMACR_DELAY_MAX) { ++ dmacr &= ~XILINX_DMA_DMACR_DELAY_MASK; + dmacr |= cfg->delay << XILINX_DMA_DMACR_DELAY_SHIFT; + chan->config.delay = cfg->delay; + } +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +index f823d4baf044..cf582cc46d53 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +@@ -203,7 +203,7 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) + struct amdgpu_ring *ring = to_amdgpu_ring(sched_job->sched); + struct dma_fence *fence = NULL, *finished; + struct amdgpu_job *job; +- int r; ++ int r = 0; + + job = to_amdgpu_job(sched_job); + finished = &job->base.s_fence->finished; +@@ -228,6 +228,8 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) + job->fence = dma_fence_get(fence); + + amdgpu_job_free_resources(job); ++ ++ fence = r ? ERR_PTR(r) : fence; + return fence; + } + +diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link_ddc.c b/drivers/gpu/drm/amd/display/dc/core/dc_link_ddc.c +index 8def0d9fa0ff..46c9cb47a96e 100644 +--- a/drivers/gpu/drm/amd/display/dc/core/dc_link_ddc.c ++++ b/drivers/gpu/drm/amd/display/dc/core/dc_link_ddc.c +@@ -433,6 +433,7 @@ void dal_ddc_service_i2c_query_dp_dual_mode_adaptor( + enum display_dongle_type *dongle = &sink_cap->dongle_type; + uint8_t type2_dongle_buf[DP_ADAPTOR_TYPE2_SIZE]; + bool is_type2_dongle = false; ++ int retry_count = 2; + struct dp_hdmi_dongle_signature_data *dongle_signature; + + /* Assume we have no valid DP passive dongle connected */ +@@ -445,13 +446,24 @@ void dal_ddc_service_i2c_query_dp_dual_mode_adaptor( + DP_HDMI_DONGLE_ADDRESS, + type2_dongle_buf, + sizeof(type2_dongle_buf))) { +- *dongle = DISPLAY_DONGLE_DP_DVI_DONGLE; +- sink_cap->max_hdmi_pixel_clock = DP_ADAPTOR_DVI_MAX_TMDS_CLK; ++ /* Passive HDMI dongles can sometimes fail here without retrying*/ ++ while (retry_count > 0) { ++ if (i2c_read(ddc, ++ DP_HDMI_DONGLE_ADDRESS, ++ type2_dongle_buf, ++ sizeof(type2_dongle_buf))) ++ break; ++ retry_count--; ++ } ++ if (retry_count == 0) { ++ *dongle = DISPLAY_DONGLE_DP_DVI_DONGLE; ++ sink_cap->max_hdmi_pixel_clock = DP_ADAPTOR_DVI_MAX_TMDS_CLK; + +- CONN_DATA_DETECT(ddc->link, type2_dongle_buf, sizeof(type2_dongle_buf), +- "DP-DVI passive dongle %dMhz: ", +- DP_ADAPTOR_DVI_MAX_TMDS_CLK / 1000); +- return; ++ CONN_DATA_DETECT(ddc->link, type2_dongle_buf, sizeof(type2_dongle_buf), ++ "DP-DVI passive dongle %dMhz: ", ++ DP_ADAPTOR_DVI_MAX_TMDS_CLK / 1000); ++ return; ++ } + } + + /* Check if Type 2 dongle.*/ +diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c +index 95478db9998b..e4b9eb1f6b60 100644 +--- a/drivers/gpu/drm/i915/i915_cmd_parser.c ++++ b/drivers/gpu/drm/i915/i915_cmd_parser.c +@@ -51,13 +51,11 @@ + * granting userspace undue privileges. There are three categories of privilege. + * + * First, commands which are explicitly defined as privileged or which should +- * only be used by the kernel driver. The parser generally rejects such +- * commands, though it may allow some from the drm master process. ++ * only be used by the kernel driver. The parser rejects such commands + * + * Second, commands which access registers. To support correct/enhanced + * userspace functionality, particularly certain OpenGL extensions, the parser +- * provides a whitelist of registers which userspace may safely access (for both +- * normal and drm master processes). ++ * provides a whitelist of registers which userspace may safely access + * + * Third, commands which access privileged memory (i.e. GGTT, HWS page, etc). + * The parser always rejects such commands. +@@ -82,9 +80,9 @@ + * in the per-engine command tables. + * + * Other command table entries map fairly directly to high level categories +- * mentioned above: rejected, master-only, register whitelist. The parser +- * implements a number of checks, including the privileged memory checks, via a +- * general bitmasking mechanism. ++ * mentioned above: rejected, register whitelist. The parser implements a number ++ * of checks, including the privileged memory checks, via a general bitmasking ++ * mechanism. + */ + + /* +@@ -102,8 +100,6 @@ struct drm_i915_cmd_descriptor { + * CMD_DESC_REJECT: The command is never allowed + * CMD_DESC_REGISTER: The command should be checked against the + * register whitelist for the appropriate ring +- * CMD_DESC_MASTER: The command is allowed if the submitting process +- * is the DRM master + */ + u32 flags; + #define CMD_DESC_FIXED (1<<0) +@@ -111,7 +107,6 @@ struct drm_i915_cmd_descriptor { + #define CMD_DESC_REJECT (1<<2) + #define CMD_DESC_REGISTER (1<<3) + #define CMD_DESC_BITMASK (1<<4) +-#define CMD_DESC_MASTER (1<<5) + + /* + * The command's unique identification bits and the bitmask to get them. +@@ -192,7 +187,7 @@ struct drm_i915_cmd_table { + #define CMD(op, opm, f, lm, fl, ...) \ + { \ + .flags = (fl) | ((f) ? CMD_DESC_FIXED : 0), \ +- .cmd = { (op), ~0u << (opm) }, \ ++ .cmd = { (op & ~0u << (opm)), ~0u << (opm) }, \ + .length = { (lm) }, \ + __VA_ARGS__ \ + } +@@ -207,14 +202,13 @@ struct drm_i915_cmd_table { + #define R CMD_DESC_REJECT + #define W CMD_DESC_REGISTER + #define B CMD_DESC_BITMASK +-#define M CMD_DESC_MASTER + + /* Command Mask Fixed Len Action + ---------------------------------------------------------- */ +-static const struct drm_i915_cmd_descriptor common_cmds[] = { ++static const struct drm_i915_cmd_descriptor gen7_common_cmds[] = { + CMD( MI_NOOP, SMI, F, 1, S ), + CMD( MI_USER_INTERRUPT, SMI, F, 1, R ), +- CMD( MI_WAIT_FOR_EVENT, SMI, F, 1, M ), ++ CMD( MI_WAIT_FOR_EVENT, SMI, F, 1, R ), + CMD( MI_ARB_CHECK, SMI, F, 1, S ), + CMD( MI_REPORT_HEAD, SMI, F, 1, S ), + CMD( MI_SUSPEND_FLUSH, SMI, F, 1, S ), +@@ -244,7 +238,7 @@ static const struct drm_i915_cmd_descriptor common_cmds[] = { + CMD( MI_BATCH_BUFFER_START, SMI, !F, 0xFF, S ), + }; + +-static const struct drm_i915_cmd_descriptor render_cmds[] = { ++static const struct drm_i915_cmd_descriptor gen7_render_cmds[] = { + CMD( MI_FLUSH, SMI, F, 1, S ), + CMD( MI_ARB_ON_OFF, SMI, F, 1, R ), + CMD( MI_PREDICATE, SMI, F, 1, S ), +@@ -311,7 +305,7 @@ static const struct drm_i915_cmd_descriptor hsw_render_cmds[] = { + CMD( MI_URB_ATOMIC_ALLOC, SMI, F, 1, S ), + CMD( MI_SET_APPID, SMI, F, 1, S ), + CMD( MI_RS_CONTEXT, SMI, F, 1, S ), +- CMD( MI_LOAD_SCAN_LINES_INCL, SMI, !F, 0x3F, M ), ++ CMD( MI_LOAD_SCAN_LINES_INCL, SMI, !F, 0x3F, R ), + CMD( MI_LOAD_SCAN_LINES_EXCL, SMI, !F, 0x3F, R ), + CMD( MI_LOAD_REGISTER_REG, SMI, !F, 0xFF, W, + .reg = { .offset = 1, .mask = 0x007FFFFC, .step = 1 } ), +@@ -328,7 +322,7 @@ static const struct drm_i915_cmd_descriptor hsw_render_cmds[] = { + CMD( GFX_OP_3DSTATE_BINDING_TABLE_EDIT_PS, S3D, !F, 0x1FF, S ), + }; + +-static const struct drm_i915_cmd_descriptor video_cmds[] = { ++static const struct drm_i915_cmd_descriptor gen7_video_cmds[] = { + CMD( MI_ARB_ON_OFF, SMI, F, 1, R ), + CMD( MI_SET_APPID, SMI, F, 1, S ), + CMD( MI_STORE_DWORD_IMM, SMI, !F, 0xFF, B, +@@ -372,7 +366,7 @@ static const struct drm_i915_cmd_descriptor video_cmds[] = { + CMD( MFX_WAIT, SMFX, F, 1, S ), + }; + +-static const struct drm_i915_cmd_descriptor vecs_cmds[] = { ++static const struct drm_i915_cmd_descriptor gen7_vecs_cmds[] = { + CMD( MI_ARB_ON_OFF, SMI, F, 1, R ), + CMD( MI_SET_APPID, SMI, F, 1, S ), + CMD( MI_STORE_DWORD_IMM, SMI, !F, 0xFF, B, +@@ -410,7 +404,7 @@ static const struct drm_i915_cmd_descriptor vecs_cmds[] = { + }}, ), + }; + +-static const struct drm_i915_cmd_descriptor blt_cmds[] = { ++static const struct drm_i915_cmd_descriptor gen7_blt_cmds[] = { + CMD( MI_DISPLAY_FLIP, SMI, !F, 0xFF, R ), + CMD( MI_STORE_DWORD_IMM, SMI, !F, 0x3FF, B, + .bits = {{ +@@ -444,10 +438,64 @@ static const struct drm_i915_cmd_descriptor blt_cmds[] = { + }; + + static const struct drm_i915_cmd_descriptor hsw_blt_cmds[] = { +- CMD( MI_LOAD_SCAN_LINES_INCL, SMI, !F, 0x3F, M ), ++ CMD( MI_LOAD_SCAN_LINES_INCL, SMI, !F, 0x3F, R ), + CMD( MI_LOAD_SCAN_LINES_EXCL, SMI, !F, 0x3F, R ), + }; + ++/* ++ * For Gen9 we can still rely on the h/w to enforce cmd security, and only ++ * need to re-enforce the register access checks. We therefore only need to ++ * teach the cmdparser how to find the end of each command, and identify ++ * register accesses. The table doesn't need to reject any commands, and so ++ * the only commands listed here are: ++ * 1) Those that touch registers ++ * 2) Those that do not have the default 8-bit length ++ * ++ * Note that the default MI length mask chosen for this table is 0xFF, not ++ * the 0x3F used on older devices. This is because the vast majority of MI ++ * cmds on Gen9 use a standard 8-bit Length field. ++ * All the Gen9 blitter instructions are standard 0xFF length mask, and ++ * none allow access to non-general registers, so in fact no BLT cmds are ++ * included in the table at all. ++ * ++ */ ++static const struct drm_i915_cmd_descriptor gen9_blt_cmds[] = { ++ CMD( MI_NOOP, SMI, F, 1, S ), ++ CMD( MI_USER_INTERRUPT, SMI, F, 1, S ), ++ CMD( MI_WAIT_FOR_EVENT, SMI, F, 1, S ), ++ CMD( MI_FLUSH, SMI, F, 1, S ), ++ CMD( MI_ARB_CHECK, SMI, F, 1, S ), ++ CMD( MI_REPORT_HEAD, SMI, F, 1, S ), ++ CMD( MI_ARB_ON_OFF, SMI, F, 1, S ), ++ CMD( MI_SUSPEND_FLUSH, SMI, F, 1, S ), ++ CMD( MI_LOAD_SCAN_LINES_INCL, SMI, !F, 0x3F, S ), ++ CMD( MI_LOAD_SCAN_LINES_EXCL, SMI, !F, 0x3F, S ), ++ CMD( MI_STORE_DWORD_IMM, SMI, !F, 0x3FF, S ), ++ CMD( MI_LOAD_REGISTER_IMM(1), SMI, !F, 0xFF, W, ++ .reg = { .offset = 1, .mask = 0x007FFFFC, .step = 2 } ), ++ CMD( MI_UPDATE_GTT, SMI, !F, 0x3FF, S ), ++ CMD( MI_STORE_REGISTER_MEM_GEN8, SMI, F, 4, W, ++ .reg = { .offset = 1, .mask = 0x007FFFFC } ), ++ CMD( MI_FLUSH_DW, SMI, !F, 0x3F, S ), ++ CMD( MI_LOAD_REGISTER_MEM_GEN8, SMI, F, 4, W, ++ .reg = { .offset = 1, .mask = 0x007FFFFC } ), ++ CMD( MI_LOAD_REGISTER_REG, SMI, !F, 0xFF, W, ++ .reg = { .offset = 1, .mask = 0x007FFFFC, .step = 1 } ), ++ ++ /* ++ * We allow BB_START but apply further checks. We just sanitize the ++ * basic fields here. ++ */ ++#define MI_BB_START_OPERAND_MASK GENMASK(SMI-1, 0) ++#define MI_BB_START_OPERAND_EXPECT (MI_BATCH_PPGTT_HSW | 1) ++ CMD( MI_BATCH_BUFFER_START_GEN8, SMI, !F, 0xFF, B, ++ .bits = {{ ++ .offset = 0, ++ .mask = MI_BB_START_OPERAND_MASK, ++ .expected = MI_BB_START_OPERAND_EXPECT, ++ }}, ), ++}; ++ + static const struct drm_i915_cmd_descriptor noop_desc = + CMD(MI_NOOP, SMI, F, 1, S); + +@@ -461,40 +509,44 @@ static const struct drm_i915_cmd_descriptor noop_desc = + #undef R + #undef W + #undef B +-#undef M + +-static const struct drm_i915_cmd_table gen7_render_cmds[] = { +- { common_cmds, ARRAY_SIZE(common_cmds) }, +- { render_cmds, ARRAY_SIZE(render_cmds) }, ++static const struct drm_i915_cmd_table gen7_render_cmd_table[] = { ++ { gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) }, ++ { gen7_render_cmds, ARRAY_SIZE(gen7_render_cmds) }, + }; + +-static const struct drm_i915_cmd_table hsw_render_ring_cmds[] = { +- { common_cmds, ARRAY_SIZE(common_cmds) }, +- { render_cmds, ARRAY_SIZE(render_cmds) }, ++static const struct drm_i915_cmd_table hsw_render_ring_cmd_table[] = { ++ { gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) }, ++ { gen7_render_cmds, ARRAY_SIZE(gen7_render_cmds) }, + { hsw_render_cmds, ARRAY_SIZE(hsw_render_cmds) }, + }; + +-static const struct drm_i915_cmd_table gen7_video_cmds[] = { +- { common_cmds, ARRAY_SIZE(common_cmds) }, +- { video_cmds, ARRAY_SIZE(video_cmds) }, ++static const struct drm_i915_cmd_table gen7_video_cmd_table[] = { ++ { gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) }, ++ { gen7_video_cmds, ARRAY_SIZE(gen7_video_cmds) }, + }; + +-static const struct drm_i915_cmd_table hsw_vebox_cmds[] = { +- { common_cmds, ARRAY_SIZE(common_cmds) }, +- { vecs_cmds, ARRAY_SIZE(vecs_cmds) }, ++static const struct drm_i915_cmd_table hsw_vebox_cmd_table[] = { ++ { gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) }, ++ { gen7_vecs_cmds, ARRAY_SIZE(gen7_vecs_cmds) }, + }; + +-static const struct drm_i915_cmd_table gen7_blt_cmds[] = { +- { common_cmds, ARRAY_SIZE(common_cmds) }, +- { blt_cmds, ARRAY_SIZE(blt_cmds) }, ++static const struct drm_i915_cmd_table gen7_blt_cmd_table[] = { ++ { gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) }, ++ { gen7_blt_cmds, ARRAY_SIZE(gen7_blt_cmds) }, + }; + +-static const struct drm_i915_cmd_table hsw_blt_ring_cmds[] = { +- { common_cmds, ARRAY_SIZE(common_cmds) }, +- { blt_cmds, ARRAY_SIZE(blt_cmds) }, ++static const struct drm_i915_cmd_table hsw_blt_ring_cmd_table[] = { ++ { gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) }, ++ { gen7_blt_cmds, ARRAY_SIZE(gen7_blt_cmds) }, + { hsw_blt_cmds, ARRAY_SIZE(hsw_blt_cmds) }, + }; + ++static const struct drm_i915_cmd_table gen9_blt_cmd_table[] = { ++ { gen9_blt_cmds, ARRAY_SIZE(gen9_blt_cmds) }, ++}; ++ ++ + /* + * Register whitelists, sorted by increasing register offset. + */ +@@ -610,17 +662,27 @@ static const struct drm_i915_reg_descriptor gen7_blt_regs[] = { + REG64_IDX(RING_TIMESTAMP, BLT_RING_BASE), + }; + +-static const struct drm_i915_reg_descriptor ivb_master_regs[] = { +- REG32(FORCEWAKE_MT), +- REG32(DERRMR), +- REG32(GEN7_PIPE_DE_LOAD_SL(PIPE_A)), +- REG32(GEN7_PIPE_DE_LOAD_SL(PIPE_B)), +- REG32(GEN7_PIPE_DE_LOAD_SL(PIPE_C)), +-}; +- +-static const struct drm_i915_reg_descriptor hsw_master_regs[] = { +- REG32(FORCEWAKE_MT), +- REG32(DERRMR), ++static const struct drm_i915_reg_descriptor gen9_blt_regs[] = { ++ REG64_IDX(RING_TIMESTAMP, RENDER_RING_BASE), ++ REG64_IDX(RING_TIMESTAMP, BSD_RING_BASE), ++ REG32(BCS_SWCTRL), ++ REG64_IDX(RING_TIMESTAMP, BLT_RING_BASE), ++ REG64_IDX(BCS_GPR, 0), ++ REG64_IDX(BCS_GPR, 1), ++ REG64_IDX(BCS_GPR, 2), ++ REG64_IDX(BCS_GPR, 3), ++ REG64_IDX(BCS_GPR, 4), ++ REG64_IDX(BCS_GPR, 5), ++ REG64_IDX(BCS_GPR, 6), ++ REG64_IDX(BCS_GPR, 7), ++ REG64_IDX(BCS_GPR, 8), ++ REG64_IDX(BCS_GPR, 9), ++ REG64_IDX(BCS_GPR, 10), ++ REG64_IDX(BCS_GPR, 11), ++ REG64_IDX(BCS_GPR, 12), ++ REG64_IDX(BCS_GPR, 13), ++ REG64_IDX(BCS_GPR, 14), ++ REG64_IDX(BCS_GPR, 15), + }; + + #undef REG64 +@@ -629,28 +691,27 @@ static const struct drm_i915_reg_descriptor hsw_master_regs[] = { + struct drm_i915_reg_table { + const struct drm_i915_reg_descriptor *regs; + int num_regs; +- bool master; + }; + + static const struct drm_i915_reg_table ivb_render_reg_tables[] = { +- { gen7_render_regs, ARRAY_SIZE(gen7_render_regs), false }, +- { ivb_master_regs, ARRAY_SIZE(ivb_master_regs), true }, ++ { gen7_render_regs, ARRAY_SIZE(gen7_render_regs) }, + }; + + static const struct drm_i915_reg_table ivb_blt_reg_tables[] = { +- { gen7_blt_regs, ARRAY_SIZE(gen7_blt_regs), false }, +- { ivb_master_regs, ARRAY_SIZE(ivb_master_regs), true }, ++ { gen7_blt_regs, ARRAY_SIZE(gen7_blt_regs) }, + }; + + static const struct drm_i915_reg_table hsw_render_reg_tables[] = { +- { gen7_render_regs, ARRAY_SIZE(gen7_render_regs), false }, +- { hsw_render_regs, ARRAY_SIZE(hsw_render_regs), false }, +- { hsw_master_regs, ARRAY_SIZE(hsw_master_regs), true }, ++ { gen7_render_regs, ARRAY_SIZE(gen7_render_regs) }, ++ { hsw_render_regs, ARRAY_SIZE(hsw_render_regs) }, + }; + + static const struct drm_i915_reg_table hsw_blt_reg_tables[] = { +- { gen7_blt_regs, ARRAY_SIZE(gen7_blt_regs), false }, +- { hsw_master_regs, ARRAY_SIZE(hsw_master_regs), true }, ++ { gen7_blt_regs, ARRAY_SIZE(gen7_blt_regs) }, ++}; ++ ++static const struct drm_i915_reg_table gen9_blt_reg_tables[] = { ++ { gen9_blt_regs, ARRAY_SIZE(gen9_blt_regs) }, + }; + + static u32 gen7_render_get_cmd_length_mask(u32 cmd_header) +@@ -708,6 +769,17 @@ static u32 gen7_blt_get_cmd_length_mask(u32 cmd_header) + return 0; + } + ++static u32 gen9_blt_get_cmd_length_mask(u32 cmd_header) ++{ ++ u32 client = cmd_header >> INSTR_CLIENT_SHIFT; ++ ++ if (client == INSTR_MI_CLIENT || client == INSTR_BC_CLIENT) ++ return 0xFF; ++ ++ DRM_DEBUG_DRIVER("CMD: Abnormal blt cmd length! 0x%08X\n", cmd_header); ++ return 0; ++} ++ + static bool validate_cmds_sorted(const struct intel_engine_cs *engine, + const struct drm_i915_cmd_table *cmd_tables, + int cmd_table_count) +@@ -865,18 +937,19 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine) + int cmd_table_count; + int ret; + +- if (!IS_GEN7(engine->i915)) ++ if (!IS_GEN7(engine->i915) && !(IS_GEN9(engine->i915) && ++ engine->id == BCS)) + return; + + switch (engine->id) { + case RCS: + if (IS_HASWELL(engine->i915)) { +- cmd_tables = hsw_render_ring_cmds; ++ cmd_tables = hsw_render_ring_cmd_table; + cmd_table_count = +- ARRAY_SIZE(hsw_render_ring_cmds); ++ ARRAY_SIZE(hsw_render_ring_cmd_table); + } else { +- cmd_tables = gen7_render_cmds; +- cmd_table_count = ARRAY_SIZE(gen7_render_cmds); ++ cmd_tables = gen7_render_cmd_table; ++ cmd_table_count = ARRAY_SIZE(gen7_render_cmd_table); + } + + if (IS_HASWELL(engine->i915)) { +@@ -886,36 +959,46 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine) + engine->reg_tables = ivb_render_reg_tables; + engine->reg_table_count = ARRAY_SIZE(ivb_render_reg_tables); + } +- + engine->get_cmd_length_mask = gen7_render_get_cmd_length_mask; + break; + case VCS: +- cmd_tables = gen7_video_cmds; +- cmd_table_count = ARRAY_SIZE(gen7_video_cmds); ++ cmd_tables = gen7_video_cmd_table; ++ cmd_table_count = ARRAY_SIZE(gen7_video_cmd_table); + engine->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask; + break; + case BCS: +- if (IS_HASWELL(engine->i915)) { +- cmd_tables = hsw_blt_ring_cmds; +- cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmds); ++ engine->get_cmd_length_mask = gen7_blt_get_cmd_length_mask; ++ if (IS_GEN9(engine->i915)) { ++ cmd_tables = gen9_blt_cmd_table; ++ cmd_table_count = ARRAY_SIZE(gen9_blt_cmd_table); ++ engine->get_cmd_length_mask = ++ gen9_blt_get_cmd_length_mask; ++ ++ /* BCS Engine unsafe without parser */ ++ engine->flags |= I915_ENGINE_REQUIRES_CMD_PARSER; ++ } else if (IS_HASWELL(engine->i915)) { ++ cmd_tables = hsw_blt_ring_cmd_table; ++ cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmd_table); + } else { +- cmd_tables = gen7_blt_cmds; +- cmd_table_count = ARRAY_SIZE(gen7_blt_cmds); ++ cmd_tables = gen7_blt_cmd_table; ++ cmd_table_count = ARRAY_SIZE(gen7_blt_cmd_table); + } + +- if (IS_HASWELL(engine->i915)) { ++ if (IS_GEN9(engine->i915)) { ++ engine->reg_tables = gen9_blt_reg_tables; ++ engine->reg_table_count = ++ ARRAY_SIZE(gen9_blt_reg_tables); ++ } else if (IS_HASWELL(engine->i915)) { + engine->reg_tables = hsw_blt_reg_tables; + engine->reg_table_count = ARRAY_SIZE(hsw_blt_reg_tables); + } else { + engine->reg_tables = ivb_blt_reg_tables; + engine->reg_table_count = ARRAY_SIZE(ivb_blt_reg_tables); + } +- +- engine->get_cmd_length_mask = gen7_blt_get_cmd_length_mask; + break; + case VECS: +- cmd_tables = hsw_vebox_cmds; +- cmd_table_count = ARRAY_SIZE(hsw_vebox_cmds); ++ cmd_tables = hsw_vebox_cmd_table; ++ cmd_table_count = ARRAY_SIZE(hsw_vebox_cmd_table); + /* VECS can use the same length_mask function as VCS */ + engine->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask; + break; +@@ -941,7 +1024,7 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine) + return; + } + +- engine->flags |= I915_ENGINE_NEEDS_CMD_PARSER; ++ engine->flags |= I915_ENGINE_USING_CMD_PARSER; + } + + /** +@@ -953,7 +1036,7 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine) + */ + void intel_engine_cleanup_cmd_parser(struct intel_engine_cs *engine) + { +- if (!intel_engine_needs_cmd_parser(engine)) ++ if (!intel_engine_using_cmd_parser(engine)) + return; + + fini_hash_table(engine); +@@ -1027,22 +1110,16 @@ __find_reg(const struct drm_i915_reg_descriptor *table, int count, u32 addr) + } + + static const struct drm_i915_reg_descriptor * +-find_reg(const struct intel_engine_cs *engine, bool is_master, u32 addr) ++find_reg(const struct intel_engine_cs *engine, u32 addr) + { + const struct drm_i915_reg_table *table = engine->reg_tables; ++ const struct drm_i915_reg_descriptor *reg = NULL; + int count = engine->reg_table_count; + +- for (; count > 0; ++table, --count) { +- if (!table->master || is_master) { +- const struct drm_i915_reg_descriptor *reg; ++ for (; !reg && (count > 0); ++table, --count) ++ reg = __find_reg(table->regs, table->num_regs, addr); + +- reg = __find_reg(table->regs, table->num_regs, addr); +- if (reg != NULL) +- return reg; +- } +- } +- +- return NULL; ++ return reg; + } + + /* Returns a vmap'd pointer to dst_obj, which the caller must unmap */ +@@ -1127,8 +1204,7 @@ unpin_src: + + static bool check_cmd(const struct intel_engine_cs *engine, + const struct drm_i915_cmd_descriptor *desc, +- const u32 *cmd, u32 length, +- const bool is_master) ++ const u32 *cmd, u32 length) + { + if (desc->flags & CMD_DESC_SKIP) + return true; +@@ -1138,12 +1214,6 @@ static bool check_cmd(const struct intel_engine_cs *engine, + return false; + } + +- if ((desc->flags & CMD_DESC_MASTER) && !is_master) { +- DRM_DEBUG_DRIVER("CMD: Rejected master-only command: 0x%08X\n", +- *cmd); +- return false; +- } +- + if (desc->flags & CMD_DESC_REGISTER) { + /* + * Get the distance between individual register offset +@@ -1157,7 +1227,7 @@ static bool check_cmd(const struct intel_engine_cs *engine, + offset += step) { + const u32 reg_addr = cmd[offset] & desc->reg.mask; + const struct drm_i915_reg_descriptor *reg = +- find_reg(engine, is_master, reg_addr); ++ find_reg(engine, reg_addr); + + if (!reg) { + DRM_DEBUG_DRIVER("CMD: Rejected register 0x%08X in command: 0x%08X (%s)\n", +@@ -1235,16 +1305,112 @@ static bool check_cmd(const struct intel_engine_cs *engine, + return true; + } + ++static int check_bbstart(const struct i915_gem_context *ctx, ++ u32 *cmd, u32 offset, u32 length, ++ u32 batch_len, ++ u64 batch_start, ++ u64 shadow_batch_start) ++{ ++ u64 jump_offset, jump_target; ++ u32 target_cmd_offset, target_cmd_index; ++ ++ /* For igt compatibility on older platforms */ ++ if (CMDPARSER_USES_GGTT(ctx->i915)) { ++ DRM_DEBUG("CMD: Rejecting BB_START for ggtt based submission\n"); ++ return -EACCES; ++ } ++ ++ if (length != 3) { ++ DRM_DEBUG("CMD: Recursive BB_START with bad length(%u)\n", ++ length); ++ return -EINVAL; ++ } ++ ++ jump_target = *(u64*)(cmd+1); ++ jump_offset = jump_target - batch_start; ++ ++ /* ++ * Any underflow of jump_target is guaranteed to be outside the range ++ * of a u32, so >= test catches both too large and too small ++ */ ++ if (jump_offset >= batch_len) { ++ DRM_DEBUG("CMD: BB_START to 0x%llx jumps out of BB\n", ++ jump_target); ++ return -EINVAL; ++ } ++ ++ /* ++ * This cannot overflow a u32 because we already checked jump_offset ++ * is within the BB, and the batch_len is a u32 ++ */ ++ target_cmd_offset = lower_32_bits(jump_offset); ++ target_cmd_index = target_cmd_offset / sizeof(u32); ++ ++ *(u64*)(cmd + 1) = shadow_batch_start + target_cmd_offset; ++ ++ if (target_cmd_index == offset) ++ return 0; ++ ++ if (ctx->jump_whitelist_cmds <= target_cmd_index) { ++ DRM_DEBUG("CMD: Rejecting BB_START - truncated whitelist array\n"); ++ return -EINVAL; ++ } else if (!test_bit(target_cmd_index, ctx->jump_whitelist)) { ++ DRM_DEBUG("CMD: BB_START to 0x%llx not a previously executed cmd\n", ++ jump_target); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static void init_whitelist(struct i915_gem_context *ctx, u32 batch_len) ++{ ++ const u32 batch_cmds = DIV_ROUND_UP(batch_len, sizeof(u32)); ++ const u32 exact_size = BITS_TO_LONGS(batch_cmds); ++ u32 next_size = BITS_TO_LONGS(roundup_pow_of_two(batch_cmds)); ++ unsigned long *next_whitelist; ++ ++ if (CMDPARSER_USES_GGTT(ctx->i915)) ++ return; ++ ++ if (batch_cmds <= ctx->jump_whitelist_cmds) { ++ bitmap_zero(ctx->jump_whitelist, batch_cmds); ++ return; ++ } ++ ++again: ++ next_whitelist = kcalloc(next_size, sizeof(long), GFP_KERNEL); ++ if (next_whitelist) { ++ kfree(ctx->jump_whitelist); ++ ctx->jump_whitelist = next_whitelist; ++ ctx->jump_whitelist_cmds = ++ next_size * BITS_PER_BYTE * sizeof(long); ++ return; ++ } ++ ++ if (next_size > exact_size) { ++ next_size = exact_size; ++ goto again; ++ } ++ ++ DRM_DEBUG("CMD: Failed to extend whitelist. BB_START may be disallowed\n"); ++ bitmap_zero(ctx->jump_whitelist, ctx->jump_whitelist_cmds); ++ ++ return; ++} ++ + #define LENGTH_BIAS 2 + + /** + * i915_parse_cmds() - parse a submitted batch buffer for privilege violations ++ * @ctx: the context in which the batch is to execute + * @engine: the engine on which the batch is to execute + * @batch_obj: the batch buffer in question +- * @shadow_batch_obj: copy of the batch buffer in question ++ * @batch_start: Canonical base address of batch + * @batch_start_offset: byte offset in the batch at which execution starts + * @batch_len: length of the commands in batch_obj +- * @is_master: is the submitting process the drm master? ++ * @shadow_batch_obj: copy of the batch buffer in question ++ * @shadow_batch_start: Canonical base address of shadow_batch_obj + * + * Parses the specified batch buffer looking for privilege violations as + * described in the overview. +@@ -1252,14 +1418,17 @@ static bool check_cmd(const struct intel_engine_cs *engine, + * Return: non-zero if the parser finds violations or otherwise fails; -EACCES + * if the batch appears legal but should use hardware parsing + */ +-int intel_engine_cmd_parser(struct intel_engine_cs *engine, ++ ++int intel_engine_cmd_parser(struct i915_gem_context *ctx, ++ struct intel_engine_cs *engine, + struct drm_i915_gem_object *batch_obj, +- struct drm_i915_gem_object *shadow_batch_obj, ++ u64 batch_start, + u32 batch_start_offset, + u32 batch_len, +- bool is_master) ++ struct drm_i915_gem_object *shadow_batch_obj, ++ u64 shadow_batch_start) + { +- u32 *cmd, *batch_end; ++ u32 *cmd, *batch_end, offset = 0; + struct drm_i915_cmd_descriptor default_desc = noop_desc; + const struct drm_i915_cmd_descriptor *desc = &default_desc; + bool needs_clflush_after = false; +@@ -1273,6 +1442,8 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine, + return PTR_ERR(cmd); + } + ++ init_whitelist(ctx, batch_len); ++ + /* + * We use the batch length as size because the shadow object is as + * large or larger and copy_batch() will write MI_NOPs to the extra +@@ -1282,31 +1453,15 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine, + do { + u32 length; + +- if (*cmd == MI_BATCH_BUFFER_END) { +- if (needs_clflush_after) { +- void *ptr = page_mask_bits(shadow_batch_obj->mm.mapping); +- drm_clflush_virt_range(ptr, +- (void *)(cmd + 1) - ptr); +- } ++ if (*cmd == MI_BATCH_BUFFER_END) + break; +- } + + desc = find_cmd(engine, *cmd, desc, &default_desc); + if (!desc) { + DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n", + *cmd); + ret = -EINVAL; +- break; +- } +- +- /* +- * If the batch buffer contains a chained batch, return an +- * error that tells the caller to abort and dispatch the +- * workload as a non-secure batch. +- */ +- if (desc->cmd.value == MI_BATCH_BUFFER_START) { +- ret = -EACCES; +- break; ++ goto err; + } + + if (desc->flags & CMD_DESC_FIXED) +@@ -1320,22 +1475,43 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine, + length, + batch_end - cmd); + ret = -EINVAL; +- break; ++ goto err; + } + +- if (!check_cmd(engine, desc, cmd, length, is_master)) { ++ if (!check_cmd(engine, desc, cmd, length)) { + ret = -EACCES; ++ goto err; ++ } ++ ++ if (desc->cmd.value == MI_BATCH_BUFFER_START) { ++ ret = check_bbstart(ctx, cmd, offset, length, ++ batch_len, batch_start, ++ shadow_batch_start); ++ ++ if (ret) ++ goto err; + break; + } + ++ if (ctx->jump_whitelist_cmds > offset) ++ set_bit(offset, ctx->jump_whitelist); ++ + cmd += length; ++ offset += length; + if (cmd >= batch_end) { + DRM_DEBUG_DRIVER("CMD: Got to the end of the buffer w/o a BBE cmd!\n"); + ret = -EINVAL; +- break; ++ goto err; + } + } while (1); + ++ if (needs_clflush_after) { ++ void *ptr = page_mask_bits(shadow_batch_obj->mm.mapping); ++ ++ drm_clflush_virt_range(ptr, (void *)(cmd + 1) - ptr); ++ } ++ ++err: + i915_gem_object_unpin_map(shadow_batch_obj); + return ret; + } +@@ -1357,7 +1533,7 @@ int i915_cmd_parser_get_version(struct drm_i915_private *dev_priv) + + /* If the command parser is not enabled, report 0 - unsupported */ + for_each_engine(engine, dev_priv, id) { +- if (intel_engine_needs_cmd_parser(engine)) { ++ if (intel_engine_using_cmd_parser(engine)) { + active = true; + break; + } +@@ -1382,6 +1558,7 @@ int i915_cmd_parser_get_version(struct drm_i915_private *dev_priv) + * the parser enabled. + * 9. Don't whitelist or handle oacontrol specially, as ownership + * for oacontrol state is moving to i915-perf. ++ * 10. Support for Gen9 BCS Parsing + */ +- return 9; ++ return 10; + } +diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c +index a4b4ab7b9f8e..b0d76a7a0946 100644 +--- a/drivers/gpu/drm/i915/i915_drv.c ++++ b/drivers/gpu/drm/i915/i915_drv.c +@@ -351,7 +351,7 @@ static int i915_getparam_ioctl(struct drm_device *dev, void *data, + value = HAS_LEGACY_SEMAPHORES(dev_priv); + break; + case I915_PARAM_HAS_SECURE_BATCHES: +- value = capable(CAP_SYS_ADMIN); ++ value = HAS_SECURE_BATCHES(dev_priv) && capable(CAP_SYS_ADMIN); + break; + case I915_PARAM_CMD_PARSER_VERSION: + value = i915_cmd_parser_get_version(dev_priv); +@@ -1627,6 +1627,7 @@ static int i915_drm_suspend_late(struct drm_device *dev, bool hibernation) + i915_gem_suspend_late(dev_priv); + + intel_display_set_init_power(dev_priv, false); ++ i915_rc6_ctx_wa_suspend(dev_priv); + intel_uncore_suspend(dev_priv); + + /* +@@ -1853,6 +1854,8 @@ static int i915_drm_resume_early(struct drm_device *dev) + else + intel_display_set_init_power(dev_priv, true); + ++ i915_rc6_ctx_wa_resume(dev_priv); ++ + intel_engines_sanitize(dev_priv); + + enable_rpm_wakeref_asserts(dev_priv); +diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h +index d6c25bea4382..db2e9af49ae6 100644 +--- a/drivers/gpu/drm/i915/i915_drv.h ++++ b/drivers/gpu/drm/i915/i915_drv.h +@@ -801,6 +801,7 @@ struct intel_rps { + + struct intel_rc6 { + bool enabled; ++ bool ctx_corrupted; + u64 prev_hw_residency[4]; + u64 cur_residency[4]; + }; +@@ -2496,6 +2497,12 @@ intel_info(const struct drm_i915_private *dev_priv) + #define IS_GEN9_LP(dev_priv) (IS_GEN9(dev_priv) && IS_LP(dev_priv)) + #define IS_GEN9_BC(dev_priv) (IS_GEN9(dev_priv) && !IS_LP(dev_priv)) + ++/* ++ * The Gen7 cmdparser copies the scanned buffer to the ggtt for execution ++ * All later gens can run the final buffer from the ppgtt ++ */ ++#define CMDPARSER_USES_GGTT(dev_priv) IS_GEN7(dev_priv) ++ + #define ENGINE_MASK(id) BIT(id) + #define RENDER_RING ENGINE_MASK(RCS) + #define BSD_RING ENGINE_MASK(VCS) +@@ -2517,6 +2524,8 @@ intel_info(const struct drm_i915_private *dev_priv) + + #define HAS_LEGACY_SEMAPHORES(dev_priv) IS_GEN7(dev_priv) + ++#define HAS_SECURE_BATCHES(dev_priv) (INTEL_GEN(dev_priv) < 6) ++ + #define HAS_LLC(dev_priv) ((dev_priv)->info.has_llc) + #define HAS_SNOOP(dev_priv) ((dev_priv)->info.has_snoop) + #define HAS_EDRAM(dev_priv) (!!((dev_priv)->edram_cap & EDRAM_ENABLED)) +@@ -2549,10 +2558,12 @@ intel_info(const struct drm_i915_private *dev_priv) + /* Early gen2 have a totally busted CS tlb and require pinned batches. */ + #define HAS_BROKEN_CS_TLB(dev_priv) (IS_I830(dev_priv) || IS_I845G(dev_priv)) + ++#define NEEDS_RC6_CTX_CORRUPTION_WA(dev_priv) \ ++ (IS_BROADWELL(dev_priv) || INTEL_GEN(dev_priv) == 9) ++ + /* WaRsDisableCoarsePowerGating:skl,cnl */ + #define NEEDS_WaRsDisableCoarsePowerGating(dev_priv) \ +- (IS_CANNONLAKE(dev_priv) || \ +- IS_SKL_GT3(dev_priv) || IS_SKL_GT4(dev_priv)) ++ (IS_CANNONLAKE(dev_priv) || INTEL_GEN(dev_priv) == 9) + + #define HAS_GMBUS_IRQ(dev_priv) (INTEL_GEN(dev_priv) >= 4) + #define HAS_GMBUS_BURST_READ(dev_priv) (INTEL_GEN(dev_priv) >= 10 || \ +@@ -2944,6 +2955,14 @@ i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj, + u64 alignment, + u64 flags); + ++struct i915_vma * __must_check ++i915_gem_object_pin(struct drm_i915_gem_object *obj, ++ struct i915_address_space *vm, ++ const struct i915_ggtt_view *view, ++ u64 size, ++ u64 alignment, ++ u64 flags); ++ + int i915_gem_object_unbind(struct drm_i915_gem_object *obj); + void i915_gem_release_mmap(struct drm_i915_gem_object *obj); + +@@ -3337,12 +3356,14 @@ const char *i915_cache_level_str(struct drm_i915_private *i915, int type); + int i915_cmd_parser_get_version(struct drm_i915_private *dev_priv); + void intel_engine_init_cmd_parser(struct intel_engine_cs *engine); + void intel_engine_cleanup_cmd_parser(struct intel_engine_cs *engine); +-int intel_engine_cmd_parser(struct intel_engine_cs *engine, ++int intel_engine_cmd_parser(struct i915_gem_context *cxt, ++ struct intel_engine_cs *engine, + struct drm_i915_gem_object *batch_obj, +- struct drm_i915_gem_object *shadow_batch_obj, ++ u64 user_batch_start, + u32 batch_start_offset, + u32 batch_len, +- bool is_master); ++ struct drm_i915_gem_object *shadow_batch_obj, ++ u64 shadow_batch_start); + + /* i915_perf.c */ + extern void i915_perf_init(struct drm_i915_private *dev_priv); +diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c +index 937287710042..c7d05ac7af3c 100644 +--- a/drivers/gpu/drm/i915/i915_gem.c ++++ b/drivers/gpu/drm/i915/i915_gem.c +@@ -174,6 +174,11 @@ static u32 __i915_gem_park(struct drm_i915_private *i915) + if (INTEL_GEN(i915) >= 6) + gen6_rps_idle(i915); + ++ if (NEEDS_RC6_CTX_CORRUPTION_WA(i915)) { ++ i915_rc6_ctx_wa_check(i915); ++ intel_uncore_forcewake_put(i915, FORCEWAKE_ALL); ++ } ++ + intel_display_power_put(i915, POWER_DOMAIN_GT_IRQ); + + intel_runtime_pm_put(i915); +@@ -220,6 +225,9 @@ void i915_gem_unpark(struct drm_i915_private *i915) + */ + intel_display_power_get(i915, POWER_DOMAIN_GT_IRQ); + ++ if (NEEDS_RC6_CTX_CORRUPTION_WA(i915)) ++ intel_uncore_forcewake_get(i915, FORCEWAKE_ALL); ++ + i915->gt.awake = true; + if (unlikely(++i915->gt.epoch == 0)) /* keep 0 as invalid */ + i915->gt.epoch = 1; +@@ -4425,6 +4433,20 @@ i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj, + { + struct drm_i915_private *dev_priv = to_i915(obj->base.dev); + struct i915_address_space *vm = &dev_priv->ggtt.vm; ++ ++ return i915_gem_object_pin(obj, vm, view, size, alignment, ++ flags | PIN_GLOBAL); ++} ++ ++struct i915_vma * ++i915_gem_object_pin(struct drm_i915_gem_object *obj, ++ struct i915_address_space *vm, ++ const struct i915_ggtt_view *view, ++ u64 size, ++ u64 alignment, ++ u64 flags) ++{ ++ struct drm_i915_private *dev_priv = to_i915(obj->base.dev); + struct i915_vma *vma; + int ret; + +@@ -4488,7 +4510,7 @@ i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj, + return ERR_PTR(ret); + } + +- ret = i915_vma_pin(vma, size, alignment, flags | PIN_GLOBAL); ++ ret = i915_vma_pin(vma, size, alignment, flags); + if (ret) + return ERR_PTR(ret); + +diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c +index b10770cfccd2..7a0e6dbbad2e 100644 +--- a/drivers/gpu/drm/i915/i915_gem_context.c ++++ b/drivers/gpu/drm/i915/i915_gem_context.c +@@ -124,6 +124,8 @@ static void i915_gem_context_free(struct i915_gem_context *ctx) + + i915_ppgtt_put(ctx->ppgtt); + ++ kfree(ctx->jump_whitelist); ++ + for (n = 0; n < ARRAY_SIZE(ctx->__engine); n++) { + struct intel_context *ce = &ctx->__engine[n]; + +@@ -339,6 +341,9 @@ __create_hw_context(struct drm_i915_private *dev_priv, + else + ctx->ggtt_offset_bias = I915_GTT_PAGE_SIZE; + ++ ctx->jump_whitelist = NULL; ++ ctx->jump_whitelist_cmds = 0; ++ + return ctx; + + err_pid: +diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h +index b116e4942c10..834d3951d8a9 100644 +--- a/drivers/gpu/drm/i915/i915_gem_context.h ++++ b/drivers/gpu/drm/i915/i915_gem_context.h +@@ -183,6 +183,12 @@ struct i915_gem_context { + /** remap_slice: Bitmask of cache lines that need remapping */ + u8 remap_slice; + ++ /** jump_whitelist: Bit array for tracking cmds during cmdparsing */ ++ unsigned long *jump_whitelist; ++ ++ /** jump_whitelist_cmds: No of cmd slots available */ ++ u32 jump_whitelist_cmds; ++ + /** handles_vma: rbtree to look up our context specific obj/vma for + * the user handle. (user handles are per fd, but the binding is + * per vm, which may be one per context or shared with the global GTT) +diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c +index 679bbae52945..f08c54740cbe 100644 +--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c ++++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c +@@ -309,7 +309,9 @@ static inline u64 gen8_noncanonical_addr(u64 address) + + static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb) + { +- return intel_engine_needs_cmd_parser(eb->engine) && eb->batch_len; ++ return intel_engine_requires_cmd_parser(eb->engine) || ++ (intel_engine_using_cmd_parser(eb->engine) && ++ eb->args->batch_len); + } + + static int eb_create(struct i915_execbuffer *eb) +@@ -1893,10 +1895,38 @@ static int i915_reset_gen7_sol_offsets(struct i915_request *rq) + return 0; + } + +-static struct i915_vma *eb_parse(struct i915_execbuffer *eb, bool is_master) ++static struct i915_vma * ++shadow_batch_pin(struct i915_execbuffer *eb, struct drm_i915_gem_object *obj) ++{ ++ struct drm_i915_private *dev_priv = eb->i915; ++ struct i915_address_space *vm; ++ u64 flags; ++ ++ /* ++ * PPGTT backed shadow buffers must be mapped RO, to prevent ++ * post-scan tampering ++ */ ++ if (CMDPARSER_USES_GGTT(dev_priv)) { ++ flags = PIN_GLOBAL; ++ vm = &dev_priv->ggtt.vm; ++ } else if (eb->vm->has_read_only) { ++ flags = PIN_USER; ++ vm = eb->vm; ++ i915_gem_object_set_readonly(obj); ++ } else { ++ DRM_DEBUG("Cannot prevent post-scan tampering without RO capable vm\n"); ++ return ERR_PTR(-EINVAL); ++ } ++ ++ return i915_gem_object_pin(obj, vm, NULL, 0, 0, flags); ++} ++ ++static struct i915_vma *eb_parse(struct i915_execbuffer *eb) + { + struct drm_i915_gem_object *shadow_batch_obj; + struct i915_vma *vma; ++ u64 batch_start; ++ u64 shadow_batch_start; + int err; + + shadow_batch_obj = i915_gem_batch_pool_get(&eb->engine->batch_pool, +@@ -1904,29 +1934,54 @@ static struct i915_vma *eb_parse(struct i915_execbuffer *eb, bool is_master) + if (IS_ERR(shadow_batch_obj)) + return ERR_CAST(shadow_batch_obj); + +- err = intel_engine_cmd_parser(eb->engine, ++ vma = shadow_batch_pin(eb, shadow_batch_obj); ++ if (IS_ERR(vma)) ++ goto out; ++ ++ batch_start = gen8_canonical_addr(eb->batch->node.start) + ++ eb->batch_start_offset; ++ ++ shadow_batch_start = gen8_canonical_addr(vma->node.start); ++ ++ err = intel_engine_cmd_parser(eb->ctx, ++ eb->engine, + eb->batch->obj, +- shadow_batch_obj, ++ batch_start, + eb->batch_start_offset, + eb->batch_len, +- is_master); ++ shadow_batch_obj, ++ shadow_batch_start); ++ + if (err) { +- if (err == -EACCES) /* unhandled chained batch */ ++ i915_vma_unpin(vma); ++ ++ /* ++ * Unsafe GGTT-backed buffers can still be submitted safely ++ * as non-secure. ++ * For PPGTT backing however, we have no choice but to forcibly ++ * reject unsafe buffers ++ */ ++ if (CMDPARSER_USES_GGTT(eb->i915) && (err == -EACCES)) ++ /* Execute original buffer non-secure */ + vma = NULL; + else + vma = ERR_PTR(err); +- goto out; +- } + +- vma = i915_gem_object_ggtt_pin(shadow_batch_obj, NULL, 0, 0, 0); +- if (IS_ERR(vma)) + goto out; ++ } + + eb->vma[eb->buffer_count] = i915_vma_get(vma); + eb->flags[eb->buffer_count] = + __EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_REF; + vma->exec_flags = &eb->flags[eb->buffer_count]; + eb->buffer_count++; ++ eb->batch_start_offset = 0; ++ eb->batch = vma; ++ ++ /* eb->batch_len unchanged */ ++ ++ if (CMDPARSER_USES_GGTT(eb->i915)) ++ eb->batch_flags |= I915_DISPATCH_SECURE; + + out: + i915_gem_object_unpin_pages(shadow_batch_obj); +@@ -2177,6 +2232,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, + struct drm_i915_gem_exec_object2 *exec, + struct drm_syncobj **fences) + { ++ struct drm_i915_private *i915 = to_i915(dev); + struct i915_execbuffer eb; + struct dma_fence *in_fence = NULL; + struct sync_file *out_fence = NULL; +@@ -2187,7 +2243,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, + BUILD_BUG_ON(__EXEC_OBJECT_INTERNAL_FLAGS & + ~__EXEC_OBJECT_UNKNOWN_FLAGS); + +- eb.i915 = to_i915(dev); ++ eb.i915 = i915; + eb.file = file; + eb.args = args; + if (DBG_FORCE_RELOC || !(args->flags & I915_EXEC_NO_RELOC)) +@@ -2209,8 +2265,15 @@ i915_gem_do_execbuffer(struct drm_device *dev, + + eb.batch_flags = 0; + if (args->flags & I915_EXEC_SECURE) { ++ if (INTEL_GEN(i915) >= 11) ++ return -ENODEV; ++ ++ /* Return -EPERM to trigger fallback code on old binaries. */ ++ if (!HAS_SECURE_BATCHES(i915)) ++ return -EPERM; ++ + if (!drm_is_current_master(file) || !capable(CAP_SYS_ADMIN)) +- return -EPERM; ++ return -EPERM; + + eb.batch_flags |= I915_DISPATCH_SECURE; + } +@@ -2297,34 +2360,19 @@ i915_gem_do_execbuffer(struct drm_device *dev, + goto err_vma; + } + ++ if (eb.batch_len == 0) ++ eb.batch_len = eb.batch->size - eb.batch_start_offset; ++ + if (eb_use_cmdparser(&eb)) { + struct i915_vma *vma; + +- vma = eb_parse(&eb, drm_is_current_master(file)); ++ vma = eb_parse(&eb); + if (IS_ERR(vma)) { + err = PTR_ERR(vma); + goto err_vma; + } +- +- if (vma) { +- /* +- * Batch parsed and accepted: +- * +- * Set the DISPATCH_SECURE bit to remove the NON_SECURE +- * bit from MI_BATCH_BUFFER_START commands issued in +- * the dispatch_execbuffer implementations. We +- * specifically don't want that set on batches the +- * command parser has accepted. +- */ +- eb.batch_flags |= I915_DISPATCH_SECURE; +- eb.batch_start_offset = 0; +- eb.batch = vma; +- } + } + +- if (eb.batch_len == 0) +- eb.batch_len = eb.batch->size - eb.batch_start_offset; +- + /* + * snb/ivb/vlv conflate the "batch in ppgtt" bit with the "non-secure + * batch" bit. Hence we need to pin secure batches into the global gtt. +diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c +index 87411a5aba77..d4c6aa7fbac8 100644 +--- a/drivers/gpu/drm/i915/i915_gem_gtt.c ++++ b/drivers/gpu/drm/i915/i915_gem_gtt.c +@@ -158,7 +158,8 @@ int intel_sanitize_enable_ppgtt(struct drm_i915_private *dev_priv, + if (enable_ppgtt == 0 && INTEL_GEN(dev_priv) < 9) + return 0; + +- if (enable_ppgtt == 1) ++ /* Full PPGTT is required by the Gen9 cmdparser */ ++ if (enable_ppgtt == 1 && INTEL_GEN(dev_priv) != 9) + return 1; + + if (enable_ppgtt == 2 && has_full_ppgtt) +diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h +index 4e070afb2738..a6f4f32dd71c 100644 +--- a/drivers/gpu/drm/i915/i915_reg.h ++++ b/drivers/gpu/drm/i915/i915_reg.h +@@ -387,6 +387,8 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) + #define ECOCHK_PPGTT_WT_HSW (0x2 << 3) + #define ECOCHK_PPGTT_WB_HSW (0x3 << 3) + ++#define GEN8_RC6_CTX_INFO _MMIO(0x8504) ++ + #define GAC_ECO_BITS _MMIO(0x14090) + #define ECOBITS_SNB_BIT (1 << 13) + #define ECOBITS_PPGTT_CACHE64B (3 << 8) +@@ -471,6 +473,10 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) + */ + #define BCS_SWCTRL _MMIO(0x22200) + ++/* There are 16 GPR registers */ ++#define BCS_GPR(n) _MMIO(0x22600 + (n) * 8) ++#define BCS_GPR_UDW(n) _MMIO(0x22600 + (n) * 8 + 4) ++ + #define GPGPU_THREADS_DISPATCHED _MMIO(0x2290) + #define GPGPU_THREADS_DISPATCHED_UDW _MMIO(0x2290 + 4) + #define HS_INVOCATION_COUNT _MMIO(0x2300) +@@ -7005,6 +7011,10 @@ enum { + #define SKL_CSR_DC5_DC6_COUNT _MMIO(0x8002C) + #define BXT_CSR_DC3_DC5_COUNT _MMIO(0x80038) + ++/* Display Internal Timeout Register */ ++#define RM_TIMEOUT _MMIO(0x42060) ++#define MMIO_TIMEOUT_US(us) ((us) << 0) ++ + /* interrupts */ + #define DE_MASTER_IRQ_CONTROL (1 << 31) + #define DE_SPRITEB_FLIP_DONE (1 << 29) +diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h +index 50d56498de77..b1154d803564 100644 +--- a/drivers/gpu/drm/i915/intel_drv.h ++++ b/drivers/gpu/drm/i915/intel_drv.h +@@ -2064,6 +2064,9 @@ void intel_sanitize_gt_powersave(struct drm_i915_private *dev_priv); + void intel_enable_gt_powersave(struct drm_i915_private *dev_priv); + void intel_disable_gt_powersave(struct drm_i915_private *dev_priv); + void intel_suspend_gt_powersave(struct drm_i915_private *dev_priv); ++bool i915_rc6_ctx_wa_check(struct drm_i915_private *i915); ++void i915_rc6_ctx_wa_suspend(struct drm_i915_private *i915); ++void i915_rc6_ctx_wa_resume(struct drm_i915_private *i915); + void gen6_rps_busy(struct drm_i915_private *dev_priv); + void gen6_rps_reset_ei(struct drm_i915_private *dev_priv); + void gen6_rps_idle(struct drm_i915_private *dev_priv); +diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c +index 425df814de75..8d731eb1de69 100644 +--- a/drivers/gpu/drm/i915/intel_pm.c ++++ b/drivers/gpu/drm/i915/intel_pm.c +@@ -114,6 +114,14 @@ static void bxt_init_clock_gating(struct drm_i915_private *dev_priv) + */ + I915_WRITE(GEN9_CLKGATE_DIS_0, I915_READ(GEN9_CLKGATE_DIS_0) | + PWM1_GATING_DIS | PWM2_GATING_DIS); ++ ++ /* ++ * Lower the display internal timeout. ++ * This is needed to avoid any hard hangs when DSI port PLL ++ * is off and a MMIO access is attempted by any privilege ++ * application, using batch buffers or any other means. ++ */ ++ I915_WRITE(RM_TIMEOUT, MMIO_TIMEOUT_US(950)); + } + + static void glk_init_clock_gating(struct drm_i915_private *dev_priv) +@@ -8188,6 +8196,95 @@ static void intel_init_emon(struct drm_i915_private *dev_priv) + dev_priv->ips.corr = (lcfuse & LCFUSE_HIV_MASK); + } + ++static bool i915_rc6_ctx_corrupted(struct drm_i915_private *dev_priv) ++{ ++ return !I915_READ(GEN8_RC6_CTX_INFO); ++} ++ ++static void i915_rc6_ctx_wa_init(struct drm_i915_private *i915) ++{ ++ if (!NEEDS_RC6_CTX_CORRUPTION_WA(i915)) ++ return; ++ ++ if (i915_rc6_ctx_corrupted(i915)) { ++ DRM_INFO("RC6 context corrupted, disabling runtime power management\n"); ++ i915->gt_pm.rc6.ctx_corrupted = true; ++ intel_runtime_pm_get(i915); ++ } ++} ++ ++static void i915_rc6_ctx_wa_cleanup(struct drm_i915_private *i915) ++{ ++ if (i915->gt_pm.rc6.ctx_corrupted) { ++ intel_runtime_pm_put(i915); ++ i915->gt_pm.rc6.ctx_corrupted = false; ++ } ++} ++ ++/** ++ * i915_rc6_ctx_wa_suspend - system suspend sequence for the RC6 CTX WA ++ * @i915: i915 device ++ * ++ * Perform any steps needed to clean up the RC6 CTX WA before system suspend. ++ */ ++void i915_rc6_ctx_wa_suspend(struct drm_i915_private *i915) ++{ ++ if (i915->gt_pm.rc6.ctx_corrupted) ++ intel_runtime_pm_put(i915); ++} ++ ++/** ++ * i915_rc6_ctx_wa_resume - system resume sequence for the RC6 CTX WA ++ * @i915: i915 device ++ * ++ * Perform any steps needed to re-init the RC6 CTX WA after system resume. ++ */ ++void i915_rc6_ctx_wa_resume(struct drm_i915_private *i915) ++{ ++ if (!i915->gt_pm.rc6.ctx_corrupted) ++ return; ++ ++ if (i915_rc6_ctx_corrupted(i915)) { ++ intel_runtime_pm_get(i915); ++ return; ++ } ++ ++ DRM_INFO("RC6 context restored, re-enabling runtime power management\n"); ++ i915->gt_pm.rc6.ctx_corrupted = false; ++} ++ ++static void intel_disable_rc6(struct drm_i915_private *dev_priv); ++ ++/** ++ * i915_rc6_ctx_wa_check - check for a new RC6 CTX corruption ++ * @i915: i915 device ++ * ++ * Check if an RC6 CTX corruption has happened since the last check and if so ++ * disable RC6 and runtime power management. ++ * ++ * Return false if no context corruption has happened since the last call of ++ * this function, true otherwise. ++*/ ++bool i915_rc6_ctx_wa_check(struct drm_i915_private *i915) ++{ ++ if (!NEEDS_RC6_CTX_CORRUPTION_WA(i915)) ++ return false; ++ ++ if (i915->gt_pm.rc6.ctx_corrupted) ++ return false; ++ ++ if (!i915_rc6_ctx_corrupted(i915)) ++ return false; ++ ++ DRM_NOTE("RC6 context corruption, disabling runtime power management\n"); ++ ++ intel_disable_rc6(i915); ++ i915->gt_pm.rc6.ctx_corrupted = true; ++ intel_runtime_pm_get_noresume(i915); ++ ++ return true; ++} ++ + void intel_init_gt_powersave(struct drm_i915_private *dev_priv) + { + struct intel_rps *rps = &dev_priv->gt_pm.rps; +@@ -8203,6 +8300,8 @@ void intel_init_gt_powersave(struct drm_i915_private *dev_priv) + + mutex_lock(&dev_priv->pcu_lock); + ++ i915_rc6_ctx_wa_init(dev_priv); ++ + /* Initialize RPS limits (for userspace) */ + if (IS_CHERRYVIEW(dev_priv)) + cherryview_init_gt_powersave(dev_priv); +@@ -8249,6 +8348,8 @@ void intel_cleanup_gt_powersave(struct drm_i915_private *dev_priv) + if (IS_VALLEYVIEW(dev_priv)) + valleyview_cleanup_gt_powersave(dev_priv); + ++ i915_rc6_ctx_wa_cleanup(dev_priv); ++ + if (!HAS_RC6(dev_priv)) + intel_runtime_pm_put(dev_priv); + } +@@ -8293,7 +8394,7 @@ static inline void intel_disable_llc_pstate(struct drm_i915_private *i915) + i915->gt_pm.llc_pstate.enabled = false; + } + +-static void intel_disable_rc6(struct drm_i915_private *dev_priv) ++static void __intel_disable_rc6(struct drm_i915_private *dev_priv) + { + lockdep_assert_held(&dev_priv->pcu_lock); + +@@ -8312,6 +8413,13 @@ static void intel_disable_rc6(struct drm_i915_private *dev_priv) + dev_priv->gt_pm.rc6.enabled = false; + } + ++static void intel_disable_rc6(struct drm_i915_private *dev_priv) ++{ ++ mutex_lock(&dev_priv->pcu_lock); ++ __intel_disable_rc6(dev_priv); ++ mutex_unlock(&dev_priv->pcu_lock); ++} ++ + static void intel_disable_rps(struct drm_i915_private *dev_priv) + { + lockdep_assert_held(&dev_priv->pcu_lock); +@@ -8337,7 +8445,7 @@ void intel_disable_gt_powersave(struct drm_i915_private *dev_priv) + { + mutex_lock(&dev_priv->pcu_lock); + +- intel_disable_rc6(dev_priv); ++ __intel_disable_rc6(dev_priv); + intel_disable_rps(dev_priv); + if (HAS_LLC(dev_priv)) + intel_disable_llc_pstate(dev_priv); +@@ -8364,6 +8472,9 @@ static void intel_enable_rc6(struct drm_i915_private *dev_priv) + if (dev_priv->gt_pm.rc6.enabled) + return; + ++ if (dev_priv->gt_pm.rc6.ctx_corrupted) ++ return; ++ + if (IS_CHERRYVIEW(dev_priv)) + cherryview_enable_rc6(dev_priv); + else if (IS_VALLEYVIEW(dev_priv)) +diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h +index f5ffa6d31e82..eaf1a161bc96 100644 +--- a/drivers/gpu/drm/i915/intel_ringbuffer.h ++++ b/drivers/gpu/drm/i915/intel_ringbuffer.h +@@ -584,9 +584,10 @@ struct intel_engine_cs { + + struct intel_engine_hangcheck hangcheck; + +-#define I915_ENGINE_NEEDS_CMD_PARSER BIT(0) +-#define I915_ENGINE_SUPPORTS_STATS BIT(1) +-#define I915_ENGINE_HAS_PREEMPTION BIT(2) ++#define I915_ENGINE_USING_CMD_PARSER BIT(0) ++#define I915_ENGINE_SUPPORTS_STATS BIT(1) ++#define I915_ENGINE_HAS_PREEMPTION BIT(2) ++#define I915_ENGINE_REQUIRES_CMD_PARSER BIT(3) + unsigned int flags; + + /* +@@ -647,9 +648,15 @@ struct intel_engine_cs { + }; + + static inline bool +-intel_engine_needs_cmd_parser(const struct intel_engine_cs *engine) ++intel_engine_using_cmd_parser(const struct intel_engine_cs *engine) + { +- return engine->flags & I915_ENGINE_NEEDS_CMD_PARSER; ++ return engine->flags & I915_ENGINE_USING_CMD_PARSER; ++} ++ ++static inline bool ++intel_engine_requires_cmd_parser(const struct intel_engine_cs *engine) ++{ ++ return engine->flags & I915_ENGINE_REQUIRES_CMD_PARSER; + } + + static inline bool +diff --git a/drivers/gpu/drm/radeon/si_dpm.c b/drivers/gpu/drm/radeon/si_dpm.c +index 0a785ef0ab66..db2d8b84e137 100644 +--- a/drivers/gpu/drm/radeon/si_dpm.c ++++ b/drivers/gpu/drm/radeon/si_dpm.c +@@ -1956,6 +1956,7 @@ static void si_initialize_powertune_defaults(struct radeon_device *rdev) + case 0x682C: + si_pi->cac_weights = cac_weights_cape_verde_pro; + si_pi->dte_data = dte_data_sun_xt; ++ update_dte_from_pl2 = true; + break; + case 0x6825: + case 0x6827: +diff --git a/drivers/hid/hid-google-hammer.c b/drivers/hid/hid-google-hammer.c +index 6bf4da7ad63a..8cb63ea9977d 100644 +--- a/drivers/hid/hid-google-hammer.c ++++ b/drivers/hid/hid-google-hammer.c +@@ -120,6 +120,10 @@ static int hammer_input_configured(struct hid_device *hdev, + static const struct hid_device_id hammer_devices[] = { + { HID_DEVICE(BUS_USB, HID_GROUP_GENERIC, + USB_VENDOR_ID_GOOGLE, USB_DEVICE_ID_GOOGLE_HAMMER) }, ++ { HID_DEVICE(BUS_USB, HID_GROUP_GENERIC, ++ USB_VENDOR_ID_GOOGLE, USB_DEVICE_ID_GOOGLE_MAGNEMITE) }, ++ { HID_DEVICE(BUS_USB, HID_GROUP_GENERIC, ++ USB_VENDOR_ID_GOOGLE, USB_DEVICE_ID_GOOGLE_MASTERBALL) }, + { HID_DEVICE(BUS_USB, HID_GROUP_GENERIC, + USB_VENDOR_ID_GOOGLE, USB_DEVICE_ID_GOOGLE_STAFF) }, + { HID_DEVICE(BUS_USB, HID_GROUP_GENERIC, +diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h +index 6b33117ca60e..02c263a4c083 100644 +--- a/drivers/hid/hid-ids.h ++++ b/drivers/hid/hid-ids.h +@@ -466,6 +466,8 @@ + #define USB_DEVICE_ID_GOOGLE_STAFF 0x502b + #define USB_DEVICE_ID_GOOGLE_WAND 0x502d + #define USB_DEVICE_ID_GOOGLE_WHISKERS 0x5030 ++#define USB_DEVICE_ID_GOOGLE_MASTERBALL 0x503c ++#define USB_DEVICE_ID_GOOGLE_MAGNEMITE 0x503d + + #define USB_VENDOR_ID_GOTOP 0x08f2 + #define USB_DEVICE_ID_SUPER_Q2 0x007f +diff --git a/drivers/hid/intel-ish-hid/ishtp/client-buffers.c b/drivers/hid/intel-ish-hid/ishtp/client-buffers.c +index b9b917d2d50d..c41dbb167c91 100644 +--- a/drivers/hid/intel-ish-hid/ishtp/client-buffers.c ++++ b/drivers/hid/intel-ish-hid/ishtp/client-buffers.c +@@ -90,7 +90,7 @@ int ishtp_cl_alloc_tx_ring(struct ishtp_cl *cl) + return 0; + out: + dev_err(&cl->device->dev, "error in allocating Tx pool\n"); +- ishtp_cl_free_rx_ring(cl); ++ ishtp_cl_free_tx_ring(cl); + return -ENOMEM; + } + +diff --git a/drivers/hid/wacom.h b/drivers/hid/wacom.h +index 3c37c3cbf6f1..9c0900c35b23 100644 +--- a/drivers/hid/wacom.h ++++ b/drivers/hid/wacom.h +@@ -205,6 +205,21 @@ static inline void wacom_schedule_work(struct wacom_wac *wacom_wac, + } + } + ++/* ++ * Convert a signed 32-bit integer to an unsigned n-bit integer. Undoes ++ * the normally-helpful work of 'hid_snto32' for fields that use signed ++ * ranges for questionable reasons. ++ */ ++static inline __u32 wacom_s32tou(s32 value, __u8 n) ++{ ++ switch (n) { ++ case 8: return ((__u8)value); ++ case 16: return ((__u16)value); ++ case 32: return ((__u32)value); ++ } ++ return value & (1 << (n - 1)) ? value & (~(~0U << n)) : value; ++} ++ + extern const struct hid_device_id wacom_ids[]; + + void wacom_wac_irq(struct wacom_wac *wacom_wac, size_t len); +diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c +index 1df037e7f0b4..77bb46948eea 100644 +--- a/drivers/hid/wacom_wac.c ++++ b/drivers/hid/wacom_wac.c +@@ -2271,7 +2271,7 @@ static void wacom_wac_pen_event(struct hid_device *hdev, struct hid_field *field + case HID_DG_TOOLSERIALNUMBER: + if (value) { + wacom_wac->serial[0] = (wacom_wac->serial[0] & ~0xFFFFFFFFULL); +- wacom_wac->serial[0] |= (__u32)value; ++ wacom_wac->serial[0] |= wacom_s32tou(value, field->report_size); + } + return; + case HID_DG_TWIST: +@@ -2287,15 +2287,17 @@ static void wacom_wac_pen_event(struct hid_device *hdev, struct hid_field *field + return; + case WACOM_HID_WD_SERIALHI: + if (value) { ++ __u32 raw_value = wacom_s32tou(value, field->report_size); ++ + wacom_wac->serial[0] = (wacom_wac->serial[0] & 0xFFFFFFFF); +- wacom_wac->serial[0] |= ((__u64)value) << 32; ++ wacom_wac->serial[0] |= ((__u64)raw_value) << 32; + /* + * Non-USI EMR devices may contain additional tool type + * information here. See WACOM_HID_WD_TOOLTYPE case for + * more details. + */ + if (value >> 20 == 1) { +- wacom_wac->id[0] |= value & 0xFFFFF; ++ wacom_wac->id[0] |= raw_value & 0xFFFFF; + } + } + return; +@@ -2307,7 +2309,7 @@ static void wacom_wac_pen_event(struct hid_device *hdev, struct hid_field *field + * bitwise OR so the complete value can be built + * up over time :( + */ +- wacom_wac->id[0] |= value; ++ wacom_wac->id[0] |= wacom_s32tou(value, field->report_size); + return; + case WACOM_HID_WD_OFFSETLEFT: + if (features->offset_left && value != features->offset_left) +diff --git a/drivers/hwtracing/intel_th/pci.c b/drivers/hwtracing/intel_th/pci.c +index 968319f4e5f1..8421009d3a9d 100644 +--- a/drivers/hwtracing/intel_th/pci.c ++++ b/drivers/hwtracing/intel_th/pci.c +@@ -175,6 +175,11 @@ static const struct pci_device_id intel_th_pci_id_table[] = { + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x02a6), + .driver_data = (kernel_ulong_t)&intel_th_2x, + }, ++ { ++ /* Comet Lake PCH */ ++ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x06a6), ++ .driver_data = (kernel_ulong_t)&intel_th_2x, ++ }, + { + /* Ice Lake NNPI */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x45c5), +@@ -185,6 +190,11 @@ static const struct pci_device_id intel_th_pci_id_table[] = { + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xa0a6), + .driver_data = (kernel_ulong_t)&intel_th_2x, + }, ++ { ++ /* Jasper Lake PCH */ ++ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x4da6), ++ .driver_data = (kernel_ulong_t)&intel_th_2x, ++ }, + { 0 }, + }; + +diff --git a/drivers/iio/adc/stm32-adc.c b/drivers/iio/adc/stm32-adc.c +index c52d20f7ca2e..0409dcf5b047 100644 +--- a/drivers/iio/adc/stm32-adc.c ++++ b/drivers/iio/adc/stm32-adc.c +@@ -1340,7 +1340,7 @@ static int stm32_adc_dma_start(struct iio_dev *indio_dev) + cookie = dmaengine_submit(desc); + ret = dma_submit_error(cookie); + if (ret) { +- dmaengine_terminate_all(adc->dma_chan); ++ dmaengine_terminate_sync(adc->dma_chan); + return ret; + } + +@@ -1413,7 +1413,7 @@ static int stm32_adc_buffer_predisable(struct iio_dev *indio_dev) + dev_err(&indio_dev->dev, "predisable failed\n"); + + if (adc->dma_chan) +- dmaengine_terminate_all(adc->dma_chan); ++ dmaengine_terminate_sync(adc->dma_chan); + + if (stm32_adc_set_trig(indio_dev, NULL)) + dev_err(&indio_dev->dev, "Can't clear trigger\n"); +diff --git a/drivers/iio/imu/adis16480.c b/drivers/iio/imu/adis16480.c +index a27fe208f3ae..d3af4f28cbbb 100644 +--- a/drivers/iio/imu/adis16480.c ++++ b/drivers/iio/imu/adis16480.c +@@ -270,8 +270,11 @@ static int adis16480_set_freq(struct iio_dev *indio_dev, int val, int val2) + struct adis16480 *st = iio_priv(indio_dev); + unsigned int t; + ++ if (val < 0 || val2 < 0) ++ return -EINVAL; ++ + t = val * 1000 + val2 / 1000; +- if (t <= 0) ++ if (t == 0) + return -EINVAL; + + t = 2460000 / t; +diff --git a/drivers/iio/imu/inv_mpu6050/Kconfig b/drivers/iio/imu/inv_mpu6050/Kconfig +index 5483b2ea754d..d2fe9dbddda7 100644 +--- a/drivers/iio/imu/inv_mpu6050/Kconfig ++++ b/drivers/iio/imu/inv_mpu6050/Kconfig +@@ -13,8 +13,8 @@ config INV_MPU6050_I2C + select INV_MPU6050_IIO + select REGMAP_I2C + help +- This driver supports the Invensense MPU6050/6500/9150 and ICM20608 +- motion tracking devices over I2C. ++ This driver supports the Invensense MPU6050/6500/9150 and ++ ICM20608/20602 motion tracking devices over I2C. + This driver can be built as a module. The module will be called + inv-mpu6050-i2c. + +@@ -24,7 +24,7 @@ config INV_MPU6050_SPI + select INV_MPU6050_IIO + select REGMAP_SPI + help +- This driver supports the Invensense MPU6050/6500/9150 and ICM20608 +- motion tracking devices over SPI. ++ This driver supports the Invensense MPU6050/6500/9150 and ++ ICM20608/20602 motion tracking devices over SPI. + This driver can be built as a module. The module will be called + inv-mpu6050-spi. +diff --git a/drivers/iio/imu/inv_mpu6050/inv_mpu_core.c b/drivers/iio/imu/inv_mpu6050/inv_mpu_core.c +index d80ef468508a..baba8e5459d0 100644 +--- a/drivers/iio/imu/inv_mpu6050/inv_mpu_core.c ++++ b/drivers/iio/imu/inv_mpu6050/inv_mpu_core.c +@@ -37,6 +37,29 @@ static const int gyro_scale_6050[] = {133090, 266181, 532362, 1064724}; + */ + static const int accel_scale[] = {598, 1196, 2392, 4785}; + ++static const struct inv_mpu6050_reg_map reg_set_icm20602 = { ++ .sample_rate_div = INV_MPU6050_REG_SAMPLE_RATE_DIV, ++ .lpf = INV_MPU6050_REG_CONFIG, ++ .accel_lpf = INV_MPU6500_REG_ACCEL_CONFIG_2, ++ .user_ctrl = INV_MPU6050_REG_USER_CTRL, ++ .fifo_en = INV_MPU6050_REG_FIFO_EN, ++ .gyro_config = INV_MPU6050_REG_GYRO_CONFIG, ++ .accl_config = INV_MPU6050_REG_ACCEL_CONFIG, ++ .fifo_count_h = INV_MPU6050_REG_FIFO_COUNT_H, ++ .fifo_r_w = INV_MPU6050_REG_FIFO_R_W, ++ .raw_gyro = INV_MPU6050_REG_RAW_GYRO, ++ .raw_accl = INV_MPU6050_REG_RAW_ACCEL, ++ .temperature = INV_MPU6050_REG_TEMPERATURE, ++ .int_enable = INV_MPU6050_REG_INT_ENABLE, ++ .int_status = INV_MPU6050_REG_INT_STATUS, ++ .pwr_mgmt_1 = INV_MPU6050_REG_PWR_MGMT_1, ++ .pwr_mgmt_2 = INV_MPU6050_REG_PWR_MGMT_2, ++ .int_pin_cfg = INV_MPU6050_REG_INT_PIN_CFG, ++ .accl_offset = INV_MPU6500_REG_ACCEL_OFFSET, ++ .gyro_offset = INV_MPU6050_REG_GYRO_OFFSET, ++ .i2c_if = INV_ICM20602_REG_I2C_IF, ++}; ++ + static const struct inv_mpu6050_reg_map reg_set_6500 = { + .sample_rate_div = INV_MPU6050_REG_SAMPLE_RATE_DIV, + .lpf = INV_MPU6050_REG_CONFIG, +@@ -57,6 +80,7 @@ static const struct inv_mpu6050_reg_map reg_set_6500 = { + .int_pin_cfg = INV_MPU6050_REG_INT_PIN_CFG, + .accl_offset = INV_MPU6500_REG_ACCEL_OFFSET, + .gyro_offset = INV_MPU6050_REG_GYRO_OFFSET, ++ .i2c_if = 0, + }; + + static const struct inv_mpu6050_reg_map reg_set_6050 = { +@@ -77,6 +101,7 @@ static const struct inv_mpu6050_reg_map reg_set_6050 = { + .int_pin_cfg = INV_MPU6050_REG_INT_PIN_CFG, + .accl_offset = INV_MPU6050_REG_ACCEL_OFFSET, + .gyro_offset = INV_MPU6050_REG_GYRO_OFFSET, ++ .i2c_if = 0, + }; + + static const struct inv_mpu6050_chip_config chip_config_6050 = { +@@ -96,48 +121,63 @@ static const struct inv_mpu6050_hw hw_info[] = { + .name = "MPU6050", + .reg = ®_set_6050, + .config = &chip_config_6050, ++ .fifo_size = 1024, + }, + { + .whoami = INV_MPU6500_WHOAMI_VALUE, + .name = "MPU6500", + .reg = ®_set_6500, + .config = &chip_config_6050, ++ .fifo_size = 512, + }, + { + .whoami = INV_MPU6515_WHOAMI_VALUE, + .name = "MPU6515", + .reg = ®_set_6500, + .config = &chip_config_6050, ++ .fifo_size = 512, + }, + { + .whoami = INV_MPU6000_WHOAMI_VALUE, + .name = "MPU6000", + .reg = ®_set_6050, + .config = &chip_config_6050, ++ .fifo_size = 1024, + }, + { + .whoami = INV_MPU9150_WHOAMI_VALUE, + .name = "MPU9150", + .reg = ®_set_6050, + .config = &chip_config_6050, ++ .fifo_size = 1024, + }, + { + .whoami = INV_MPU9250_WHOAMI_VALUE, + .name = "MPU9250", + .reg = ®_set_6500, + .config = &chip_config_6050, ++ .fifo_size = 512, + }, + { + .whoami = INV_MPU9255_WHOAMI_VALUE, + .name = "MPU9255", + .reg = ®_set_6500, + .config = &chip_config_6050, ++ .fifo_size = 512, + }, + { + .whoami = INV_ICM20608_WHOAMI_VALUE, + .name = "ICM20608", + .reg = ®_set_6500, + .config = &chip_config_6050, ++ .fifo_size = 512, ++ }, ++ { ++ .whoami = INV_ICM20602_WHOAMI_VALUE, ++ .name = "ICM20602", ++ .reg = ®_set_icm20602, ++ .config = &chip_config_6050, ++ .fifo_size = 1008, + }, + }; + +@@ -439,7 +479,10 @@ inv_mpu6050_read_raw(struct iio_dev *indio_dev, + return IIO_VAL_INT_PLUS_MICRO; + case IIO_TEMP: + *val = 0; +- *val2 = INV_MPU6050_TEMP_SCALE; ++ if (st->chip_type == INV_ICM20602) ++ *val2 = INV_ICM20602_TEMP_SCALE; ++ else ++ *val2 = INV_MPU6050_TEMP_SCALE; + + return IIO_VAL_INT_PLUS_MICRO; + default: +@@ -448,7 +491,10 @@ inv_mpu6050_read_raw(struct iio_dev *indio_dev, + case IIO_CHAN_INFO_OFFSET: + switch (chan->type) { + case IIO_TEMP: +- *val = INV_MPU6050_TEMP_OFFSET; ++ if (st->chip_type == INV_ICM20602) ++ *val = INV_ICM20602_TEMP_OFFSET; ++ else ++ *val = INV_MPU6050_TEMP_OFFSET; + + return IIO_VAL_INT; + default: +@@ -813,6 +859,32 @@ static const struct iio_chan_spec inv_mpu_channels[] = { + INV_MPU6050_CHAN(IIO_ACCEL, IIO_MOD_Z, INV_MPU6050_SCAN_ACCL_Z), + }; + ++static const struct iio_chan_spec inv_icm20602_channels[] = { ++ IIO_CHAN_SOFT_TIMESTAMP(INV_ICM20602_SCAN_TIMESTAMP), ++ { ++ .type = IIO_TEMP, ++ .info_mask_separate = BIT(IIO_CHAN_INFO_RAW) ++ | BIT(IIO_CHAN_INFO_OFFSET) ++ | BIT(IIO_CHAN_INFO_SCALE), ++ .scan_index = INV_ICM20602_SCAN_TEMP, ++ .scan_type = { ++ .sign = 's', ++ .realbits = 16, ++ .storagebits = 16, ++ .shift = 0, ++ .endianness = IIO_BE, ++ }, ++ }, ++ ++ INV_MPU6050_CHAN(IIO_ANGL_VEL, IIO_MOD_X, INV_ICM20602_SCAN_GYRO_X), ++ INV_MPU6050_CHAN(IIO_ANGL_VEL, IIO_MOD_Y, INV_ICM20602_SCAN_GYRO_Y), ++ INV_MPU6050_CHAN(IIO_ANGL_VEL, IIO_MOD_Z, INV_ICM20602_SCAN_GYRO_Z), ++ ++ INV_MPU6050_CHAN(IIO_ACCEL, IIO_MOD_Y, INV_ICM20602_SCAN_ACCL_Y), ++ INV_MPU6050_CHAN(IIO_ACCEL, IIO_MOD_X, INV_ICM20602_SCAN_ACCL_X), ++ INV_MPU6050_CHAN(IIO_ACCEL, IIO_MOD_Z, INV_ICM20602_SCAN_ACCL_Z), ++}; ++ + /* + * The user can choose any frequency between INV_MPU6050_MIN_FIFO_RATE and + * INV_MPU6050_MAX_FIFO_RATE, but only these frequencies are matched by the +@@ -1013,8 +1085,14 @@ int inv_mpu_core_probe(struct regmap *regmap, int irq, const char *name, + indio_dev->name = name; + else + indio_dev->name = dev_name(dev); +- indio_dev->channels = inv_mpu_channels; +- indio_dev->num_channels = ARRAY_SIZE(inv_mpu_channels); ++ ++ if (chip_type == INV_ICM20602) { ++ indio_dev->channels = inv_icm20602_channels; ++ indio_dev->num_channels = ARRAY_SIZE(inv_icm20602_channels); ++ } else { ++ indio_dev->channels = inv_mpu_channels; ++ indio_dev->num_channels = ARRAY_SIZE(inv_mpu_channels); ++ } + + indio_dev->info = &mpu_info; + indio_dev->modes = INDIO_BUFFER_TRIGGERED; +diff --git a/drivers/iio/imu/inv_mpu6050/inv_mpu_i2c.c b/drivers/iio/imu/inv_mpu6050/inv_mpu_i2c.c +index dd758e3d403d..e46eb4ddea21 100644 +--- a/drivers/iio/imu/inv_mpu6050/inv_mpu_i2c.c ++++ b/drivers/iio/imu/inv_mpu6050/inv_mpu_i2c.c +@@ -127,6 +127,7 @@ static int inv_mpu_probe(struct i2c_client *client, + st = iio_priv(dev_get_drvdata(&client->dev)); + switch (st->chip_type) { + case INV_ICM20608: ++ case INV_ICM20602: + /* no i2c auxiliary bus on the chip */ + break; + default: +@@ -179,6 +180,7 @@ static const struct i2c_device_id inv_mpu_id[] = { + {"mpu9250", INV_MPU9250}, + {"mpu9255", INV_MPU9255}, + {"icm20608", INV_ICM20608}, ++ {"icm20602", INV_ICM20602}, + {} + }; + +@@ -213,6 +215,10 @@ static const struct of_device_id inv_of_match[] = { + .compatible = "invensense,icm20608", + .data = (void *)INV_ICM20608 + }, ++ { ++ .compatible = "invensense,icm20602", ++ .data = (void *)INV_ICM20602 ++ }, + { } + }; + MODULE_DEVICE_TABLE(of, inv_of_match); +diff --git a/drivers/iio/imu/inv_mpu6050/inv_mpu_iio.h b/drivers/iio/imu/inv_mpu6050/inv_mpu_iio.h +index e69a59659dbc..6ef872f97c17 100644 +--- a/drivers/iio/imu/inv_mpu6050/inv_mpu_iio.h ++++ b/drivers/iio/imu/inv_mpu6050/inv_mpu_iio.h +@@ -44,6 +44,7 @@ + * @int_pin_cfg; Controls interrupt pin configuration. + * @accl_offset: Controls the accelerometer calibration offset. + * @gyro_offset: Controls the gyroscope calibration offset. ++ * @i2c_if: Controls the i2c interface + */ + struct inv_mpu6050_reg_map { + u8 sample_rate_div; +@@ -65,6 +66,7 @@ struct inv_mpu6050_reg_map { + u8 int_pin_cfg; + u8 accl_offset; + u8 gyro_offset; ++ u8 i2c_if; + }; + + /*device enum */ +@@ -77,6 +79,7 @@ enum inv_devices { + INV_MPU9250, + INV_MPU9255, + INV_ICM20608, ++ INV_ICM20602, + INV_NUM_PARTS + }; + +@@ -105,12 +108,14 @@ struct inv_mpu6050_chip_config { + * @name: name of the chip. + * @reg: register map of the chip. + * @config: configuration of the chip. ++ * @fifo_size: size of the FIFO in bytes. + */ + struct inv_mpu6050_hw { + u8 whoami; + u8 *name; + const struct inv_mpu6050_reg_map *reg; + const struct inv_mpu6050_chip_config *config; ++ size_t fifo_size; + }; + + /* +@@ -193,12 +198,19 @@ struct inv_mpu6050_state { + #define INV_MPU6050_BIT_PWR_ACCL_STBY 0x38 + #define INV_MPU6050_BIT_PWR_GYRO_STBY 0x07 + ++/* ICM20602 register */ ++#define INV_ICM20602_REG_I2C_IF 0x70 ++#define INV_ICM20602_BIT_I2C_IF_DIS 0x40 ++ + #define INV_MPU6050_REG_FIFO_COUNT_H 0x72 + #define INV_MPU6050_REG_FIFO_R_W 0x74 + + #define INV_MPU6050_BYTES_PER_3AXIS_SENSOR 6 + #define INV_MPU6050_FIFO_COUNT_BYTE 2 + ++/* ICM20602 FIFO samples include temperature readings */ ++#define INV_ICM20602_BYTES_PER_TEMP_SENSOR 2 ++ + /* mpu6500 registers */ + #define INV_MPU6500_REG_ACCEL_CONFIG_2 0x1D + #define INV_MPU6500_REG_ACCEL_OFFSET 0x77 +@@ -220,6 +232,9 @@ struct inv_mpu6050_state { + #define INV_MPU6050_GYRO_CONFIG_FSR_SHIFT 3 + #define INV_MPU6050_ACCL_CONFIG_FSR_SHIFT 3 + ++#define INV_ICM20602_TEMP_OFFSET 8170 ++#define INV_ICM20602_TEMP_SCALE 3060 ++ + /* 6 + 6 round up and plus 8 */ + #define INV_MPU6050_OUTPUT_DATA_SIZE 24 + +@@ -259,8 +274,9 @@ struct inv_mpu6050_state { + #define INV_MPU9255_WHOAMI_VALUE 0x73 + #define INV_MPU6515_WHOAMI_VALUE 0x74 + #define INV_ICM20608_WHOAMI_VALUE 0xAF ++#define INV_ICM20602_WHOAMI_VALUE 0x12 + +-/* scan element definition */ ++/* scan element definition for generic MPU6xxx devices */ + enum inv_mpu6050_scan { + INV_MPU6050_SCAN_ACCL_X, + INV_MPU6050_SCAN_ACCL_Y, +@@ -271,6 +287,18 @@ enum inv_mpu6050_scan { + INV_MPU6050_SCAN_TIMESTAMP, + }; + ++/* scan element definition for ICM20602, which includes temperature */ ++enum inv_icm20602_scan { ++ INV_ICM20602_SCAN_ACCL_X, ++ INV_ICM20602_SCAN_ACCL_Y, ++ INV_ICM20602_SCAN_ACCL_Z, ++ INV_ICM20602_SCAN_TEMP, ++ INV_ICM20602_SCAN_GYRO_X, ++ INV_ICM20602_SCAN_GYRO_Y, ++ INV_ICM20602_SCAN_GYRO_Z, ++ INV_ICM20602_SCAN_TIMESTAMP, ++}; ++ + enum inv_mpu6050_filter_e { + INV_MPU6050_FILTER_256HZ_NOLPF2 = 0, + INV_MPU6050_FILTER_188HZ, +diff --git a/drivers/iio/imu/inv_mpu6050/inv_mpu_ring.c b/drivers/iio/imu/inv_mpu6050/inv_mpu_ring.c +index 548e042f7b5b..0e54f2d54bd7 100644 +--- a/drivers/iio/imu/inv_mpu6050/inv_mpu_ring.c ++++ b/drivers/iio/imu/inv_mpu6050/inv_mpu_ring.c +@@ -188,9 +188,6 @@ irqreturn_t inv_mpu6050_read_fifo(int irq, void *p) + "failed to ack interrupt\n"); + goto flush_fifo; + } +- /* handle fifo overflow by reseting fifo */ +- if (int_status & INV_MPU6050_BIT_FIFO_OVERFLOW_INT) +- goto flush_fifo; + if (!(int_status & INV_MPU6050_BIT_RAW_DATA_RDY_INT)) { + dev_warn(regmap_get_device(st->map), + "spurious interrupt with status 0x%x\n", int_status); +@@ -207,6 +204,9 @@ irqreturn_t inv_mpu6050_read_fifo(int irq, void *p) + if (st->chip_config.gyro_fifo_enable) + bytes_per_datum += INV_MPU6050_BYTES_PER_3AXIS_SENSOR; + ++ if (st->chip_type == INV_ICM20602) ++ bytes_per_datum += INV_ICM20602_BYTES_PER_TEMP_SENSOR; ++ + /* + * read fifo_count register to know how many bytes are inside the FIFO + * right now +@@ -216,6 +216,18 @@ irqreturn_t inv_mpu6050_read_fifo(int irq, void *p) + if (result) + goto end_session; + fifo_count = get_unaligned_be16(&data[0]); ++ ++ /* ++ * Handle fifo overflow by resetting fifo. ++ * Reset if there is only 3 data set free remaining to mitigate ++ * possible delay between reading fifo count and fifo data. ++ */ ++ nb = 3 * bytes_per_datum; ++ if (fifo_count >= st->hw->fifo_size - nb) { ++ dev_warn(regmap_get_device(st->map), "fifo overflow reset\n"); ++ goto flush_fifo; ++ } ++ + /* compute and process all complete datum */ + nb = fifo_count / bytes_per_datum; + inv_mpu6050_update_period(st, pf->timestamp, nb); +diff --git a/drivers/iio/imu/inv_mpu6050/inv_mpu_spi.c b/drivers/iio/imu/inv_mpu6050/inv_mpu_spi.c +index 227f50afff22..a112c3f45f74 100644 +--- a/drivers/iio/imu/inv_mpu6050/inv_mpu_spi.c ++++ b/drivers/iio/imu/inv_mpu6050/inv_mpu_spi.c +@@ -31,9 +31,14 @@ static int inv_mpu_i2c_disable(struct iio_dev *indio_dev) + if (ret) + return ret; + +- st->chip_config.user_ctrl |= INV_MPU6050_BIT_I2C_IF_DIS; +- ret = regmap_write(st->map, st->reg->user_ctrl, +- st->chip_config.user_ctrl); ++ if (st->reg->i2c_if) { ++ ret = regmap_write(st->map, st->reg->i2c_if, ++ INV_ICM20602_BIT_I2C_IF_DIS); ++ } else { ++ st->chip_config.user_ctrl |= INV_MPU6050_BIT_I2C_IF_DIS; ++ ret = regmap_write(st->map, st->reg->user_ctrl, ++ st->chip_config.user_ctrl); ++ } + if (ret) { + inv_mpu6050_set_power_itg(st, false); + return ret; +@@ -81,6 +86,7 @@ static const struct spi_device_id inv_mpu_id[] = { + {"mpu9250", INV_MPU9250}, + {"mpu9255", INV_MPU9255}, + {"icm20608", INV_ICM20608}, ++ {"icm20602", INV_ICM20602}, + {} + }; + +diff --git a/drivers/iio/proximity/srf04.c b/drivers/iio/proximity/srf04.c +index 09c7b9c095b0..0428a0dfcbd4 100644 +--- a/drivers/iio/proximity/srf04.c ++++ b/drivers/iio/proximity/srf04.c +@@ -105,7 +105,7 @@ static int srf04_read(struct srf04_data *data) + udelay(10); + gpiod_set_value(data->gpiod_trig, 0); + +- /* it cannot take more than 20 ms */ ++ /* it should not take more than 20 ms until echo is rising */ + ret = wait_for_completion_killable_timeout(&data->rising, HZ/50); + if (ret < 0) { + mutex_unlock(&data->lock); +@@ -115,7 +115,8 @@ static int srf04_read(struct srf04_data *data) + return -ETIMEDOUT; + } + +- ret = wait_for_completion_killable_timeout(&data->falling, HZ/50); ++ /* it cannot take more than 50 ms until echo is falling */ ++ ret = wait_for_completion_killable_timeout(&data->falling, HZ/20); + if (ret < 0) { + mutex_unlock(&data->lock); + return ret; +@@ -130,19 +131,19 @@ static int srf04_read(struct srf04_data *data) + + dt_ns = ktime_to_ns(ktime_dt); + /* +- * measuring more than 3 meters is beyond the capabilities of +- * the sensor ++ * measuring more than 6,45 meters is beyond the capabilities of ++ * the supported sensors + * ==> filter out invalid results for not measuring echos of + * another us sensor + * + * formula: +- * distance 3 m +- * time = ---------- = --------- = 9404389 ns +- * speed 319 m/s ++ * distance 6,45 * 2 m ++ * time = ---------- = ------------ = 40438871 ns ++ * speed 319 m/s + * + * using a minimum speed at -20 °C of 319 m/s + */ +- if (dt_ns > 9404389) ++ if (dt_ns > 40438871) + return -EIO; + + time_ns = dt_ns; +@@ -154,20 +155,20 @@ static int srf04_read(struct srf04_data *data) + * with Temp in °C + * and speed in m/s + * +- * use 343 m/s as ultrasonic speed at 20 °C here in absence of the ++ * use 343,5 m/s as ultrasonic speed at 20 °C here in absence of the + * temperature + * + * therefore: +- * time 343 +- * distance = ------ * ----- +- * 10^6 2 ++ * time 343,5 time * 106 ++ * distance = ------ * ------- = ------------ ++ * 10^6 2 617176 + * with time in ns + * and distance in mm (one way) + * +- * because we limit to 3 meters the multiplication with 343 just ++ * because we limit to 6,45 meters the multiplication with 106 just + * fits into 32 bit + */ +- distance_mm = time_ns * 343 / 2000000; ++ distance_mm = time_ns * 106 / 617176; + + return distance_mm; + } +diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h +index 5df8e548cc14..4a14de2d8c71 100644 +--- a/drivers/infiniband/core/uverbs.h ++++ b/drivers/infiniband/core/uverbs.h +@@ -98,7 +98,7 @@ ib_uverbs_init_udata_buf_or_null(struct ib_udata *udata, + + struct ib_uverbs_device { + atomic_t refcount; +- int num_comp_vectors; ++ u32 num_comp_vectors; + struct completion comp; + struct device *dev; + struct ib_device __rcu *ib_dev; +diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c +index 3be6405d9855..a5ff1f0f2073 100644 +--- a/drivers/infiniband/hw/cxgb4/cm.c ++++ b/drivers/infiniband/hw/cxgb4/cm.c +@@ -493,7 +493,6 @@ static int _put_ep_safe(struct c4iw_dev *dev, struct sk_buff *skb) + + ep = *((struct c4iw_ep **)(skb->cb + 2 * sizeof(void *))); + release_ep_resources(ep); +- kfree_skb(skb); + return 0; + } + +@@ -504,7 +503,6 @@ static int _put_pass_ep_safe(struct c4iw_dev *dev, struct sk_buff *skb) + ep = *((struct c4iw_ep **)(skb->cb + 2 * sizeof(void *))); + c4iw_put_ep(&ep->parent_ep->com); + release_ep_resources(ep); +- kfree_skb(skb); + return 0; + } + +@@ -2380,20 +2378,6 @@ static int accept_cr(struct c4iw_ep *ep, struct sk_buff *skb, + enum chip_type adapter_type = ep->com.dev->rdev.lldi.adapter_type; + + pr_debug("ep %p tid %u\n", ep, ep->hwtid); +- +- skb_get(skb); +- rpl = cplhdr(skb); +- if (!is_t4(adapter_type)) { +- skb_trim(skb, roundup(sizeof(*rpl5), 16)); +- rpl5 = (void *)rpl; +- INIT_TP_WR(rpl5, ep->hwtid); +- } else { +- skb_trim(skb, sizeof(*rpl)); +- INIT_TP_WR(rpl, ep->hwtid); +- } +- OPCODE_TID(rpl) = cpu_to_be32(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, +- ep->hwtid)); +- + cxgb_best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx, + enable_tcp_timestamps && req->tcpopt.tstamp, + (ep->com.remote_addr.ss_family == AF_INET) ? 0 : 1); +@@ -2439,6 +2423,20 @@ static int accept_cr(struct c4iw_ep *ep, struct sk_buff *skb, + if (tcph->ece && tcph->cwr) + opt2 |= CCTRL_ECN_V(1); + } ++ ++ skb_get(skb); ++ rpl = cplhdr(skb); ++ if (!is_t4(adapter_type)) { ++ skb_trim(skb, roundup(sizeof(*rpl5), 16)); ++ rpl5 = (void *)rpl; ++ INIT_TP_WR(rpl5, ep->hwtid); ++ } else { ++ skb_trim(skb, sizeof(*rpl)); ++ INIT_TP_WR(rpl, ep->hwtid); ++ } ++ OPCODE_TID(rpl) = cpu_to_be32(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, ++ ep->hwtid)); ++ + if (CHELSIO_CHIP_VERSION(adapter_type) > CHELSIO_T4) { + u32 isn = (prandom_u32() & ~7UL) - 1; + opt2 |= T5_OPT_2_VALID_F; +diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +index a442b29e7611..cf878e1b71fc 100644 +--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c ++++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +@@ -4572,9 +4572,9 @@ static void hns_roce_v2_free_eq(struct hns_roce_dev *hr_dev, + return; + } + +- if (eq->buf_list) +- dma_free_coherent(hr_dev->dev, buf_chk_sz, +- eq->buf_list->buf, eq->buf_list->map); ++ dma_free_coherent(hr_dev->dev, buf_chk_sz, eq->buf_list->buf, ++ eq->buf_list->map); ++ kfree(eq->buf_list); + } + + static void hns_roce_config_eqc(struct hns_roce_dev *hr_dev, +diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c +index 77b1f3fd086a..900f85ce0fb0 100644 +--- a/drivers/infiniband/hw/mlx5/qp.c ++++ b/drivers/infiniband/hw/mlx5/qp.c +@@ -2828,10 +2828,12 @@ static int modify_raw_packet_qp_sq(struct mlx5_core_dev *dev, + } + + /* Only remove the old rate after new rate was set */ +- if ((old_rl.rate && +- !mlx5_rl_are_equal(&old_rl, &new_rl)) || +- (new_state != MLX5_SQC_STATE_RDY)) ++ if ((old_rl.rate && !mlx5_rl_are_equal(&old_rl, &new_rl)) || ++ (new_state != MLX5_SQC_STATE_RDY)) { + mlx5_rl_remove_rate(dev, &old_rl); ++ if (new_state != MLX5_SQC_STATE_RDY) ++ memset(&new_rl, 0, sizeof(new_rl)); ++ } + + ibqp->rl = new_rl; + sq->state = new_state; +diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c +index a0af6d424aed..d1680d3b5825 100644 +--- a/drivers/infiniband/hw/qedr/main.c ++++ b/drivers/infiniband/hw/qedr/main.c +@@ -77,7 +77,7 @@ static void qedr_get_dev_fw_str(struct ib_device *ibdev, char *str) + struct qedr_dev *qedr = get_qedr_dev(ibdev); + u32 fw_ver = (u32)qedr->attr.fw_ver; + +- snprintf(str, IB_FW_VERSION_NAME_MAX, "%d. %d. %d. %d", ++ snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%d.%d", + (fw_ver >> 24) & 0xFF, (fw_ver >> 16) & 0xFF, + (fw_ver >> 8) & 0xFF, fw_ver & 0xFF); + } +diff --git a/drivers/iommu/amd_iommu_quirks.c b/drivers/iommu/amd_iommu_quirks.c +index c235f79b7a20..5120ce4fdce3 100644 +--- a/drivers/iommu/amd_iommu_quirks.c ++++ b/drivers/iommu/amd_iommu_quirks.c +@@ -73,6 +73,19 @@ static const struct dmi_system_id ivrs_quirks[] __initconst = { + }, + .driver_data = (void *)&ivrs_ioapic_quirks[DELL_LATITUDE_5495], + }, ++ { ++ /* ++ * Acer Aspire A315-41 requires the very same workaround as ++ * Dell Latitude 5495 ++ */ ++ .callback = ivrs_ioapic_quirk_cb, ++ .ident = "Acer Aspire A315-41", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "Acer"), ++ DMI_MATCH(DMI_PRODUCT_NAME, "Aspire A315-41"), ++ }, ++ .driver_data = (void *)&ivrs_ioapic_quirks[DELL_LATITUDE_5495], ++ }, + { + .callback = ivrs_ioapic_quirk_cb, + .ident = "Lenovo ideapad 330S-15ARR", +diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c +index 2804e2d1ae5e..9b8143dca512 100644 +--- a/drivers/net/bonding/bond_main.c ++++ b/drivers/net/bonding/bond_main.c +@@ -1798,7 +1798,8 @@ err_detach: + slave_disable_netpoll(new_slave); + + err_close: +- slave_dev->priv_flags &= ~IFF_BONDING; ++ if (!netif_is_bond_master(slave_dev)) ++ slave_dev->priv_flags &= ~IFF_BONDING; + dev_close(slave_dev); + + err_restore_mac: +@@ -2004,7 +2005,8 @@ static int __bond_release_one(struct net_device *bond_dev, + else + dev_set_mtu(slave_dev, slave->original_mtu); + +- slave_dev->priv_flags &= ~IFF_BONDING; ++ if (!netif_is_bond_master(slave_dev)) ++ slave_dev->priv_flags &= ~IFF_BONDING; + + bond_free_slave(slave); + +@@ -2074,8 +2076,7 @@ static int bond_miimon_inspect(struct bonding *bond) + ignore_updelay = !rcu_dereference(bond->curr_active_slave); + + bond_for_each_slave_rcu(bond, slave, iter) { +- slave->new_link = BOND_LINK_NOCHANGE; +- slave->link_new_state = slave->link; ++ bond_propose_link_state(slave, BOND_LINK_NOCHANGE); + + link_state = bond_check_dev_link(bond, slave->dev, 0); + +@@ -2111,7 +2112,7 @@ static int bond_miimon_inspect(struct bonding *bond) + } + + if (slave->delay <= 0) { +- slave->new_link = BOND_LINK_DOWN; ++ bond_propose_link_state(slave, BOND_LINK_DOWN); + commit++; + continue; + } +@@ -2150,7 +2151,7 @@ static int bond_miimon_inspect(struct bonding *bond) + slave->delay = 0; + + if (slave->delay <= 0) { +- slave->new_link = BOND_LINK_UP; ++ bond_propose_link_state(slave, BOND_LINK_UP); + commit++; + ignore_updelay = false; + continue; +@@ -2188,7 +2189,7 @@ static void bond_miimon_commit(struct bonding *bond) + struct slave *slave, *primary; + + bond_for_each_slave(bond, slave, iter) { +- switch (slave->new_link) { ++ switch (slave->link_new_state) { + case BOND_LINK_NOCHANGE: + /* For 802.3ad mode, check current slave speed and + * duplex again in case its port was disabled after +@@ -2263,8 +2264,8 @@ static void bond_miimon_commit(struct bonding *bond) + + default: + netdev_err(bond->dev, "invalid new link %d on slave %s\n", +- slave->new_link, slave->dev->name); +- slave->new_link = BOND_LINK_NOCHANGE; ++ slave->link_new_state, slave->dev->name); ++ bond_propose_link_state(slave, BOND_LINK_NOCHANGE); + + continue; + } +@@ -2664,13 +2665,13 @@ static void bond_loadbalance_arp_mon(struct bonding *bond) + bond_for_each_slave_rcu(bond, slave, iter) { + unsigned long trans_start = dev_trans_start(slave->dev); + +- slave->new_link = BOND_LINK_NOCHANGE; ++ bond_propose_link_state(slave, BOND_LINK_NOCHANGE); + + if (slave->link != BOND_LINK_UP) { + if (bond_time_in_interval(bond, trans_start, 1) && + bond_time_in_interval(bond, slave->last_rx, 1)) { + +- slave->new_link = BOND_LINK_UP; ++ bond_propose_link_state(slave, BOND_LINK_UP); + slave_state_changed = 1; + + /* primary_slave has no meaning in round-robin +@@ -2697,7 +2698,7 @@ static void bond_loadbalance_arp_mon(struct bonding *bond) + if (!bond_time_in_interval(bond, trans_start, 2) || + !bond_time_in_interval(bond, slave->last_rx, 2)) { + +- slave->new_link = BOND_LINK_DOWN; ++ bond_propose_link_state(slave, BOND_LINK_DOWN); + slave_state_changed = 1; + + if (slave->link_failure_count < UINT_MAX) +@@ -2729,8 +2730,8 @@ static void bond_loadbalance_arp_mon(struct bonding *bond) + goto re_arm; + + bond_for_each_slave(bond, slave, iter) { +- if (slave->new_link != BOND_LINK_NOCHANGE) +- slave->link = slave->new_link; ++ if (slave->link_new_state != BOND_LINK_NOCHANGE) ++ slave->link = slave->link_new_state; + } + + if (slave_state_changed) { +@@ -2753,9 +2754,9 @@ re_arm: + } + + /* Called to inspect slaves for active-backup mode ARP monitor link state +- * changes. Sets new_link in slaves to specify what action should take +- * place for the slave. Returns 0 if no changes are found, >0 if changes +- * to link states must be committed. ++ * changes. Sets proposed link state in slaves to specify what action ++ * should take place for the slave. Returns 0 if no changes are found, >0 ++ * if changes to link states must be committed. + * + * Called with rcu_read_lock held. + */ +@@ -2767,12 +2768,12 @@ static int bond_ab_arp_inspect(struct bonding *bond) + int commit = 0; + + bond_for_each_slave_rcu(bond, slave, iter) { +- slave->new_link = BOND_LINK_NOCHANGE; ++ bond_propose_link_state(slave, BOND_LINK_NOCHANGE); + last_rx = slave_last_rx(bond, slave); + + if (slave->link != BOND_LINK_UP) { + if (bond_time_in_interval(bond, last_rx, 1)) { +- slave->new_link = BOND_LINK_UP; ++ bond_propose_link_state(slave, BOND_LINK_UP); + commit++; + } + continue; +@@ -2800,7 +2801,7 @@ static int bond_ab_arp_inspect(struct bonding *bond) + if (!bond_is_active_slave(slave) && + !rcu_access_pointer(bond->current_arp_slave) && + !bond_time_in_interval(bond, last_rx, 3)) { +- slave->new_link = BOND_LINK_DOWN; ++ bond_propose_link_state(slave, BOND_LINK_DOWN); + commit++; + } + +@@ -2813,7 +2814,7 @@ static int bond_ab_arp_inspect(struct bonding *bond) + if (bond_is_active_slave(slave) && + (!bond_time_in_interval(bond, trans_start, 2) || + !bond_time_in_interval(bond, last_rx, 2))) { +- slave->new_link = BOND_LINK_DOWN; ++ bond_propose_link_state(slave, BOND_LINK_DOWN); + commit++; + } + } +@@ -2833,7 +2834,7 @@ static void bond_ab_arp_commit(struct bonding *bond) + struct slave *slave; + + bond_for_each_slave(bond, slave, iter) { +- switch (slave->new_link) { ++ switch (slave->link_new_state) { + case BOND_LINK_NOCHANGE: + continue; + +@@ -2886,7 +2887,7 @@ static void bond_ab_arp_commit(struct bonding *bond) + + default: + netdev_err(bond->dev, "impossible: new_link %d on slave %s\n", +- slave->new_link, slave->dev->name); ++ slave->link_new_state, slave->dev->name); + continue; + } + +diff --git a/drivers/net/can/c_can/c_can.c b/drivers/net/can/c_can/c_can.c +index 606b7d8ffe13..9b61bfbea6cd 100644 +--- a/drivers/net/can/c_can/c_can.c ++++ b/drivers/net/can/c_can/c_can.c +@@ -97,6 +97,9 @@ + #define BTR_TSEG2_SHIFT 12 + #define BTR_TSEG2_MASK (0x7 << BTR_TSEG2_SHIFT) + ++/* interrupt register */ ++#define INT_STS_PENDING 0x8000 ++ + /* brp extension register */ + #define BRP_EXT_BRPE_MASK 0x0f + #define BRP_EXT_BRPE_SHIFT 0 +@@ -1029,10 +1032,16 @@ static int c_can_poll(struct napi_struct *napi, int quota) + u16 curr, last = priv->last_status; + int work_done = 0; + +- priv->last_status = curr = priv->read_reg(priv, C_CAN_STS_REG); +- /* Ack status on C_CAN. D_CAN is self clearing */ +- if (priv->type != BOSCH_D_CAN) +- priv->write_reg(priv, C_CAN_STS_REG, LEC_UNUSED); ++ /* Only read the status register if a status interrupt was pending */ ++ if (atomic_xchg(&priv->sie_pending, 0)) { ++ priv->last_status = curr = priv->read_reg(priv, C_CAN_STS_REG); ++ /* Ack status on C_CAN. D_CAN is self clearing */ ++ if (priv->type != BOSCH_D_CAN) ++ priv->write_reg(priv, C_CAN_STS_REG, LEC_UNUSED); ++ } else { ++ /* no change detected ... */ ++ curr = last; ++ } + + /* handle state changes */ + if ((curr & STATUS_EWARN) && (!(last & STATUS_EWARN))) { +@@ -1083,10 +1092,16 @@ static irqreturn_t c_can_isr(int irq, void *dev_id) + { + struct net_device *dev = (struct net_device *)dev_id; + struct c_can_priv *priv = netdev_priv(dev); ++ int reg_int; + +- if (!priv->read_reg(priv, C_CAN_INT_REG)) ++ reg_int = priv->read_reg(priv, C_CAN_INT_REG); ++ if (!reg_int) + return IRQ_NONE; + ++ /* save for later use */ ++ if (reg_int & INT_STS_PENDING) ++ atomic_set(&priv->sie_pending, 1); ++ + /* disable all interrupts and schedule the NAPI */ + c_can_irq_control(priv, false); + napi_schedule(&priv->napi); +diff --git a/drivers/net/can/c_can/c_can.h b/drivers/net/can/c_can/c_can.h +index 8acdc7fa4792..d5567a7c1c6d 100644 +--- a/drivers/net/can/c_can/c_can.h ++++ b/drivers/net/can/c_can/c_can.h +@@ -198,6 +198,7 @@ struct c_can_priv { + struct net_device *dev; + struct device *device; + atomic_t tx_active; ++ atomic_t sie_pending; + unsigned long tx_dir; + int last_status; + u16 (*read_reg) (const struct c_can_priv *priv, enum reg index); +diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c +index bd127ce3aba2..49b853f53f59 100644 +--- a/drivers/net/can/dev.c ++++ b/drivers/net/can/dev.c +@@ -853,6 +853,7 @@ void of_can_transceiver(struct net_device *dev) + return; + + ret = of_property_read_u32(dn, "max-bitrate", &priv->bitrate_max); ++ of_node_put(dn); + if ((ret && ret != -EINVAL) || (!ret && !priv->bitrate_max)) + netdev_warn(dev, "Invalid value for transceiver max bitrate. Ignoring bitrate limit.\n"); + } +diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c +index 6f265d2e647b..581e84d8e2c8 100644 +--- a/drivers/net/can/flexcan.c ++++ b/drivers/net/can/flexcan.c +@@ -1048,6 +1048,7 @@ static int flexcan_chip_start(struct net_device *dev) + reg_mecr = priv->read(®s->mecr); + reg_mecr &= ~FLEXCAN_MECR_ECRWRDIS; + priv->write(reg_mecr, ®s->mecr); ++ reg_mecr |= FLEXCAN_MECR_ECCDIS; + reg_mecr &= ~(FLEXCAN_MECR_NCEFAFRZ | FLEXCAN_MECR_HANCEI_MSK | + FLEXCAN_MECR_FANCEI_MSK); + priv->write(reg_mecr, ®s->mecr); +diff --git a/drivers/net/can/rx-offload.c b/drivers/net/can/rx-offload.c +index 727691dd08fb..6cf0d0bc1e8d 100644 +--- a/drivers/net/can/rx-offload.c ++++ b/drivers/net/can/rx-offload.c +@@ -216,8 +216,10 @@ int can_rx_offload_queue_sorted(struct can_rx_offload *offload, + unsigned long flags; + + if (skb_queue_len(&offload->skb_queue) > +- offload->skb_queue_len_max) +- return -ENOMEM; ++ offload->skb_queue_len_max) { ++ kfree_skb(skb); ++ return -ENOBUFS; ++ } + + cb = can_rx_offload_get_cb(skb); + cb->timestamp = timestamp; +diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c +index 17c21ad3b95e..3a39f51a9e24 100644 +--- a/drivers/net/can/usb/gs_usb.c ++++ b/drivers/net/can/usb/gs_usb.c +@@ -631,6 +631,7 @@ static int gs_can_open(struct net_device *netdev) + rc); + + usb_unanchor_urb(urb); ++ usb_free_urb(urb); + break; + } + +diff --git a/drivers/net/can/usb/mcba_usb.c b/drivers/net/can/usb/mcba_usb.c +index 8d8c2086424d..1b0afeaf1a3c 100644 +--- a/drivers/net/can/usb/mcba_usb.c ++++ b/drivers/net/can/usb/mcba_usb.c +@@ -887,9 +887,8 @@ static void mcba_usb_disconnect(struct usb_interface *intf) + netdev_info(priv->netdev, "device disconnected\n"); + + unregister_candev(priv->netdev); +- free_candev(priv->netdev); +- + mcba_urb_unlink(priv); ++ free_candev(priv->netdev); + } + + static struct usb_driver mcba_usb_driver = { +diff --git a/drivers/net/can/usb/peak_usb/pcan_usb.c b/drivers/net/can/usb/peak_usb/pcan_usb.c +index 13238a72a338..61f33c2fb1cd 100644 +--- a/drivers/net/can/usb/peak_usb/pcan_usb.c ++++ b/drivers/net/can/usb/peak_usb/pcan_usb.c +@@ -108,7 +108,7 @@ struct pcan_usb_msg_context { + u8 *end; + u8 rec_cnt; + u8 rec_idx; +- u8 rec_data_idx; ++ u8 rec_ts_idx; + struct net_device *netdev; + struct pcan_usb *pdev; + }; +@@ -555,10 +555,15 @@ static int pcan_usb_decode_status(struct pcan_usb_msg_context *mc, + mc->ptr += PCAN_USB_CMD_ARGS; + + if (status_len & PCAN_USB_STATUSLEN_TIMESTAMP) { +- int err = pcan_usb_decode_ts(mc, !mc->rec_idx); ++ int err = pcan_usb_decode_ts(mc, !mc->rec_ts_idx); + + if (err) + return err; ++ ++ /* Next packet in the buffer will have a timestamp on a single ++ * byte ++ */ ++ mc->rec_ts_idx++; + } + + switch (f) { +@@ -640,10 +645,13 @@ static int pcan_usb_decode_data(struct pcan_usb_msg_context *mc, u8 status_len) + + cf->can_dlc = get_can_dlc(rec_len); + +- /* first data packet timestamp is a word */ +- if (pcan_usb_decode_ts(mc, !mc->rec_data_idx)) ++ /* Only first packet timestamp is a word */ ++ if (pcan_usb_decode_ts(mc, !mc->rec_ts_idx)) + goto decode_failed; + ++ /* Next packet in the buffer will have a timestamp on a single byte */ ++ mc->rec_ts_idx++; ++ + /* read data */ + memset(cf->data, 0x0, sizeof(cf->data)); + if (status_len & PCAN_USB_STATUSLEN_RTR) { +@@ -696,7 +704,6 @@ static int pcan_usb_decode_msg(struct peak_usb_device *dev, u8 *ibuf, u32 lbuf) + /* handle normal can frames here */ + } else { + err = pcan_usb_decode_data(&mc, sl); +- mc.rec_data_idx++; + } + } + +diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_core.c b/drivers/net/can/usb/peak_usb/pcan_usb_core.c +index 43b0fa2b9932..afc8d978124e 100644 +--- a/drivers/net/can/usb/peak_usb/pcan_usb_core.c ++++ b/drivers/net/can/usb/peak_usb/pcan_usb_core.c +@@ -758,7 +758,7 @@ static int peak_usb_create_dev(const struct peak_usb_adapter *peak_usb_adapter, + dev = netdev_priv(netdev); + + /* allocate a buffer large enough to send commands */ +- dev->cmd_buf = kmalloc(PCAN_USB_MAX_CMD_LEN, GFP_KERNEL); ++ dev->cmd_buf = kzalloc(PCAN_USB_MAX_CMD_LEN, GFP_KERNEL); + if (!dev->cmd_buf) { + err = -ENOMEM; + goto lbl_free_candev; +diff --git a/drivers/net/can/usb/usb_8dev.c b/drivers/net/can/usb/usb_8dev.c +index 27861c417c94..3e4416473607 100644 +--- a/drivers/net/can/usb/usb_8dev.c ++++ b/drivers/net/can/usb/usb_8dev.c +@@ -1007,9 +1007,8 @@ static void usb_8dev_disconnect(struct usb_interface *intf) + netdev_info(priv->netdev, "device disconnected\n"); + + unregister_netdev(priv->netdev); +- free_candev(priv->netdev); +- + unlink_all_urbs(priv); ++ free_candev(priv->netdev); + } + + } +diff --git a/drivers/net/ethernet/arc/emac_rockchip.c b/drivers/net/ethernet/arc/emac_rockchip.c +index 0f6576802607..a1df2ebab07f 100644 +--- a/drivers/net/ethernet/arc/emac_rockchip.c ++++ b/drivers/net/ethernet/arc/emac_rockchip.c +@@ -265,6 +265,9 @@ static int emac_rockchip_remove(struct platform_device *pdev) + if (priv->regulator) + regulator_disable(priv->regulator); + ++ if (priv->soc_data->need_div_macclk) ++ clk_disable_unprepare(priv->macclk); ++ + free_netdev(ndev); + return err; + } +diff --git a/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c b/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c +index bb43ddb7539e..592fb9e847b9 100644 +--- a/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c ++++ b/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c +@@ -1495,7 +1495,7 @@ static int octeon_mgmt_probe(struct platform_device *pdev) + netdev->ethtool_ops = &octeon_mgmt_ethtool_ops; + + netdev->min_mtu = 64 - OCTEON_MGMT_RX_HEADROOM; +- netdev->max_mtu = 16383 - OCTEON_MGMT_RX_HEADROOM; ++ netdev->max_mtu = 16383 - OCTEON_MGMT_RX_HEADROOM - VLAN_HLEN; + + mac = of_get_mac_address(pdev->dev.of_node); + +diff --git a/drivers/net/ethernet/hisilicon/hip04_eth.c b/drivers/net/ethernet/hisilicon/hip04_eth.c +index 2f8f03e0db81..644ad78d0051 100644 +--- a/drivers/net/ethernet/hisilicon/hip04_eth.c ++++ b/drivers/net/ethernet/hisilicon/hip04_eth.c +@@ -945,7 +945,6 @@ static int hip04_remove(struct platform_device *pdev) + + hip04_free_ring(ndev, d); + unregister_netdev(ndev); +- free_irq(ndev->irq, ndev); + of_node_put(priv->phy_node); + cancel_work_sync(&priv->tx_timeout_task); + free_netdev(ndev); +diff --git a/drivers/net/ethernet/hisilicon/hns/hnae.c b/drivers/net/ethernet/hisilicon/hns/hnae.c +index c7fa97a7e1f4..b758b3e79337 100644 +--- a/drivers/net/ethernet/hisilicon/hns/hnae.c ++++ b/drivers/net/ethernet/hisilicon/hns/hnae.c +@@ -203,7 +203,6 @@ hnae_init_ring(struct hnae_queue *q, struct hnae_ring *ring, int flags) + + ring->q = q; + ring->flags = flags; +- spin_lock_init(&ring->lock); + ring->coal_param = q->handle->coal_param; + assert(!ring->desc && !ring->desc_cb && !ring->desc_dma_addr); + +diff --git a/drivers/net/ethernet/hisilicon/hns/hnae.h b/drivers/net/ethernet/hisilicon/hns/hnae.h +index 08a750fb60c4..c8cbbe5d5549 100644 +--- a/drivers/net/ethernet/hisilicon/hns/hnae.h ++++ b/drivers/net/ethernet/hisilicon/hns/hnae.h +@@ -278,9 +278,6 @@ struct hnae_ring { + /* statistic */ + struct ring_stats stats; + +- /* ring lock for poll one */ +- spinlock_t lock; +- + dma_addr_t desc_dma_addr; + u32 buf_size; /* size for hnae_desc->addr, preset by AE */ + u16 desc_num; /* total number of desc */ +diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c +index 1c70f9aa0aa7..7f8cf809e02b 100644 +--- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c ++++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c +@@ -947,15 +947,6 @@ static int is_valid_clean_head(struct hnae_ring *ring, int h) + return u > c ? (h > c && h <= u) : (h > c || h <= u); + } + +-/* netif_tx_lock will turn down the performance, set only when necessary */ +-#ifdef CONFIG_NET_POLL_CONTROLLER +-#define NETIF_TX_LOCK(ring) spin_lock(&(ring)->lock) +-#define NETIF_TX_UNLOCK(ring) spin_unlock(&(ring)->lock) +-#else +-#define NETIF_TX_LOCK(ring) +-#define NETIF_TX_UNLOCK(ring) +-#endif +- + /* reclaim all desc in one budget + * return error or number of desc left + */ +@@ -969,21 +960,16 @@ static int hns_nic_tx_poll_one(struct hns_nic_ring_data *ring_data, + int head; + int bytes, pkts; + +- NETIF_TX_LOCK(ring); +- + head = readl_relaxed(ring->io_base + RCB_REG_HEAD); + rmb(); /* make sure head is ready before touch any data */ + +- if (is_ring_empty(ring) || head == ring->next_to_clean) { +- NETIF_TX_UNLOCK(ring); ++ if (is_ring_empty(ring) || head == ring->next_to_clean) + return 0; /* no data to poll */ +- } + + if (!is_valid_clean_head(ring, head)) { + netdev_err(ndev, "wrong head (%d, %d-%d)\n", head, + ring->next_to_use, ring->next_to_clean); + ring->stats.io_err_cnt++; +- NETIF_TX_UNLOCK(ring); + return -EIO; + } + +@@ -998,8 +984,6 @@ static int hns_nic_tx_poll_one(struct hns_nic_ring_data *ring_data, + ring->stats.tx_pkts += pkts; + ring->stats.tx_bytes += bytes; + +- NETIF_TX_UNLOCK(ring); +- + dev_queue = netdev_get_tx_queue(ndev, ring_data->queue_index); + netdev_tx_completed_queue(dev_queue, pkts, bytes); + +@@ -1059,16 +1043,12 @@ static void hns_nic_tx_clr_all_bufs(struct hns_nic_ring_data *ring_data) + int head; + int bytes, pkts; + +- NETIF_TX_LOCK(ring); +- + head = ring->next_to_use; /* ntu :soft setted ring position*/ + bytes = 0; + pkts = 0; + while (head != ring->next_to_clean) + hns_nic_reclaim_one_desc(ring, &bytes, &pkts); + +- NETIF_TX_UNLOCK(ring); +- + dev_queue = netdev_get_tx_queue(ndev, ring_data->queue_index); + netdev_tx_reset_queue(dev_queue); + } +diff --git a/drivers/net/ethernet/intel/e1000/e1000_ethtool.c b/drivers/net/ethernet/intel/e1000/e1000_ethtool.c +index 2569a168334c..903b0a902cb9 100644 +--- a/drivers/net/ethernet/intel/e1000/e1000_ethtool.c ++++ b/drivers/net/ethernet/intel/e1000/e1000_ethtool.c +@@ -607,6 +607,7 @@ static int e1000_set_ringparam(struct net_device *netdev, + for (i = 0; i < adapter->num_rx_queues; i++) + rxdr[i].count = rxdr->count; + ++ err = 0; + if (netif_running(adapter->netdev)) { + /* Try to get new resources before deleting old */ + err = e1000_setup_all_rx_resources(adapter); +@@ -627,14 +628,13 @@ static int e1000_set_ringparam(struct net_device *netdev, + adapter->rx_ring = rxdr; + adapter->tx_ring = txdr; + err = e1000_up(adapter); +- if (err) +- goto err_setup; + } + kfree(tx_old); + kfree(rx_old); + + clear_bit(__E1000_RESETTING, &adapter->flags); +- return 0; ++ return err; ++ + err_setup_tx: + e1000_free_all_rx_resources(adapter); + err_setup_rx: +@@ -646,7 +646,6 @@ err_alloc_rx: + err_alloc_tx: + if (netif_running(adapter->netdev)) + e1000_up(adapter); +-err_setup: + clear_bit(__E1000_RESETTING, &adapter->flags); + return err; + } +diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c +index ab76a5f77cd0..36db874f3c92 100644 +--- a/drivers/net/ethernet/intel/igb/igb_main.c ++++ b/drivers/net/ethernet/intel/igb/igb_main.c +@@ -2064,7 +2064,8 @@ static void igb_check_swap_media(struct igb_adapter *adapter) + if ((hw->phy.media_type == e1000_media_type_copper) && + (!(connsw & E1000_CONNSW_AUTOSENSE_EN))) { + swap_now = true; +- } else if (!(connsw & E1000_CONNSW_SERDESD)) { ++ } else if ((hw->phy.media_type != e1000_media_type_copper) && ++ !(connsw & E1000_CONNSW_SERDESD)) { + /* copper signal takes time to appear */ + if (adapter->copper_tries < 4) { + adapter->copper_tries++; +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +index 0b03d65474e9..73dce92c41c4 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +@@ -462,7 +462,10 @@ netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct net_device *dev) + static void mlx5e_dump_error_cqe(struct mlx5e_txqsq *sq, + struct mlx5_err_cqe *err_cqe) + { +- u32 ci = mlx5_cqwq_get_ci(&sq->cq.wq); ++ struct mlx5_cqwq *wq = &sq->cq.wq; ++ u32 ci; ++ ++ ci = mlx5_cqwq_ctr2ix(wq, wq->cc - 1); + + netdev_err(sq->channel->netdev, + "Error cqe on cqn 0x%x, ci 0x%x, sqn 0x%x, syndrome 0x%x, vendor syndrome 0x%x\n", +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c +index 8ca1d1949d93..d8d0b6bd5c5a 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c +@@ -462,8 +462,10 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size) + } + + err = mlx5_vector2eqn(mdev, smp_processor_id(), &eqn, &irqn); +- if (err) ++ if (err) { ++ kvfree(in); + goto err_cqwq; ++ } + + cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context); + MLX5_SET(cqc, cqc, log_cq_size, ilog2(cq_size)); +diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c +index 732ba21d3369..a29a6a618110 100644 +--- a/drivers/net/ethernet/mscc/ocelot.c ++++ b/drivers/net/ethernet/mscc/ocelot.c +@@ -253,8 +253,15 @@ static int ocelot_vlan_vid_add(struct net_device *dev, u16 vid, bool pvid, + port->pvid = vid; + + /* Untagged egress vlan clasification */ +- if (untagged) ++ if (untagged && port->vid != vid) { ++ if (port->vid) { ++ dev_err(ocelot->dev, ++ "Port already has a native VLAN: %d\n", ++ port->vid); ++ return -EBUSY; ++ } + port->vid = vid; ++ } + + ocelot_vlan_port_apply(ocelot, port); + +@@ -886,7 +893,7 @@ end: + static int ocelot_vlan_rx_add_vid(struct net_device *dev, __be16 proto, + u16 vid) + { +- return ocelot_vlan_vid_add(dev, vid, false, true); ++ return ocelot_vlan_vid_add(dev, vid, false, false); + } + + static int ocelot_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, +@@ -1506,9 +1513,6 @@ static int ocelot_netdevice_port_event(struct net_device *dev, + struct ocelot_port *ocelot_port = netdev_priv(dev); + int err = 0; + +- if (!ocelot_netdevice_dev_check(dev)) +- return 0; +- + switch (event) { + case NETDEV_CHANGEUPPER: + if (netif_is_bridge_master(info->upper_dev)) { +@@ -1545,12 +1549,16 @@ static int ocelot_netdevice_event(struct notifier_block *unused, + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + int ret = 0; + ++ if (!ocelot_netdevice_dev_check(dev)) ++ return 0; ++ + if (event == NETDEV_PRECHANGEUPPER && + netif_is_lag_master(info->upper_dev)) { + struct netdev_lag_upper_info *lag_upper_info = info->upper_info; + struct netlink_ext_ack *extack; + +- if (lag_upper_info->tx_type != NETDEV_LAG_TX_TYPE_HASH) { ++ if (lag_upper_info && ++ lag_upper_info->tx_type != NETDEV_LAG_TX_TYPE_HASH) { + extack = netdev_notifier_info_to_extack(&info->info); + NL_SET_ERR_MSG_MOD(extack, "LAG device using unsupported Tx type"); + +diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c +index f3d9c40c4115..630b13a9c3d5 100644 +--- a/drivers/net/ethernet/qlogic/qede/qede_main.c ++++ b/drivers/net/ethernet/qlogic/qede/qede_main.c +@@ -1170,8 +1170,16 @@ enum qede_remove_mode { + static void __qede_remove(struct pci_dev *pdev, enum qede_remove_mode mode) + { + struct net_device *ndev = pci_get_drvdata(pdev); +- struct qede_dev *edev = netdev_priv(ndev); +- struct qed_dev *cdev = edev->cdev; ++ struct qede_dev *edev; ++ struct qed_dev *cdev; ++ ++ if (!ndev) { ++ dev_info(&pdev->dev, "Device has already been removed\n"); ++ return; ++ } ++ ++ edev = netdev_priv(ndev); ++ cdev = edev->cdev; + + DP_INFO(edev, "Starting qede_remove\n"); + +diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c +index 5f4e447c5dce..f66d1255e36a 100644 +--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c ++++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c +@@ -66,10 +66,10 @@ static int rmnet_unregister_real_device(struct net_device *real_dev, + if (port->nr_rmnet_devs) + return -EINVAL; + +- kfree(port); +- + netdev_rx_handler_unregister(real_dev); + ++ kfree(port); ++ + /* release reference on real_dev */ + dev_put(real_dev); + +diff --git a/drivers/net/fjes/fjes_main.c b/drivers/net/fjes/fjes_main.c +index d3eae1239045..61a9843346ad 100644 +--- a/drivers/net/fjes/fjes_main.c ++++ b/drivers/net/fjes/fjes_main.c +@@ -1252,8 +1252,17 @@ static int fjes_probe(struct platform_device *plat_dev) + adapter->open_guard = false; + + adapter->txrx_wq = alloc_workqueue(DRV_NAME "/txrx", WQ_MEM_RECLAIM, 0); ++ if (unlikely(!adapter->txrx_wq)) { ++ err = -ENOMEM; ++ goto err_free_netdev; ++ } ++ + adapter->control_wq = alloc_workqueue(DRV_NAME "/control", + WQ_MEM_RECLAIM, 0); ++ if (unlikely(!adapter->control_wq)) { ++ err = -ENOMEM; ++ goto err_free_txrx_wq; ++ } + + INIT_WORK(&adapter->tx_stall_task, fjes_tx_stall_task); + INIT_WORK(&adapter->raise_intr_rxdata_task, +@@ -1270,7 +1279,7 @@ static int fjes_probe(struct platform_device *plat_dev) + hw->hw_res.irq = platform_get_irq(plat_dev, 0); + err = fjes_hw_init(&adapter->hw); + if (err) +- goto err_free_netdev; ++ goto err_free_control_wq; + + /* setup MAC address (02:00:00:00:00:[epid])*/ + netdev->dev_addr[0] = 2; +@@ -1292,6 +1301,10 @@ static int fjes_probe(struct platform_device *plat_dev) + + err_hw_exit: + fjes_hw_exit(&adapter->hw); ++err_free_control_wq: ++ destroy_workqueue(adapter->control_wq); ++err_free_txrx_wq: ++ destroy_workqueue(adapter->txrx_wq); + err_free_netdev: + free_netdev(netdev); + err_out: +diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c +index 6f6c0dbd91fc..b7a71c203aa3 100644 +--- a/drivers/net/hyperv/netvsc_drv.c ++++ b/drivers/net/hyperv/netvsc_drv.c +@@ -993,7 +993,7 @@ static int netvsc_attach(struct net_device *ndev, + if (netif_running(ndev)) { + ret = rndis_filter_open(nvdev); + if (ret) +- return ret; ++ goto err; + + rdev = nvdev->extension; + if (!rdev->link_state) +@@ -1001,6 +1001,13 @@ static int netvsc_attach(struct net_device *ndev, + } + + return 0; ++ ++err: ++ netif_device_detach(ndev); ++ ++ rndis_filter_device_remove(hdev, nvdev); ++ ++ return ret; + } + + static int netvsc_set_channels(struct net_device *net, +diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c +index 0dc92d2faa64..05115fb0c97a 100644 +--- a/drivers/net/macsec.c ++++ b/drivers/net/macsec.c +@@ -3008,12 +3008,10 @@ static const struct nla_policy macsec_rtnl_policy[IFLA_MACSEC_MAX + 1] = { + static void macsec_free_netdev(struct net_device *dev) + { + struct macsec_dev *macsec = macsec_priv(dev); +- struct net_device *real_dev = macsec->real_dev; + + free_percpu(macsec->stats); + free_percpu(macsec->secy.tx_sc.stats); + +- dev_put(real_dev); + } + + static void macsec_setup(struct net_device *dev) +@@ -3268,8 +3266,6 @@ static int macsec_newlink(struct net *net, struct net_device *dev, + if (err < 0) + return err; + +- dev_hold(real_dev); +- + macsec->nest_level = dev_get_nest_level(real_dev) + 1; + netdev_lockdep_set_classes(dev); + lockdep_set_class_and_subclass(&dev->addr_list_lock, +diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c +index f53e3e4e25f3..a57d82ef0f81 100644 +--- a/drivers/net/usb/cdc_ncm.c ++++ b/drivers/net/usb/cdc_ncm.c +@@ -578,8 +578,8 @@ static void cdc_ncm_set_dgram_size(struct usbnet *dev, int new_size) + /* read current mtu value from device */ + err = usbnet_read_cmd(dev, USB_CDC_GET_MAX_DATAGRAM_SIZE, + USB_TYPE_CLASS | USB_DIR_IN | USB_RECIP_INTERFACE, +- 0, iface_no, &max_datagram_size, 2); +- if (err < 0) { ++ 0, iface_no, &max_datagram_size, sizeof(max_datagram_size)); ++ if (err < sizeof(max_datagram_size)) { + dev_dbg(&dev->intf->dev, "GET_MAX_DATAGRAM_SIZE failed\n"); + goto out; + } +@@ -590,7 +590,7 @@ static void cdc_ncm_set_dgram_size(struct usbnet *dev, int new_size) + max_datagram_size = cpu_to_le16(ctx->max_datagram_size); + err = usbnet_write_cmd(dev, USB_CDC_SET_MAX_DATAGRAM_SIZE, + USB_TYPE_CLASS | USB_DIR_OUT | USB_RECIP_INTERFACE, +- 0, iface_no, &max_datagram_size, 2); ++ 0, iface_no, &max_datagram_size, sizeof(max_datagram_size)); + if (err < 0) + dev_dbg(&dev->intf->dev, "SET_MAX_DATAGRAM_SIZE failed\n"); + +diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c +index 6f517e673020..9f037c50054d 100644 +--- a/drivers/net/usb/qmi_wwan.c ++++ b/drivers/net/usb/qmi_wwan.c +@@ -1297,6 +1297,7 @@ static const struct usb_device_id products[] = { + {QMI_FIXED_INTF(0x413c, 0x81b6, 8)}, /* Dell Wireless 5811e */ + {QMI_FIXED_INTF(0x413c, 0x81b6, 10)}, /* Dell Wireless 5811e */ + {QMI_FIXED_INTF(0x413c, 0x81d7, 0)}, /* Dell Wireless 5821e */ ++ {QMI_FIXED_INTF(0x413c, 0x81e0, 0)}, /* Dell Wireless 5821e with eSIM support*/ + {QMI_FIXED_INTF(0x03f0, 0x4e1d, 8)}, /* HP lt4111 LTE/EV-DO/HSPA+ Gobi 4G Module */ + {QMI_FIXED_INTF(0x03f0, 0x9d1d, 1)}, /* HP lt4120 Snapdragon X5 LTE */ + {QMI_FIXED_INTF(0x22de, 0x9061, 3)}, /* WeTelecom WPD-600N */ +diff --git a/drivers/nfc/fdp/i2c.c b/drivers/nfc/fdp/i2c.c +index d8d70dd830b0..7f143387b9ff 100644 +--- a/drivers/nfc/fdp/i2c.c ++++ b/drivers/nfc/fdp/i2c.c +@@ -267,7 +267,7 @@ static void fdp_nci_i2c_read_device_properties(struct device *dev, + *fw_vsc_cfg, len); + + if (r) { +- devm_kfree(dev, fw_vsc_cfg); ++ devm_kfree(dev, *fw_vsc_cfg); + goto vsc_read_err; + } + } else { +diff --git a/drivers/nfc/st21nfca/core.c b/drivers/nfc/st21nfca/core.c +index e803fdfa9189..f37069b53b20 100644 +--- a/drivers/nfc/st21nfca/core.c ++++ b/drivers/nfc/st21nfca/core.c +@@ -719,6 +719,7 @@ static int st21nfca_hci_complete_target_discovered(struct nfc_hci_dev *hdev, + NFC_PROTO_FELICA_MASK; + } else { + kfree_skb(nfcid_skb); ++ nfcid_skb = NULL; + /* P2P in type A */ + r = nfc_hci_get_param(hdev, ST21NFCA_RF_READER_F_GATE, + ST21NFCA_RF_READER_F_NFCID1, +diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c +index 892ef5212232..838ee58d80cd 100644 +--- a/drivers/nvme/host/multipath.c ++++ b/drivers/nvme/host/multipath.c +@@ -575,7 +575,7 @@ int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) + goto out; + } + +- error = nvme_read_ana_log(ctrl, true); ++ error = nvme_read_ana_log(ctrl, false); + if (error) + goto out_free_ana_log_buf; + return 0; +diff --git a/drivers/pci/controller/pci-tegra.c b/drivers/pci/controller/pci-tegra.c +index 976eaa9a9f26..58e487352853 100644 +--- a/drivers/pci/controller/pci-tegra.c ++++ b/drivers/pci/controller/pci-tegra.c +@@ -545,12 +545,15 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0bf1, tegra_pcie_fixup_class); + DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0e1c, tegra_pcie_fixup_class); + DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0e1d, tegra_pcie_fixup_class); + +-/* Tegra PCIE requires relaxed ordering */ ++/* Tegra20 and Tegra30 PCIE requires relaxed ordering */ + static void tegra_pcie_relax_enable(struct pci_dev *dev) + { + pcie_capability_set_word(dev, PCI_EXP_DEVCTL, PCI_EXP_DEVCTL_RELAX_EN); + } +-DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, tegra_pcie_relax_enable); ++DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NVIDIA, 0x0bf0, tegra_pcie_relax_enable); ++DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NVIDIA, 0x0bf1, tegra_pcie_relax_enable); ++DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NVIDIA, 0x0e1c, tegra_pcie_relax_enable); ++DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NVIDIA, 0x0e1d, tegra_pcie_relax_enable); + + static int tegra_pcie_request_resources(struct tegra_pcie *pcie) + { +diff --git a/drivers/pinctrl/intel/pinctrl-cherryview.c b/drivers/pinctrl/intel/pinctrl-cherryview.c +index 227646eb817c..9eab50839581 100644 +--- a/drivers/pinctrl/intel/pinctrl-cherryview.c ++++ b/drivers/pinctrl/intel/pinctrl-cherryview.c +@@ -1595,7 +1595,7 @@ static int chv_gpio_probe(struct chv_pinctrl *pctrl, int irq) + intsel >>= CHV_PADCTRL0_INTSEL_SHIFT; + + if (need_valid_mask && intsel >= community->nirqs) +- clear_bit(i, chip->irq.valid_mask); ++ clear_bit(desc->number, chip->irq.valid_mask); + } + + /* +diff --git a/drivers/pinctrl/intel/pinctrl-intel.c b/drivers/pinctrl/intel/pinctrl-intel.c +index 1ea3438ea67e..89ff2795a8b5 100644 +--- a/drivers/pinctrl/intel/pinctrl-intel.c ++++ b/drivers/pinctrl/intel/pinctrl-intel.c +@@ -49,6 +49,7 @@ + #define PADCFG0_GPIROUTNMI BIT(17) + #define PADCFG0_PMODE_SHIFT 10 + #define PADCFG0_PMODE_MASK (0xf << PADCFG0_PMODE_SHIFT) ++#define PADCFG0_PMODE_GPIO 0 + #define PADCFG0_GPIORXDIS BIT(9) + #define PADCFG0_GPIOTXDIS BIT(8) + #define PADCFG0_GPIORXSTATE BIT(1) +@@ -301,7 +302,7 @@ static void intel_pin_dbg_show(struct pinctrl_dev *pctldev, struct seq_file *s, + cfg1 = readl(intel_get_padcfg(pctrl, pin, PADCFG1)); + + mode = (cfg0 & PADCFG0_PMODE_MASK) >> PADCFG0_PMODE_SHIFT; +- if (!mode) ++ if (mode == PADCFG0_PMODE_GPIO) + seq_puts(s, "GPIO "); + else + seq_printf(s, "mode %d ", mode); +@@ -422,6 +423,11 @@ static void __intel_gpio_set_direction(void __iomem *padcfg0, bool input) + writel(value, padcfg0); + } + ++static int intel_gpio_get_gpio_mode(void __iomem *padcfg0) ++{ ++ return (readl(padcfg0) & PADCFG0_PMODE_MASK) >> PADCFG0_PMODE_SHIFT; ++} ++ + static void intel_gpio_set_gpio_mode(void __iomem *padcfg0) + { + u32 value; +@@ -450,7 +456,20 @@ static int intel_gpio_request_enable(struct pinctrl_dev *pctldev, + } + + padcfg0 = intel_get_padcfg(pctrl, pin, PADCFG0); ++ ++ /* ++ * If pin is already configured in GPIO mode, we assume that ++ * firmware provides correct settings. In such case we avoid ++ * potential glitches on the pin. Otherwise, for the pin in ++ * alternative mode, consumer has to supply respective flags. ++ */ ++ if (intel_gpio_get_gpio_mode(padcfg0) == PADCFG0_PMODE_GPIO) { ++ raw_spin_unlock_irqrestore(&pctrl->lock, flags); ++ return 0; ++ } ++ + intel_gpio_set_gpio_mode(padcfg0); ++ + /* Disable TX buffer and enable RX (this will be input) */ + __intel_gpio_set_direction(padcfg0, true); + +diff --git a/drivers/scsi/lpfc/lpfc_nportdisc.c b/drivers/scsi/lpfc/lpfc_nportdisc.c +index a6619fd8238c..ae6301c79678 100644 +--- a/drivers/scsi/lpfc/lpfc_nportdisc.c ++++ b/drivers/scsi/lpfc/lpfc_nportdisc.c +@@ -844,9 +844,9 @@ lpfc_disc_set_adisc(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp) + + if (!(vport->fc_flag & FC_PT2PT)) { + /* Check config parameter use-adisc or FCP-2 */ +- if ((vport->cfg_use_adisc && (vport->fc_flag & FC_RSCN_MODE)) || ++ if (vport->cfg_use_adisc && ((vport->fc_flag & FC_RSCN_MODE) || + ((ndlp->nlp_fcp_info & NLP_FCP_2_DEVICE) && +- (ndlp->nlp_type & NLP_FCP_TARGET))) { ++ (ndlp->nlp_type & NLP_FCP_TARGET)))) { + spin_lock_irq(shost->host_lock); + ndlp->nlp_flag |= NLP_NPR_ADISC; + spin_unlock_irq(shost->host_lock); +diff --git a/drivers/scsi/qla2xxx/qla_bsg.c b/drivers/scsi/qla2xxx/qla_bsg.c +index 4a9fd8d944d6..85b03a7f473c 100644 +--- a/drivers/scsi/qla2xxx/qla_bsg.c ++++ b/drivers/scsi/qla2xxx/qla_bsg.c +@@ -258,7 +258,7 @@ qla2x00_process_els(struct bsg_job *bsg_job) + srb_t *sp; + const char *type; + int req_sg_cnt, rsp_sg_cnt; +- int rval = (DRIVER_ERROR << 16); ++ int rval = (DID_ERROR << 16); + uint16_t nextlid = 0; + + if (bsg_request->msgcode == FC_BSG_RPT_ELS) { +@@ -433,7 +433,7 @@ qla2x00_process_ct(struct bsg_job *bsg_job) + struct Scsi_Host *host = fc_bsg_to_shost(bsg_job); + scsi_qla_host_t *vha = shost_priv(host); + struct qla_hw_data *ha = vha->hw; +- int rval = (DRIVER_ERROR << 16); ++ int rval = (DID_ERROR << 16); + int req_sg_cnt, rsp_sg_cnt; + uint16_t loop_id; + struct fc_port *fcport; +@@ -1948,7 +1948,7 @@ qlafx00_mgmt_cmd(struct bsg_job *bsg_job) + struct Scsi_Host *host = fc_bsg_to_shost(bsg_job); + scsi_qla_host_t *vha = shost_priv(host); + struct qla_hw_data *ha = vha->hw; +- int rval = (DRIVER_ERROR << 16); ++ int rval = (DID_ERROR << 16); + struct qla_mt_iocb_rqst_fx00 *piocb_rqst; + srb_t *sp; + int req_sg_cnt = 0, rsp_sg_cnt = 0; +diff --git a/drivers/scsi/qla2xxx/qla_mbx.c b/drivers/scsi/qla2xxx/qla_mbx.c +index 84f57f075455..128fcff24f1b 100644 +--- a/drivers/scsi/qla2xxx/qla_mbx.c ++++ b/drivers/scsi/qla2xxx/qla_mbx.c +@@ -684,6 +684,7 @@ qla2x00_execute_fw(scsi_qla_host_t *vha, uint32_t risc_addr) + mcp->mb[2] = LSW(risc_addr); + mcp->mb[3] = 0; + mcp->mb[4] = 0; ++ mcp->mb[11] = 0; + ha->flags.using_lr_setting = 0; + if (IS_QLA25XX(ha) || IS_QLA81XX(ha) || IS_QLA83XX(ha) || + IS_QLA27XX(ha)) { +@@ -727,7 +728,7 @@ qla2x00_execute_fw(scsi_qla_host_t *vha, uint32_t risc_addr) + if (ha->flags.exchoffld_enabled) + mcp->mb[4] |= ENABLE_EXCHANGE_OFFLD; + +- mcp->out_mb |= MBX_4|MBX_3|MBX_2|MBX_1; ++ mcp->out_mb |= MBX_4 | MBX_3 | MBX_2 | MBX_1 | MBX_11; + mcp->in_mb |= MBX_3 | MBX_2 | MBX_1; + } else { + mcp->mb[1] = LSW(risc_addr); +diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c +index 856a7ceb9a04..18ee614fe07f 100644 +--- a/drivers/scsi/qla2xxx/qla_os.c ++++ b/drivers/scsi/qla2xxx/qla_os.c +@@ -3496,6 +3496,10 @@ qla2x00_shutdown(struct pci_dev *pdev) + qla2x00_try_to_stop_firmware(vha); + } + ++ /* Disable timer */ ++ if (vha->timer_active) ++ qla2x00_stop_timer(vha); ++ + /* Turn adapter off line */ + vha->flags.online = 0; + +diff --git a/drivers/soundwire/Kconfig b/drivers/soundwire/Kconfig +index 1ba1556f1987..c7708feaa62e 100644 +--- a/drivers/soundwire/Kconfig ++++ b/drivers/soundwire/Kconfig +@@ -4,6 +4,7 @@ + + menuconfig SOUNDWIRE + tristate "SoundWire support" ++ depends on ACPI + help + SoundWire is a 2-Pin interface with data and clock line ratified + by the MIPI Alliance. SoundWire is used for transporting data +diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c +index dcc0ff9f0c22..83576810eee6 100644 +--- a/drivers/soundwire/bus.c ++++ b/drivers/soundwire/bus.c +@@ -805,7 +805,7 @@ static int sdw_handle_port_interrupt(struct sdw_slave *slave, + static int sdw_handle_slave_alerts(struct sdw_slave *slave) + { + struct sdw_slave_intr_status slave_intr; +- u8 clear = 0, bit, port_status[15]; ++ u8 clear = 0, bit, port_status[15] = {0}; + int port_num, stat, ret, count = 0; + unsigned long port; + bool slave_notify = false; +diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c +index 921ad6998dec..1eb72be75fb7 100644 +--- a/drivers/usb/core/config.c ++++ b/drivers/usb/core/config.c +@@ -348,6 +348,11 @@ static int usb_parse_endpoint(struct device *ddev, int cfgno, int inum, + + /* Validate the wMaxPacketSize field */ + maxp = usb_endpoint_maxp(&endpoint->desc); ++ if (maxp == 0) { ++ dev_warn(ddev, "config %d interface %d altsetting %d endpoint 0x%X has wMaxPacketSize 0, skipping\n", ++ cfgno, inum, asnum, d->bEndpointAddress); ++ goto skip_to_next_endpoint_or_interface_descriptor; ++ } + + /* Find the highest legal maxpacket size for this endpoint */ + i = 0; /* additional transactions per microframe */ +diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c +index 05b9ccff7447..aca7e7fa5e47 100644 +--- a/drivers/usb/dwc3/core.c ++++ b/drivers/usb/dwc3/core.c +@@ -299,8 +299,7 @@ static void dwc3_frame_length_adjustment(struct dwc3 *dwc) + + reg = dwc3_readl(dwc->regs, DWC3_GFLADJ); + dft = reg & DWC3_GFLADJ_30MHZ_MASK; +- if (!dev_WARN_ONCE(dwc->dev, dft == dwc->fladj, +- "request value same as default, ignoring\n")) { ++ if (dft != dwc->fladj) { + reg &= ~DWC3_GFLADJ_30MHZ_MASK; + reg |= DWC3_GFLADJ_30MHZ_SDBND_SEL | dwc->fladj; + dwc3_writel(dwc->regs, DWC3_GFLADJ, reg); +diff --git a/drivers/usb/dwc3/dwc3-pci.c b/drivers/usb/dwc3/dwc3-pci.c +index 8cced3609e24..b4e42d597211 100644 +--- a/drivers/usb/dwc3/dwc3-pci.c ++++ b/drivers/usb/dwc3/dwc3-pci.c +@@ -256,7 +256,7 @@ static int dwc3_pci_probe(struct pci_dev *pci, const struct pci_device_id *id) + + ret = platform_device_add_properties(dwc->dwc3, p); + if (ret < 0) +- return ret; ++ goto err; + + ret = dwc3_pci_quirks(dwc); + if (ret) +diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c +index 54de73255064..8398c33d08e7 100644 +--- a/drivers/usb/dwc3/gadget.c ++++ b/drivers/usb/dwc3/gadget.c +@@ -698,6 +698,12 @@ static void dwc3_remove_requests(struct dwc3 *dwc, struct dwc3_ep *dep) + + dwc3_gadget_giveback(dep, req, -ESHUTDOWN); + } ++ ++ while (!list_empty(&dep->cancelled_list)) { ++ req = next_request(&dep->cancelled_list); ++ ++ dwc3_gadget_giveback(dep, req, -ESHUTDOWN); ++ } + } + + /** +diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c +index dfcabadeed01..33115e19756c 100644 +--- a/drivers/usb/gadget/composite.c ++++ b/drivers/usb/gadget/composite.c +@@ -2156,14 +2156,18 @@ void composite_dev_cleanup(struct usb_composite_dev *cdev) + usb_ep_dequeue(cdev->gadget->ep0, cdev->os_desc_req); + + kfree(cdev->os_desc_req->buf); ++ cdev->os_desc_req->buf = NULL; + usb_ep_free_request(cdev->gadget->ep0, cdev->os_desc_req); ++ cdev->os_desc_req = NULL; + } + if (cdev->req) { + if (cdev->setup_pending) + usb_ep_dequeue(cdev->gadget->ep0, cdev->req); + + kfree(cdev->req->buf); ++ cdev->req->buf = NULL; + usb_ep_free_request(cdev->gadget->ep0, cdev->req); ++ cdev->req = NULL; + } + cdev->next_string_id = 0; + device_remove_file(&cdev->gadget->dev, &dev_attr_suspended); +diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c +index 025129942894..33852c2b29d1 100644 +--- a/drivers/usb/gadget/configfs.c ++++ b/drivers/usb/gadget/configfs.c +@@ -61,6 +61,8 @@ struct gadget_info { + bool use_os_desc; + char b_vendor_code; + char qw_sign[OS_STRING_QW_SIGN_LEN]; ++ spinlock_t spinlock; ++ bool unbind; + }; + + static inline struct gadget_info *to_gadget_info(struct config_item *item) +@@ -1244,6 +1246,7 @@ static int configfs_composite_bind(struct usb_gadget *gadget, + int ret; + + /* the gi->lock is hold by the caller */ ++ gi->unbind = 0; + cdev->gadget = gadget; + set_gadget_data(gadget, cdev); + ret = composite_dev_prepare(composite, cdev); +@@ -1376,31 +1379,128 @@ static void configfs_composite_unbind(struct usb_gadget *gadget) + { + struct usb_composite_dev *cdev; + struct gadget_info *gi; ++ unsigned long flags; + + /* the gi->lock is hold by the caller */ + + cdev = get_gadget_data(gadget); + gi = container_of(cdev, struct gadget_info, cdev); ++ spin_lock_irqsave(&gi->spinlock, flags); ++ gi->unbind = 1; ++ spin_unlock_irqrestore(&gi->spinlock, flags); + + kfree(otg_desc[0]); + otg_desc[0] = NULL; + purge_configs_funcs(gi); + composite_dev_cleanup(cdev); + usb_ep_autoconfig_reset(cdev->gadget); ++ spin_lock_irqsave(&gi->spinlock, flags); + cdev->gadget = NULL; + set_gadget_data(gadget, NULL); ++ spin_unlock_irqrestore(&gi->spinlock, flags); ++} ++ ++static int configfs_composite_setup(struct usb_gadget *gadget, ++ const struct usb_ctrlrequest *ctrl) ++{ ++ struct usb_composite_dev *cdev; ++ struct gadget_info *gi; ++ unsigned long flags; ++ int ret; ++ ++ cdev = get_gadget_data(gadget); ++ if (!cdev) ++ return 0; ++ ++ gi = container_of(cdev, struct gadget_info, cdev); ++ spin_lock_irqsave(&gi->spinlock, flags); ++ cdev = get_gadget_data(gadget); ++ if (!cdev || gi->unbind) { ++ spin_unlock_irqrestore(&gi->spinlock, flags); ++ return 0; ++ } ++ ++ ret = composite_setup(gadget, ctrl); ++ spin_unlock_irqrestore(&gi->spinlock, flags); ++ return ret; ++} ++ ++static void configfs_composite_disconnect(struct usb_gadget *gadget) ++{ ++ struct usb_composite_dev *cdev; ++ struct gadget_info *gi; ++ unsigned long flags; ++ ++ cdev = get_gadget_data(gadget); ++ if (!cdev) ++ return; ++ ++ gi = container_of(cdev, struct gadget_info, cdev); ++ spin_lock_irqsave(&gi->spinlock, flags); ++ cdev = get_gadget_data(gadget); ++ if (!cdev || gi->unbind) { ++ spin_unlock_irqrestore(&gi->spinlock, flags); ++ return; ++ } ++ ++ composite_disconnect(gadget); ++ spin_unlock_irqrestore(&gi->spinlock, flags); ++} ++ ++static void configfs_composite_suspend(struct usb_gadget *gadget) ++{ ++ struct usb_composite_dev *cdev; ++ struct gadget_info *gi; ++ unsigned long flags; ++ ++ cdev = get_gadget_data(gadget); ++ if (!cdev) ++ return; ++ ++ gi = container_of(cdev, struct gadget_info, cdev); ++ spin_lock_irqsave(&gi->spinlock, flags); ++ cdev = get_gadget_data(gadget); ++ if (!cdev || gi->unbind) { ++ spin_unlock_irqrestore(&gi->spinlock, flags); ++ return; ++ } ++ ++ composite_suspend(gadget); ++ spin_unlock_irqrestore(&gi->spinlock, flags); ++} ++ ++static void configfs_composite_resume(struct usb_gadget *gadget) ++{ ++ struct usb_composite_dev *cdev; ++ struct gadget_info *gi; ++ unsigned long flags; ++ ++ cdev = get_gadget_data(gadget); ++ if (!cdev) ++ return; ++ ++ gi = container_of(cdev, struct gadget_info, cdev); ++ spin_lock_irqsave(&gi->spinlock, flags); ++ cdev = get_gadget_data(gadget); ++ if (!cdev || gi->unbind) { ++ spin_unlock_irqrestore(&gi->spinlock, flags); ++ return; ++ } ++ ++ composite_resume(gadget); ++ spin_unlock_irqrestore(&gi->spinlock, flags); + } + + static const struct usb_gadget_driver configfs_driver_template = { + .bind = configfs_composite_bind, + .unbind = configfs_composite_unbind, + +- .setup = composite_setup, +- .reset = composite_disconnect, +- .disconnect = composite_disconnect, ++ .setup = configfs_composite_setup, ++ .reset = configfs_composite_disconnect, ++ .disconnect = configfs_composite_disconnect, + +- .suspend = composite_suspend, +- .resume = composite_resume, ++ .suspend = configfs_composite_suspend, ++ .resume = configfs_composite_resume, + + .max_speed = USB_SPEED_SUPER, + .driver = { +diff --git a/drivers/usb/gadget/udc/atmel_usba_udc.c b/drivers/usb/gadget/udc/atmel_usba_udc.c +index 8f267be1745d..a4ab23033578 100644 +--- a/drivers/usb/gadget/udc/atmel_usba_udc.c ++++ b/drivers/usb/gadget/udc/atmel_usba_udc.c +@@ -436,9 +436,11 @@ static void submit_request(struct usba_ep *ep, struct usba_request *req) + next_fifo_transaction(ep, req); + if (req->last_transaction) { + usba_ep_writel(ep, CTL_DIS, USBA_TX_PK_RDY); +- usba_ep_writel(ep, CTL_ENB, USBA_TX_COMPLETE); ++ if (ep_is_control(ep)) ++ usba_ep_writel(ep, CTL_ENB, USBA_TX_COMPLETE); + } else { +- usba_ep_writel(ep, CTL_DIS, USBA_TX_COMPLETE); ++ if (ep_is_control(ep)) ++ usba_ep_writel(ep, CTL_DIS, USBA_TX_COMPLETE); + usba_ep_writel(ep, CTL_ENB, USBA_TX_PK_RDY); + } + } +diff --git a/drivers/usb/gadget/udc/fsl_udc_core.c b/drivers/usb/gadget/udc/fsl_udc_core.c +index be59309e848c..d44b26d5b2a2 100644 +--- a/drivers/usb/gadget/udc/fsl_udc_core.c ++++ b/drivers/usb/gadget/udc/fsl_udc_core.c +@@ -2552,7 +2552,7 @@ static int fsl_udc_remove(struct platform_device *pdev) + dma_pool_destroy(udc_controller->td_pool); + free_irq(udc_controller->irq, udc_controller); + iounmap(dr_regs); +- if (pdata->operating_mode == FSL_USB2_DR_DEVICE) ++ if (res && (pdata->operating_mode == FSL_USB2_DR_DEVICE)) + release_mem_region(res->start, resource_size(res)); + + /* free udc --wait for the release() finished */ +diff --git a/drivers/usb/misc/ldusb.c b/drivers/usb/misc/ldusb.c +index 320b06e0724b..67c1b8f5d54d 100644 +--- a/drivers/usb/misc/ldusb.c ++++ b/drivers/usb/misc/ldusb.c +@@ -487,7 +487,7 @@ static ssize_t ld_usb_read(struct file *file, char __user *buffer, size_t count, + } + bytes_to_read = min(count, *actual_buffer); + if (bytes_to_read < *actual_buffer) +- dev_warn(&dev->intf->dev, "Read buffer overflow, %zd bytes dropped\n", ++ dev_warn(&dev->intf->dev, "Read buffer overflow, %zu bytes dropped\n", + *actual_buffer-bytes_to_read); + + /* copy one interrupt_in_buffer from ring_buffer into userspace */ +@@ -562,8 +562,9 @@ static ssize_t ld_usb_write(struct file *file, const char __user *buffer, + /* write the data into interrupt_out_buffer from userspace */ + bytes_to_write = min(count, write_buffer_size*dev->interrupt_out_endpoint_size); + if (bytes_to_write < count) +- dev_warn(&dev->intf->dev, "Write buffer overflow, %zd bytes dropped\n", count-bytes_to_write); +- dev_dbg(&dev->intf->dev, "%s: count = %zd, bytes_to_write = %zd\n", ++ dev_warn(&dev->intf->dev, "Write buffer overflow, %zu bytes dropped\n", ++ count - bytes_to_write); ++ dev_dbg(&dev->intf->dev, "%s: count = %zu, bytes_to_write = %zu\n", + __func__, count, bytes_to_write); + + if (copy_from_user(dev->interrupt_out_buffer, buffer, bytes_to_write)) { +diff --git a/drivers/usb/usbip/stub.h b/drivers/usb/usbip/stub.h +index 35618ceb2791..d11270560c24 100644 +--- a/drivers/usb/usbip/stub.h ++++ b/drivers/usb/usbip/stub.h +@@ -52,7 +52,11 @@ struct stub_priv { + unsigned long seqnum; + struct list_head list; + struct stub_device *sdev; +- struct urb *urb; ++ struct urb **urbs; ++ struct scatterlist *sgl; ++ int num_urbs; ++ int completed_urbs; ++ int urb_status; + + int unlinking; + }; +@@ -86,6 +90,7 @@ extern struct usb_device_driver stub_driver; + struct bus_id_priv *get_busid_priv(const char *busid); + void put_busid_priv(struct bus_id_priv *bid); + int del_match_busid(char *busid); ++void stub_free_priv_and_urb(struct stub_priv *priv); + void stub_device_cleanup_urbs(struct stub_device *sdev); + + /* stub_rx.c */ +diff --git a/drivers/usb/usbip/stub_main.c b/drivers/usb/usbip/stub_main.c +index bf8a5feb0ee9..a20bb2d04f4d 100644 +--- a/drivers/usb/usbip/stub_main.c ++++ b/drivers/usb/usbip/stub_main.c +@@ -6,6 +6,7 @@ + #include <linux/string.h> + #include <linux/module.h> + #include <linux/device.h> ++#include <linux/scatterlist.h> + + #include "usbip_common.h" + #include "stub.h" +@@ -283,13 +284,49 @@ static struct stub_priv *stub_priv_pop_from_listhead(struct list_head *listhead) + struct stub_priv *priv, *tmp; + + list_for_each_entry_safe(priv, tmp, listhead, list) { +- list_del(&priv->list); ++ list_del_init(&priv->list); + return priv; + } + + return NULL; + } + ++void stub_free_priv_and_urb(struct stub_priv *priv) ++{ ++ struct urb *urb; ++ int i; ++ ++ for (i = 0; i < priv->num_urbs; i++) { ++ urb = priv->urbs[i]; ++ ++ if (!urb) ++ return; ++ ++ kfree(urb->setup_packet); ++ urb->setup_packet = NULL; ++ ++ ++ if (urb->transfer_buffer && !priv->sgl) { ++ kfree(urb->transfer_buffer); ++ urb->transfer_buffer = NULL; ++ } ++ ++ if (urb->num_sgs) { ++ sgl_free(urb->sg); ++ urb->sg = NULL; ++ urb->num_sgs = 0; ++ } ++ ++ usb_free_urb(urb); ++ } ++ if (!list_empty(&priv->list)) ++ list_del(&priv->list); ++ if (priv->sgl) ++ sgl_free(priv->sgl); ++ kfree(priv->urbs); ++ kmem_cache_free(stub_priv_cache, priv); ++} ++ + static struct stub_priv *stub_priv_pop(struct stub_device *sdev) + { + unsigned long flags; +@@ -316,25 +353,15 @@ done: + void stub_device_cleanup_urbs(struct stub_device *sdev) + { + struct stub_priv *priv; +- struct urb *urb; ++ int i; + + dev_dbg(&sdev->udev->dev, "Stub device cleaning up urbs\n"); + + while ((priv = stub_priv_pop(sdev))) { +- urb = priv->urb; +- dev_dbg(&sdev->udev->dev, "free urb seqnum %lu\n", +- priv->seqnum); +- usb_kill_urb(urb); +- +- kmem_cache_free(stub_priv_cache, priv); ++ for (i = 0; i < priv->num_urbs; i++) ++ usb_kill_urb(priv->urbs[i]); + +- kfree(urb->transfer_buffer); +- urb->transfer_buffer = NULL; +- +- kfree(urb->setup_packet); +- urb->setup_packet = NULL; +- +- usb_free_urb(urb); ++ stub_free_priv_and_urb(priv); + } + } + +diff --git a/drivers/usb/usbip/stub_rx.c b/drivers/usb/usbip/stub_rx.c +index dbfb2f24d71e..75d8756c6d27 100644 +--- a/drivers/usb/usbip/stub_rx.c ++++ b/drivers/usb/usbip/stub_rx.c +@@ -7,6 +7,7 @@ + #include <linux/kthread.h> + #include <linux/usb.h> + #include <linux/usb/hcd.h> ++#include <linux/scatterlist.h> + + #include "usbip_common.h" + #include "stub.h" +@@ -201,7 +202,7 @@ static void tweak_special_requests(struct urb *urb) + static int stub_recv_cmd_unlink(struct stub_device *sdev, + struct usbip_header *pdu) + { +- int ret; ++ int ret, i; + unsigned long flags; + struct stub_priv *priv; + +@@ -246,12 +247,14 @@ static int stub_recv_cmd_unlink(struct stub_device *sdev, + * so a driver in a client host will know the failure + * of the unlink request ? + */ +- ret = usb_unlink_urb(priv->urb); +- if (ret != -EINPROGRESS) +- dev_err(&priv->urb->dev->dev, +- "failed to unlink a urb # %lu, ret %d\n", +- priv->seqnum, ret); +- ++ for (i = priv->completed_urbs; i < priv->num_urbs; i++) { ++ ret = usb_unlink_urb(priv->urbs[i]); ++ if (ret != -EINPROGRESS) ++ dev_err(&priv->urbs[i]->dev->dev, ++ "failed to unlink %d/%d urb of seqnum %lu, ret %d\n", ++ i + 1, priv->num_urbs, ++ priv->seqnum, ret); ++ } + return 0; + } + +@@ -433,14 +436,36 @@ static void masking_bogus_flags(struct urb *urb) + urb->transfer_flags &= allowed; + } + ++static int stub_recv_xbuff(struct usbip_device *ud, struct stub_priv *priv) ++{ ++ int ret; ++ int i; ++ ++ for (i = 0; i < priv->num_urbs; i++) { ++ ret = usbip_recv_xbuff(ud, priv->urbs[i]); ++ if (ret < 0) ++ break; ++ } ++ ++ return ret; ++} ++ + static void stub_recv_cmd_submit(struct stub_device *sdev, + struct usbip_header *pdu) + { +- int ret; + struct stub_priv *priv; + struct usbip_device *ud = &sdev->ud; + struct usb_device *udev = sdev->udev; ++ struct scatterlist *sgl = NULL, *sg; ++ void *buffer = NULL; ++ unsigned long long buf_len; ++ int nents; ++ int num_urbs = 1; + int pipe = get_pipe(sdev, pdu); ++ int use_sg = pdu->u.cmd_submit.transfer_flags & URB_DMA_MAP_SG; ++ int support_sg = 1; ++ int np = 0; ++ int ret, i; + + if (pipe == -1) + return; +@@ -449,76 +474,139 @@ static void stub_recv_cmd_submit(struct stub_device *sdev, + if (!priv) + return; + +- /* setup a urb */ +- if (usb_pipeisoc(pipe)) +- priv->urb = usb_alloc_urb(pdu->u.cmd_submit.number_of_packets, +- GFP_KERNEL); +- else +- priv->urb = usb_alloc_urb(0, GFP_KERNEL); ++ buf_len = (unsigned long long)pdu->u.cmd_submit.transfer_buffer_length; + +- if (!priv->urb) { +- usbip_event_add(ud, SDEV_EVENT_ERROR_MALLOC); +- return; ++ /* allocate urb transfer buffer, if needed */ ++ if (buf_len) { ++ if (use_sg) { ++ sgl = sgl_alloc(buf_len, GFP_KERNEL, &nents); ++ if (!sgl) ++ goto err_malloc; ++ } else { ++ buffer = kzalloc(buf_len, GFP_KERNEL); ++ if (!buffer) ++ goto err_malloc; ++ } + } + +- /* allocate urb transfer buffer, if needed */ +- if (pdu->u.cmd_submit.transfer_buffer_length > 0) { +- priv->urb->transfer_buffer = +- kzalloc(pdu->u.cmd_submit.transfer_buffer_length, +- GFP_KERNEL); +- if (!priv->urb->transfer_buffer) { ++ /* Check if the server's HCD supports SG */ ++ if (use_sg && !udev->bus->sg_tablesize) { ++ /* ++ * If the server's HCD doesn't support SG, break a single SG ++ * request into several URBs and map each SG list entry to ++ * corresponding URB buffer. The previously allocated SG ++ * list is stored in priv->sgl (If the server's HCD support SG, ++ * SG list is stored only in urb->sg) and it is used as an ++ * indicator that the server split single SG request into ++ * several URBs. Later, priv->sgl is used by stub_complete() and ++ * stub_send_ret_submit() to reassemble the divied URBs. ++ */ ++ support_sg = 0; ++ num_urbs = nents; ++ priv->completed_urbs = 0; ++ pdu->u.cmd_submit.transfer_flags &= ~URB_DMA_MAP_SG; ++ } ++ ++ /* allocate urb array */ ++ priv->num_urbs = num_urbs; ++ priv->urbs = kmalloc_array(num_urbs, sizeof(*priv->urbs), GFP_KERNEL); ++ if (!priv->urbs) ++ goto err_urbs; ++ ++ /* setup a urb */ ++ if (support_sg) { ++ if (usb_pipeisoc(pipe)) ++ np = pdu->u.cmd_submit.number_of_packets; ++ ++ priv->urbs[0] = usb_alloc_urb(np, GFP_KERNEL); ++ if (!priv->urbs[0]) ++ goto err_urb; ++ ++ if (buf_len) { ++ if (use_sg) { ++ priv->urbs[0]->sg = sgl; ++ priv->urbs[0]->num_sgs = nents; ++ priv->urbs[0]->transfer_buffer = NULL; ++ } else { ++ priv->urbs[0]->transfer_buffer = buffer; ++ } ++ } ++ ++ /* copy urb setup packet */ ++ priv->urbs[0]->setup_packet = kmemdup(&pdu->u.cmd_submit.setup, ++ 8, GFP_KERNEL); ++ if (!priv->urbs[0]->setup_packet) { + usbip_event_add(ud, SDEV_EVENT_ERROR_MALLOC); + return; + } +- } + +- /* copy urb setup packet */ +- priv->urb->setup_packet = kmemdup(&pdu->u.cmd_submit.setup, 8, +- GFP_KERNEL); +- if (!priv->urb->setup_packet) { +- dev_err(&udev->dev, "allocate setup_packet\n"); +- usbip_event_add(ud, SDEV_EVENT_ERROR_MALLOC); +- return; ++ usbip_pack_pdu(pdu, priv->urbs[0], USBIP_CMD_SUBMIT, 0); ++ } else { ++ for_each_sg(sgl, sg, nents, i) { ++ priv->urbs[i] = usb_alloc_urb(0, GFP_KERNEL); ++ /* The URBs which is previously allocated will be freed ++ * in stub_device_cleanup_urbs() if error occurs. ++ */ ++ if (!priv->urbs[i]) ++ goto err_urb; ++ ++ usbip_pack_pdu(pdu, priv->urbs[i], USBIP_CMD_SUBMIT, 0); ++ priv->urbs[i]->transfer_buffer = sg_virt(sg); ++ priv->urbs[i]->transfer_buffer_length = sg->length; ++ } ++ priv->sgl = sgl; + } + +- /* set other members from the base header of pdu */ +- priv->urb->context = (void *) priv; +- priv->urb->dev = udev; +- priv->urb->pipe = pipe; +- priv->urb->complete = stub_complete; ++ for (i = 0; i < num_urbs; i++) { ++ /* set other members from the base header of pdu */ ++ priv->urbs[i]->context = (void *) priv; ++ priv->urbs[i]->dev = udev; ++ priv->urbs[i]->pipe = pipe; ++ priv->urbs[i]->complete = stub_complete; + +- usbip_pack_pdu(pdu, priv->urb, USBIP_CMD_SUBMIT, 0); ++ /* no need to submit an intercepted request, but harmless? */ ++ tweak_special_requests(priv->urbs[i]); + ++ masking_bogus_flags(priv->urbs[i]); ++ } + +- if (usbip_recv_xbuff(ud, priv->urb) < 0) ++ if (stub_recv_xbuff(ud, priv) < 0) + return; + +- if (usbip_recv_iso(ud, priv->urb) < 0) ++ if (usbip_recv_iso(ud, priv->urbs[0]) < 0) + return; + +- /* no need to submit an intercepted request, but harmless? */ +- tweak_special_requests(priv->urb); +- +- masking_bogus_flags(priv->urb); + /* urb is now ready to submit */ +- ret = usb_submit_urb(priv->urb, GFP_KERNEL); +- +- if (ret == 0) +- usbip_dbg_stub_rx("submit urb ok, seqnum %u\n", +- pdu->base.seqnum); +- else { +- dev_err(&udev->dev, "submit_urb error, %d\n", ret); +- usbip_dump_header(pdu); +- usbip_dump_urb(priv->urb); +- +- /* +- * Pessimistic. +- * This connection will be discarded. +- */ +- usbip_event_add(ud, SDEV_EVENT_ERROR_SUBMIT); ++ for (i = 0; i < priv->num_urbs; i++) { ++ ret = usb_submit_urb(priv->urbs[i], GFP_KERNEL); ++ ++ if (ret == 0) ++ usbip_dbg_stub_rx("submit urb ok, seqnum %u\n", ++ pdu->base.seqnum); ++ else { ++ dev_err(&udev->dev, "submit_urb error, %d\n", ret); ++ usbip_dump_header(pdu); ++ usbip_dump_urb(priv->urbs[i]); ++ ++ /* ++ * Pessimistic. ++ * This connection will be discarded. ++ */ ++ usbip_event_add(ud, SDEV_EVENT_ERROR_SUBMIT); ++ break; ++ } + } + + usbip_dbg_stub_rx("Leave\n"); ++ return; ++ ++err_urb: ++ kfree(priv->urbs); ++err_urbs: ++ kfree(buffer); ++ sgl_free(sgl); ++err_malloc: ++ usbip_event_add(ud, SDEV_EVENT_ERROR_MALLOC); + } + + /* recv a pdu */ +diff --git a/drivers/usb/usbip/stub_tx.c b/drivers/usb/usbip/stub_tx.c +index f0ec41a50cbc..36010a82b359 100644 +--- a/drivers/usb/usbip/stub_tx.c ++++ b/drivers/usb/usbip/stub_tx.c +@@ -5,25 +5,11 @@ + + #include <linux/kthread.h> + #include <linux/socket.h> ++#include <linux/scatterlist.h> + + #include "usbip_common.h" + #include "stub.h" + +-static void stub_free_priv_and_urb(struct stub_priv *priv) +-{ +- struct urb *urb = priv->urb; +- +- kfree(urb->setup_packet); +- urb->setup_packet = NULL; +- +- kfree(urb->transfer_buffer); +- urb->transfer_buffer = NULL; +- +- list_del(&priv->list); +- kmem_cache_free(stub_priv_cache, priv); +- usb_free_urb(urb); +-} +- + /* be in spin_lock_irqsave(&sdev->priv_lock, flags) */ + void stub_enqueue_ret_unlink(struct stub_device *sdev, __u32 seqnum, + __u32 status) +@@ -85,6 +71,22 @@ void stub_complete(struct urb *urb) + break; + } + ++ /* ++ * If the server breaks single SG request into the several URBs, the ++ * URBs must be reassembled before sending completed URB to the vhci. ++ * Don't wake up the tx thread until all the URBs are completed. ++ */ ++ if (priv->sgl) { ++ priv->completed_urbs++; ++ ++ /* Only save the first error status */ ++ if (urb->status && !priv->urb_status) ++ priv->urb_status = urb->status; ++ ++ if (priv->completed_urbs < priv->num_urbs) ++ return; ++ } ++ + /* link a urb to the queue of tx. */ + spin_lock_irqsave(&sdev->priv_lock, flags); + if (sdev->ud.tcp_socket == NULL) { +@@ -156,18 +158,22 @@ static int stub_send_ret_submit(struct stub_device *sdev) + size_t total_size = 0; + + while ((priv = dequeue_from_priv_tx(sdev)) != NULL) { +- int ret; +- struct urb *urb = priv->urb; ++ struct urb *urb = priv->urbs[0]; + struct usbip_header pdu_header; + struct usbip_iso_packet_descriptor *iso_buffer = NULL; + struct kvec *iov = NULL; ++ struct scatterlist *sg; ++ u32 actual_length = 0; + int iovnum = 0; ++ int ret; ++ int i; + + txsize = 0; + memset(&pdu_header, 0, sizeof(pdu_header)); + memset(&msg, 0, sizeof(msg)); + +- if (urb->actual_length > 0 && !urb->transfer_buffer) { ++ if (urb->actual_length > 0 && !urb->transfer_buffer && ++ !urb->num_sgs) { + dev_err(&sdev->udev->dev, + "urb: actual_length %d transfer_buffer null\n", + urb->actual_length); +@@ -176,6 +182,11 @@ static int stub_send_ret_submit(struct stub_device *sdev) + + if (usb_pipetype(urb->pipe) == PIPE_ISOCHRONOUS) + iovnum = 2 + urb->number_of_packets; ++ else if (usb_pipein(urb->pipe) && urb->actual_length > 0 && ++ urb->num_sgs) ++ iovnum = 1 + urb->num_sgs; ++ else if (usb_pipein(urb->pipe) && priv->sgl) ++ iovnum = 1 + priv->num_urbs; + else + iovnum = 2; + +@@ -192,6 +203,15 @@ static int stub_send_ret_submit(struct stub_device *sdev) + setup_ret_submit_pdu(&pdu_header, urb); + usbip_dbg_stub_tx("setup txdata seqnum: %d\n", + pdu_header.base.seqnum); ++ ++ if (priv->sgl) { ++ for (i = 0; i < priv->num_urbs; i++) ++ actual_length += priv->urbs[i]->actual_length; ++ ++ pdu_header.u.ret_submit.status = priv->urb_status; ++ pdu_header.u.ret_submit.actual_length = actual_length; ++ } ++ + usbip_header_correct_endian(&pdu_header, 1); + + iov[iovnum].iov_base = &pdu_header; +@@ -200,12 +220,47 @@ static int stub_send_ret_submit(struct stub_device *sdev) + txsize += sizeof(pdu_header); + + /* 2. setup transfer buffer */ +- if (usb_pipein(urb->pipe) && ++ if (usb_pipein(urb->pipe) && priv->sgl) { ++ /* If the server split a single SG request into several ++ * URBs because the server's HCD doesn't support SG, ++ * reassemble the split URB buffers into a single ++ * return command. ++ */ ++ for (i = 0; i < priv->num_urbs; i++) { ++ iov[iovnum].iov_base = ++ priv->urbs[i]->transfer_buffer; ++ iov[iovnum].iov_len = ++ priv->urbs[i]->actual_length; ++ iovnum++; ++ } ++ txsize += actual_length; ++ } else if (usb_pipein(urb->pipe) && + usb_pipetype(urb->pipe) != PIPE_ISOCHRONOUS && + urb->actual_length > 0) { +- iov[iovnum].iov_base = urb->transfer_buffer; +- iov[iovnum].iov_len = urb->actual_length; +- iovnum++; ++ if (urb->num_sgs) { ++ unsigned int copy = urb->actual_length; ++ int size; ++ ++ for_each_sg(urb->sg, sg, urb->num_sgs, i) { ++ if (copy == 0) ++ break; ++ ++ if (copy < sg->length) ++ size = copy; ++ else ++ size = sg->length; ++ ++ iov[iovnum].iov_base = sg_virt(sg); ++ iov[iovnum].iov_len = size; ++ ++ iovnum++; ++ copy -= size; ++ } ++ } else { ++ iov[iovnum].iov_base = urb->transfer_buffer; ++ iov[iovnum].iov_len = urb->actual_length; ++ iovnum++; ++ } + txsize += urb->actual_length; + } else if (usb_pipein(urb->pipe) && + usb_pipetype(urb->pipe) == PIPE_ISOCHRONOUS) { +diff --git a/drivers/usb/usbip/usbip_common.c b/drivers/usb/usbip/usbip_common.c +index 9756752c0681..d88a5b15f073 100644 +--- a/drivers/usb/usbip/usbip_common.c ++++ b/drivers/usb/usbip/usbip_common.c +@@ -680,8 +680,12 @@ EXPORT_SYMBOL_GPL(usbip_pad_iso); + /* some members of urb must be substituted before. */ + int usbip_recv_xbuff(struct usbip_device *ud, struct urb *urb) + { +- int ret; ++ struct scatterlist *sg; ++ int ret = 0; ++ int recv; + int size; ++ int copy; ++ int i; + + if (ud->side == USBIP_STUB || ud->side == USBIP_VUDC) { + /* the direction of urb must be OUT. */ +@@ -701,29 +705,48 @@ int usbip_recv_xbuff(struct usbip_device *ud, struct urb *urb) + if (!(size > 0)) + return 0; + +- if (size > urb->transfer_buffer_length) { ++ if (size > urb->transfer_buffer_length) + /* should not happen, probably malicious packet */ +- if (ud->side == USBIP_STUB) { +- usbip_event_add(ud, SDEV_EVENT_ERROR_TCP); +- return 0; +- } else { +- usbip_event_add(ud, VDEV_EVENT_ERROR_TCP); +- return -EPIPE; +- } +- } ++ goto error; + +- ret = usbip_recv(ud->tcp_socket, urb->transfer_buffer, size); +- if (ret != size) { +- dev_err(&urb->dev->dev, "recv xbuf, %d\n", ret); +- if (ud->side == USBIP_STUB || ud->side == USBIP_VUDC) { +- usbip_event_add(ud, SDEV_EVENT_ERROR_TCP); +- } else { +- usbip_event_add(ud, VDEV_EVENT_ERROR_TCP); +- return -EPIPE; ++ if (urb->num_sgs) { ++ copy = size; ++ for_each_sg(urb->sg, sg, urb->num_sgs, i) { ++ int recv_size; ++ ++ if (copy < sg->length) ++ recv_size = copy; ++ else ++ recv_size = sg->length; ++ ++ recv = usbip_recv(ud->tcp_socket, sg_virt(sg), ++ recv_size); ++ ++ if (recv != recv_size) ++ goto error; ++ ++ copy -= recv; ++ ret += recv; + } ++ ++ if (ret != size) ++ goto error; ++ } else { ++ ret = usbip_recv(ud->tcp_socket, urb->transfer_buffer, size); ++ if (ret != size) ++ goto error; + } + + return ret; ++ ++error: ++ dev_err(&urb->dev->dev, "recv xbuf, %d\n", ret); ++ if (ud->side == USBIP_STUB || ud->side == USBIP_VUDC) ++ usbip_event_add(ud, SDEV_EVENT_ERROR_TCP); ++ else ++ usbip_event_add(ud, VDEV_EVENT_ERROR_TCP); ++ ++ return -EPIPE; + } + EXPORT_SYMBOL_GPL(usbip_recv_xbuff); + +diff --git a/drivers/usb/usbip/vhci_hcd.c b/drivers/usb/usbip/vhci_hcd.c +index 1e592ec94ba4..d5a036bf904b 100644 +--- a/drivers/usb/usbip/vhci_hcd.c ++++ b/drivers/usb/usbip/vhci_hcd.c +@@ -702,8 +702,11 @@ static int vhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag + } + vdev = &vhci_hcd->vdev[portnum-1]; + +- /* patch to usb_sg_init() is in 2.5.60 */ +- BUG_ON(!urb->transfer_buffer && urb->transfer_buffer_length); ++ if (!urb->transfer_buffer && !urb->num_sgs && ++ urb->transfer_buffer_length) { ++ dev_dbg(dev, "Null URB transfer buffer\n"); ++ return -EINVAL; ++ } + + spin_lock_irqsave(&vhci->lock, flags); + +@@ -1146,6 +1149,15 @@ static int vhci_setup(struct usb_hcd *hcd) + hcd->speed = HCD_USB3; + hcd->self.root_hub->speed = USB_SPEED_SUPER; + } ++ ++ /* ++ * Support SG. ++ * sg_tablesize is an arbitrary value to alleviate memory pressure ++ * on the host. ++ */ ++ hcd->self.sg_tablesize = 32; ++ hcd->self.no_sg_constraint = 1; ++ + return 0; + } + +diff --git a/drivers/usb/usbip/vhci_rx.c b/drivers/usb/usbip/vhci_rx.c +index 44cd64518925..33f8972ba842 100644 +--- a/drivers/usb/usbip/vhci_rx.c ++++ b/drivers/usb/usbip/vhci_rx.c +@@ -90,6 +90,9 @@ static void vhci_recv_ret_submit(struct vhci_device *vdev, + if (usbip_dbg_flag_vhci_rx) + usbip_dump_urb(urb); + ++ if (urb->num_sgs) ++ urb->transfer_flags &= ~URB_DMA_MAP_SG; ++ + usbip_dbg_vhci_rx("now giveback urb %u\n", pdu->base.seqnum); + + spin_lock_irqsave(&vhci->lock, flags); +diff --git a/drivers/usb/usbip/vhci_tx.c b/drivers/usb/usbip/vhci_tx.c +index 9aed15a358b7..acac49402c2b 100644 +--- a/drivers/usb/usbip/vhci_tx.c ++++ b/drivers/usb/usbip/vhci_tx.c +@@ -5,6 +5,7 @@ + + #include <linux/kthread.h> + #include <linux/slab.h> ++#include <linux/scatterlist.h> + + #include "usbip_common.h" + #include "vhci.h" +@@ -50,19 +51,23 @@ static struct vhci_priv *dequeue_from_priv_tx(struct vhci_device *vdev) + + static int vhci_send_cmd_submit(struct vhci_device *vdev) + { ++ struct usbip_iso_packet_descriptor *iso_buffer = NULL; + struct vhci_priv *priv = NULL; ++ struct scatterlist *sg; + + struct msghdr msg; +- struct kvec iov[3]; ++ struct kvec *iov; + size_t txsize; + + size_t total_size = 0; ++ int iovnum; ++ int err = -ENOMEM; ++ int i; + + while ((priv = dequeue_from_priv_tx(vdev)) != NULL) { + int ret; + struct urb *urb = priv->urb; + struct usbip_header pdu_header; +- struct usbip_iso_packet_descriptor *iso_buffer = NULL; + + txsize = 0; + memset(&pdu_header, 0, sizeof(pdu_header)); +@@ -72,18 +77,45 @@ static int vhci_send_cmd_submit(struct vhci_device *vdev) + usbip_dbg_vhci_tx("setup txdata urb seqnum %lu\n", + priv->seqnum); + ++ if (urb->num_sgs && usb_pipeout(urb->pipe)) ++ iovnum = 2 + urb->num_sgs; ++ else ++ iovnum = 3; ++ ++ iov = kcalloc(iovnum, sizeof(*iov), GFP_KERNEL); ++ if (!iov) { ++ usbip_event_add(&vdev->ud, SDEV_EVENT_ERROR_MALLOC); ++ return -ENOMEM; ++ } ++ ++ if (urb->num_sgs) ++ urb->transfer_flags |= URB_DMA_MAP_SG; ++ + /* 1. setup usbip_header */ + setup_cmd_submit_pdu(&pdu_header, urb); + usbip_header_correct_endian(&pdu_header, 1); ++ iovnum = 0; + +- iov[0].iov_base = &pdu_header; +- iov[0].iov_len = sizeof(pdu_header); ++ iov[iovnum].iov_base = &pdu_header; ++ iov[iovnum].iov_len = sizeof(pdu_header); + txsize += sizeof(pdu_header); ++ iovnum++; + + /* 2. setup transfer buffer */ + if (!usb_pipein(urb->pipe) && urb->transfer_buffer_length > 0) { +- iov[1].iov_base = urb->transfer_buffer; +- iov[1].iov_len = urb->transfer_buffer_length; ++ if (urb->num_sgs && ++ !usb_endpoint_xfer_isoc(&urb->ep->desc)) { ++ for_each_sg(urb->sg, sg, urb->num_sgs, i) { ++ iov[iovnum].iov_base = sg_virt(sg); ++ iov[iovnum].iov_len = sg->length; ++ iovnum++; ++ } ++ } else { ++ iov[iovnum].iov_base = urb->transfer_buffer; ++ iov[iovnum].iov_len = ++ urb->transfer_buffer_length; ++ iovnum++; ++ } + txsize += urb->transfer_buffer_length; + } + +@@ -95,30 +127,43 @@ static int vhci_send_cmd_submit(struct vhci_device *vdev) + if (!iso_buffer) { + usbip_event_add(&vdev->ud, + SDEV_EVENT_ERROR_MALLOC); +- return -1; ++ goto err_iso_buffer; + } + +- iov[2].iov_base = iso_buffer; +- iov[2].iov_len = len; ++ iov[iovnum].iov_base = iso_buffer; ++ iov[iovnum].iov_len = len; ++ iovnum++; + txsize += len; + } + +- ret = kernel_sendmsg(vdev->ud.tcp_socket, &msg, iov, 3, txsize); ++ ret = kernel_sendmsg(vdev->ud.tcp_socket, &msg, iov, iovnum, ++ txsize); + if (ret != txsize) { + pr_err("sendmsg failed!, ret=%d for %zd\n", ret, + txsize); +- kfree(iso_buffer); + usbip_event_add(&vdev->ud, VDEV_EVENT_ERROR_TCP); +- return -1; ++ err = -EPIPE; ++ goto err_tx; + } + ++ kfree(iov); ++ /* This is only for isochronous case */ + kfree(iso_buffer); ++ iso_buffer = NULL; ++ + usbip_dbg_vhci_tx("send txdata\n"); + + total_size += txsize; + } + + return total_size; ++ ++err_tx: ++ kfree(iso_buffer); ++err_iso_buffer: ++ kfree(iov); ++ ++ return err; + } + + static struct vhci_unlink *dequeue_from_unlink_tx(struct vhci_device *vdev) +diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c +index db547af01b59..4c0b220e20ba 100644 +--- a/fs/ceph/caps.c ++++ b/fs/ceph/caps.c +@@ -1053,6 +1053,11 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) + + dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); + ++ /* remove from inode's cap rbtree, and clear auth cap */ ++ rb_erase(&cap->ci_node, &ci->i_caps); ++ if (ci->i_auth_cap == cap) ++ ci->i_auth_cap = NULL; ++ + /* remove from session list */ + spin_lock(&session->s_cap_lock); + if (session->s_cap_iterator == cap) { +@@ -1088,11 +1093,6 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) + + spin_unlock(&session->s_cap_lock); + +- /* remove from inode list */ +- rb_erase(&cap->ci_node, &ci->i_caps); +- if (ci->i_auth_cap == cap) +- ci->i_auth_cap = NULL; +- + if (removed) + ceph_put_cap(mdsc, cap); + +diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c +index 8196c21d8623..acb70a6a82f0 100644 +--- a/fs/ceph/inode.c ++++ b/fs/ceph/inode.c +@@ -1399,6 +1399,7 @@ retry_lookup: + dout(" final dn %p\n", dn); + } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP || + req->r_op == CEPH_MDS_OP_MKSNAP) && ++ test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && + !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { + struct dentry *dn = req->r_dentry; + struct inode *dir = req->r_parent; +diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h +index 437257d1116f..308c682fa4d3 100644 +--- a/fs/cifs/smb2pdu.h ++++ b/fs/cifs/smb2pdu.h +@@ -777,6 +777,7 @@ struct create_durable_handle_reconnect_v2 { + struct create_context ccontext; + __u8 Name[8]; + struct durable_reconnect_context_v2 dcontext; ++ __u8 Pad[4]; + } __packed; + + /* See MS-SMB2 2.2.13.2.5 */ +diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h +index ccc31fa6f1a7..16eb59adf5aa 100644 +--- a/fs/configfs/configfs_internal.h ++++ b/fs/configfs/configfs_internal.h +@@ -34,6 +34,15 @@ + #include <linux/list.h> + #include <linux/spinlock.h> + ++struct configfs_fragment { ++ atomic_t frag_count; ++ struct rw_semaphore frag_sem; ++ bool frag_dead; ++}; ++ ++void put_fragment(struct configfs_fragment *); ++struct configfs_fragment *get_fragment(struct configfs_fragment *); ++ + struct configfs_dirent { + atomic_t s_count; + int s_dependent_count; +@@ -48,6 +57,7 @@ struct configfs_dirent { + #ifdef CONFIG_LOCKDEP + int s_depth; + #endif ++ struct configfs_fragment *s_frag; + }; + + #define CONFIGFS_ROOT 0x0001 +@@ -75,8 +85,8 @@ extern int configfs_create(struct dentry *, umode_t mode, void (*init)(struct in + extern int configfs_create_file(struct config_item *, const struct configfs_attribute *); + extern int configfs_create_bin_file(struct config_item *, + const struct configfs_bin_attribute *); +-extern int configfs_make_dirent(struct configfs_dirent *, +- struct dentry *, void *, umode_t, int); ++extern int configfs_make_dirent(struct configfs_dirent *, struct dentry *, ++ void *, umode_t, int, struct configfs_fragment *); + extern int configfs_dirent_is_ready(struct configfs_dirent *); + + extern void configfs_hash_and_remove(struct dentry * dir, const char * name); +@@ -151,6 +161,7 @@ static inline void release_configfs_dirent(struct configfs_dirent * sd) + { + if (!(sd->s_type & CONFIGFS_ROOT)) { + kfree(sd->s_iattr); ++ put_fragment(sd->s_frag); + kmem_cache_free(configfs_dir_cachep, sd); + } + } +diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c +index 809c1edffbaf..2cc6b1c49d34 100644 +--- a/fs/configfs/dir.c ++++ b/fs/configfs/dir.c +@@ -164,11 +164,38 @@ configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd) + + #endif /* CONFIG_LOCKDEP */ + ++static struct configfs_fragment *new_fragment(void) ++{ ++ struct configfs_fragment *p; ++ ++ p = kmalloc(sizeof(struct configfs_fragment), GFP_KERNEL); ++ if (p) { ++ atomic_set(&p->frag_count, 1); ++ init_rwsem(&p->frag_sem); ++ p->frag_dead = false; ++ } ++ return p; ++} ++ ++void put_fragment(struct configfs_fragment *frag) ++{ ++ if (frag && atomic_dec_and_test(&frag->frag_count)) ++ kfree(frag); ++} ++ ++struct configfs_fragment *get_fragment(struct configfs_fragment *frag) ++{ ++ if (likely(frag)) ++ atomic_inc(&frag->frag_count); ++ return frag; ++} ++ + /* + * Allocates a new configfs_dirent and links it to the parent configfs_dirent + */ + static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent *parent_sd, +- void *element, int type) ++ void *element, int type, ++ struct configfs_fragment *frag) + { + struct configfs_dirent * sd; + +@@ -188,6 +215,7 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent *paren + kmem_cache_free(configfs_dir_cachep, sd); + return ERR_PTR(-ENOENT); + } ++ sd->s_frag = get_fragment(frag); + list_add(&sd->s_sibling, &parent_sd->s_children); + spin_unlock(&configfs_dirent_lock); + +@@ -222,11 +250,11 @@ static int configfs_dirent_exists(struct configfs_dirent *parent_sd, + + int configfs_make_dirent(struct configfs_dirent * parent_sd, + struct dentry * dentry, void * element, +- umode_t mode, int type) ++ umode_t mode, int type, struct configfs_fragment *frag) + { + struct configfs_dirent * sd; + +- sd = configfs_new_dirent(parent_sd, element, type); ++ sd = configfs_new_dirent(parent_sd, element, type, frag); + if (IS_ERR(sd)) + return PTR_ERR(sd); + +@@ -273,7 +301,8 @@ static void init_symlink(struct inode * inode) + * until it is validated by configfs_dir_set_ready() + */ + +-static int configfs_create_dir(struct config_item *item, struct dentry *dentry) ++static int configfs_create_dir(struct config_item *item, struct dentry *dentry, ++ struct configfs_fragment *frag) + { + int error; + umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; +@@ -286,7 +315,8 @@ static int configfs_create_dir(struct config_item *item, struct dentry *dentry) + return error; + + error = configfs_make_dirent(p->d_fsdata, dentry, item, mode, +- CONFIGFS_DIR | CONFIGFS_USET_CREATING); ++ CONFIGFS_DIR | CONFIGFS_USET_CREATING, ++ frag); + if (unlikely(error)) + return error; + +@@ -351,9 +381,10 @@ int configfs_create_link(struct configfs_symlink *sl, + { + int err = 0; + umode_t mode = S_IFLNK | S_IRWXUGO; ++ struct configfs_dirent *p = parent->d_fsdata; + +- err = configfs_make_dirent(parent->d_fsdata, dentry, sl, mode, +- CONFIGFS_ITEM_LINK); ++ err = configfs_make_dirent(p, dentry, sl, mode, ++ CONFIGFS_ITEM_LINK, p->s_frag); + if (!err) { + err = configfs_create(dentry, mode, init_symlink); + if (err) { +@@ -612,7 +643,8 @@ static int populate_attrs(struct config_item *item) + + static int configfs_attach_group(struct config_item *parent_item, + struct config_item *item, +- struct dentry *dentry); ++ struct dentry *dentry, ++ struct configfs_fragment *frag); + static void configfs_detach_group(struct config_item *item); + + static void detach_groups(struct config_group *group) +@@ -660,7 +692,8 @@ static void detach_groups(struct config_group *group) + * try using vfs_mkdir. Just a thought. + */ + static int create_default_group(struct config_group *parent_group, +- struct config_group *group) ++ struct config_group *group, ++ struct configfs_fragment *frag) + { + int ret; + struct configfs_dirent *sd; +@@ -676,7 +709,7 @@ static int create_default_group(struct config_group *parent_group, + d_add(child, NULL); + + ret = configfs_attach_group(&parent_group->cg_item, +- &group->cg_item, child); ++ &group->cg_item, child, frag); + if (!ret) { + sd = child->d_fsdata; + sd->s_type |= CONFIGFS_USET_DEFAULT; +@@ -690,13 +723,14 @@ static int create_default_group(struct config_group *parent_group, + return ret; + } + +-static int populate_groups(struct config_group *group) ++static int populate_groups(struct config_group *group, ++ struct configfs_fragment *frag) + { + struct config_group *new_group; + int ret = 0; + + list_for_each_entry(new_group, &group->default_groups, group_entry) { +- ret = create_default_group(group, new_group); ++ ret = create_default_group(group, new_group, frag); + if (ret) { + detach_groups(group); + break; +@@ -810,11 +844,12 @@ static void link_group(struct config_group *parent_group, struct config_group *g + */ + static int configfs_attach_item(struct config_item *parent_item, + struct config_item *item, +- struct dentry *dentry) ++ struct dentry *dentry, ++ struct configfs_fragment *frag) + { + int ret; + +- ret = configfs_create_dir(item, dentry); ++ ret = configfs_create_dir(item, dentry, frag); + if (!ret) { + ret = populate_attrs(item); + if (ret) { +@@ -844,12 +879,13 @@ static void configfs_detach_item(struct config_item *item) + + static int configfs_attach_group(struct config_item *parent_item, + struct config_item *item, +- struct dentry *dentry) ++ struct dentry *dentry, ++ struct configfs_fragment *frag) + { + int ret; + struct configfs_dirent *sd; + +- ret = configfs_attach_item(parent_item, item, dentry); ++ ret = configfs_attach_item(parent_item, item, dentry, frag); + if (!ret) { + sd = dentry->d_fsdata; + sd->s_type |= CONFIGFS_USET_DIR; +@@ -865,7 +901,7 @@ static int configfs_attach_group(struct config_item *parent_item, + */ + inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); + configfs_adjust_dir_dirent_depth_before_populate(sd); +- ret = populate_groups(to_config_group(item)); ++ ret = populate_groups(to_config_group(item), frag); + if (ret) { + configfs_detach_item(item); + d_inode(dentry)->i_flags |= S_DEAD; +@@ -1260,6 +1296,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode + struct configfs_dirent *sd; + const struct config_item_type *type; + struct module *subsys_owner = NULL, *new_item_owner = NULL; ++ struct configfs_fragment *frag; + char *name; + + sd = dentry->d_parent->d_fsdata; +@@ -1278,6 +1315,12 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode + goto out; + } + ++ frag = new_fragment(); ++ if (!frag) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ + /* Get a working ref for the duration of this function */ + parent_item = configfs_get_config_item(dentry->d_parent); + type = parent_item->ci_type; +@@ -1380,9 +1423,9 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode + spin_unlock(&configfs_dirent_lock); + + if (group) +- ret = configfs_attach_group(parent_item, item, dentry); ++ ret = configfs_attach_group(parent_item, item, dentry, frag); + else +- ret = configfs_attach_item(parent_item, item, dentry); ++ ret = configfs_attach_item(parent_item, item, dentry, frag); + + spin_lock(&configfs_dirent_lock); + sd->s_type &= ~CONFIGFS_USET_IN_MKDIR; +@@ -1419,6 +1462,7 @@ out_put: + * reference. + */ + config_item_put(parent_item); ++ put_fragment(frag); + + out: + return ret; +@@ -1430,6 +1474,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) + struct config_item *item; + struct configfs_subsystem *subsys; + struct configfs_dirent *sd; ++ struct configfs_fragment *frag; + struct module *subsys_owner = NULL, *dead_item_owner = NULL; + int ret; + +@@ -1487,6 +1532,16 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) + } + } while (ret == -EAGAIN); + ++ frag = sd->s_frag; ++ if (down_write_killable(&frag->frag_sem)) { ++ spin_lock(&configfs_dirent_lock); ++ configfs_detach_rollback(dentry); ++ spin_unlock(&configfs_dirent_lock); ++ return -EINTR; ++ } ++ frag->frag_dead = true; ++ up_write(&frag->frag_sem); ++ + /* Get a working ref for the duration of this function */ + item = configfs_get_config_item(dentry); + +@@ -1587,7 +1642,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file) + */ + err = -ENOENT; + if (configfs_dirent_is_ready(parent_sd)) { +- file->private_data = configfs_new_dirent(parent_sd, NULL, 0); ++ file->private_data = configfs_new_dirent(parent_sd, NULL, 0, NULL); + if (IS_ERR(file->private_data)) + err = PTR_ERR(file->private_data); + else +@@ -1743,8 +1798,13 @@ int configfs_register_group(struct config_group *parent_group, + { + struct configfs_subsystem *subsys = parent_group->cg_subsys; + struct dentry *parent; ++ struct configfs_fragment *frag; + int ret; + ++ frag = new_fragment(); ++ if (!frag) ++ return -ENOMEM; ++ + mutex_lock(&subsys->su_mutex); + link_group(parent_group, group); + mutex_unlock(&subsys->su_mutex); +@@ -1752,7 +1812,7 @@ int configfs_register_group(struct config_group *parent_group, + parent = parent_group->cg_item.ci_dentry; + + inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); +- ret = create_default_group(parent_group, group); ++ ret = create_default_group(parent_group, group, frag); + if (ret) + goto err_out; + +@@ -1760,12 +1820,14 @@ int configfs_register_group(struct config_group *parent_group, + configfs_dir_set_ready(group->cg_item.ci_dentry->d_fsdata); + spin_unlock(&configfs_dirent_lock); + inode_unlock(d_inode(parent)); ++ put_fragment(frag); + return 0; + err_out: + inode_unlock(d_inode(parent)); + mutex_lock(&subsys->su_mutex); + unlink_group(group); + mutex_unlock(&subsys->su_mutex); ++ put_fragment(frag); + return ret; + } + EXPORT_SYMBOL(configfs_register_group); +@@ -1781,16 +1843,12 @@ void configfs_unregister_group(struct config_group *group) + struct configfs_subsystem *subsys = group->cg_subsys; + struct dentry *dentry = group->cg_item.ci_dentry; + struct dentry *parent = group->cg_item.ci_parent->ci_dentry; ++ struct configfs_dirent *sd = dentry->d_fsdata; ++ struct configfs_fragment *frag = sd->s_frag; + +- mutex_lock(&subsys->su_mutex); +- if (!group->cg_item.ci_parent->ci_group) { +- /* +- * The parent has already been unlinked and detached +- * due to a rmdir. +- */ +- goto unlink_group; +- } +- mutex_unlock(&subsys->su_mutex); ++ down_write(&frag->frag_sem); ++ frag->frag_dead = true; ++ up_write(&frag->frag_sem); + + inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); + spin_lock(&configfs_dirent_lock); +@@ -1806,7 +1864,6 @@ void configfs_unregister_group(struct config_group *group) + dput(dentry); + + mutex_lock(&subsys->su_mutex); +-unlink_group: + unlink_group(group); + mutex_unlock(&subsys->su_mutex); + } +@@ -1863,10 +1920,17 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) + struct dentry *dentry; + struct dentry *root; + struct configfs_dirent *sd; ++ struct configfs_fragment *frag; ++ ++ frag = new_fragment(); ++ if (!frag) ++ return -ENOMEM; + + root = configfs_pin_fs(); +- if (IS_ERR(root)) ++ if (IS_ERR(root)) { ++ put_fragment(frag); + return PTR_ERR(root); ++ } + + if (!group->cg_item.ci_name) + group->cg_item.ci_name = group->cg_item.ci_namebuf; +@@ -1882,7 +1946,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) + d_add(dentry, NULL); + + err = configfs_attach_group(sd->s_element, &group->cg_item, +- dentry); ++ dentry, frag); + if (err) { + BUG_ON(d_inode(dentry)); + d_drop(dentry); +@@ -1900,6 +1964,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) + unlink_group(group); + configfs_release_fs(); + } ++ put_fragment(frag); + + return err; + } +@@ -1909,12 +1974,18 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) + struct config_group *group = &subsys->su_group; + struct dentry *dentry = group->cg_item.ci_dentry; + struct dentry *root = dentry->d_sb->s_root; ++ struct configfs_dirent *sd = dentry->d_fsdata; ++ struct configfs_fragment *frag = sd->s_frag; + + if (dentry->d_parent != root) { + pr_err("Tried to unregister non-subsystem!\n"); + return; + } + ++ down_write(&frag->frag_sem); ++ frag->frag_dead = true; ++ up_write(&frag->frag_sem); ++ + inode_lock_nested(d_inode(root), + I_MUTEX_PARENT); + inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); +diff --git a/fs/configfs/file.c b/fs/configfs/file.c +index 62580dba3552..bb0a427517e9 100644 +--- a/fs/configfs/file.c ++++ b/fs/configfs/file.c +@@ -53,40 +53,44 @@ struct configfs_buffer { + bool write_in_progress; + char *bin_buffer; + int bin_buffer_size; ++ int cb_max_size; ++ struct config_item *item; ++ struct module *owner; ++ union { ++ struct configfs_attribute *attr; ++ struct configfs_bin_attribute *bin_attr; ++ }; + }; + ++static inline struct configfs_fragment *to_frag(struct file *file) ++{ ++ struct configfs_dirent *sd = file->f_path.dentry->d_fsdata; + +-/** +- * fill_read_buffer - allocate and fill buffer from item. +- * @dentry: dentry pointer. +- * @buffer: data buffer for file. +- * +- * Allocate @buffer->page, if it hasn't been already, then call the +- * config_item's show() method to fill the buffer with this attribute's +- * data. +- * This is called only once, on the file's first read. +- */ +-static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buffer) ++ return sd->s_frag; ++} ++ ++static int fill_read_buffer(struct file *file, struct configfs_buffer *buffer) + { +- struct configfs_attribute * attr = to_attr(dentry); +- struct config_item * item = to_item(dentry->d_parent); +- int ret = 0; +- ssize_t count; ++ struct configfs_fragment *frag = to_frag(file); ++ ssize_t count = -ENOENT; + + if (!buffer->page) + buffer->page = (char *) get_zeroed_page(GFP_KERNEL); + if (!buffer->page) + return -ENOMEM; + +- count = attr->show(item, buffer->page); +- +- BUG_ON(count > (ssize_t)SIMPLE_ATTR_SIZE); +- if (count >= 0) { +- buffer->needs_read_fill = 0; +- buffer->count = count; +- } else +- ret = count; +- return ret; ++ down_read(&frag->frag_sem); ++ if (!frag->frag_dead) ++ count = buffer->attr->show(buffer->item, buffer->page); ++ up_read(&frag->frag_sem); ++ ++ if (count < 0) ++ return count; ++ if (WARN_ON_ONCE(count > (ssize_t)SIMPLE_ATTR_SIZE)) ++ return -EIO; ++ buffer->needs_read_fill = 0; ++ buffer->count = count; ++ return 0; + } + + /** +@@ -111,12 +115,13 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf + static ssize_t + configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) + { +- struct configfs_buffer * buffer = file->private_data; ++ struct configfs_buffer *buffer = file->private_data; + ssize_t retval = 0; + + mutex_lock(&buffer->mutex); + if (buffer->needs_read_fill) { +- if ((retval = fill_read_buffer(file->f_path.dentry,buffer))) ++ retval = fill_read_buffer(file, buffer); ++ if (retval) + goto out; + } + pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n", +@@ -152,10 +157,8 @@ static ssize_t + configfs_read_bin_file(struct file *file, char __user *buf, + size_t count, loff_t *ppos) + { ++ struct configfs_fragment *frag = to_frag(file); + struct configfs_buffer *buffer = file->private_data; +- struct dentry *dentry = file->f_path.dentry; +- struct config_item *item = to_item(dentry->d_parent); +- struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry); + ssize_t retval = 0; + ssize_t len = min_t(size_t, count, PAGE_SIZE); + +@@ -170,14 +173,19 @@ configfs_read_bin_file(struct file *file, char __user *buf, + + if (buffer->needs_read_fill) { + /* perform first read with buf == NULL to get extent */ +- len = bin_attr->read(item, NULL, 0); ++ down_read(&frag->frag_sem); ++ if (!frag->frag_dead) ++ len = buffer->bin_attr->read(buffer->item, NULL, 0); ++ else ++ len = -ENOENT; ++ up_read(&frag->frag_sem); + if (len <= 0) { + retval = len; + goto out; + } + + /* do not exceed the maximum value */ +- if (bin_attr->cb_max_size && len > bin_attr->cb_max_size) { ++ if (buffer->cb_max_size && len > buffer->cb_max_size) { + retval = -EFBIG; + goto out; + } +@@ -190,7 +198,13 @@ configfs_read_bin_file(struct file *file, char __user *buf, + buffer->bin_buffer_size = len; + + /* perform second read to fill buffer */ +- len = bin_attr->read(item, buffer->bin_buffer, len); ++ down_read(&frag->frag_sem); ++ if (!frag->frag_dead) ++ len = buffer->bin_attr->read(buffer->item, ++ buffer->bin_buffer, len); ++ else ++ len = -ENOENT; ++ up_read(&frag->frag_sem); + if (len < 0) { + retval = len; + vfree(buffer->bin_buffer); +@@ -240,25 +254,17 @@ fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size + return error ? -EFAULT : count; + } + +- +-/** +- * flush_write_buffer - push buffer to config_item. +- * @dentry: dentry to the attribute +- * @buffer: data buffer for file. +- * @count: number of bytes +- * +- * Get the correct pointers for the config_item and the attribute we're +- * dealing with, then call the store() method for the attribute, +- * passing the buffer that we acquired in fill_write_buffer(). +- */ +- + static int +-flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size_t count) ++flush_write_buffer(struct file *file, struct configfs_buffer *buffer, size_t count) + { +- struct configfs_attribute * attr = to_attr(dentry); +- struct config_item * item = to_item(dentry->d_parent); +- +- return attr->store(item, buffer->page, count); ++ struct configfs_fragment *frag = to_frag(file); ++ int res = -ENOENT; ++ ++ down_read(&frag->frag_sem); ++ if (!frag->frag_dead) ++ res = buffer->attr->store(buffer->item, buffer->page, count); ++ up_read(&frag->frag_sem); ++ return res; + } + + +@@ -282,13 +288,13 @@ flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size + static ssize_t + configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos) + { +- struct configfs_buffer * buffer = file->private_data; ++ struct configfs_buffer *buffer = file->private_data; + ssize_t len; + + mutex_lock(&buffer->mutex); + len = fill_write_buffer(buffer, buf, count); + if (len > 0) +- len = flush_write_buffer(file->f_path.dentry, buffer, len); ++ len = flush_write_buffer(file, buffer, len); + if (len > 0) + *ppos += len; + mutex_unlock(&buffer->mutex); +@@ -313,8 +319,6 @@ configfs_write_bin_file(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) + { + struct configfs_buffer *buffer = file->private_data; +- struct dentry *dentry = file->f_path.dentry; +- struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry); + void *tbuf = NULL; + ssize_t len; + +@@ -330,8 +334,8 @@ configfs_write_bin_file(struct file *file, const char __user *buf, + /* buffer grows? */ + if (*ppos + count > buffer->bin_buffer_size) { + +- if (bin_attr->cb_max_size && +- *ppos + count > bin_attr->cb_max_size) { ++ if (buffer->cb_max_size && ++ *ppos + count > buffer->cb_max_size) { + len = -EFBIG; + goto out; + } +@@ -363,31 +367,51 @@ out: + return len; + } + +-static int check_perm(struct inode * inode, struct file * file, int type) ++static int __configfs_open_file(struct inode *inode, struct file *file, int type) + { +- struct config_item *item = configfs_get_config_item(file->f_path.dentry->d_parent); +- struct configfs_attribute * attr = to_attr(file->f_path.dentry); +- struct configfs_bin_attribute *bin_attr = NULL; +- struct configfs_buffer * buffer; +- struct configfs_item_operations * ops = NULL; +- int error = 0; ++ struct dentry *dentry = file->f_path.dentry; ++ struct configfs_fragment *frag = to_frag(file); ++ struct configfs_attribute *attr; ++ struct configfs_buffer *buffer; ++ int error; + +- if (!item || !attr) +- goto Einval; ++ error = -ENOMEM; ++ buffer = kzalloc(sizeof(struct configfs_buffer), GFP_KERNEL); ++ if (!buffer) ++ goto out; + +- if (type & CONFIGFS_ITEM_BIN_ATTR) +- bin_attr = to_bin_attr(file->f_path.dentry); ++ error = -ENOENT; ++ down_read(&frag->frag_sem); ++ if (unlikely(frag->frag_dead)) ++ goto out_free_buffer; + +- /* Grab the module reference for this attribute if we have one */ +- if (!try_module_get(attr->ca_owner)) { +- error = -ENODEV; +- goto Done; ++ error = -EINVAL; ++ buffer->item = to_item(dentry->d_parent); ++ if (!buffer->item) ++ goto out_free_buffer; ++ ++ attr = to_attr(dentry); ++ if (!attr) ++ goto out_put_item; ++ ++ if (type & CONFIGFS_ITEM_BIN_ATTR) { ++ buffer->bin_attr = to_bin_attr(dentry); ++ buffer->cb_max_size = buffer->bin_attr->cb_max_size; ++ } else { ++ buffer->attr = attr; + } + +- if (item->ci_type) +- ops = item->ci_type->ct_item_ops; +- else +- goto Eaccess; ++ buffer->owner = attr->ca_owner; ++ /* Grab the module reference for this attribute if we have one */ ++ error = -ENODEV; ++ if (!try_module_get(buffer->owner)) ++ goto out_put_item; ++ ++ error = -EACCES; ++ if (!buffer->item->ci_type) ++ goto out_put_module; ++ ++ buffer->ops = buffer->item->ci_type->ct_item_ops; + + /* File needs write support. + * The inode's perms must say it's ok, +@@ -395,13 +419,11 @@ static int check_perm(struct inode * inode, struct file * file, int type) + */ + if (file->f_mode & FMODE_WRITE) { + if (!(inode->i_mode & S_IWUGO)) +- goto Eaccess; +- ++ goto out_put_module; + if ((type & CONFIGFS_ITEM_ATTR) && !attr->store) +- goto Eaccess; +- +- if ((type & CONFIGFS_ITEM_BIN_ATTR) && !bin_attr->write) +- goto Eaccess; ++ goto out_put_module; ++ if ((type & CONFIGFS_ITEM_BIN_ATTR) && !buffer->bin_attr->write) ++ goto out_put_module; + } + + /* File needs read support. +@@ -410,92 +432,72 @@ static int check_perm(struct inode * inode, struct file * file, int type) + */ + if (file->f_mode & FMODE_READ) { + if (!(inode->i_mode & S_IRUGO)) +- goto Eaccess; +- ++ goto out_put_module; + if ((type & CONFIGFS_ITEM_ATTR) && !attr->show) +- goto Eaccess; +- +- if ((type & CONFIGFS_ITEM_BIN_ATTR) && !bin_attr->read) +- goto Eaccess; ++ goto out_put_module; ++ if ((type & CONFIGFS_ITEM_BIN_ATTR) && !buffer->bin_attr->read) ++ goto out_put_module; + } + +- /* No error? Great, allocate a buffer for the file, and store it +- * it in file->private_data for easy access. +- */ +- buffer = kzalloc(sizeof(struct configfs_buffer),GFP_KERNEL); +- if (!buffer) { +- error = -ENOMEM; +- goto Enomem; +- } + mutex_init(&buffer->mutex); + buffer->needs_read_fill = 1; + buffer->read_in_progress = false; + buffer->write_in_progress = false; +- buffer->ops = ops; + file->private_data = buffer; +- goto Done; ++ up_read(&frag->frag_sem); ++ return 0; + +- Einval: +- error = -EINVAL; +- goto Done; +- Eaccess: +- error = -EACCES; +- Enomem: +- module_put(attr->ca_owner); +- Done: +- if (error && item) +- config_item_put(item); ++out_put_module: ++ module_put(buffer->owner); ++out_put_item: ++ config_item_put(buffer->item); ++out_free_buffer: ++ up_read(&frag->frag_sem); ++ kfree(buffer); ++out: + return error; + } + + static int configfs_release(struct inode *inode, struct file *filp) + { +- struct config_item * item = to_item(filp->f_path.dentry->d_parent); +- struct configfs_attribute * attr = to_attr(filp->f_path.dentry); +- struct module * owner = attr->ca_owner; +- struct configfs_buffer * buffer = filp->private_data; +- +- if (item) +- config_item_put(item); +- /* After this point, attr should not be accessed. */ +- module_put(owner); +- +- if (buffer) { +- if (buffer->page) +- free_page((unsigned long)buffer->page); +- mutex_destroy(&buffer->mutex); +- kfree(buffer); +- } ++ struct configfs_buffer *buffer = filp->private_data; ++ ++ module_put(buffer->owner); ++ if (buffer->page) ++ free_page((unsigned long)buffer->page); ++ mutex_destroy(&buffer->mutex); ++ kfree(buffer); + return 0; + } + + static int configfs_open_file(struct inode *inode, struct file *filp) + { +- return check_perm(inode, filp, CONFIGFS_ITEM_ATTR); ++ return __configfs_open_file(inode, filp, CONFIGFS_ITEM_ATTR); + } + + static int configfs_open_bin_file(struct inode *inode, struct file *filp) + { +- return check_perm(inode, filp, CONFIGFS_ITEM_BIN_ATTR); ++ return __configfs_open_file(inode, filp, CONFIGFS_ITEM_BIN_ATTR); + } + +-static int configfs_release_bin_file(struct inode *inode, struct file *filp) ++static int configfs_release_bin_file(struct inode *inode, struct file *file) + { +- struct configfs_buffer *buffer = filp->private_data; +- struct dentry *dentry = filp->f_path.dentry; +- struct config_item *item = to_item(dentry->d_parent); +- struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry); +- ssize_t len = 0; +- int ret; ++ struct configfs_buffer *buffer = file->private_data; + + buffer->read_in_progress = false; + + if (buffer->write_in_progress) { ++ struct configfs_fragment *frag = to_frag(file); + buffer->write_in_progress = false; + +- len = bin_attr->write(item, buffer->bin_buffer, +- buffer->bin_buffer_size); +- ++ down_read(&frag->frag_sem); ++ if (!frag->frag_dead) { ++ /* result of ->release() is ignored */ ++ buffer->bin_attr->write(buffer->item, ++ buffer->bin_buffer, ++ buffer->bin_buffer_size); ++ } ++ up_read(&frag->frag_sem); + /* vfree on NULL is safe */ + vfree(buffer->bin_buffer); + buffer->bin_buffer = NULL; +@@ -503,10 +505,8 @@ static int configfs_release_bin_file(struct inode *inode, struct file *filp) + buffer->needs_read_fill = 1; + } + +- ret = configfs_release(inode, filp); +- if (len < 0) +- return len; +- return ret; ++ configfs_release(inode, file); ++ return 0; + } + + +@@ -541,7 +541,7 @@ int configfs_create_file(struct config_item * item, const struct configfs_attrib + + inode_lock_nested(d_inode(dir), I_MUTEX_NORMAL); + error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, +- CONFIGFS_ITEM_ATTR); ++ CONFIGFS_ITEM_ATTR, parent_sd->s_frag); + inode_unlock(d_inode(dir)); + + return error; +@@ -563,7 +563,7 @@ int configfs_create_bin_file(struct config_item *item, + + inode_lock_nested(dir->d_inode, I_MUTEX_NORMAL); + error = configfs_make_dirent(parent_sd, NULL, (void *) bin_attr, mode, +- CONFIGFS_ITEM_BIN_ATTR); ++ CONFIGFS_ITEM_BIN_ATTR, parent_sd->s_frag); + inode_unlock(dir->d_inode); + + return error; +diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c +index a5c54af861f7..1996643bb654 100644 +--- a/fs/configfs/symlink.c ++++ b/fs/configfs/symlink.c +@@ -157,11 +157,42 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna + !type->ct_item_ops->allow_link) + goto out_put; + ++ /* ++ * This is really sick. What they wanted was a hybrid of ++ * link(2) and symlink(2) - they wanted the target resolved ++ * at syscall time (as link(2) would've done), be a directory ++ * (which link(2) would've refused to do) *AND* be a deep ++ * fucking magic, making the target busy from rmdir POV. ++ * symlink(2) is nothing of that sort, and the locking it ++ * gets matches the normal symlink(2) semantics. Without ++ * attempts to resolve the target (which might very well ++ * not even exist yet) done prior to locking the parent ++ * directory. This perversion, OTOH, needs to resolve ++ * the target, which would lead to obvious deadlocks if ++ * attempted with any directories locked. ++ * ++ * Unfortunately, that garbage is userland ABI and we should've ++ * said "no" back in 2005. Too late now, so we get to ++ * play very ugly games with locking. ++ * ++ * Try *ANYTHING* of that sort in new code, and you will ++ * really regret it. Just ask yourself - what could a BOFH ++ * do to me and do I want to find it out first-hand? ++ * ++ * AV, a thoroughly annoyed bastard. ++ */ ++ inode_unlock(dir); + ret = get_target(symname, &path, &target_item, dentry->d_sb); ++ inode_lock(dir); + if (ret) + goto out_put; + +- ret = type->ct_item_ops->allow_link(parent_item, target_item); ++ if (dentry->d_inode || d_unhashed(dentry)) ++ ret = -EEXIST; ++ else ++ ret = inode_permission(dir, MAY_WRITE | MAY_EXEC); ++ if (!ret) ++ ret = type->ct_item_ops->allow_link(parent_item, target_item); + if (!ret) { + mutex_lock(&configfs_symlink_mutex); + ret = create_link(parent_item, target_item, dentry); +diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c +index 7ee86d8f313d..a89e27367e34 100644 +--- a/fs/fs-writeback.c ++++ b/fs/fs-writeback.c +@@ -582,10 +582,13 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, + spin_unlock(&inode->i_lock); + + /* +- * A dying wb indicates that the memcg-blkcg mapping has changed +- * and a new wb is already serving the memcg. Switch immediately. ++ * A dying wb indicates that either the blkcg associated with the ++ * memcg changed or the associated memcg is dying. In the first ++ * case, a replacement wb should already be available and we should ++ * refresh the wb immediately. In the second case, trying to ++ * refresh will keep failing. + */ +- if (unlikely(wb_dying(wbc->wb))) ++ if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css))) + inode_switch_wbs(inode, wbc->wb_id); + } + +diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c +index 825a8c52165a..c5c3394148f7 100644 +--- a/fs/nfs/delegation.c ++++ b/fs/nfs/delegation.c +@@ -54,6 +54,16 @@ nfs4_is_valid_delegation(const struct nfs_delegation *delegation, + return false; + } + ++struct nfs_delegation *nfs4_get_valid_delegation(const struct inode *inode) ++{ ++ struct nfs_delegation *delegation; ++ ++ delegation = rcu_dereference(NFS_I(inode)->delegation); ++ if (nfs4_is_valid_delegation(delegation, 0)) ++ return delegation; ++ return NULL; ++} ++ + static int + nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark) + { +diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h +index c95477823fa6..dd0f3eed3890 100644 +--- a/fs/nfs/delegation.h ++++ b/fs/nfs/delegation.h +@@ -66,6 +66,7 @@ int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, + bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, struct rpc_cred **cred); + bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode); + ++struct nfs_delegation *nfs4_get_valid_delegation(const struct inode *inode); + void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); + int nfs4_have_delegation(struct inode *inode, fmode_t flags); + int nfs4_check_delegation(struct inode *inode, fmode_t flags); +diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c +index 75faef7af22d..792f8821b5d6 100644 +--- a/fs/nfs/nfs4proc.c ++++ b/fs/nfs/nfs4proc.c +@@ -1393,8 +1393,6 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode, + return 0; + if ((delegation->type & fmode) != fmode) + return 0; +- if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) +- return 0; + switch (claim) { + case NFS4_OPEN_CLAIM_NULL: + case NFS4_OPEN_CLAIM_FH: +@@ -1751,7 +1749,6 @@ static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmo + static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) + { + struct nfs4_state *state = opendata->state; +- struct nfs_inode *nfsi = NFS_I(state->inode); + struct nfs_delegation *delegation; + int open_mode = opendata->o_arg.open_flags; + fmode_t fmode = opendata->o_arg.fmode; +@@ -1768,7 +1765,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) + } + spin_unlock(&state->owner->so_lock); + rcu_read_lock(); +- delegation = rcu_dereference(nfsi->delegation); ++ delegation = nfs4_get_valid_delegation(state->inode); + if (!can_open_delegated(delegation, fmode, claim)) { + rcu_read_unlock(); + break; +@@ -2293,7 +2290,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) + data->o_arg.open_flags, claim)) + goto out_no_action; + rcu_read_lock(); +- delegation = rcu_dereference(NFS_I(data->state->inode)->delegation); ++ delegation = nfs4_get_valid_delegation(data->state->inode); + if (can_open_delegated(delegation, data->o_arg.fmode, claim)) + goto unlock_no_action; + rcu_read_unlock(); +diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c +index 9fa35cb6f6e0..a847fe52c56e 100644 +--- a/fs/ocfs2/file.c ++++ b/fs/ocfs2/file.c +@@ -2106,54 +2106,90 @@ static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos) + return 0; + } + +-static int ocfs2_prepare_inode_for_refcount(struct inode *inode, +- struct file *file, +- loff_t pos, size_t count, +- int *meta_level) ++static int ocfs2_inode_lock_for_extent_tree(struct inode *inode, ++ struct buffer_head **di_bh, ++ int meta_level, ++ int overwrite_io, ++ int write_sem, ++ int wait) + { +- int ret; +- struct buffer_head *di_bh = NULL; +- u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; +- u32 clusters = +- ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos; ++ int ret = 0; + +- ret = ocfs2_inode_lock(inode, &di_bh, 1); +- if (ret) { +- mlog_errno(ret); ++ if (wait) ++ ret = ocfs2_inode_lock(inode, NULL, meta_level); ++ else ++ ret = ocfs2_try_inode_lock(inode, ++ overwrite_io ? NULL : di_bh, meta_level); ++ if (ret < 0) + goto out; ++ ++ if (wait) { ++ if (write_sem) ++ down_write(&OCFS2_I(inode)->ip_alloc_sem); ++ else ++ down_read(&OCFS2_I(inode)->ip_alloc_sem); ++ } else { ++ if (write_sem) ++ ret = down_write_trylock(&OCFS2_I(inode)->ip_alloc_sem); ++ else ++ ret = down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem); ++ ++ if (!ret) { ++ ret = -EAGAIN; ++ goto out_unlock; ++ } + } + +- *meta_level = 1; ++ return ret; + +- ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); +- if (ret) +- mlog_errno(ret); ++out_unlock: ++ brelse(*di_bh); ++ ocfs2_inode_unlock(inode, meta_level); + out: +- brelse(di_bh); + return ret; + } + ++static void ocfs2_inode_unlock_for_extent_tree(struct inode *inode, ++ struct buffer_head **di_bh, ++ int meta_level, ++ int write_sem) ++{ ++ if (write_sem) ++ up_write(&OCFS2_I(inode)->ip_alloc_sem); ++ else ++ up_read(&OCFS2_I(inode)->ip_alloc_sem); ++ ++ brelse(*di_bh); ++ *di_bh = NULL; ++ ++ if (meta_level >= 0) ++ ocfs2_inode_unlock(inode, meta_level); ++} ++ + static int ocfs2_prepare_inode_for_write(struct file *file, + loff_t pos, size_t count, int wait) + { + int ret = 0, meta_level = 0, overwrite_io = 0; ++ int write_sem = 0; + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = d_inode(dentry); + struct buffer_head *di_bh = NULL; + loff_t end; ++ u32 cpos; ++ u32 clusters; + + /* + * We start with a read level meta lock and only jump to an ex + * if we need to make modifications here. + */ + for(;;) { +- if (wait) +- ret = ocfs2_inode_lock(inode, NULL, meta_level); +- else +- ret = ocfs2_try_inode_lock(inode, +- overwrite_io ? NULL : &di_bh, meta_level); ++ ret = ocfs2_inode_lock_for_extent_tree(inode, ++ &di_bh, ++ meta_level, ++ overwrite_io, ++ write_sem, ++ wait); + if (ret < 0) { +- meta_level = -1; + if (ret != -EAGAIN) + mlog_errno(ret); + goto out; +@@ -2165,15 +2201,8 @@ static int ocfs2_prepare_inode_for_write(struct file *file, + */ + if (!wait && !overwrite_io) { + overwrite_io = 1; +- if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) { +- ret = -EAGAIN; +- goto out_unlock; +- } + + ret = ocfs2_overwrite_io(inode, di_bh, pos, count); +- brelse(di_bh); +- di_bh = NULL; +- up_read(&OCFS2_I(inode)->ip_alloc_sem); + if (ret < 0) { + if (ret != -EAGAIN) + mlog_errno(ret); +@@ -2192,7 +2221,10 @@ static int ocfs2_prepare_inode_for_write(struct file *file, + * set inode->i_size at the end of a write. */ + if (should_remove_suid(dentry)) { + if (meta_level == 0) { +- ocfs2_inode_unlock(inode, meta_level); ++ ocfs2_inode_unlock_for_extent_tree(inode, ++ &di_bh, ++ meta_level, ++ write_sem); + meta_level = 1; + continue; + } +@@ -2208,18 +2240,32 @@ static int ocfs2_prepare_inode_for_write(struct file *file, + + ret = ocfs2_check_range_for_refcount(inode, pos, count); + if (ret == 1) { +- ocfs2_inode_unlock(inode, meta_level); +- meta_level = -1; +- +- ret = ocfs2_prepare_inode_for_refcount(inode, +- file, +- pos, +- count, +- &meta_level); ++ ocfs2_inode_unlock_for_extent_tree(inode, ++ &di_bh, ++ meta_level, ++ write_sem); ++ ret = ocfs2_inode_lock_for_extent_tree(inode, ++ &di_bh, ++ meta_level, ++ overwrite_io, ++ 1, ++ wait); ++ write_sem = 1; ++ if (ret < 0) { ++ if (ret != -EAGAIN) ++ mlog_errno(ret); ++ goto out; ++ } ++ ++ cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; ++ clusters = ++ ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos; ++ ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); + } + + if (ret < 0) { +- mlog_errno(ret); ++ if (ret != -EAGAIN) ++ mlog_errno(ret); + goto out_unlock; + } + +@@ -2230,10 +2276,10 @@ out_unlock: + trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno, + pos, count, wait); + +- brelse(di_bh); +- +- if (meta_level >= 0) +- ocfs2_inode_unlock(inode, meta_level); ++ ocfs2_inode_unlock_for_extent_tree(inode, ++ &di_bh, ++ meta_level, ++ write_sem); + + out: + return ret; +diff --git a/include/linux/cpu.h b/include/linux/cpu.h +index 006f69f9277b..aab4273810e3 100644 +--- a/include/linux/cpu.h ++++ b/include/linux/cpu.h +@@ -59,6 +59,11 @@ extern ssize_t cpu_show_l1tf(struct device *dev, + struct device_attribute *attr, char *buf); + extern ssize_t cpu_show_mds(struct device *dev, + struct device_attribute *attr, char *buf); ++extern ssize_t cpu_show_tsx_async_abort(struct device *dev, ++ struct device_attribute *attr, ++ char *buf); ++extern ssize_t cpu_show_itlb_multihit(struct device *dev, ++ struct device_attribute *attr, char *buf); + + extern __printf(4, 5) + struct device *cpu_device_create(struct device *parent, void *drvdata, +@@ -193,28 +198,7 @@ static inline int cpuhp_smt_enable(void) { return 0; } + static inline int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { return 0; } + #endif + +-/* +- * These are used for a global "mitigations=" cmdline option for toggling +- * optional CPU mitigations. +- */ +-enum cpu_mitigations { +- CPU_MITIGATIONS_OFF, +- CPU_MITIGATIONS_AUTO, +- CPU_MITIGATIONS_AUTO_NOSMT, +-}; +- +-extern enum cpu_mitigations cpu_mitigations; +- +-/* mitigations=off */ +-static inline bool cpu_mitigations_off(void) +-{ +- return cpu_mitigations == CPU_MITIGATIONS_OFF; +-} +- +-/* mitigations=auto,nosmt */ +-static inline bool cpu_mitigations_auto_nosmt(void) +-{ +- return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT; +-} ++extern bool cpu_mitigations_off(void); ++extern bool cpu_mitigations_auto_nosmt(void); + + #endif /* _LINUX_CPU_H_ */ +diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h +index d42a36e4e6c2..96207939d862 100644 +--- a/include/linux/kvm_host.h ++++ b/include/linux/kvm_host.h +@@ -141,7 +141,7 @@ static inline bool is_error_page(struct page *page) + + extern struct kmem_cache *kvm_vcpu_cache; + +-extern spinlock_t kvm_lock; ++extern struct mutex kvm_lock; + extern struct list_head vm_list; + + struct kvm_io_range { +@@ -1034,6 +1034,7 @@ enum kvm_stat_kind { + + struct kvm_stat_data { + int offset; ++ int mode; + struct kvm *kvm; + }; + +@@ -1041,6 +1042,7 @@ struct kvm_stats_debugfs_item { + const char *name; + int offset; + enum kvm_stat_kind kind; ++ int mode; + }; + extern struct kvm_stats_debugfs_item debugfs_entries[]; + extern struct dentry *kvm_debugfs_dir; +@@ -1303,4 +1305,10 @@ static inline int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) + } + #endif /* CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE */ + ++typedef int (*kvm_vm_thread_fn_t)(struct kvm *kvm, uintptr_t data); ++ ++int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, ++ uintptr_t data, const char *name, ++ struct task_struct **thread_ptr); ++ + #endif +diff --git a/include/linux/mm.h b/include/linux/mm.h +index bdec425c8e14..45f10f5896b7 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -602,11 +602,6 @@ static inline void *kvcalloc(size_t n, size_t size, gfp_t flags) + + extern void kvfree(const void *addr); + +-static inline atomic_t *compound_mapcount_ptr(struct page *page) +-{ +- return &page[1].compound_mapcount; +-} +- + static inline int compound_mapcount(struct page *page) + { + VM_BUG_ON_PAGE(!PageCompound(page), page); +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 5ed8f6292a53..3a9a996af229 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -226,6 +226,11 @@ struct page_frag_cache { + + typedef unsigned long vm_flags_t; + ++static inline atomic_t *compound_mapcount_ptr(struct page *page) ++{ ++ return &page[1].compound_mapcount; ++} ++ + /* + * A region containing a mapping of a non-memory backed file under NOMMU + * conditions. These are held in a global tree and are pinned by the VMAs that +diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h +index 74bee8cecf4c..3f066ce63a63 100644 +--- a/include/linux/page-flags.h ++++ b/include/linux/page-flags.h +@@ -577,12 +577,28 @@ static inline int PageTransCompound(struct page *page) + * + * Unlike PageTransCompound, this is safe to be called only while + * split_huge_pmd() cannot run from under us, like if protected by the +- * MMU notifier, otherwise it may result in page->_mapcount < 0 false ++ * MMU notifier, otherwise it may result in page->_mapcount check false + * positives. ++ * ++ * We have to treat page cache THP differently since every subpage of it ++ * would get _mapcount inc'ed once it is PMD mapped. But, it may be PTE ++ * mapped in the current process so comparing subpage's _mapcount to ++ * compound_mapcount to filter out PTE mapped case. + */ + static inline int PageTransCompoundMap(struct page *page) + { +- return PageTransCompound(page) && atomic_read(&page->_mapcount) < 0; ++ struct page *head; ++ ++ if (!PageTransCompound(page)) ++ return 0; ++ ++ if (PageAnon(page)) ++ return atomic_read(&page->_mapcount) < 0; ++ ++ head = compound_head(page); ++ /* File THP is PMD mapped and not PTE mapped */ ++ return atomic_read(&page->_mapcount) == ++ atomic_read(compound_mapcount_ptr(head)); + } + + /* +diff --git a/include/net/bonding.h b/include/net/bonding.h +index b46d68acf701..8116648873c3 100644 +--- a/include/net/bonding.h ++++ b/include/net/bonding.h +@@ -149,7 +149,6 @@ struct slave { + unsigned long target_last_arp_rx[BOND_MAX_ARP_TARGETS]; + s8 link; /* one of BOND_LINK_XXXX */ + s8 link_new_state; /* one of BOND_LINK_XXXX */ +- s8 new_link; + u8 backup:1, /* indicates backup slave. Value corresponds with + BOND_STATE_ACTIVE and BOND_STATE_BACKUP */ + inactive:1, /* indicates inactive slave */ +@@ -539,7 +538,7 @@ static inline void bond_propose_link_state(struct slave *slave, int state) + + static inline void bond_commit_link_state(struct slave *slave, bool notify) + { +- if (slave->link == slave->link_new_state) ++ if (slave->link_new_state == BOND_LINK_NOCHANGE) + return; + + slave->link = slave->link_new_state; +diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h +index 0e3c0d83bd99..af0ede9ad4d0 100644 +--- a/include/net/ip_vs.h ++++ b/include/net/ip_vs.h +@@ -885,6 +885,7 @@ struct netns_ipvs { + struct delayed_work defense_work; /* Work handler */ + int drop_rate; + int drop_counter; ++ int old_secure_tcp; + atomic_t dropentry; + /* locks in ctl.c */ + spinlock_t dropentry_lock; /* drop entry handling */ +diff --git a/include/net/neighbour.h b/include/net/neighbour.h +index beeeed126872..c84807c1c5bd 100644 +--- a/include/net/neighbour.h ++++ b/include/net/neighbour.h +@@ -430,8 +430,8 @@ static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) + { + unsigned long now = jiffies; + +- if (neigh->used != now) +- neigh->used = now; ++ if (READ_ONCE(neigh->used) != now) ++ WRITE_ONCE(neigh->used, now); + if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE))) + return __neigh_event_send(neigh, skb); + return 0; +diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h +index 7685cbda9f28..024636c31adc 100644 +--- a/include/net/netfilter/nf_tables.h ++++ b/include/net/netfilter/nf_tables.h +@@ -793,7 +793,8 @@ struct nft_expr_ops { + */ + struct nft_expr { + const struct nft_expr_ops *ops; +- unsigned char data[]; ++ unsigned char data[] ++ __attribute__((aligned(__alignof__(u64)))); + }; + + static inline void *nft_expr_priv(const struct nft_expr *expr) +diff --git a/include/net/sock.h b/include/net/sock.h +index 05e8faa84717..0252c0d00310 100644 +--- a/include/net/sock.h ++++ b/include/net/sock.h +@@ -2318,7 +2318,7 @@ static inline ktime_t sock_read_timestamp(struct sock *sk) + + return kt; + #else +- return sk->sk_stamp; ++ return READ_ONCE(sk->sk_stamp); + #endif + } + +@@ -2329,7 +2329,7 @@ static inline void sock_write_timestamp(struct sock *sk, ktime_t kt) + sk->sk_stamp = kt; + write_sequnlock(&sk->sk_stamp_seq); + #else +- sk->sk_stamp = kt; ++ WRITE_ONCE(sk->sk_stamp, kt); + #endif + } + +diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h +index b7d63c3970d1..f3d475024d37 100644 +--- a/include/rdma/ib_verbs.h ++++ b/include/rdma/ib_verbs.h +@@ -310,7 +310,7 @@ struct ib_tm_caps { + + struct ib_cq_init_attr { + unsigned int cqe; +- int comp_vector; ++ u32 comp_vector; + u32 flags; + }; + +diff --git a/kernel/cpu.c b/kernel/cpu.c +index d9f855cb9f6f..9bb57ce57d98 100644 +--- a/kernel/cpu.c ++++ b/kernel/cpu.c +@@ -2282,7 +2282,18 @@ void __init boot_cpu_hotplug_init(void) + this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); + } + +-enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO; ++/* ++ * These are used for a global "mitigations=" cmdline option for toggling ++ * optional CPU mitigations. ++ */ ++enum cpu_mitigations { ++ CPU_MITIGATIONS_OFF, ++ CPU_MITIGATIONS_AUTO, ++ CPU_MITIGATIONS_AUTO_NOSMT, ++}; ++ ++static enum cpu_mitigations cpu_mitigations __ro_after_init = ++ CPU_MITIGATIONS_AUTO; + + static int __init mitigations_parse_cmdline(char *arg) + { +@@ -2299,3 +2310,17 @@ static int __init mitigations_parse_cmdline(char *arg) + return 0; + } + early_param("mitigations", mitigations_parse_cmdline); ++ ++/* mitigations=off */ ++bool cpu_mitigations_off(void) ++{ ++ return cpu_mitigations == CPU_MITIGATIONS_OFF; ++} ++EXPORT_SYMBOL_GPL(cpu_mitigations_off); ++ ++/* mitigations=auto,nosmt */ ++bool cpu_mitigations_auto_nosmt(void) ++{ ++ return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT; ++} ++EXPORT_SYMBOL_GPL(cpu_mitigations_auto_nosmt); +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 32d2dac680a7..e5e8f6721872 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -4305,23 +4305,16 @@ static inline u64 sched_cfs_bandwidth_slice(void) + } + + /* +- * Replenish runtime according to assigned quota and update expiration time. +- * We use sched_clock_cpu directly instead of rq->clock to avoid adding +- * additional synchronization around rq->lock. ++ * Replenish runtime according to assigned quota. We use sched_clock_cpu ++ * directly instead of rq->clock to avoid adding additional synchronization ++ * around rq->lock. + * + * requires cfs_b->lock + */ + void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) + { +- u64 now; +- +- if (cfs_b->quota == RUNTIME_INF) +- return; +- +- now = sched_clock_cpu(smp_processor_id()); +- cfs_b->runtime = cfs_b->quota; +- cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); +- cfs_b->expires_seq++; ++ if (cfs_b->quota != RUNTIME_INF) ++ cfs_b->runtime = cfs_b->quota; + } + + static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) +@@ -4343,8 +4336,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) + { + struct task_group *tg = cfs_rq->tg; + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); +- u64 amount = 0, min_amount, expires; +- int expires_seq; ++ u64 amount = 0, min_amount; + + /* note: this is a positive sum as runtime_remaining <= 0 */ + min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; +@@ -4361,61 +4353,17 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) + cfs_b->idle = 0; + } + } +- expires_seq = cfs_b->expires_seq; +- expires = cfs_b->runtime_expires; + raw_spin_unlock(&cfs_b->lock); + + cfs_rq->runtime_remaining += amount; +- /* +- * we may have advanced our local expiration to account for allowed +- * spread between our sched_clock and the one on which runtime was +- * issued. +- */ +- if (cfs_rq->expires_seq != expires_seq) { +- cfs_rq->expires_seq = expires_seq; +- cfs_rq->runtime_expires = expires; +- } + + return cfs_rq->runtime_remaining > 0; + } + +-/* +- * Note: This depends on the synchronization provided by sched_clock and the +- * fact that rq->clock snapshots this value. +- */ +-static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) +-{ +- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); +- +- /* if the deadline is ahead of our clock, nothing to do */ +- if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0)) +- return; +- +- if (cfs_rq->runtime_remaining < 0) +- return; +- +- /* +- * If the local deadline has passed we have to consider the +- * possibility that our sched_clock is 'fast' and the global deadline +- * has not truly expired. +- * +- * Fortunately we can check determine whether this the case by checking +- * whether the global deadline(cfs_b->expires_seq) has advanced. +- */ +- if (cfs_rq->expires_seq == cfs_b->expires_seq) { +- /* extend local deadline, drift is bounded above by 2 ticks */ +- cfs_rq->runtime_expires += TICK_NSEC; +- } else { +- /* global deadline is ahead, expiration has passed */ +- cfs_rq->runtime_remaining = 0; +- } +-} +- + static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) + { + /* dock delta_exec before expiring quota (as it could span periods) */ + cfs_rq->runtime_remaining -= delta_exec; +- expire_cfs_rq_runtime(cfs_rq); + + if (likely(cfs_rq->runtime_remaining > 0)) + return; +@@ -4600,8 +4548,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) + resched_curr(rq); + } + +-static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, +- u64 remaining, u64 expires) ++static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) + { + struct cfs_rq *cfs_rq; + u64 runtime; +@@ -4626,7 +4573,6 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, + remaining -= runtime; + + cfs_rq->runtime_remaining += runtime; +- cfs_rq->runtime_expires = expires; + + /* we check whether we're throttled above */ + if (cfs_rq->runtime_remaining > 0) +@@ -4651,7 +4597,7 @@ next: + */ + static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) + { +- u64 runtime, runtime_expires; ++ u64 runtime; + int throttled; + + /* no need to continue the timer with no bandwidth constraint */ +@@ -4679,8 +4625,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) + /* account preceding periods in which throttling occurred */ + cfs_b->nr_throttled += overrun; + +- runtime_expires = cfs_b->runtime_expires; +- + /* + * This check is repeated as we are holding onto the new bandwidth while + * we unthrottle. This can potentially race with an unthrottled group +@@ -4693,8 +4637,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) + cfs_b->distribute_running = 1; + raw_spin_unlock(&cfs_b->lock); + /* we can't nest cfs_b->lock while distributing bandwidth */ +- runtime = distribute_cfs_runtime(cfs_b, runtime, +- runtime_expires); ++ runtime = distribute_cfs_runtime(cfs_b, runtime); + raw_spin_lock(&cfs_b->lock); + + cfs_b->distribute_running = 0; +@@ -4771,8 +4714,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) + return; + + raw_spin_lock(&cfs_b->lock); +- if (cfs_b->quota != RUNTIME_INF && +- cfs_rq->runtime_expires == cfs_b->runtime_expires) { ++ if (cfs_b->quota != RUNTIME_INF) { + cfs_b->runtime += slack_runtime; + + /* we are under rq->lock, defer unthrottling using a timer */ +@@ -4804,7 +4746,6 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) + static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) + { + u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); +- u64 expires; + + /* confirm we're still not at a refresh boundary */ + raw_spin_lock(&cfs_b->lock); +@@ -4821,7 +4762,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) + if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) + runtime = cfs_b->runtime; + +- expires = cfs_b->runtime_expires; + if (runtime) + cfs_b->distribute_running = 1; + +@@ -4830,11 +4770,10 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) + if (!runtime) + return; + +- runtime = distribute_cfs_runtime(cfs_b, runtime, expires); ++ runtime = distribute_cfs_runtime(cfs_b, runtime); + + raw_spin_lock(&cfs_b->lock); +- if (expires == cfs_b->runtime_expires) +- cfs_b->runtime -= min(runtime, cfs_b->runtime); ++ cfs_b->runtime -= min(runtime, cfs_b->runtime); + cfs_b->distribute_running = 0; + raw_spin_unlock(&cfs_b->lock); + } +@@ -4980,17 +4919,13 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) + + void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) + { +- u64 overrun; +- + lockdep_assert_held(&cfs_b->lock); + + if (cfs_b->period_active) + return; + + cfs_b->period_active = 1; +- overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); +- cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period); +- cfs_b->expires_seq++; ++ hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); + hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); + } + +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 9a7c3d08b39f..62058fd6dcf6 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -334,8 +334,6 @@ struct cfs_bandwidth { + u64 quota; + u64 runtime; + s64 hierarchical_quota; +- u64 runtime_expires; +- int expires_seq; + + short idle; + short period_active; +@@ -555,8 +553,6 @@ struct cfs_rq { + + #ifdef CONFIG_CFS_BANDWIDTH + int runtime_enabled; +- int expires_seq; +- u64 runtime_expires; + s64 runtime_remaining; + + u64 throttled_clock; +diff --git a/lib/dump_stack.c b/lib/dump_stack.c +index 5cff72f18c4a..33ffbf308853 100644 +--- a/lib/dump_stack.c ++++ b/lib/dump_stack.c +@@ -106,7 +106,12 @@ retry: + was_locked = 1; + } else { + local_irq_restore(flags); +- cpu_relax(); ++ /* ++ * Wait for the lock to release before jumping to ++ * atomic_cmpxchg() in order to mitigate the thundering herd ++ * problem. ++ */ ++ do { cpu_relax(); } while (atomic_read(&dump_lock) != -1); + goto retry; + } + +diff --git a/mm/filemap.c b/mm/filemap.c +index 287f3fa02e5e..45f1c6d73b5b 100644 +--- a/mm/filemap.c ++++ b/mm/filemap.c +@@ -438,7 +438,8 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, + .range_end = end, + }; + +- if (!mapping_cap_writeback_dirty(mapping)) ++ if (!mapping_cap_writeback_dirty(mapping) || ++ !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) + return 0; + + wbc_attach_fdatawrite_inode(&wbc, mapping->host); +diff --git a/mm/memcontrol.c b/mm/memcontrol.c +index 65da189a433b..e0f7b94a4e9b 100644 +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -2224,6 +2224,15 @@ retry: + goto retry; + } + ++ /* ++ * Memcg doesn't have a dedicated reserve for atomic ++ * allocations. But like the global atomic pool, we need to ++ * put the burden of reclaim on regular allocation requests ++ * and let these go through as privileged allocations. ++ */ ++ if (gfp_mask & __GFP_ATOMIC) ++ goto force; ++ + /* + * Unlike in global OOM situations, memcg is not in a physical + * memory shortage. Allow dying and OOM-killed tasks to +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 2d04bd2e1ced..b34348a41bfe 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -1742,6 +1742,14 @@ void __init page_alloc_init_late(void) + /* Block until all are initialised */ + wait_for_completion(&pgdat_init_all_done_comp); + ++ /* ++ * The number of managed pages has changed due to the initialisation ++ * so the pcpu batch and high limits needs to be updated or the limits ++ * will be artificially small. ++ */ ++ for_each_populated_zone(zone) ++ zone_pcp_update(zone); ++ + /* + * We initialized the rest of the deferred pages. Permanently disable + * on-demand struct page initialization. +@@ -8011,7 +8019,6 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) + } + #endif + +-#ifdef CONFIG_MEMORY_HOTPLUG + /* + * The zone indicated has a new number of managed_pages; batch sizes and percpu + * page high values need to be recalulated. +@@ -8025,7 +8032,6 @@ void __meminit zone_pcp_update(struct zone *zone) + per_cpu_ptr(zone->pageset, cpu)); + mutex_unlock(&pcp_batch_high_lock); + } +-#endif + + void zone_pcp_reset(struct zone *zone) + { +diff --git a/mm/vmstat.c b/mm/vmstat.c +index 4a387937f9f5..a2b2ea786c9b 100644 +--- a/mm/vmstat.c ++++ b/mm/vmstat.c +@@ -1972,7 +1972,7 @@ void __init init_mm_internals(void) + #endif + #ifdef CONFIG_PROC_FS + proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op); +- proc_create_seq("pagetypeinfo", 0444, NULL, &pagetypeinfo_op); ++ proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op); + proc_create_seq("vmstat", 0444, NULL, &vmstat_op); + proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op); + #endif +diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c +index 446204ca7406..a8fc4e83cd95 100644 +--- a/net/ipv4/fib_semantics.c ++++ b/net/ipv4/fib_semantics.c +@@ -1421,8 +1421,8 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local) + int ret = 0; + unsigned int hash = fib_laddr_hashfn(local); + struct hlist_head *head = &fib_info_laddrhash[hash]; ++ int tb_id = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; + struct net *net = dev_net(dev); +- int tb_id = l3mdev_fib_table(dev); + struct fib_info *fi; + + if (!fib_info_laddrhash || local == 0) +diff --git a/net/ipv6/route.c b/net/ipv6/route.c +index c88586380134..076c21f6a645 100644 +--- a/net/ipv6/route.c ++++ b/net/ipv6/route.c +@@ -521,6 +521,7 @@ static void rt6_probe(struct fib6_info *rt) + { + struct __rt6_probe_work *work = NULL; + const struct in6_addr *nh_gw; ++ unsigned long last_probe; + struct neighbour *neigh; + struct net_device *dev; + struct inet6_dev *idev; +@@ -539,6 +540,7 @@ static void rt6_probe(struct fib6_info *rt) + nh_gw = &rt->fib6_nh.nh_gw; + dev = rt->fib6_nh.nh_dev; + rcu_read_lock_bh(); ++ last_probe = READ_ONCE(rt->last_probe); + idev = __in6_dev_get(dev); + neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); + if (neigh) { +@@ -554,13 +556,15 @@ static void rt6_probe(struct fib6_info *rt) + __neigh_set_probe_once(neigh); + } + write_unlock(&neigh->lock); +- } else if (time_after(jiffies, rt->last_probe + ++ } else if (time_after(jiffies, last_probe + + idev->cnf.rtr_probe_interval)) { + work = kmalloc(sizeof(*work), GFP_ATOMIC); + } + +- if (work) { +- rt->last_probe = jiffies; ++ if (!work || cmpxchg(&rt->last_probe, ++ last_probe, jiffies) != last_probe) { ++ kfree(work); ++ } else { + INIT_WORK(&work->work, rt6_probe_deferred); + work->target = *nh_gw; + dev_hold(dev); +@@ -3066,6 +3070,9 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, + if (!rt) + goto out; + ++#ifdef CONFIG_IPV6_ROUTER_PREF ++ rt->last_probe = jiffies; ++#endif + if (cfg->fc_flags & RTF_ADDRCONF) + rt->dst_nocount = true; + +diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c +index e2538c578671..1566261b6b5a 100644 +--- a/net/netfilter/ipset/ip_set_core.c ++++ b/net/netfilter/ipset/ip_set_core.c +@@ -1977,8 +1977,9 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) + } + + req_version->version = IPSET_PROTOCOL; +- ret = copy_to_user(user, req_version, +- sizeof(struct ip_set_req_version)); ++ if (copy_to_user(user, req_version, ++ sizeof(struct ip_set_req_version))) ++ ret = -EFAULT; + goto done; + } + case IP_SET_OP_GET_BYNAME: { +@@ -2035,7 +2036,8 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) + } /* end of switch(op) */ + + copy: +- ret = copy_to_user(user, data, copylen); ++ if (copy_to_user(user, data, copylen)) ++ ret = -EFAULT; + + done: + vfree(data); +diff --git a/net/netfilter/ipset/ip_set_hash_ipmac.c b/net/netfilter/ipset/ip_set_hash_ipmac.c +index 25560ea742d6..f2c2f72e2fff 100644 +--- a/net/netfilter/ipset/ip_set_hash_ipmac.c ++++ b/net/netfilter/ipset/ip_set_hash_ipmac.c +@@ -212,7 +212,7 @@ hash_ipmac6_kadt(struct ip_set *set, const struct sk_buff *skb, + (skb_mac_header(skb) + ETH_HLEN) > skb->data) + return -EINVAL; + +- if (opt->flags & IPSET_DIM_ONE_SRC) ++ if (opt->flags & IPSET_DIM_TWO_SRC) + ether_addr_copy(e.ether, eth_hdr(skb)->h_source); + else + ether_addr_copy(e.ether, eth_hdr(skb)->h_dest); +diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c +index 7588aeaa605f..80759aadd3e0 100644 +--- a/net/netfilter/ipvs/ip_vs_app.c ++++ b/net/netfilter/ipvs/ip_vs_app.c +@@ -198,21 +198,29 @@ struct ip_vs_app *register_ip_vs_app(struct netns_ipvs *ipvs, struct ip_vs_app * + + mutex_lock(&__ip_vs_app_mutex); + ++ /* increase the module use count */ ++ if (!ip_vs_use_count_inc()) { ++ err = -ENOENT; ++ goto out_unlock; ++ } ++ + list_for_each_entry(a, &ipvs->app_list, a_list) { + if (!strcmp(app->name, a->name)) { + err = -EEXIST; ++ /* decrease the module use count */ ++ ip_vs_use_count_dec(); + goto out_unlock; + } + } + a = kmemdup(app, sizeof(*app), GFP_KERNEL); + if (!a) { + err = -ENOMEM; ++ /* decrease the module use count */ ++ ip_vs_use_count_dec(); + goto out_unlock; + } + INIT_LIST_HEAD(&a->incs_list); + list_add(&a->a_list, &ipvs->app_list); +- /* increase the module use count */ +- ip_vs_use_count_inc(); + + out_unlock: + mutex_unlock(&__ip_vs_app_mutex); +diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c +index 3df94a499126..c339b5e386b7 100644 +--- a/net/netfilter/ipvs/ip_vs_ctl.c ++++ b/net/netfilter/ipvs/ip_vs_ctl.c +@@ -98,7 +98,6 @@ static bool __ip_vs_addr_is_local_v6(struct net *net, + static void update_defense_level(struct netns_ipvs *ipvs) + { + struct sysinfo i; +- static int old_secure_tcp = 0; + int availmem; + int nomem; + int to_change = -1; +@@ -179,35 +178,35 @@ static void update_defense_level(struct netns_ipvs *ipvs) + spin_lock(&ipvs->securetcp_lock); + switch (ipvs->sysctl_secure_tcp) { + case 0: +- if (old_secure_tcp >= 2) ++ if (ipvs->old_secure_tcp >= 2) + to_change = 0; + break; + case 1: + if (nomem) { +- if (old_secure_tcp < 2) ++ if (ipvs->old_secure_tcp < 2) + to_change = 1; + ipvs->sysctl_secure_tcp = 2; + } else { +- if (old_secure_tcp >= 2) ++ if (ipvs->old_secure_tcp >= 2) + to_change = 0; + } + break; + case 2: + if (nomem) { +- if (old_secure_tcp < 2) ++ if (ipvs->old_secure_tcp < 2) + to_change = 1; + } else { +- if (old_secure_tcp >= 2) ++ if (ipvs->old_secure_tcp >= 2) + to_change = 0; + ipvs->sysctl_secure_tcp = 1; + } + break; + case 3: +- if (old_secure_tcp < 2) ++ if (ipvs->old_secure_tcp < 2) + to_change = 1; + break; + } +- old_secure_tcp = ipvs->sysctl_secure_tcp; ++ ipvs->old_secure_tcp = ipvs->sysctl_secure_tcp; + if (to_change >= 0) + ip_vs_protocol_timeout_change(ipvs, + ipvs->sysctl_secure_tcp > 1); +@@ -1204,7 +1203,8 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, + struct ip_vs_service *svc = NULL; + + /* increase the module use count */ +- ip_vs_use_count_inc(); ++ if (!ip_vs_use_count_inc()) ++ return -ENOPROTOOPT; + + /* Lookup the scheduler by 'u->sched_name' */ + if (strcmp(u->sched_name, "none")) { +@@ -2363,9 +2363,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) + if (copy_from_user(arg, user, len) != 0) + return -EFAULT; + +- /* increase the module use count */ +- ip_vs_use_count_inc(); +- + /* Handle daemons since they have another lock */ + if (cmd == IP_VS_SO_SET_STARTDAEMON || + cmd == IP_VS_SO_SET_STOPDAEMON) { +@@ -2378,13 +2375,13 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) + ret = -EINVAL; + if (strscpy(cfg.mcast_ifn, dm->mcast_ifn, + sizeof(cfg.mcast_ifn)) <= 0) +- goto out_dec; ++ return ret; + cfg.syncid = dm->syncid; + ret = start_sync_thread(ipvs, &cfg, dm->state); + } else { + ret = stop_sync_thread(ipvs, dm->state); + } +- goto out_dec; ++ return ret; + } + + mutex_lock(&__ip_vs_mutex); +@@ -2479,10 +2476,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) + + out_unlock: + mutex_unlock(&__ip_vs_mutex); +- out_dec: +- /* decrease the module use count */ +- ip_vs_use_count_dec(); +- + return ret; + } + +diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c +index 0df17caa8af6..714e7e05c102 100644 +--- a/net/netfilter/ipvs/ip_vs_pe.c ++++ b/net/netfilter/ipvs/ip_vs_pe.c +@@ -67,7 +67,8 @@ int register_ip_vs_pe(struct ip_vs_pe *pe) + struct ip_vs_pe *tmp; + + /* increase the module use count */ +- ip_vs_use_count_inc(); ++ if (!ip_vs_use_count_inc()) ++ return -ENOENT; + + mutex_lock(&ip_vs_pe_mutex); + /* Make sure that the pe with this name doesn't exist +diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c +index a2ff7d746ebf..3bd0ff36dc41 100644 +--- a/net/netfilter/ipvs/ip_vs_sched.c ++++ b/net/netfilter/ipvs/ip_vs_sched.c +@@ -184,7 +184,8 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) + } + + /* increase the module use count */ +- ip_vs_use_count_inc(); ++ if (!ip_vs_use_count_inc()) ++ return -ENOENT; + + mutex_lock(&ip_vs_sched_mutex); + +diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c +index ecb71062fcb3..5acd99f83166 100644 +--- a/net/netfilter/ipvs/ip_vs_sync.c ++++ b/net/netfilter/ipvs/ip_vs_sync.c +@@ -1762,6 +1762,10 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, + IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n", + sizeof(struct ip_vs_sync_conn_v0)); + ++ /* increase the module use count */ ++ if (!ip_vs_use_count_inc()) ++ return -ENOPROTOOPT; ++ + /* Do not hold one mutex and then to block on another */ + for (;;) { + rtnl_lock(); +@@ -1892,9 +1896,6 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, + mutex_unlock(&ipvs->sync_mutex); + rtnl_unlock(); + +- /* increase the module use count */ +- ip_vs_use_count_inc(); +- + return 0; + + out: +@@ -1924,11 +1925,17 @@ out: + } + kfree(ti); + } ++ ++ /* decrease the module use count */ ++ ip_vs_use_count_dec(); + return result; + + out_early: + mutex_unlock(&ipvs->sync_mutex); + rtnl_unlock(); ++ ++ /* decrease the module use count */ ++ ip_vs_use_count_dec(); + return result; + } + +diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c +index 8ade40512944..70bd730ca059 100644 +--- a/net/netfilter/nf_flow_table_core.c ++++ b/net/netfilter/nf_flow_table_core.c +@@ -187,6 +187,8 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) + { + int err; + ++ flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; ++ + err = rhashtable_insert_fast(&flow_table->rhashtable, + &flow->tuplehash[0].node, + nf_flow_offload_rhash_params); +@@ -203,7 +205,6 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) + return err; + } + +- flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; + return 0; + } + EXPORT_SYMBOL_GPL(flow_offload_add); +diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c +index b3662264aa24..30938854bb8d 100644 +--- a/net/nfc/netlink.c ++++ b/net/nfc/netlink.c +@@ -1110,7 +1110,6 @@ static int nfc_genl_llc_set_params(struct sk_buff *skb, struct genl_info *info) + + local = nfc_llcp_find_local(dev); + if (!local) { +- nfc_put_device(dev); + rc = -ENODEV; + goto exit; + } +@@ -1170,7 +1169,6 @@ static int nfc_genl_llc_sdreq(struct sk_buff *skb, struct genl_info *info) + + local = nfc_llcp_find_local(dev); + if (!local) { +- nfc_put_device(dev); + rc = -ENODEV; + goto exit; + } +diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c +index 5a304cfc8423..d2356a284646 100644 +--- a/net/openvswitch/vport-internal_dev.c ++++ b/net/openvswitch/vport-internal_dev.c +@@ -149,7 +149,7 @@ static void do_setup(struct net_device *netdev) + netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH | + IFF_NO_QUEUE; + netdev->needs_free_netdev = true; +- netdev->priv_destructor = internal_dev_destructor; ++ netdev->priv_destructor = NULL; + netdev->ethtool_ops = &internal_dev_ethtool_ops; + netdev->rtnl_link_ops = &internal_dev_link_ops; + +@@ -171,7 +171,6 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) + struct internal_dev *internal_dev; + struct net_device *dev; + int err; +- bool free_vport = true; + + vport = ovs_vport_alloc(0, &ovs_internal_vport_ops, parms); + if (IS_ERR(vport)) { +@@ -202,10 +201,9 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) + + rtnl_lock(); + err = register_netdevice(vport->dev); +- if (err) { +- free_vport = false; ++ if (err) + goto error_unlock; +- } ++ vport->dev->priv_destructor = internal_dev_destructor; + + dev_set_promiscuity(vport->dev, 1); + rtnl_unlock(); +@@ -219,8 +217,7 @@ error_unlock: + error_free_netdev: + free_netdev(dev); + error_free_vport: +- if (free_vport) +- ovs_vport_free(vport); ++ ovs_vport_free(vport); + error: + return ERR_PTR(err); + } +diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c +index 3c199f752fd3..2a8651aa90c8 100644 +--- a/net/vmw_vsock/virtio_transport_common.c ++++ b/net/vmw_vsock/virtio_transport_common.c +@@ -871,9 +871,11 @@ virtio_transport_recv_connected(struct sock *sk, + if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_SEND) + vsk->peer_shutdown |= SEND_SHUTDOWN; + if (vsk->peer_shutdown == SHUTDOWN_MASK && +- vsock_stream_has_data(vsk) <= 0) { +- sock_set_flag(sk, SOCK_DONE); +- sk->sk_state = TCP_CLOSING; ++ vsock_stream_has_data(vsk) <= 0 && ++ !sock_flag(sk, SOCK_DONE)) { ++ (void)virtio_transport_reset(vsk, NULL); ++ ++ virtio_transport_do_close(vsk, true); + } + if (le32_to_cpu(pkt->hdr.flags)) + sk->sk_state_change(sk); +diff --git a/sound/core/timer.c b/sound/core/timer.c +index ec74705f003b..86a31e69fc7d 100644 +--- a/sound/core/timer.c ++++ b/sound/core/timer.c +@@ -298,11 +298,11 @@ int snd_timer_open(struct snd_timer_instance **ti, + goto unlock; + } + if (!list_empty(&timer->open_list_head)) { +- timeri = list_entry(timer->open_list_head.next, ++ struct snd_timer_instance *t = ++ list_entry(timer->open_list_head.next, + struct snd_timer_instance, open_list); +- if (timeri->flags & SNDRV_TIMER_IFLG_EXCLUSIVE) { ++ if (t->flags & SNDRV_TIMER_IFLG_EXCLUSIVE) { + err = -EBUSY; +- timeri = NULL; + goto unlock; + } + } +diff --git a/sound/firewire/bebob/bebob_focusrite.c b/sound/firewire/bebob/bebob_focusrite.c +index 52b8b61ecddd..62d989edd129 100644 +--- a/sound/firewire/bebob/bebob_focusrite.c ++++ b/sound/firewire/bebob/bebob_focusrite.c +@@ -28,6 +28,8 @@ + #define SAFFIRE_CLOCK_SOURCE_SPDIF 1 + + /* clock sources as returned from register of Saffire Pro 10 and 26 */ ++#define SAFFIREPRO_CLOCK_SOURCE_SELECT_MASK 0x000000ff ++#define SAFFIREPRO_CLOCK_SOURCE_DETECT_MASK 0x0000ff00 + #define SAFFIREPRO_CLOCK_SOURCE_INTERNAL 0 + #define SAFFIREPRO_CLOCK_SOURCE_SKIP 1 /* never used on hardware */ + #define SAFFIREPRO_CLOCK_SOURCE_SPDIF 2 +@@ -190,6 +192,7 @@ saffirepro_both_clk_src_get(struct snd_bebob *bebob, unsigned int *id) + map = saffirepro_clk_maps[1]; + + /* In a case that this driver cannot handle the value of register. */ ++ value &= SAFFIREPRO_CLOCK_SOURCE_SELECT_MASK; + if (value >= SAFFIREPRO_CLOCK_SOURCE_COUNT || map[value] < 0) { + err = -EIO; + goto end; +diff --git a/sound/pci/hda/patch_ca0132.c b/sound/pci/hda/patch_ca0132.c +index 0436789e7cd8..3e978b75be9a 100644 +--- a/sound/pci/hda/patch_ca0132.c ++++ b/sound/pci/hda/patch_ca0132.c +@@ -6769,7 +6769,7 @@ static void hp_callback(struct hda_codec *codec, struct hda_jack_callback *cb) + /* Delay enabling the HP amp, to let the mic-detection + * state machine run. + */ +- cancel_delayed_work_sync(&spec->unsol_hp_work); ++ cancel_delayed_work(&spec->unsol_hp_work); + schedule_delayed_work(&spec->unsol_hp_work, msecs_to_jiffies(500)); + tbl = snd_hda_jack_tbl_get(codec, cb->nid); + if (tbl) +diff --git a/sound/usb/Makefile b/sound/usb/Makefile +index d330f74c90e6..a12fffcbcb20 100644 +--- a/sound/usb/Makefile ++++ b/sound/usb/Makefile +@@ -16,7 +16,8 @@ snd-usb-audio-objs := card.o \ + power.o \ + proc.o \ + quirks.o \ +- stream.o ++ stream.o \ ++ validate.o + + snd-usbmidi-lib-objs := midi.o + +diff --git a/sound/usb/clock.c b/sound/usb/clock.c +index db5e39d67a90..e31349865f20 100644 +--- a/sound/usb/clock.c ++++ b/sound/usb/clock.c +@@ -52,39 +52,37 @@ static void *find_uac_clock_desc(struct usb_host_interface *iface, int id, + static bool validate_clock_source_v2(void *p, int id) + { + struct uac_clock_source_descriptor *cs = p; +- return cs->bLength == sizeof(*cs) && cs->bClockID == id; ++ return cs->bClockID == id; + } + + static bool validate_clock_source_v3(void *p, int id) + { + struct uac3_clock_source_descriptor *cs = p; +- return cs->bLength == sizeof(*cs) && cs->bClockID == id; ++ return cs->bClockID == id; + } + + static bool validate_clock_selector_v2(void *p, int id) + { + struct uac_clock_selector_descriptor *cs = p; +- return cs->bLength >= sizeof(*cs) && cs->bClockID == id && +- cs->bLength == 7 + cs->bNrInPins; ++ return cs->bClockID == id; + } + + static bool validate_clock_selector_v3(void *p, int id) + { + struct uac3_clock_selector_descriptor *cs = p; +- return cs->bLength >= sizeof(*cs) && cs->bClockID == id && +- cs->bLength == 11 + cs->bNrInPins; ++ return cs->bClockID == id; + } + + static bool validate_clock_multiplier_v2(void *p, int id) + { + struct uac_clock_multiplier_descriptor *cs = p; +- return cs->bLength == sizeof(*cs) && cs->bClockID == id; ++ return cs->bClockID == id; + } + + static bool validate_clock_multiplier_v3(void *p, int id) + { + struct uac3_clock_multiplier_descriptor *cs = p; +- return cs->bLength == sizeof(*cs) && cs->bClockID == id; ++ return cs->bClockID == id; + } + + #define DEFINE_FIND_HELPER(name, obj, validator, type) \ +diff --git a/sound/usb/helper.h b/sound/usb/helper.h +index d338bd0e0ca6..f5b4c6647e4d 100644 +--- a/sound/usb/helper.h ++++ b/sound/usb/helper.h +@@ -30,4 +30,8 @@ static inline int snd_usb_ctrl_intf(struct snd_usb_audio *chip) + return get_iface_desc(chip->ctrl_intf)->bInterfaceNumber; + } + ++/* in validate.c */ ++bool snd_usb_validate_audio_desc(void *p, int protocol); ++bool snd_usb_validate_midi_desc(void *p); ++ + #endif /* __USBAUDIO_HELPER_H */ +diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c +index b0c5d4ef6137..bb67131e6437 100644 +--- a/sound/usb/mixer.c ++++ b/sound/usb/mixer.c +@@ -755,13 +755,6 @@ static int uac_mixer_unit_get_channels(struct mixer_build *state, + { + int mu_channels; + +- if (desc->bLength < sizeof(*desc)) +- return -EINVAL; +- if (!desc->bNrInPins) +- return -EINVAL; +- if (desc->bLength < sizeof(*desc) + desc->bNrInPins) +- return -EINVAL; +- + switch (state->mixer->protocol) { + case UAC_VERSION_1: + case UAC_VERSION_2: +@@ -780,222 +773,242 @@ static int uac_mixer_unit_get_channels(struct mixer_build *state, + } + + /* +- * parse the source unit recursively until it reaches to a terminal +- * or a branched unit. ++ * Parse Input Terminal Unit + */ + static int __check_input_term(struct mixer_build *state, int id, +- struct usb_audio_term *term) ++ struct usb_audio_term *term); ++ ++static int parse_term_uac1_iterm_unit(struct mixer_build *state, ++ struct usb_audio_term *term, ++ void *p1, int id) + { +- int protocol = state->mixer->protocol; ++ struct uac_input_terminal_descriptor *d = p1; ++ ++ term->type = le16_to_cpu(d->wTerminalType); ++ term->channels = d->bNrChannels; ++ term->chconfig = le16_to_cpu(d->wChannelConfig); ++ term->name = d->iTerminal; ++ return 0; ++} ++ ++static int parse_term_uac2_iterm_unit(struct mixer_build *state, ++ struct usb_audio_term *term, ++ void *p1, int id) ++{ ++ struct uac2_input_terminal_descriptor *d = p1; + int err; +- void *p1; +- unsigned char *hdr; + +- memset(term, 0, sizeof(*term)); +- for (;;) { +- /* a loop in the terminal chain? */ +- if (test_and_set_bit(id, state->termbitmap)) +- return -EINVAL; ++ /* call recursively to verify the referenced clock entity */ ++ err = __check_input_term(state, d->bCSourceID, term); ++ if (err < 0) ++ return err; + +- p1 = find_audio_control_unit(state, id); +- if (!p1) +- break; ++ /* save input term properties after recursion, ++ * to ensure they are not overriden by the recursion calls ++ */ ++ term->id = id; ++ term->type = le16_to_cpu(d->wTerminalType); ++ term->channels = d->bNrChannels; ++ term->chconfig = le32_to_cpu(d->bmChannelConfig); ++ term->name = d->iTerminal; ++ return 0; ++} + +- hdr = p1; +- term->id = id; ++static int parse_term_uac3_iterm_unit(struct mixer_build *state, ++ struct usb_audio_term *term, ++ void *p1, int id) ++{ ++ struct uac3_input_terminal_descriptor *d = p1; ++ int err; + +- if (protocol == UAC_VERSION_1 || protocol == UAC_VERSION_2) { +- switch (hdr[2]) { +- case UAC_INPUT_TERMINAL: +- if (protocol == UAC_VERSION_1) { +- struct uac_input_terminal_descriptor *d = p1; +- +- term->type = le16_to_cpu(d->wTerminalType); +- term->channels = d->bNrChannels; +- term->chconfig = le16_to_cpu(d->wChannelConfig); +- term->name = d->iTerminal; +- } else { /* UAC_VERSION_2 */ +- struct uac2_input_terminal_descriptor *d = p1; +- +- /* call recursively to verify that the +- * referenced clock entity is valid */ +- err = __check_input_term(state, d->bCSourceID, term); +- if (err < 0) +- return err; ++ /* call recursively to verify the referenced clock entity */ ++ err = __check_input_term(state, d->bCSourceID, term); ++ if (err < 0) ++ return err; + +- /* save input term properties after recursion, +- * to ensure they are not overriden by the +- * recursion calls */ +- term->id = id; +- term->type = le16_to_cpu(d->wTerminalType); +- term->channels = d->bNrChannels; +- term->chconfig = le32_to_cpu(d->bmChannelConfig); +- term->name = d->iTerminal; +- } +- return 0; +- case UAC_FEATURE_UNIT: { +- /* the header is the same for v1 and v2 */ +- struct uac_feature_unit_descriptor *d = p1; ++ /* save input term properties after recursion, ++ * to ensure they are not overriden by the recursion calls ++ */ ++ term->id = id; ++ term->type = le16_to_cpu(d->wTerminalType); + +- id = d->bSourceID; +- break; /* continue to parse */ +- } +- case UAC_MIXER_UNIT: { +- struct uac_mixer_unit_descriptor *d = p1; +- +- term->type = UAC3_MIXER_UNIT << 16; /* virtual type */ +- term->channels = uac_mixer_unit_bNrChannels(d); +- term->chconfig = uac_mixer_unit_wChannelConfig(d, protocol); +- term->name = uac_mixer_unit_iMixer(d); +- return 0; +- } +- case UAC_SELECTOR_UNIT: +- case UAC2_CLOCK_SELECTOR: { +- struct uac_selector_unit_descriptor *d = p1; +- /* call recursively to retrieve the channel info */ +- err = __check_input_term(state, d->baSourceID[0], term); +- if (err < 0) +- return err; +- term->type = UAC3_SELECTOR_UNIT << 16; /* virtual type */ +- term->id = id; +- term->name = uac_selector_unit_iSelector(d); +- return 0; +- } +- case UAC1_PROCESSING_UNIT: +- /* UAC2_EFFECT_UNIT */ +- if (protocol == UAC_VERSION_1) +- term->type = UAC3_PROCESSING_UNIT << 16; /* virtual type */ +- else /* UAC_VERSION_2 */ +- term->type = UAC3_EFFECT_UNIT << 16; /* virtual type */ +- /* fall through */ +- case UAC1_EXTENSION_UNIT: +- /* UAC2_PROCESSING_UNIT_V2 */ +- if (protocol == UAC_VERSION_1 && !term->type) +- term->type = UAC3_EXTENSION_UNIT << 16; /* virtual type */ +- else if (protocol == UAC_VERSION_2 && !term->type) +- term->type = UAC3_PROCESSING_UNIT << 16; /* virtual type */ +- /* fall through */ +- case UAC2_EXTENSION_UNIT_V2: { +- struct uac_processing_unit_descriptor *d = p1; +- +- if (protocol == UAC_VERSION_2 && +- hdr[2] == UAC2_EFFECT_UNIT) { +- /* UAC2/UAC1 unit IDs overlap here in an +- * uncompatible way. Ignore this unit for now. +- */ +- return 0; +- } ++ err = get_cluster_channels_v3(state, le16_to_cpu(d->wClusterDescrID)); ++ if (err < 0) ++ return err; ++ term->channels = err; + +- if (d->bNrInPins) { +- id = d->baSourceID[0]; +- break; /* continue to parse */ +- } +- if (!term->type) +- term->type = UAC3_EXTENSION_UNIT << 16; /* virtual type */ ++ /* REVISIT: UAC3 IT doesn't have channels cfg */ ++ term->chconfig = 0; + +- term->channels = uac_processing_unit_bNrChannels(d); +- term->chconfig = uac_processing_unit_wChannelConfig(d, protocol); +- term->name = uac_processing_unit_iProcessing(d, protocol); +- return 0; +- } +- case UAC2_CLOCK_SOURCE: { +- struct uac_clock_source_descriptor *d = p1; ++ term->name = le16_to_cpu(d->wTerminalDescrStr); ++ return 0; ++} + +- term->type = UAC3_CLOCK_SOURCE << 16; /* virtual type */ +- term->id = id; +- term->name = d->iClockSource; +- return 0; +- } +- default: +- return -ENODEV; +- } +- } else { /* UAC_VERSION_3 */ +- switch (hdr[2]) { +- case UAC_INPUT_TERMINAL: { +- struct uac3_input_terminal_descriptor *d = p1; +- +- /* call recursively to verify that the +- * referenced clock entity is valid */ +- err = __check_input_term(state, d->bCSourceID, term); +- if (err < 0) +- return err; ++static int parse_term_mixer_unit(struct mixer_build *state, ++ struct usb_audio_term *term, ++ void *p1, int id) ++{ ++ struct uac_mixer_unit_descriptor *d = p1; ++ int protocol = state->mixer->protocol; ++ int err; + +- /* save input term properties after recursion, +- * to ensure they are not overriden by the +- * recursion calls */ +- term->id = id; +- term->type = le16_to_cpu(d->wTerminalType); ++ err = uac_mixer_unit_get_channels(state, d); ++ if (err <= 0) ++ return err; + +- err = get_cluster_channels_v3(state, le16_to_cpu(d->wClusterDescrID)); +- if (err < 0) +- return err; +- term->channels = err; ++ term->type = UAC3_MIXER_UNIT << 16; /* virtual type */ ++ term->channels = err; ++ if (protocol != UAC_VERSION_3) { ++ term->chconfig = uac_mixer_unit_wChannelConfig(d, protocol); ++ term->name = uac_mixer_unit_iMixer(d); ++ } ++ return 0; ++} + +- /* REVISIT: UAC3 IT doesn't have channels cfg */ +- term->chconfig = 0; ++static int parse_term_selector_unit(struct mixer_build *state, ++ struct usb_audio_term *term, ++ void *p1, int id) ++{ ++ struct uac_selector_unit_descriptor *d = p1; ++ int err; + +- term->name = le16_to_cpu(d->wTerminalDescrStr); +- return 0; +- } +- case UAC3_FEATURE_UNIT: { +- struct uac3_feature_unit_descriptor *d = p1; ++ /* call recursively to retrieve the channel info */ ++ err = __check_input_term(state, d->baSourceID[0], term); ++ if (err < 0) ++ return err; ++ term->type = UAC3_SELECTOR_UNIT << 16; /* virtual type */ ++ term->id = id; ++ if (state->mixer->protocol != UAC_VERSION_3) ++ term->name = uac_selector_unit_iSelector(d); ++ return 0; ++} + +- id = d->bSourceID; +- break; /* continue to parse */ +- } +- case UAC3_CLOCK_SOURCE: { +- struct uac3_clock_source_descriptor *d = p1; ++static int parse_term_proc_unit(struct mixer_build *state, ++ struct usb_audio_term *term, ++ void *p1, int id, int vtype) ++{ ++ struct uac_processing_unit_descriptor *d = p1; ++ int protocol = state->mixer->protocol; ++ int err; + +- term->type = UAC3_CLOCK_SOURCE << 16; /* virtual type */ +- term->id = id; +- term->name = le16_to_cpu(d->wClockSourceStr); +- return 0; +- } +- case UAC3_MIXER_UNIT: { +- struct uac_mixer_unit_descriptor *d = p1; ++ if (d->bNrInPins) { ++ /* call recursively to retrieve the channel info */ ++ err = __check_input_term(state, d->baSourceID[0], term); ++ if (err < 0) ++ return err; ++ } + +- err = uac_mixer_unit_get_channels(state, d); +- if (err <= 0) +- return err; ++ term->type = vtype << 16; /* virtual type */ ++ term->id = id; + +- term->channels = err; +- term->type = UAC3_MIXER_UNIT << 16; /* virtual type */ ++ if (protocol == UAC_VERSION_3) ++ return 0; + +- return 0; +- } +- case UAC3_SELECTOR_UNIT: +- case UAC3_CLOCK_SELECTOR: { +- struct uac_selector_unit_descriptor *d = p1; +- /* call recursively to retrieve the channel info */ +- err = __check_input_term(state, d->baSourceID[0], term); +- if (err < 0) +- return err; +- term->type = UAC3_SELECTOR_UNIT << 16; /* virtual type */ +- term->id = id; +- term->name = 0; /* TODO: UAC3 Class-specific strings */ ++ if (!term->channels) { ++ term->channels = uac_processing_unit_bNrChannels(d); ++ term->chconfig = uac_processing_unit_wChannelConfig(d, protocol); ++ } ++ term->name = uac_processing_unit_iProcessing(d, protocol); ++ return 0; ++} + +- return 0; +- } +- case UAC3_PROCESSING_UNIT: { +- struct uac_processing_unit_descriptor *d = p1; ++static int parse_term_uac2_clock_source(struct mixer_build *state, ++ struct usb_audio_term *term, ++ void *p1, int id) ++{ ++ struct uac_clock_source_descriptor *d = p1; + +- if (!d->bNrInPins) +- return -EINVAL; ++ term->type = UAC3_CLOCK_SOURCE << 16; /* virtual type */ ++ term->id = id; ++ term->name = d->iClockSource; ++ return 0; ++} + +- /* call recursively to retrieve the channel info */ +- err = __check_input_term(state, d->baSourceID[0], term); +- if (err < 0) +- return err; ++static int parse_term_uac3_clock_source(struct mixer_build *state, ++ struct usb_audio_term *term, ++ void *p1, int id) ++{ ++ struct uac3_clock_source_descriptor *d = p1; ++ ++ term->type = UAC3_CLOCK_SOURCE << 16; /* virtual type */ ++ term->id = id; ++ term->name = le16_to_cpu(d->wClockSourceStr); ++ return 0; ++} + +- term->type = UAC3_PROCESSING_UNIT << 16; /* virtual type */ +- term->id = id; +- term->name = 0; /* TODO: UAC3 Class-specific strings */ ++#define PTYPE(a, b) ((a) << 8 | (b)) + +- return 0; +- } +- default: +- return -ENODEV; +- } ++/* ++ * parse the source unit recursively until it reaches to a terminal ++ * or a branched unit. ++ */ ++static int __check_input_term(struct mixer_build *state, int id, ++ struct usb_audio_term *term) ++{ ++ int protocol = state->mixer->protocol; ++ void *p1; ++ unsigned char *hdr; ++ ++ for (;;) { ++ /* a loop in the terminal chain? */ ++ if (test_and_set_bit(id, state->termbitmap)) ++ return -EINVAL; ++ ++ p1 = find_audio_control_unit(state, id); ++ if (!p1) ++ break; ++ if (!snd_usb_validate_audio_desc(p1, protocol)) ++ break; /* bad descriptor */ ++ ++ hdr = p1; ++ term->id = id; ++ ++ switch (PTYPE(protocol, hdr[2])) { ++ case PTYPE(UAC_VERSION_1, UAC_FEATURE_UNIT): ++ case PTYPE(UAC_VERSION_2, UAC_FEATURE_UNIT): ++ case PTYPE(UAC_VERSION_3, UAC3_FEATURE_UNIT): { ++ /* the header is the same for all versions */ ++ struct uac_feature_unit_descriptor *d = p1; ++ ++ id = d->bSourceID; ++ break; /* continue to parse */ ++ } ++ case PTYPE(UAC_VERSION_1, UAC_INPUT_TERMINAL): ++ return parse_term_uac1_iterm_unit(state, term, p1, id); ++ case PTYPE(UAC_VERSION_2, UAC_INPUT_TERMINAL): ++ return parse_term_uac2_iterm_unit(state, term, p1, id); ++ case PTYPE(UAC_VERSION_3, UAC_INPUT_TERMINAL): ++ return parse_term_uac3_iterm_unit(state, term, p1, id); ++ case PTYPE(UAC_VERSION_1, UAC_MIXER_UNIT): ++ case PTYPE(UAC_VERSION_2, UAC_MIXER_UNIT): ++ case PTYPE(UAC_VERSION_3, UAC3_MIXER_UNIT): ++ return parse_term_mixer_unit(state, term, p1, id); ++ case PTYPE(UAC_VERSION_1, UAC_SELECTOR_UNIT): ++ case PTYPE(UAC_VERSION_2, UAC_SELECTOR_UNIT): ++ case PTYPE(UAC_VERSION_2, UAC2_CLOCK_SELECTOR): ++ case PTYPE(UAC_VERSION_3, UAC3_SELECTOR_UNIT): ++ case PTYPE(UAC_VERSION_3, UAC3_CLOCK_SELECTOR): ++ return parse_term_selector_unit(state, term, p1, id); ++ case PTYPE(UAC_VERSION_1, UAC1_PROCESSING_UNIT): ++ case PTYPE(UAC_VERSION_2, UAC2_PROCESSING_UNIT_V2): ++ case PTYPE(UAC_VERSION_3, UAC3_PROCESSING_UNIT): ++ return parse_term_proc_unit(state, term, p1, id, ++ UAC3_PROCESSING_UNIT); ++ case PTYPE(UAC_VERSION_2, UAC2_EFFECT_UNIT): ++ case PTYPE(UAC_VERSION_3, UAC3_EFFECT_UNIT): ++ return parse_term_proc_unit(state, term, p1, id, ++ UAC3_EFFECT_UNIT); ++ case PTYPE(UAC_VERSION_1, UAC1_EXTENSION_UNIT): ++ case PTYPE(UAC_VERSION_2, UAC2_EXTENSION_UNIT_V2): ++ case PTYPE(UAC_VERSION_3, UAC3_EXTENSION_UNIT): ++ return parse_term_proc_unit(state, term, p1, id, ++ UAC3_EXTENSION_UNIT); ++ case PTYPE(UAC_VERSION_2, UAC2_CLOCK_SOURCE): ++ return parse_term_uac2_clock_source(state, term, p1, id); ++ case PTYPE(UAC_VERSION_3, UAC3_CLOCK_SOURCE): ++ return parse_term_uac3_clock_source(state, term, p1, id); ++ default: ++ return -ENODEV; + } + } + return -ENODEV; +@@ -1039,10 +1052,15 @@ static struct usb_feature_control_info audio_feature_info[] = { + { UAC2_FU_PHASE_INVERTER, "Phase Inverter Control", USB_MIXER_BOOLEAN, -1 }, + }; + ++static void usb_mixer_elem_info_free(struct usb_mixer_elem_info *cval) ++{ ++ kfree(cval); ++} ++ + /* private_free callback */ + void snd_usb_mixer_elem_free(struct snd_kcontrol *kctl) + { +- kfree(kctl->private_data); ++ usb_mixer_elem_info_free(kctl->private_data); + kctl->private_data = NULL; + } + +@@ -1565,7 +1583,7 @@ static void __build_feature_ctl(struct usb_mixer_interface *mixer, + + ctl_info = get_feature_control_info(control); + if (!ctl_info) { +- kfree(cval); ++ usb_mixer_elem_info_free(cval); + return; + } + if (mixer->protocol == UAC_VERSION_1) +@@ -1598,7 +1616,7 @@ static void __build_feature_ctl(struct usb_mixer_interface *mixer, + + if (!kctl) { + usb_audio_err(mixer->chip, "cannot malloc kcontrol\n"); +- kfree(cval); ++ usb_mixer_elem_info_free(cval); + return; + } + kctl->private_free = snd_usb_mixer_elem_free; +@@ -1768,7 +1786,7 @@ static void build_connector_control(struct usb_mixer_interface *mixer, + kctl = snd_ctl_new1(&usb_connector_ctl_ro, cval); + if (!kctl) { + usb_audio_err(mixer->chip, "cannot malloc kcontrol\n"); +- kfree(cval); ++ usb_mixer_elem_info_free(cval); + return; + } + get_connector_control_name(mixer, term, is_input, kctl->id.name, +@@ -1789,13 +1807,6 @@ static int parse_clock_source_unit(struct mixer_build *state, int unitid, + if (state->mixer->protocol != UAC_VERSION_2) + return -EINVAL; + +- if (hdr->bLength != sizeof(*hdr)) { +- usb_audio_dbg(state->chip, +- "Bogus clock source descriptor length of %d, ignoring.\n", +- hdr->bLength); +- return 0; +- } +- + /* + * The only property of this unit we are interested in is the + * clock source validity. If that isn't readable, just bail out. +@@ -1821,7 +1832,7 @@ static int parse_clock_source_unit(struct mixer_build *state, int unitid, + kctl = snd_ctl_new1(&usb_bool_master_control_ctl_ro, cval); + + if (!kctl) { +- kfree(cval); ++ usb_mixer_elem_info_free(cval); + return -ENOMEM; + } + +@@ -1854,62 +1865,20 @@ static int parse_audio_feature_unit(struct mixer_build *state, int unitid, + __u8 *bmaControls; + + if (state->mixer->protocol == UAC_VERSION_1) { +- if (hdr->bLength < 7) { +- usb_audio_err(state->chip, +- "unit %u: invalid UAC_FEATURE_UNIT descriptor\n", +- unitid); +- return -EINVAL; +- } + csize = hdr->bControlSize; +- if (!csize) { +- usb_audio_dbg(state->chip, +- "unit %u: invalid bControlSize == 0\n", +- unitid); +- return -EINVAL; +- } + channels = (hdr->bLength - 7) / csize - 1; + bmaControls = hdr->bmaControls; +- if (hdr->bLength < 7 + csize) { +- usb_audio_err(state->chip, +- "unit %u: invalid UAC_FEATURE_UNIT descriptor\n", +- unitid); +- return -EINVAL; +- } + } else if (state->mixer->protocol == UAC_VERSION_2) { + struct uac2_feature_unit_descriptor *ftr = _ftr; +- if (hdr->bLength < 6) { +- usb_audio_err(state->chip, +- "unit %u: invalid UAC_FEATURE_UNIT descriptor\n", +- unitid); +- return -EINVAL; +- } + csize = 4; + channels = (hdr->bLength - 6) / 4 - 1; + bmaControls = ftr->bmaControls; +- if (hdr->bLength < 6 + csize) { +- usb_audio_err(state->chip, +- "unit %u: invalid UAC_FEATURE_UNIT descriptor\n", +- unitid); +- return -EINVAL; +- } + } else { /* UAC_VERSION_3 */ + struct uac3_feature_unit_descriptor *ftr = _ftr; + +- if (hdr->bLength < 7) { +- usb_audio_err(state->chip, +- "unit %u: invalid UAC3_FEATURE_UNIT descriptor\n", +- unitid); +- return -EINVAL; +- } + csize = 4; + channels = (ftr->bLength - 7) / 4 - 1; + bmaControls = ftr->bmaControls; +- if (hdr->bLength < 7 + csize) { +- usb_audio_err(state->chip, +- "unit %u: invalid UAC3_FEATURE_UNIT descriptor\n", +- unitid); +- return -EINVAL; +- } + } + + /* parse the source unit */ +@@ -2087,7 +2056,7 @@ static void build_mixer_unit_ctl(struct mixer_build *state, + kctl = snd_ctl_new1(&usb_feature_unit_ctl, cval); + if (!kctl) { + usb_audio_err(state->chip, "cannot malloc kcontrol\n"); +- kfree(cval); ++ usb_mixer_elem_info_free(cval); + return; + } + kctl->private_free = snd_usb_mixer_elem_free; +@@ -2113,15 +2082,11 @@ static int parse_audio_input_terminal(struct mixer_build *state, int unitid, + + if (state->mixer->protocol == UAC_VERSION_2) { + struct uac2_input_terminal_descriptor *d_v2 = raw_desc; +- if (d_v2->bLength < sizeof(*d_v2)) +- return -EINVAL; + control = UAC2_TE_CONNECTOR; + term_id = d_v2->bTerminalID; + bmctls = le16_to_cpu(d_v2->bmControls); + } else if (state->mixer->protocol == UAC_VERSION_3) { + struct uac3_input_terminal_descriptor *d_v3 = raw_desc; +- if (d_v3->bLength < sizeof(*d_v3)) +- return -EINVAL; + control = UAC3_TE_INSERTION; + term_id = d_v3->bTerminalID; + bmctls = le32_to_cpu(d_v3->bmControls); +@@ -2383,18 +2348,7 @@ static int build_audio_procunit(struct mixer_build *state, int unitid, + const char *name = extension_unit ? + "Extension Unit" : "Processing Unit"; + +- if (desc->bLength < 13) { +- usb_audio_err(state->chip, "invalid %s descriptor (id %d)\n", name, unitid); +- return -EINVAL; +- } +- + num_ins = desc->bNrInPins; +- if (desc->bLength < 13 + num_ins || +- desc->bLength < num_ins + uac_processing_unit_bControlSize(desc, state->mixer->protocol)) { +- usb_audio_err(state->chip, "invalid %s descriptor (id %d)\n", name, unitid); +- return -EINVAL; +- } +- + for (i = 0; i < num_ins; i++) { + err = parse_audio_unit(state, desc->baSourceID[i]); + if (err < 0) +@@ -2485,7 +2439,7 @@ static int build_audio_procunit(struct mixer_build *state, int unitid, + + kctl = snd_ctl_new1(&mixer_procunit_ctl, cval); + if (!kctl) { +- kfree(cval); ++ usb_mixer_elem_info_free(cval); + return -ENOMEM; + } + kctl->private_free = snd_usb_mixer_elem_free; +@@ -2623,7 +2577,7 @@ static void usb_mixer_selector_elem_free(struct snd_kcontrol *kctl) + if (kctl->private_data) { + struct usb_mixer_elem_info *cval = kctl->private_data; + num_ins = cval->max; +- kfree(cval); ++ usb_mixer_elem_info_free(cval); + kctl->private_data = NULL; + } + if (kctl->private_value) { +@@ -2649,13 +2603,6 @@ static int parse_audio_selector_unit(struct mixer_build *state, int unitid, + const struct usbmix_name_map *map; + char **namelist; + +- if (desc->bLength < 5 || !desc->bNrInPins || +- desc->bLength < 5 + desc->bNrInPins) { +- usb_audio_err(state->chip, +- "invalid SELECTOR UNIT descriptor %d\n", unitid); +- return -EINVAL; +- } +- + for (i = 0; i < desc->bNrInPins; i++) { + err = parse_audio_unit(state, desc->baSourceID[i]); + if (err < 0) +@@ -2695,10 +2642,10 @@ static int parse_audio_selector_unit(struct mixer_build *state, int unitid, + break; + } + +- namelist = kmalloc_array(desc->bNrInPins, sizeof(char *), GFP_KERNEL); ++ namelist = kcalloc(desc->bNrInPins, sizeof(char *), GFP_KERNEL); + if (!namelist) { +- kfree(cval); +- return -ENOMEM; ++ err = -ENOMEM; ++ goto error_cval; + } + #define MAX_ITEM_NAME_LEN 64 + for (i = 0; i < desc->bNrInPins; i++) { +@@ -2706,11 +2653,8 @@ static int parse_audio_selector_unit(struct mixer_build *state, int unitid, + len = 0; + namelist[i] = kmalloc(MAX_ITEM_NAME_LEN, GFP_KERNEL); + if (!namelist[i]) { +- while (i--) +- kfree(namelist[i]); +- kfree(namelist); +- kfree(cval); +- return -ENOMEM; ++ err = -ENOMEM; ++ goto error_name; + } + len = check_mapped_selector_name(state, unitid, i, namelist[i], + MAX_ITEM_NAME_LEN); +@@ -2724,11 +2668,8 @@ static int parse_audio_selector_unit(struct mixer_build *state, int unitid, + kctl = snd_ctl_new1(&mixer_selectunit_ctl, cval); + if (! kctl) { + usb_audio_err(state->chip, "cannot malloc kcontrol\n"); +- for (i = 0; i < desc->bNrInPins; i++) +- kfree(namelist[i]); +- kfree(namelist); +- kfree(cval); +- return -ENOMEM; ++ err = -ENOMEM; ++ goto error_name; + } + kctl->private_value = (unsigned long)namelist; + kctl->private_free = usb_mixer_selector_elem_free; +@@ -2774,6 +2715,14 @@ static int parse_audio_selector_unit(struct mixer_build *state, int unitid, + usb_audio_dbg(state->chip, "[%d] SU [%s] items = %d\n", + cval->head.id, kctl->id.name, desc->bNrInPins); + return snd_usb_mixer_add_control(&cval->head, kctl); ++ ++ error_name: ++ for (i = 0; i < desc->bNrInPins; i++) ++ kfree(namelist[i]); ++ kfree(namelist); ++ error_cval: ++ usb_mixer_elem_info_free(cval); ++ return err; + } + + /* +@@ -2794,62 +2743,49 @@ static int parse_audio_unit(struct mixer_build *state, int unitid) + return -EINVAL; + } + +- if (protocol == UAC_VERSION_1 || protocol == UAC_VERSION_2) { +- switch (p1[2]) { +- case UAC_INPUT_TERMINAL: +- return parse_audio_input_terminal(state, unitid, p1); +- case UAC_MIXER_UNIT: +- return parse_audio_mixer_unit(state, unitid, p1); +- case UAC2_CLOCK_SOURCE: +- return parse_clock_source_unit(state, unitid, p1); +- case UAC_SELECTOR_UNIT: +- case UAC2_CLOCK_SELECTOR: +- return parse_audio_selector_unit(state, unitid, p1); +- case UAC_FEATURE_UNIT: +- return parse_audio_feature_unit(state, unitid, p1); +- case UAC1_PROCESSING_UNIT: +- /* UAC2_EFFECT_UNIT has the same value */ +- if (protocol == UAC_VERSION_1) +- return parse_audio_processing_unit(state, unitid, p1); +- else +- return 0; /* FIXME - effect units not implemented yet */ +- case UAC1_EXTENSION_UNIT: +- /* UAC2_PROCESSING_UNIT_V2 has the same value */ +- if (protocol == UAC_VERSION_1) +- return parse_audio_extension_unit(state, unitid, p1); +- else /* UAC_VERSION_2 */ +- return parse_audio_processing_unit(state, unitid, p1); +- case UAC2_EXTENSION_UNIT_V2: +- return parse_audio_extension_unit(state, unitid, p1); +- default: +- usb_audio_err(state->chip, +- "unit %u: unexpected type 0x%02x\n", unitid, p1[2]); +- return -EINVAL; +- } +- } else { /* UAC_VERSION_3 */ +- switch (p1[2]) { +- case UAC_INPUT_TERMINAL: +- return parse_audio_input_terminal(state, unitid, p1); +- case UAC3_MIXER_UNIT: +- return parse_audio_mixer_unit(state, unitid, p1); +- case UAC3_CLOCK_SOURCE: +- return parse_clock_source_unit(state, unitid, p1); +- case UAC3_SELECTOR_UNIT: +- case UAC3_CLOCK_SELECTOR: +- return parse_audio_selector_unit(state, unitid, p1); +- case UAC3_FEATURE_UNIT: +- return parse_audio_feature_unit(state, unitid, p1); +- case UAC3_EFFECT_UNIT: +- return 0; /* FIXME - effect units not implemented yet */ +- case UAC3_PROCESSING_UNIT: +- return parse_audio_processing_unit(state, unitid, p1); +- case UAC3_EXTENSION_UNIT: +- return parse_audio_extension_unit(state, unitid, p1); +- default: +- usb_audio_err(state->chip, +- "unit %u: unexpected type 0x%02x\n", unitid, p1[2]); +- return -EINVAL; +- } ++ if (!snd_usb_validate_audio_desc(p1, protocol)) { ++ usb_audio_dbg(state->chip, "invalid unit %d\n", unitid); ++ return 0; /* skip invalid unit */ ++ } ++ ++ switch (PTYPE(protocol, p1[2])) { ++ case PTYPE(UAC_VERSION_1, UAC_INPUT_TERMINAL): ++ case PTYPE(UAC_VERSION_2, UAC_INPUT_TERMINAL): ++ case PTYPE(UAC_VERSION_3, UAC_INPUT_TERMINAL): ++ return parse_audio_input_terminal(state, unitid, p1); ++ case PTYPE(UAC_VERSION_1, UAC_MIXER_UNIT): ++ case PTYPE(UAC_VERSION_2, UAC_MIXER_UNIT): ++ case PTYPE(UAC_VERSION_3, UAC3_MIXER_UNIT): ++ return parse_audio_mixer_unit(state, unitid, p1); ++ case PTYPE(UAC_VERSION_2, UAC2_CLOCK_SOURCE): ++ case PTYPE(UAC_VERSION_3, UAC3_CLOCK_SOURCE): ++ return parse_clock_source_unit(state, unitid, p1); ++ case PTYPE(UAC_VERSION_1, UAC_SELECTOR_UNIT): ++ case PTYPE(UAC_VERSION_2, UAC_SELECTOR_UNIT): ++ case PTYPE(UAC_VERSION_3, UAC3_SELECTOR_UNIT): ++ case PTYPE(UAC_VERSION_2, UAC2_CLOCK_SELECTOR): ++ case PTYPE(UAC_VERSION_3, UAC3_CLOCK_SELECTOR): ++ return parse_audio_selector_unit(state, unitid, p1); ++ case PTYPE(UAC_VERSION_1, UAC_FEATURE_UNIT): ++ case PTYPE(UAC_VERSION_2, UAC_FEATURE_UNIT): ++ case PTYPE(UAC_VERSION_3, UAC3_FEATURE_UNIT): ++ return parse_audio_feature_unit(state, unitid, p1); ++ case PTYPE(UAC_VERSION_1, UAC1_PROCESSING_UNIT): ++ case PTYPE(UAC_VERSION_2, UAC2_PROCESSING_UNIT_V2): ++ case PTYPE(UAC_VERSION_3, UAC3_PROCESSING_UNIT): ++ return parse_audio_processing_unit(state, unitid, p1); ++ case PTYPE(UAC_VERSION_1, UAC1_EXTENSION_UNIT): ++ case PTYPE(UAC_VERSION_2, UAC2_EXTENSION_UNIT_V2): ++ case PTYPE(UAC_VERSION_3, UAC3_EXTENSION_UNIT): ++ return parse_audio_extension_unit(state, unitid, p1); ++ case PTYPE(UAC_VERSION_2, UAC2_EFFECT_UNIT): ++ case PTYPE(UAC_VERSION_3, UAC3_EFFECT_UNIT): ++ return 0; /* FIXME - effect units not implemented yet */ ++ default: ++ usb_audio_err(state->chip, ++ "unit %u: unexpected type 0x%02x\n", ++ unitid, p1[2]); ++ return -EINVAL; + } + } + +@@ -3164,11 +3100,12 @@ static int snd_usb_mixer_controls(struct usb_mixer_interface *mixer) + while ((p = snd_usb_find_csint_desc(mixer->hostif->extra, + mixer->hostif->extralen, + p, UAC_OUTPUT_TERMINAL)) != NULL) { ++ if (!snd_usb_validate_audio_desc(p, mixer->protocol)) ++ continue; /* skip invalid descriptor */ ++ + if (mixer->protocol == UAC_VERSION_1) { + struct uac1_output_terminal_descriptor *desc = p; + +- if (desc->bLength < sizeof(*desc)) +- continue; /* invalid descriptor? */ + /* mark terminal ID as visited */ + set_bit(desc->bTerminalID, state.unitbitmap); + state.oterm.id = desc->bTerminalID; +@@ -3180,8 +3117,6 @@ static int snd_usb_mixer_controls(struct usb_mixer_interface *mixer) + } else if (mixer->protocol == UAC_VERSION_2) { + struct uac2_output_terminal_descriptor *desc = p; + +- if (desc->bLength < sizeof(*desc)) +- continue; /* invalid descriptor? */ + /* mark terminal ID as visited */ + set_bit(desc->bTerminalID, state.unitbitmap); + state.oterm.id = desc->bTerminalID; +@@ -3207,8 +3142,6 @@ static int snd_usb_mixer_controls(struct usb_mixer_interface *mixer) + } else { /* UAC_VERSION_3 */ + struct uac3_output_terminal_descriptor *desc = p; + +- if (desc->bLength < sizeof(*desc)) +- continue; /* invalid descriptor? */ + /* mark terminal ID as visited */ + set_bit(desc->bTerminalID, state.unitbitmap); + state.oterm.id = desc->bTerminalID; +diff --git a/sound/usb/power.c b/sound/usb/power.c +index bd303a1ba1b7..606a2cb23eab 100644 +--- a/sound/usb/power.c ++++ b/sound/usb/power.c +@@ -31,6 +31,8 @@ snd_usb_find_power_domain(struct usb_host_interface *ctrl_iface, + struct uac3_power_domain_descriptor *pd_desc = p; + int i; + ++ if (!snd_usb_validate_audio_desc(p, UAC_VERSION_3)) ++ continue; + for (i = 0; i < pd_desc->bNrEntities; i++) { + if (pd_desc->baEntityID[i] == id) { + pd->pd_id = pd_desc->bPowerDomainID; +diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c +index c102c0377ad9..ea253c97b8b9 100644 +--- a/sound/usb/quirks.c ++++ b/sound/usb/quirks.c +@@ -259,6 +259,9 @@ static int create_yamaha_midi_quirk(struct snd_usb_audio *chip, + NULL, USB_MS_MIDI_OUT_JACK); + if (!injd && !outjd) + return -ENODEV; ++ if (!(injd && snd_usb_validate_midi_desc(injd)) || ++ !(outjd && snd_usb_validate_midi_desc(outjd))) ++ return -ENODEV; + if (injd && (injd->bLength < 5 || + (injd->bJackType != USB_MS_EMBEDDED && + injd->bJackType != USB_MS_EXTERNAL))) +diff --git a/sound/usb/stream.c b/sound/usb/stream.c +index bc582202bd10..9d020bd0de17 100644 +--- a/sound/usb/stream.c ++++ b/sound/usb/stream.c +@@ -637,16 +637,14 @@ static int parse_uac_endpoint_attributes(struct snd_usb_audio *chip, + */ + static void * + snd_usb_find_input_terminal_descriptor(struct usb_host_interface *ctrl_iface, +- int terminal_id, bool uac23) ++ int terminal_id, int protocol) + { + struct uac2_input_terminal_descriptor *term = NULL; +- size_t minlen = uac23 ? sizeof(struct uac2_input_terminal_descriptor) : +- sizeof(struct uac_input_terminal_descriptor); + + while ((term = snd_usb_find_csint_desc(ctrl_iface->extra, + ctrl_iface->extralen, + term, UAC_INPUT_TERMINAL))) { +- if (term->bLength < minlen) ++ if (!snd_usb_validate_audio_desc(term, protocol)) + continue; + if (term->bTerminalID == terminal_id) + return term; +@@ -657,7 +655,7 @@ snd_usb_find_input_terminal_descriptor(struct usb_host_interface *ctrl_iface, + + static void * + snd_usb_find_output_terminal_descriptor(struct usb_host_interface *ctrl_iface, +- int terminal_id) ++ int terminal_id, int protocol) + { + /* OK to use with both UAC2 and UAC3 */ + struct uac2_output_terminal_descriptor *term = NULL; +@@ -665,8 +663,9 @@ snd_usb_find_output_terminal_descriptor(struct usb_host_interface *ctrl_iface, + while ((term = snd_usb_find_csint_desc(ctrl_iface->extra, + ctrl_iface->extralen, + term, UAC_OUTPUT_TERMINAL))) { +- if (term->bLength >= sizeof(*term) && +- term->bTerminalID == terminal_id) ++ if (!snd_usb_validate_audio_desc(term, protocol)) ++ continue; ++ if (term->bTerminalID == terminal_id) + return term; + } + +@@ -741,7 +740,7 @@ snd_usb_get_audioformat_uac12(struct snd_usb_audio *chip, + + iterm = snd_usb_find_input_terminal_descriptor(chip->ctrl_intf, + as->bTerminalLink, +- false); ++ protocol); + if (iterm) { + num_channels = iterm->bNrChannels; + chconfig = le16_to_cpu(iterm->wChannelConfig); +@@ -777,7 +776,7 @@ snd_usb_get_audioformat_uac12(struct snd_usb_audio *chip, + */ + input_term = snd_usb_find_input_terminal_descriptor(chip->ctrl_intf, + as->bTerminalLink, +- true); ++ protocol); + if (input_term) { + clock = input_term->bCSourceID; + if (!chconfig && (num_channels == input_term->bNrChannels)) +@@ -786,7 +785,8 @@ snd_usb_get_audioformat_uac12(struct snd_usb_audio *chip, + } + + output_term = snd_usb_find_output_terminal_descriptor(chip->ctrl_intf, +- as->bTerminalLink); ++ as->bTerminalLink, ++ protocol); + if (output_term) { + clock = output_term->bCSourceID; + goto found_clock; +@@ -1012,14 +1012,15 @@ snd_usb_get_audioformat_uac3(struct snd_usb_audio *chip, + */ + input_term = snd_usb_find_input_terminal_descriptor(chip->ctrl_intf, + as->bTerminalLink, +- true); ++ UAC_VERSION_3); + if (input_term) { + clock = input_term->bCSourceID; + goto found_clock; + } + + output_term = snd_usb_find_output_terminal_descriptor(chip->ctrl_intf, +- as->bTerminalLink); ++ as->bTerminalLink, ++ UAC_VERSION_3); + if (output_term) { + clock = output_term->bCSourceID; + goto found_clock; +diff --git a/sound/usb/validate.c b/sound/usb/validate.c +new file mode 100644 +index 000000000000..a5e584b60dcd +--- /dev/null ++++ b/sound/usb/validate.c +@@ -0,0 +1,332 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++// ++// Validation of USB-audio class descriptors ++// ++ ++#include <linux/init.h> ++#include <linux/usb.h> ++#include <linux/usb/audio.h> ++#include <linux/usb/audio-v2.h> ++#include <linux/usb/audio-v3.h> ++#include <linux/usb/midi.h> ++#include "usbaudio.h" ++#include "helper.h" ++ ++struct usb_desc_validator { ++ unsigned char protocol; ++ unsigned char type; ++ bool (*func)(const void *p, const struct usb_desc_validator *v); ++ size_t size; ++}; ++ ++#define UAC_VERSION_ALL (unsigned char)(-1) ++ ++/* UAC1 only */ ++static bool validate_uac1_header(const void *p, ++ const struct usb_desc_validator *v) ++{ ++ const struct uac1_ac_header_descriptor *d = p; ++ ++ return d->bLength >= sizeof(*d) && ++ d->bLength >= sizeof(*d) + d->bInCollection; ++} ++ ++/* for mixer unit; covering all UACs */ ++static bool validate_mixer_unit(const void *p, ++ const struct usb_desc_validator *v) ++{ ++ const struct uac_mixer_unit_descriptor *d = p; ++ size_t len; ++ ++ if (d->bLength < sizeof(*d) || !d->bNrInPins) ++ return false; ++ len = sizeof(*d) + d->bNrInPins; ++ /* We can't determine the bitmap size only from this unit descriptor, ++ * so just check with the remaining length. ++ * The actual bitmap is checked at mixer unit parser. ++ */ ++ switch (v->protocol) { ++ case UAC_VERSION_1: ++ default: ++ len += 2 + 1; /* wChannelConfig, iChannelNames */ ++ /* bmControls[n*m] */ ++ len += 1; /* iMixer */ ++ break; ++ case UAC_VERSION_2: ++ len += 4 + 1; /* bmChannelConfig, iChannelNames */ ++ /* bmMixerControls[n*m] */ ++ len += 1 + 1; /* bmControls, iMixer */ ++ break; ++ case UAC_VERSION_3: ++ len += 2; /* wClusterDescrID */ ++ /* bmMixerControls[n*m] */ ++ break; ++ } ++ return d->bLength >= len; ++} ++ ++/* both for processing and extension units; covering all UACs */ ++static bool validate_processing_unit(const void *p, ++ const struct usb_desc_validator *v) ++{ ++ const struct uac_processing_unit_descriptor *d = p; ++ const unsigned char *hdr = p; ++ size_t len, m; ++ ++ if (d->bLength < sizeof(*d)) ++ return false; ++ len = sizeof(*d) + d->bNrInPins; ++ if (d->bLength < len) ++ return false; ++ switch (v->protocol) { ++ case UAC_VERSION_1: ++ default: ++ /* bNrChannels, wChannelConfig, iChannelNames, bControlSize */ ++ len += 1 + 2 + 1 + 1; ++ if (d->bLength < len) /* bControlSize */ ++ return false; ++ m = hdr[len]; ++ len += 1 + m + 1; /* bControlSize, bmControls, iProcessing */ ++ break; ++ case UAC_VERSION_2: ++ /* bNrChannels, bmChannelConfig, iChannelNames */ ++ len += 1 + 4 + 1; ++ if (v->type == UAC2_PROCESSING_UNIT_V2) ++ len += 2; /* bmControls -- 2 bytes for PU */ ++ else ++ len += 1; /* bmControls -- 1 byte for EU */ ++ len += 1; /* iProcessing */ ++ break; ++ case UAC_VERSION_3: ++ /* wProcessingDescrStr, bmControls */ ++ len += 2 + 4; ++ break; ++ } ++ if (d->bLength < len) ++ return false; ++ ++ switch (v->protocol) { ++ case UAC_VERSION_1: ++ default: ++ if (v->type == UAC1_EXTENSION_UNIT) ++ return true; /* OK */ ++ switch (d->wProcessType) { ++ case UAC_PROCESS_UP_DOWNMIX: ++ case UAC_PROCESS_DOLBY_PROLOGIC: ++ if (d->bLength < len + 1) /* bNrModes */ ++ return false; ++ m = hdr[len]; ++ len += 1 + m * 2; /* bNrModes, waModes(n) */ ++ break; ++ default: ++ break; ++ } ++ break; ++ case UAC_VERSION_2: ++ if (v->type == UAC2_EXTENSION_UNIT_V2) ++ return true; /* OK */ ++ switch (d->wProcessType) { ++ case UAC2_PROCESS_UP_DOWNMIX: ++ case UAC2_PROCESS_DOLBY_PROLOCIC: /* SiC! */ ++ if (d->bLength < len + 1) /* bNrModes */ ++ return false; ++ m = hdr[len]; ++ len += 1 + m * 4; /* bNrModes, daModes(n) */ ++ break; ++ default: ++ break; ++ } ++ break; ++ case UAC_VERSION_3: ++ if (v->type == UAC3_EXTENSION_UNIT) { ++ len += 2; /* wClusterDescrID */ ++ break; ++ } ++ switch (d->wProcessType) { ++ case UAC3_PROCESS_UP_DOWNMIX: ++ if (d->bLength < len + 1) /* bNrModes */ ++ return false; ++ m = hdr[len]; ++ len += 1 + m * 2; /* bNrModes, waClusterDescrID(n) */ ++ break; ++ case UAC3_PROCESS_MULTI_FUNCTION: ++ len += 2 + 4; /* wClusterDescrID, bmAlgorighms */ ++ break; ++ default: ++ break; ++ } ++ break; ++ } ++ if (d->bLength < len) ++ return false; ++ ++ return true; ++} ++ ++/* both for selector and clock selector units; covering all UACs */ ++static bool validate_selector_unit(const void *p, ++ const struct usb_desc_validator *v) ++{ ++ const struct uac_selector_unit_descriptor *d = p; ++ size_t len; ++ ++ if (d->bLength < sizeof(*d)) ++ return false; ++ len = sizeof(*d) + d->bNrInPins; ++ switch (v->protocol) { ++ case UAC_VERSION_1: ++ default: ++ len += 1; /* iSelector */ ++ break; ++ case UAC_VERSION_2: ++ len += 1 + 1; /* bmControls, iSelector */ ++ break; ++ case UAC_VERSION_3: ++ len += 4 + 2; /* bmControls, wSelectorDescrStr */ ++ break; ++ } ++ return d->bLength >= len; ++} ++ ++static bool validate_uac1_feature_unit(const void *p, ++ const struct usb_desc_validator *v) ++{ ++ const struct uac_feature_unit_descriptor *d = p; ++ ++ if (d->bLength < sizeof(*d) || !d->bControlSize) ++ return false; ++ /* at least bmaControls(0) for master channel + iFeature */ ++ return d->bLength >= sizeof(*d) + d->bControlSize + 1; ++} ++ ++static bool validate_uac2_feature_unit(const void *p, ++ const struct usb_desc_validator *v) ++{ ++ const struct uac2_feature_unit_descriptor *d = p; ++ ++ if (d->bLength < sizeof(*d)) ++ return false; ++ /* at least bmaControls(0) for master channel + iFeature */ ++ return d->bLength >= sizeof(*d) + 4 + 1; ++} ++ ++static bool validate_uac3_feature_unit(const void *p, ++ const struct usb_desc_validator *v) ++{ ++ const struct uac3_feature_unit_descriptor *d = p; ++ ++ if (d->bLength < sizeof(*d)) ++ return false; ++ /* at least bmaControls(0) for master channel + wFeatureDescrStr */ ++ return d->bLength >= sizeof(*d) + 4 + 2; ++} ++ ++static bool validate_midi_out_jack(const void *p, ++ const struct usb_desc_validator *v) ++{ ++ const struct usb_midi_out_jack_descriptor *d = p; ++ ++ return d->bLength >= sizeof(*d) && ++ d->bLength >= sizeof(*d) + d->bNrInputPins * 2; ++} ++ ++#define FIXED(p, t, s) { .protocol = (p), .type = (t), .size = sizeof(s) } ++#define FUNC(p, t, f) { .protocol = (p), .type = (t), .func = (f) } ++ ++static struct usb_desc_validator audio_validators[] = { ++ /* UAC1 */ ++ FUNC(UAC_VERSION_1, UAC_HEADER, validate_uac1_header), ++ FIXED(UAC_VERSION_1, UAC_INPUT_TERMINAL, ++ struct uac_input_terminal_descriptor), ++ FIXED(UAC_VERSION_1, UAC_OUTPUT_TERMINAL, ++ struct uac1_output_terminal_descriptor), ++ FUNC(UAC_VERSION_1, UAC_MIXER_UNIT, validate_mixer_unit), ++ FUNC(UAC_VERSION_1, UAC_SELECTOR_UNIT, validate_selector_unit), ++ FUNC(UAC_VERSION_1, UAC_FEATURE_UNIT, validate_uac1_feature_unit), ++ FUNC(UAC_VERSION_1, UAC1_PROCESSING_UNIT, validate_processing_unit), ++ FUNC(UAC_VERSION_1, UAC1_EXTENSION_UNIT, validate_processing_unit), ++ ++ /* UAC2 */ ++ FIXED(UAC_VERSION_2, UAC_HEADER, struct uac2_ac_header_descriptor), ++ FIXED(UAC_VERSION_2, UAC_INPUT_TERMINAL, ++ struct uac2_input_terminal_descriptor), ++ FIXED(UAC_VERSION_2, UAC_OUTPUT_TERMINAL, ++ struct uac2_output_terminal_descriptor), ++ FUNC(UAC_VERSION_2, UAC_MIXER_UNIT, validate_mixer_unit), ++ FUNC(UAC_VERSION_2, UAC_SELECTOR_UNIT, validate_selector_unit), ++ FUNC(UAC_VERSION_2, UAC_FEATURE_UNIT, validate_uac2_feature_unit), ++ /* UAC_VERSION_2, UAC2_EFFECT_UNIT: not implemented yet */ ++ FUNC(UAC_VERSION_2, UAC2_PROCESSING_UNIT_V2, validate_processing_unit), ++ FUNC(UAC_VERSION_2, UAC2_EXTENSION_UNIT_V2, validate_processing_unit), ++ FIXED(UAC_VERSION_2, UAC2_CLOCK_SOURCE, ++ struct uac_clock_source_descriptor), ++ FUNC(UAC_VERSION_2, UAC2_CLOCK_SELECTOR, validate_selector_unit), ++ FIXED(UAC_VERSION_2, UAC2_CLOCK_MULTIPLIER, ++ struct uac_clock_multiplier_descriptor), ++ /* UAC_VERSION_2, UAC2_SAMPLE_RATE_CONVERTER: not implemented yet */ ++ ++ /* UAC3 */ ++ FIXED(UAC_VERSION_2, UAC_HEADER, struct uac3_ac_header_descriptor), ++ FIXED(UAC_VERSION_3, UAC_INPUT_TERMINAL, ++ struct uac3_input_terminal_descriptor), ++ FIXED(UAC_VERSION_3, UAC_OUTPUT_TERMINAL, ++ struct uac3_output_terminal_descriptor), ++ /* UAC_VERSION_3, UAC3_EXTENDED_TERMINAL: not implemented yet */ ++ FUNC(UAC_VERSION_3, UAC3_MIXER_UNIT, validate_mixer_unit), ++ FUNC(UAC_VERSION_3, UAC3_SELECTOR_UNIT, validate_selector_unit), ++ FUNC(UAC_VERSION_3, UAC_FEATURE_UNIT, validate_uac3_feature_unit), ++ /* UAC_VERSION_3, UAC3_EFFECT_UNIT: not implemented yet */ ++ FUNC(UAC_VERSION_3, UAC3_PROCESSING_UNIT, validate_processing_unit), ++ FUNC(UAC_VERSION_3, UAC3_EXTENSION_UNIT, validate_processing_unit), ++ FIXED(UAC_VERSION_3, UAC3_CLOCK_SOURCE, ++ struct uac3_clock_source_descriptor), ++ FUNC(UAC_VERSION_3, UAC3_CLOCK_SELECTOR, validate_selector_unit), ++ FIXED(UAC_VERSION_3, UAC3_CLOCK_MULTIPLIER, ++ struct uac3_clock_multiplier_descriptor), ++ /* UAC_VERSION_3, UAC3_SAMPLE_RATE_CONVERTER: not implemented yet */ ++ /* UAC_VERSION_3, UAC3_CONNECTORS: not implemented yet */ ++ { } /* terminator */ ++}; ++ ++static struct usb_desc_validator midi_validators[] = { ++ FIXED(UAC_VERSION_ALL, USB_MS_HEADER, ++ struct usb_ms_header_descriptor), ++ FIXED(UAC_VERSION_ALL, USB_MS_MIDI_IN_JACK, ++ struct usb_midi_in_jack_descriptor), ++ FUNC(UAC_VERSION_ALL, USB_MS_MIDI_OUT_JACK, ++ validate_midi_out_jack), ++ { } /* terminator */ ++}; ++ ++ ++/* Validate the given unit descriptor, return true if it's OK */ ++static bool validate_desc(unsigned char *hdr, int protocol, ++ const struct usb_desc_validator *v) ++{ ++ if (hdr[1] != USB_DT_CS_INTERFACE) ++ return true; /* don't care */ ++ ++ for (; v->type; v++) { ++ if (v->type == hdr[2] && ++ (v->protocol == UAC_VERSION_ALL || ++ v->protocol == protocol)) { ++ if (v->func) ++ return v->func(hdr, v); ++ /* check for the fixed size */ ++ return hdr[0] >= v->size; ++ } ++ } ++ ++ return true; /* not matching, skip validation */ ++} ++ ++bool snd_usb_validate_audio_desc(void *p, int protocol) ++{ ++ return validate_desc(p, protocol, audio_validators); ++} ++ ++bool snd_usb_validate_midi_desc(void *p) ++{ ++ return validate_desc(p, UAC_VERSION_1, midi_validators); ++} ++ +diff --git a/tools/gpio/Makefile b/tools/gpio/Makefile +index 240eda014b37..f8bc8656a544 100644 +--- a/tools/gpio/Makefile ++++ b/tools/gpio/Makefile +@@ -3,7 +3,11 @@ include ../scripts/Makefile.include + + bindir ?= /usr/bin + +-ifeq ($(srctree),) ++# This will work when gpio is built in tools env. where srctree ++# isn't set and when invoked from selftests build, where srctree ++# is set to ".". building_out_of_srctree is undefined for in srctree ++# builds ++ifndef building_out_of_srctree + srctree := $(patsubst %/,%,$(dir $(CURDIR))) + srctree := $(patsubst %/,%,$(dir $(srctree))) + endif +diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c +index e1e94b44d588..918260b65c60 100644 +--- a/tools/perf/util/hist.c ++++ b/tools/perf/util/hist.c +@@ -1508,7 +1508,7 @@ int hists__collapse_resort(struct hists *hists, struct ui_progress *prog) + return 0; + } + +-static int hist_entry__sort(struct hist_entry *a, struct hist_entry *b) ++static int64_t hist_entry__sort(struct hist_entry *a, struct hist_entry *b) + { + struct hists *hists = a->hists; + struct perf_hpp_fmt *fmt; +diff --git a/tools/usb/usbip/libsrc/usbip_device_driver.c b/tools/usb/usbip/libsrc/usbip_device_driver.c +index ec3a0b794f15..67ae6c1557b8 100644 +--- a/tools/usb/usbip/libsrc/usbip_device_driver.c ++++ b/tools/usb/usbip/libsrc/usbip_device_driver.c +@@ -81,7 +81,7 @@ int read_usb_vudc_device(struct udev_device *sdev, struct usbip_usb_device *dev) + FILE *fd = NULL; + struct udev_device *plat; + const char *speed; +- int ret = 0; ++ size_t ret; + + plat = udev_device_get_parent(sdev); + path = udev_device_get_syspath(plat); +@@ -91,8 +91,10 @@ int read_usb_vudc_device(struct udev_device *sdev, struct usbip_usb_device *dev) + if (!fd) + return -1; + ret = fread((char *) &descr, sizeof(descr), 1, fd); +- if (ret < 0) ++ if (ret != 1) { ++ err("Cannot read vudc device descr file: %s", strerror(errno)); + goto err; ++ } + fclose(fd); + + copy_descr_attr(dev, &descr, bDeviceClass); +diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c +index 4a584a575221..7a0d86d52230 100644 +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -51,6 +51,7 @@ + #include <linux/slab.h> + #include <linux/sort.h> + #include <linux/bsearch.h> ++#include <linux/kthread.h> + + #include <asm/processor.h> + #include <asm/io.h> +@@ -92,7 +93,7 @@ EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); + * kvm->lock --> kvm->slots_lock --> kvm->irq_lock + */ + +-DEFINE_SPINLOCK(kvm_lock); ++DEFINE_MUTEX(kvm_lock); + static DEFINE_RAW_SPINLOCK(kvm_count_lock); + LIST_HEAD(vm_list); + +@@ -616,13 +617,31 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) + + stat_data->kvm = kvm; + stat_data->offset = p->offset; ++ stat_data->mode = p->mode ? p->mode : 0644; + kvm->debugfs_stat_data[p - debugfs_entries] = stat_data; +- debugfs_create_file(p->name, 0644, kvm->debugfs_dentry, ++ debugfs_create_file(p->name, stat_data->mode, kvm->debugfs_dentry, + stat_data, stat_fops_per_vm[p->kind]); + } + return 0; + } + ++/* ++ * Called after the VM is otherwise initialized, but just before adding it to ++ * the vm_list. ++ */ ++int __weak kvm_arch_post_init_vm(struct kvm *kvm) ++{ ++ return 0; ++} ++ ++/* ++ * Called just after removing the VM from the vm_list, but before doing any ++ * other destruction. ++ */ ++void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm) ++{ ++} ++ + static struct kvm *kvm_create_vm(unsigned long type) + { + int r, i; +@@ -677,22 +696,31 @@ static struct kvm *kvm_create_vm(unsigned long type) + rcu_assign_pointer(kvm->buses[i], + kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL)); + if (!kvm->buses[i]) +- goto out_err; ++ goto out_err_no_mmu_notifier; + } + + r = kvm_init_mmu_notifier(kvm); ++ if (r) ++ goto out_err_no_mmu_notifier; ++ ++ r = kvm_arch_post_init_vm(kvm); + if (r) + goto out_err; + +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + list_add(&kvm->vm_list, &vm_list); +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); + + preempt_notifier_inc(); + + return kvm; + + out_err: ++#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) ++ if (kvm->mmu_notifier.ops) ++ mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); ++#endif ++out_err_no_mmu_notifier: + cleanup_srcu_struct(&kvm->irq_srcu); + out_err_no_irq_srcu: + cleanup_srcu_struct(&kvm->srcu); +@@ -732,9 +760,11 @@ static void kvm_destroy_vm(struct kvm *kvm) + kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); + kvm_destroy_vm_debugfs(kvm); + kvm_arch_sync_events(kvm); +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + list_del(&kvm->vm_list); +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); ++ kvm_arch_pre_destroy_vm(kvm); ++ + kvm_free_irq_routing(kvm); + for (i = 0; i < KVM_NR_BUSES; i++) { + struct kvm_io_bus *bus = kvm_get_bus(kvm, i); +@@ -3714,7 +3744,9 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file, + if (!refcount_inc_not_zero(&stat_data->kvm->users_count)) + return -ENOENT; + +- if (simple_attr_open(inode, file, get, set, fmt)) { ++ if (simple_attr_open(inode, file, get, ++ stat_data->mode & S_IWUGO ? set : NULL, ++ fmt)) { + kvm_put_kvm(stat_data->kvm); + return -ENOMEM; + } +@@ -3828,13 +3860,13 @@ static int vm_stat_get(void *_offset, u64 *val) + u64 tmp_val; + + *val = 0; +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) { + stat_tmp.kvm = kvm; + vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val); + *val += tmp_val; + } +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); + return 0; + } + +@@ -3847,12 +3879,12 @@ static int vm_stat_clear(void *_offset, u64 val) + if (val) + return -EINVAL; + +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) { + stat_tmp.kvm = kvm; + vm_stat_clear_per_vm((void *)&stat_tmp, 0); + } +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); + + return 0; + } +@@ -3867,13 +3899,13 @@ static int vcpu_stat_get(void *_offset, u64 *val) + u64 tmp_val; + + *val = 0; +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) { + stat_tmp.kvm = kvm; + vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val); + *val += tmp_val; + } +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); + return 0; + } + +@@ -3886,12 +3918,12 @@ static int vcpu_stat_clear(void *_offset, u64 val) + if (val) + return -EINVAL; + +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) { + stat_tmp.kvm = kvm; + vcpu_stat_clear_per_vm((void *)&stat_tmp, 0); + } +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); + + return 0; + } +@@ -3912,7 +3944,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) + if (!kvm_dev.this_device || !kvm) + return; + +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + if (type == KVM_EVENT_CREATE_VM) { + kvm_createvm_count++; + kvm_active_vms++; +@@ -3921,7 +3953,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) + } + created = kvm_createvm_count; + active = kvm_active_vms; +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); + + env = kzalloc(sizeof(*env), GFP_KERNEL); + if (!env) +@@ -3962,7 +3994,8 @@ static void kvm_init_debug(void) + + kvm_debugfs_num_entries = 0; + for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) { +- debugfs_create_file(p->name, 0644, kvm_debugfs_dir, ++ int mode = p->mode ? p->mode : 0644; ++ debugfs_create_file(p->name, mode, kvm_debugfs_dir, + (void *)(long)p->offset, + stat_fops[p->kind]); + } +@@ -4138,3 +4171,86 @@ void kvm_exit(void) + kvm_vfio_ops_exit(); + } + EXPORT_SYMBOL_GPL(kvm_exit); ++ ++struct kvm_vm_worker_thread_context { ++ struct kvm *kvm; ++ struct task_struct *parent; ++ struct completion init_done; ++ kvm_vm_thread_fn_t thread_fn; ++ uintptr_t data; ++ int err; ++}; ++ ++static int kvm_vm_worker_thread(void *context) ++{ ++ /* ++ * The init_context is allocated on the stack of the parent thread, so ++ * we have to locally copy anything that is needed beyond initialization ++ */ ++ struct kvm_vm_worker_thread_context *init_context = context; ++ struct kvm *kvm = init_context->kvm; ++ kvm_vm_thread_fn_t thread_fn = init_context->thread_fn; ++ uintptr_t data = init_context->data; ++ int err; ++ ++ err = kthread_park(current); ++ /* kthread_park(current) is never supposed to return an error */ ++ WARN_ON(err != 0); ++ if (err) ++ goto init_complete; ++ ++ err = cgroup_attach_task_all(init_context->parent, current); ++ if (err) { ++ kvm_err("%s: cgroup_attach_task_all failed with err %d\n", ++ __func__, err); ++ goto init_complete; ++ } ++ ++ set_user_nice(current, task_nice(init_context->parent)); ++ ++init_complete: ++ init_context->err = err; ++ complete(&init_context->init_done); ++ init_context = NULL; ++ ++ if (err) ++ return err; ++ ++ /* Wait to be woken up by the spawner before proceeding. */ ++ kthread_parkme(); ++ ++ if (!kthread_should_stop()) ++ err = thread_fn(kvm, data); ++ ++ return err; ++} ++ ++int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, ++ uintptr_t data, const char *name, ++ struct task_struct **thread_ptr) ++{ ++ struct kvm_vm_worker_thread_context init_context = {}; ++ struct task_struct *thread; ++ ++ *thread_ptr = NULL; ++ init_context.kvm = kvm; ++ init_context.parent = current; ++ init_context.thread_fn = thread_fn; ++ init_context.data = data; ++ init_completion(&init_context.init_done); ++ ++ thread = kthread_run(kvm_vm_worker_thread, &init_context, ++ "%s-%d", name, task_pid_nr(current)); ++ if (IS_ERR(thread)) ++ return PTR_ERR(thread); ++ ++ /* kthread_run is never supposed to return NULL */ ++ WARN_ON(thread == NULL); ++ ++ wait_for_completion(&init_context.init_done); ++ ++ if (!init_context.err) ++ *thread_ptr = thread; ++ ++ return init_context.err; ++} |