diff options
author | Mike Pagano <mpagano@gentoo.org> | 2019-08-04 12:14:44 -0400 |
---|---|---|
committer | Mike Pagano <mpagano@gentoo.org> | 2019-08-04 12:14:44 -0400 |
commit | e10ac3fedcb0f948cc28973fbe7b54429f65d498 (patch) | |
tree | 936915d3a0170dc4198b9e62060b448c08ae0e3b | |
parent | mm/vmalloc: Sync unmappings in __purge_vmap_area_lazy() (diff) | |
download | linux-patches-e10ac3fedcb0f948cc28973fbe7b54429f65d498.tar.gz linux-patches-e10ac3fedcb0f948cc28973fbe7b54429f65d498.tar.bz2 linux-patches-e10ac3fedcb0f948cc28973fbe7b54429f65d498.zip |
Linux patch 4.19.644.19-64
Signed-off-by: Mike Pagano <mpagano@gentoo.org>
-rw-r--r-- | 0000_README | 4 | ||||
-rw-r--r-- | 1063_linux-4.19.64.patch | 2473 |
2 files changed, 2477 insertions, 0 deletions
diff --git a/0000_README b/0000_README index 4639dffe..391cca5b 100644 --- a/0000_README +++ b/0000_README @@ -295,6 +295,10 @@ Patch: 1062_linux-4.19.63.patch From: https://www.kernel.org Desc: Linux 4.19.63 +Patch: 1063_linux-4.19.64.patch +From: https://www.kernel.org +Desc: Linux 4.19.64 + Patch: 1500_XATTR_USER_PREFIX.patch From: https://bugs.gentoo.org/show_bug.cgi?id=470644 Desc: Support for namespace user.pax.* on tmpfs. diff --git a/1063_linux-4.19.64.patch b/1063_linux-4.19.64.patch new file mode 100644 index 00000000..7fb8fa66 --- /dev/null +++ b/1063_linux-4.19.64.patch @@ -0,0 +1,2473 @@ +diff --git a/Makefile b/Makefile +index 8ad77a93de30..203d9e80a315 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,7 +1,7 @@ + # SPDX-License-Identifier: GPL-2.0 + VERSION = 4 + PATCHLEVEL = 19 +-SUBLEVEL = 63 ++SUBLEVEL = 64 + EXTRAVERSION = + NAME = "People's Front" + +diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h +index 1a037b94eba1..cee28a05ee98 100644 +--- a/arch/arm64/include/asm/compat.h ++++ b/arch/arm64/include/asm/compat.h +@@ -159,6 +159,7 @@ static inline compat_uptr_t ptr_to_compat(void __user *uptr) + } + + #define compat_user_stack_pointer() (user_stack_pointer(task_pt_regs(current))) ++#define COMPAT_MINSIGSTKSZ 2048 + + static inline void __user *arch_compat_alloc_user_space(long len) + { +diff --git a/arch/sh/boards/Kconfig b/arch/sh/boards/Kconfig +index 6394b4f0a69b..f42feab25dcf 100644 +--- a/arch/sh/boards/Kconfig ++++ b/arch/sh/boards/Kconfig +@@ -8,27 +8,19 @@ config SH_ALPHA_BOARD + bool + + config SH_DEVICE_TREE +- bool "Board Described by Device Tree" ++ bool + select OF + select OF_EARLY_FLATTREE + select TIMER_OF + select COMMON_CLK + select GENERIC_CALIBRATE_DELAY +- help +- Select Board Described by Device Tree to build a kernel that +- does not hard-code any board-specific knowledge but instead uses +- a device tree blob provided by the boot-loader. You must enable +- drivers for any hardware you want to use separately. At this +- time, only boards based on the open-hardware J-Core processors +- have sufficient driver coverage to use this option; do not +- select it if you are using original SuperH hardware. + + config SH_JCORE_SOC + bool "J-Core SoC" +- depends on SH_DEVICE_TREE && (CPU_SH2 || CPU_J2) ++ select SH_DEVICE_TREE + select CLKSRC_JCORE_PIT + select JCORE_AIC +- default y if CPU_J2 ++ depends on CPU_J2 + help + Select this option to include drivers core components of the + J-Core SoC, including interrupt controllers and timers. +diff --git a/block/blk-core.c b/block/blk-core.c +index 9ca703bcfe3b..4a3e1f417880 100644 +--- a/block/blk-core.c ++++ b/block/blk-core.c +@@ -421,24 +421,25 @@ void blk_sync_queue(struct request_queue *q) + EXPORT_SYMBOL(blk_sync_queue); + + /** +- * blk_set_preempt_only - set QUEUE_FLAG_PREEMPT_ONLY ++ * blk_set_pm_only - increment pm_only counter + * @q: request queue pointer +- * +- * Returns the previous value of the PREEMPT_ONLY flag - 0 if the flag was not +- * set and 1 if the flag was already set. + */ +-int blk_set_preempt_only(struct request_queue *q) ++void blk_set_pm_only(struct request_queue *q) + { +- return blk_queue_flag_test_and_set(QUEUE_FLAG_PREEMPT_ONLY, q); ++ atomic_inc(&q->pm_only); + } +-EXPORT_SYMBOL_GPL(blk_set_preempt_only); ++EXPORT_SYMBOL_GPL(blk_set_pm_only); + +-void blk_clear_preempt_only(struct request_queue *q) ++void blk_clear_pm_only(struct request_queue *q) + { +- blk_queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q); +- wake_up_all(&q->mq_freeze_wq); ++ int pm_only; ++ ++ pm_only = atomic_dec_return(&q->pm_only); ++ WARN_ON_ONCE(pm_only < 0); ++ if (pm_only == 0) ++ wake_up_all(&q->mq_freeze_wq); + } +-EXPORT_SYMBOL_GPL(blk_clear_preempt_only); ++EXPORT_SYMBOL_GPL(blk_clear_pm_only); + + /** + * __blk_run_queue_uncond - run a queue whether or not it has been stopped +@@ -916,7 +917,7 @@ EXPORT_SYMBOL(blk_alloc_queue); + */ + int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) + { +- const bool preempt = flags & BLK_MQ_REQ_PREEMPT; ++ const bool pm = flags & BLK_MQ_REQ_PREEMPT; + + while (true) { + bool success = false; +@@ -924,11 +925,11 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) + rcu_read_lock(); + if (percpu_ref_tryget_live(&q->q_usage_counter)) { + /* +- * The code that sets the PREEMPT_ONLY flag is +- * responsible for ensuring that that flag is globally +- * visible before the queue is unfrozen. ++ * The code that increments the pm_only counter is ++ * responsible for ensuring that that counter is ++ * globally visible before the queue is unfrozen. + */ +- if (preempt || !blk_queue_preempt_only(q)) { ++ if (pm || !blk_queue_pm_only(q)) { + success = true; + } else { + percpu_ref_put(&q->q_usage_counter); +@@ -953,7 +954,7 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) + + wait_event(q->mq_freeze_wq, + (atomic_read(&q->mq_freeze_depth) == 0 && +- (preempt || !blk_queue_preempt_only(q))) || ++ (pm || !blk_queue_pm_only(q))) || + blk_queue_dying(q)); + if (blk_queue_dying(q)) + return -ENODEV; +diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c +index cb1e6cf7ac48..a5ea86835fcb 100644 +--- a/block/blk-mq-debugfs.c ++++ b/block/blk-mq-debugfs.c +@@ -102,6 +102,14 @@ static int blk_flags_show(struct seq_file *m, const unsigned long flags, + return 0; + } + ++static int queue_pm_only_show(void *data, struct seq_file *m) ++{ ++ struct request_queue *q = data; ++ ++ seq_printf(m, "%d\n", atomic_read(&q->pm_only)); ++ return 0; ++} ++ + #define QUEUE_FLAG_NAME(name) [QUEUE_FLAG_##name] = #name + static const char *const blk_queue_flag_name[] = { + QUEUE_FLAG_NAME(QUEUED), +@@ -132,7 +140,6 @@ static const char *const blk_queue_flag_name[] = { + QUEUE_FLAG_NAME(REGISTERED), + QUEUE_FLAG_NAME(SCSI_PASSTHROUGH), + QUEUE_FLAG_NAME(QUIESCED), +- QUEUE_FLAG_NAME(PREEMPT_ONLY), + }; + #undef QUEUE_FLAG_NAME + +@@ -209,6 +216,7 @@ static ssize_t queue_write_hint_store(void *data, const char __user *buf, + static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { + { "poll_stat", 0400, queue_poll_stat_show }, + { "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops }, ++ { "pm_only", 0600, queue_pm_only_show, NULL }, + { "state", 0600, queue_state_show, queue_state_write }, + { "write_hints", 0600, queue_write_hint_show, queue_write_hint_store }, + { "zone_wlock", 0400, queue_zone_wlock_show, NULL }, +diff --git a/drivers/android/binder.c b/drivers/android/binder.c +index 1e0e438f079f..6e04e7a707a1 100644 +--- a/drivers/android/binder.c ++++ b/drivers/android/binder.c +@@ -1960,8 +1960,18 @@ static struct binder_thread *binder_get_txn_from_and_acq_inner( + + static void binder_free_transaction(struct binder_transaction *t) + { +- if (t->buffer) +- t->buffer->transaction = NULL; ++ struct binder_proc *target_proc = t->to_proc; ++ ++ if (target_proc) { ++ binder_inner_proc_lock(target_proc); ++ if (t->buffer) ++ t->buffer->transaction = NULL; ++ binder_inner_proc_unlock(target_proc); ++ } ++ /* ++ * If the transaction has no target_proc, then ++ * t->buffer->transaction has already been cleared. ++ */ + kfree(t); + binder_stats_deleted(BINDER_STAT_TRANSACTION); + } +@@ -3484,10 +3494,12 @@ static int binder_thread_write(struct binder_proc *proc, + buffer->debug_id, + buffer->transaction ? "active" : "finished"); + ++ binder_inner_proc_lock(proc); + if (buffer->transaction) { + buffer->transaction->buffer = NULL; + buffer->transaction = NULL; + } ++ binder_inner_proc_unlock(proc); + if (buffer->async_transaction && buffer->target_node) { + struct binder_node *buf_node; + struct binder_work *w; +diff --git a/drivers/bluetooth/hci_ath.c b/drivers/bluetooth/hci_ath.c +index d568fbd94d6c..20235925344d 100644 +--- a/drivers/bluetooth/hci_ath.c ++++ b/drivers/bluetooth/hci_ath.c +@@ -112,6 +112,9 @@ static int ath_open(struct hci_uart *hu) + + BT_DBG("hu %p", hu); + ++ if (!hci_uart_has_flow_control(hu)) ++ return -EOPNOTSUPP; ++ + ath = kzalloc(sizeof(*ath), GFP_KERNEL); + if (!ath) + return -ENOMEM; +diff --git a/drivers/bluetooth/hci_bcm.c b/drivers/bluetooth/hci_bcm.c +index 800132369134..aa6b7ed9fdf1 100644 +--- a/drivers/bluetooth/hci_bcm.c ++++ b/drivers/bluetooth/hci_bcm.c +@@ -369,6 +369,9 @@ static int bcm_open(struct hci_uart *hu) + + bt_dev_dbg(hu->hdev, "hu %p", hu); + ++ if (!hci_uart_has_flow_control(hu)) ++ return -EOPNOTSUPP; ++ + bcm = kzalloc(sizeof(*bcm), GFP_KERNEL); + if (!bcm) + return -ENOMEM; +diff --git a/drivers/bluetooth/hci_intel.c b/drivers/bluetooth/hci_intel.c +index 46ace321bf60..e9228520e4c7 100644 +--- a/drivers/bluetooth/hci_intel.c ++++ b/drivers/bluetooth/hci_intel.c +@@ -406,6 +406,9 @@ static int intel_open(struct hci_uart *hu) + + BT_DBG("hu %p", hu); + ++ if (!hci_uart_has_flow_control(hu)) ++ return -EOPNOTSUPP; ++ + intel = kzalloc(sizeof(*intel), GFP_KERNEL); + if (!intel) + return -ENOMEM; +diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c +index c915daf01a89..efeb8137ec67 100644 +--- a/drivers/bluetooth/hci_ldisc.c ++++ b/drivers/bluetooth/hci_ldisc.c +@@ -299,6 +299,19 @@ static int hci_uart_send_frame(struct hci_dev *hdev, struct sk_buff *skb) + return 0; + } + ++/* Check the underlying device or tty has flow control support */ ++bool hci_uart_has_flow_control(struct hci_uart *hu) ++{ ++ /* serdev nodes check if the needed operations are present */ ++ if (hu->serdev) ++ return true; ++ ++ if (hu->tty->driver->ops->tiocmget && hu->tty->driver->ops->tiocmset) ++ return true; ++ ++ return false; ++} ++ + /* Flow control or un-flow control the device */ + void hci_uart_set_flow_control(struct hci_uart *hu, bool enable) + { +diff --git a/drivers/bluetooth/hci_mrvl.c b/drivers/bluetooth/hci_mrvl.c +index ffb00669346f..23791df081ba 100644 +--- a/drivers/bluetooth/hci_mrvl.c ++++ b/drivers/bluetooth/hci_mrvl.c +@@ -66,6 +66,9 @@ static int mrvl_open(struct hci_uart *hu) + + BT_DBG("hu %p", hu); + ++ if (!hci_uart_has_flow_control(hu)) ++ return -EOPNOTSUPP; ++ + mrvl = kzalloc(sizeof(*mrvl), GFP_KERNEL); + if (!mrvl) + return -ENOMEM; +diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c +index 77004c29da08..f96e58de049b 100644 +--- a/drivers/bluetooth/hci_qca.c ++++ b/drivers/bluetooth/hci_qca.c +@@ -450,6 +450,9 @@ static int qca_open(struct hci_uart *hu) + + BT_DBG("hu %p qca_open", hu); + ++ if (!hci_uart_has_flow_control(hu)) ++ return -EOPNOTSUPP; ++ + qca = kzalloc(sizeof(struct qca_data), GFP_KERNEL); + if (!qca) + return -ENOMEM; +diff --git a/drivers/bluetooth/hci_uart.h b/drivers/bluetooth/hci_uart.h +index 00cab2fd7a1b..067a610f1372 100644 +--- a/drivers/bluetooth/hci_uart.h ++++ b/drivers/bluetooth/hci_uart.h +@@ -118,6 +118,7 @@ int hci_uart_tx_wakeup(struct hci_uart *hu); + int hci_uart_init_ready(struct hci_uart *hu); + void hci_uart_init_work(struct work_struct *work); + void hci_uart_set_baudrate(struct hci_uart *hu, unsigned int speed); ++bool hci_uart_has_flow_control(struct hci_uart *hu); + void hci_uart_set_flow_control(struct hci_uart *hu, bool enable); + void hci_uart_set_speeds(struct hci_uart *hu, unsigned int init_speed, + unsigned int oper_speed); +diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c +index c1439019dd12..b9af2419006f 100644 +--- a/drivers/iommu/intel-iommu.c ++++ b/drivers/iommu/intel-iommu.c +@@ -3721,7 +3721,7 @@ static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size) + + freelist = domain_unmap(domain, start_pfn, last_pfn); + +- if (intel_iommu_strict) { ++ if (intel_iommu_strict || !has_iova_flush_queue(&domain->iovad)) { + iommu_flush_iotlb_psi(iommu, domain, start_pfn, + nrpages, !freelist, 0); + /* free iova */ +diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c +index 83fe2621effe..60348d707b99 100644 +--- a/drivers/iommu/iova.c ++++ b/drivers/iommu/iova.c +@@ -65,9 +65,14 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule, + } + EXPORT_SYMBOL_GPL(init_iova_domain); + ++bool has_iova_flush_queue(struct iova_domain *iovad) ++{ ++ return !!iovad->fq; ++} ++ + static void free_iova_flush_queue(struct iova_domain *iovad) + { +- if (!iovad->fq) ++ if (!has_iova_flush_queue(iovad)) + return; + + if (timer_pending(&iovad->fq_timer)) +@@ -85,13 +90,14 @@ static void free_iova_flush_queue(struct iova_domain *iovad) + int init_iova_flush_queue(struct iova_domain *iovad, + iova_flush_cb flush_cb, iova_entry_dtor entry_dtor) + { ++ struct iova_fq __percpu *queue; + int cpu; + + atomic64_set(&iovad->fq_flush_start_cnt, 0); + atomic64_set(&iovad->fq_flush_finish_cnt, 0); + +- iovad->fq = alloc_percpu(struct iova_fq); +- if (!iovad->fq) ++ queue = alloc_percpu(struct iova_fq); ++ if (!queue) + return -ENOMEM; + + iovad->flush_cb = flush_cb; +@@ -100,13 +106,17 @@ int init_iova_flush_queue(struct iova_domain *iovad, + for_each_possible_cpu(cpu) { + struct iova_fq *fq; + +- fq = per_cpu_ptr(iovad->fq, cpu); ++ fq = per_cpu_ptr(queue, cpu); + fq->head = 0; + fq->tail = 0; + + spin_lock_init(&fq->lock); + } + ++ smp_wmb(); ++ ++ iovad->fq = queue; ++ + timer_setup(&iovad->fq_timer, fq_flush_timeout, 0); + atomic_set(&iovad->fq_timer_on, 0); + +diff --git a/drivers/isdn/hardware/mISDN/hfcsusb.c b/drivers/isdn/hardware/mISDN/hfcsusb.c +index 6d05946b445e..060dc7fd66c1 100644 +--- a/drivers/isdn/hardware/mISDN/hfcsusb.c ++++ b/drivers/isdn/hardware/mISDN/hfcsusb.c +@@ -1967,6 +1967,9 @@ hfcsusb_probe(struct usb_interface *intf, const struct usb_device_id *id) + + /* get endpoint base */ + idx = ((ep_addr & 0x7f) - 1) * 2; ++ if (idx > 15) ++ return -EIO; ++ + if (ep_addr & 0x80) + idx++; + attr = ep->desc.bmAttributes; +diff --git a/drivers/media/radio/radio-raremono.c b/drivers/media/radio/radio-raremono.c +index 9a5079d64c4a..729600c4a056 100644 +--- a/drivers/media/radio/radio-raremono.c ++++ b/drivers/media/radio/radio-raremono.c +@@ -271,6 +271,14 @@ static int vidioc_g_frequency(struct file *file, void *priv, + return 0; + } + ++static void raremono_device_release(struct v4l2_device *v4l2_dev) ++{ ++ struct raremono_device *radio = to_raremono_dev(v4l2_dev); ++ ++ kfree(radio->buffer); ++ kfree(radio); ++} ++ + /* File system interface */ + static const struct v4l2_file_operations usb_raremono_fops = { + .owner = THIS_MODULE, +@@ -295,12 +303,14 @@ static int usb_raremono_probe(struct usb_interface *intf, + struct raremono_device *radio; + int retval = 0; + +- radio = devm_kzalloc(&intf->dev, sizeof(struct raremono_device), GFP_KERNEL); +- if (radio) +- radio->buffer = devm_kmalloc(&intf->dev, BUFFER_LENGTH, GFP_KERNEL); +- +- if (!radio || !radio->buffer) ++ radio = kzalloc(sizeof(*radio), GFP_KERNEL); ++ if (!radio) ++ return -ENOMEM; ++ radio->buffer = kmalloc(BUFFER_LENGTH, GFP_KERNEL); ++ if (!radio->buffer) { ++ kfree(radio); + return -ENOMEM; ++ } + + radio->usbdev = interface_to_usbdev(intf); + radio->intf = intf; +@@ -324,7 +334,8 @@ static int usb_raremono_probe(struct usb_interface *intf, + if (retval != 3 || + (get_unaligned_be16(&radio->buffer[1]) & 0xfff) == 0x0242) { + dev_info(&intf->dev, "this is not Thanko's Raremono.\n"); +- return -ENODEV; ++ retval = -ENODEV; ++ goto free_mem; + } + + dev_info(&intf->dev, "Thanko's Raremono connected: (%04X:%04X)\n", +@@ -333,7 +344,7 @@ static int usb_raremono_probe(struct usb_interface *intf, + retval = v4l2_device_register(&intf->dev, &radio->v4l2_dev); + if (retval < 0) { + dev_err(&intf->dev, "couldn't register v4l2_device\n"); +- return retval; ++ goto free_mem; + } + + mutex_init(&radio->lock); +@@ -345,6 +356,7 @@ static int usb_raremono_probe(struct usb_interface *intf, + radio->vdev.ioctl_ops = &usb_raremono_ioctl_ops; + radio->vdev.lock = &radio->lock; + radio->vdev.release = video_device_release_empty; ++ radio->v4l2_dev.release = raremono_device_release; + + usb_set_intfdata(intf, &radio->v4l2_dev); + +@@ -360,6 +372,10 @@ static int usb_raremono_probe(struct usb_interface *intf, + } + dev_err(&intf->dev, "could not register video device\n"); + v4l2_device_unregister(&radio->v4l2_dev); ++ ++free_mem: ++ kfree(radio->buffer); ++ kfree(radio); + return retval; + } + +diff --git a/drivers/media/usb/au0828/au0828-core.c b/drivers/media/usb/au0828/au0828-core.c +index 257ae0d8cfe2..e3f63299f85c 100644 +--- a/drivers/media/usb/au0828/au0828-core.c ++++ b/drivers/media/usb/au0828/au0828-core.c +@@ -623,6 +623,12 @@ static int au0828_usb_probe(struct usb_interface *interface, + /* Setup */ + au0828_card_setup(dev); + ++ /* ++ * Store the pointer to the au0828_dev so it can be accessed in ++ * au0828_usb_disconnect ++ */ ++ usb_set_intfdata(interface, dev); ++ + /* Analog TV */ + retval = au0828_analog_register(dev, interface); + if (retval) { +@@ -641,12 +647,6 @@ static int au0828_usb_probe(struct usb_interface *interface, + /* Remote controller */ + au0828_rc_register(dev); + +- /* +- * Store the pointer to the au0828_dev so it can be accessed in +- * au0828_usb_disconnect +- */ +- usb_set_intfdata(interface, dev); +- + pr_info("Registered device AU0828 [%s]\n", + dev->board.name == NULL ? "Unset" : dev->board.name); + +diff --git a/drivers/media/usb/cpia2/cpia2_usb.c b/drivers/media/usb/cpia2/cpia2_usb.c +index a771e0a52610..f5b04594e209 100644 +--- a/drivers/media/usb/cpia2/cpia2_usb.c ++++ b/drivers/media/usb/cpia2/cpia2_usb.c +@@ -902,7 +902,6 @@ static void cpia2_usb_disconnect(struct usb_interface *intf) + cpia2_unregister_camera(cam); + v4l2_device_disconnect(&cam->v4l2_dev); + mutex_unlock(&cam->v4l2_lock); +- v4l2_device_put(&cam->v4l2_dev); + + if(cam->buffers) { + DBG("Wakeup waiting processes\n"); +@@ -911,6 +910,8 @@ static void cpia2_usb_disconnect(struct usb_interface *intf) + wake_up_interruptible(&cam->wq_stream); + } + ++ v4l2_device_put(&cam->v4l2_dev); ++ + LOG("CPiA2 camera disconnected.\n"); + } + +diff --git a/drivers/media/usb/pvrusb2/pvrusb2-hdw.c b/drivers/media/usb/pvrusb2/pvrusb2-hdw.c +index 673fdca8d2da..fcb201a40920 100644 +--- a/drivers/media/usb/pvrusb2/pvrusb2-hdw.c ++++ b/drivers/media/usb/pvrusb2/pvrusb2-hdw.c +@@ -1680,7 +1680,7 @@ static int pvr2_decoder_enable(struct pvr2_hdw *hdw,int enablefl) + } + if (!hdw->flag_decoder_missed) { + pvr2_trace(PVR2_TRACE_ERROR_LEGS, +- "WARNING: No decoder present"); ++ "***WARNING*** No decoder present"); + hdw->flag_decoder_missed = !0; + trace_stbit("flag_decoder_missed", + hdw->flag_decoder_missed); +@@ -2366,7 +2366,7 @@ struct pvr2_hdw *pvr2_hdw_create(struct usb_interface *intf, + if (hdw_desc->flag_is_experimental) { + pvr2_trace(PVR2_TRACE_INFO, "**********"); + pvr2_trace(PVR2_TRACE_INFO, +- "WARNING: Support for this device (%s) is experimental.", ++ "***WARNING*** Support for this device (%s) is experimental.", + hdw_desc->description); + pvr2_trace(PVR2_TRACE_INFO, + "Important functionality might not be entirely working."); +diff --git a/drivers/media/usb/pvrusb2/pvrusb2-i2c-core.c b/drivers/media/usb/pvrusb2/pvrusb2-i2c-core.c +index f3003ca05f4b..922c06279663 100644 +--- a/drivers/media/usb/pvrusb2/pvrusb2-i2c-core.c ++++ b/drivers/media/usb/pvrusb2/pvrusb2-i2c-core.c +@@ -343,11 +343,11 @@ static int i2c_hack_cx25840(struct pvr2_hdw *hdw, + + if ((ret != 0) || (*rdata == 0x04) || (*rdata == 0x0a)) { + pvr2_trace(PVR2_TRACE_ERROR_LEGS, +- "WARNING: Detected a wedged cx25840 chip; the device will not work."); ++ "***WARNING*** Detected a wedged cx25840 chip; the device will not work."); + pvr2_trace(PVR2_TRACE_ERROR_LEGS, +- "WARNING: Try power cycling the pvrusb2 device."); ++ "***WARNING*** Try power cycling the pvrusb2 device."); + pvr2_trace(PVR2_TRACE_ERROR_LEGS, +- "WARNING: Disabling further access to the device to prevent other foul-ups."); ++ "***WARNING*** Disabling further access to the device to prevent other foul-ups."); + // This blocks all further communication with the part. + hdw->i2c_func[0x44] = NULL; + pvr2_hdw_render_useless(hdw); +diff --git a/drivers/media/usb/pvrusb2/pvrusb2-std.c b/drivers/media/usb/pvrusb2/pvrusb2-std.c +index 6b651f8b54df..37dc299a1ca2 100644 +--- a/drivers/media/usb/pvrusb2/pvrusb2-std.c ++++ b/drivers/media/usb/pvrusb2/pvrusb2-std.c +@@ -353,7 +353,7 @@ struct v4l2_standard *pvr2_std_create_enum(unsigned int *countptr, + bcnt = pvr2_std_id_to_str(buf,sizeof(buf),fmsk); + pvr2_trace( + PVR2_TRACE_ERROR_LEGS, +- "WARNING: Failed to classify the following standard(s): %.*s", ++ "***WARNING*** Failed to classify the following standard(s): %.*s", + bcnt,buf); + } + +diff --git a/drivers/net/wireless/ath/ath10k/usb.c b/drivers/net/wireless/ath/ath10k/usb.c +index d4803ff5a78a..f09a4ad2e9de 100644 +--- a/drivers/net/wireless/ath/ath10k/usb.c ++++ b/drivers/net/wireless/ath/ath10k/usb.c +@@ -1025,7 +1025,7 @@ static int ath10k_usb_probe(struct usb_interface *interface, + } + + /* TODO: remove this once USB support is fully implemented */ +- ath10k_warn(ar, "WARNING: ath10k USB support is incomplete, don't expect anything to work!\n"); ++ ath10k_warn(ar, "Warning: ath10k USB support is incomplete, don't expect anything to work!\n"); + + return 0; + +diff --git a/drivers/pps/pps.c b/drivers/pps/pps.c +index 8febacb8fc54..0951564b6830 100644 +--- a/drivers/pps/pps.c ++++ b/drivers/pps/pps.c +@@ -166,6 +166,14 @@ static long pps_cdev_ioctl(struct file *file, + pps->params.mode |= PPS_CANWAIT; + pps->params.api_version = PPS_API_VERS; + ++ /* ++ * Clear unused fields of pps_kparams to avoid leaking ++ * uninitialized data of the PPS_SETPARAMS caller via ++ * PPS_GETPARAMS ++ */ ++ pps->params.assert_off_tu.flags = 0; ++ pps->params.clear_off_tu.flags = 0; ++ + spin_unlock_irq(&pps->lock); + + break; +diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c +index 32652b2c5e7c..75b926e70076 100644 +--- a/drivers/scsi/scsi_lib.c ++++ b/drivers/scsi/scsi_lib.c +@@ -3059,11 +3059,14 @@ scsi_device_quiesce(struct scsi_device *sdev) + */ + WARN_ON_ONCE(sdev->quiesced_by && sdev->quiesced_by != current); + +- blk_set_preempt_only(q); ++ if (sdev->quiesced_by == current) ++ return 0; ++ ++ blk_set_pm_only(q); + + blk_mq_freeze_queue(q); + /* +- * Ensure that the effect of blk_set_preempt_only() will be visible ++ * Ensure that the effect of blk_set_pm_only() will be visible + * for percpu_ref_tryget() callers that occur after the queue + * unfreeze even if the queue was already frozen before this function + * was called. See also https://lwn.net/Articles/573497/. +@@ -3076,7 +3079,7 @@ scsi_device_quiesce(struct scsi_device *sdev) + if (err == 0) + sdev->quiesced_by = current; + else +- blk_clear_preempt_only(q); ++ blk_clear_pm_only(q); + mutex_unlock(&sdev->state_mutex); + + return err; +@@ -3099,8 +3102,10 @@ void scsi_device_resume(struct scsi_device *sdev) + * device deleted during suspend) + */ + mutex_lock(&sdev->state_mutex); +- sdev->quiesced_by = NULL; +- blk_clear_preempt_only(sdev->request_queue); ++ if (sdev->quiesced_by) { ++ sdev->quiesced_by = NULL; ++ blk_clear_pm_only(sdev->request_queue); ++ } + if (sdev->sdev_state == SDEV_QUIESCE) + scsi_device_set_state(sdev, SDEV_RUNNING); + mutex_unlock(&sdev->state_mutex); +diff --git a/drivers/usb/dwc2/gadget.c b/drivers/usb/dwc2/gadget.c +index 03614ef64ca4..3f68edde0f03 100644 +--- a/drivers/usb/dwc2/gadget.c ++++ b/drivers/usb/dwc2/gadget.c +@@ -3125,6 +3125,7 @@ void dwc2_hsotg_disconnect(struct dwc2_hsotg *hsotg) + hsotg->connected = 0; + hsotg->test_mode = 0; + ++ /* all endpoints should be shutdown */ + for (ep = 0; ep < hsotg->num_of_eps; ep++) { + if (hsotg->eps_in[ep]) + kill_all_requests(hsotg, hsotg->eps_in[ep], +@@ -3175,6 +3176,7 @@ static void dwc2_hsotg_irq_fifoempty(struct dwc2_hsotg *hsotg, bool periodic) + GINTSTS_PTXFEMP | \ + GINTSTS_RXFLVL) + ++static int dwc2_hsotg_ep_disable(struct usb_ep *ep); + /** + * dwc2_hsotg_core_init - issue softreset to the core + * @hsotg: The device state +@@ -3189,13 +3191,23 @@ void dwc2_hsotg_core_init_disconnected(struct dwc2_hsotg *hsotg, + u32 val; + u32 usbcfg; + u32 dcfg = 0; ++ int ep; + + /* Kill any ep0 requests as controller will be reinitialized */ + kill_all_requests(hsotg, hsotg->eps_out[0], -ECONNRESET); + +- if (!is_usb_reset) ++ if (!is_usb_reset) { + if (dwc2_core_reset(hsotg, true)) + return; ++ } else { ++ /* all endpoints should be shutdown */ ++ for (ep = 1; ep < hsotg->num_of_eps; ep++) { ++ if (hsotg->eps_in[ep]) ++ dwc2_hsotg_ep_disable(&hsotg->eps_in[ep]->ep); ++ if (hsotg->eps_out[ep]) ++ dwc2_hsotg_ep_disable(&hsotg->eps_out[ep]->ep); ++ } ++ } + + /* + * we must now enable ep0 ready for host detection and then +@@ -3993,7 +4005,6 @@ static int dwc2_hsotg_ep_disable(struct usb_ep *ep) + struct dwc2_hsotg *hsotg = hs_ep->parent; + int dir_in = hs_ep->dir_in; + int index = hs_ep->index; +- unsigned long flags; + u32 epctrl_reg; + u32 ctrl; + +@@ -4011,8 +4022,6 @@ static int dwc2_hsotg_ep_disable(struct usb_ep *ep) + + epctrl_reg = dir_in ? DIEPCTL(index) : DOEPCTL(index); + +- spin_lock_irqsave(&hsotg->lock, flags); +- + ctrl = dwc2_readl(hsotg, epctrl_reg); + + if (ctrl & DXEPCTL_EPENA) +@@ -4035,10 +4044,22 @@ static int dwc2_hsotg_ep_disable(struct usb_ep *ep) + hs_ep->fifo_index = 0; + hs_ep->fifo_size = 0; + +- spin_unlock_irqrestore(&hsotg->lock, flags); + return 0; + } + ++static int dwc2_hsotg_ep_disable_lock(struct usb_ep *ep) ++{ ++ struct dwc2_hsotg_ep *hs_ep = our_ep(ep); ++ struct dwc2_hsotg *hsotg = hs_ep->parent; ++ unsigned long flags; ++ int ret; ++ ++ spin_lock_irqsave(&hsotg->lock, flags); ++ ret = dwc2_hsotg_ep_disable(ep); ++ spin_unlock_irqrestore(&hsotg->lock, flags); ++ return ret; ++} ++ + /** + * on_list - check request is on the given endpoint + * @ep: The endpoint to check. +@@ -4186,7 +4207,7 @@ static int dwc2_hsotg_ep_sethalt_lock(struct usb_ep *ep, int value) + + static const struct usb_ep_ops dwc2_hsotg_ep_ops = { + .enable = dwc2_hsotg_ep_enable, +- .disable = dwc2_hsotg_ep_disable, ++ .disable = dwc2_hsotg_ep_disable_lock, + .alloc_request = dwc2_hsotg_ep_alloc_request, + .free_request = dwc2_hsotg_ep_free_request, + .queue = dwc2_hsotg_ep_queue_lock, +@@ -4326,9 +4347,9 @@ static int dwc2_hsotg_udc_stop(struct usb_gadget *gadget) + /* all endpoints should be shutdown */ + for (ep = 1; ep < hsotg->num_of_eps; ep++) { + if (hsotg->eps_in[ep]) +- dwc2_hsotg_ep_disable(&hsotg->eps_in[ep]->ep); ++ dwc2_hsotg_ep_disable_lock(&hsotg->eps_in[ep]->ep); + if (hsotg->eps_out[ep]) +- dwc2_hsotg_ep_disable(&hsotg->eps_out[ep]->ep); ++ dwc2_hsotg_ep_disable_lock(&hsotg->eps_out[ep]->ep); + } + + spin_lock_irqsave(&hsotg->lock, flags); +@@ -4776,9 +4797,9 @@ int dwc2_hsotg_suspend(struct dwc2_hsotg *hsotg) + + for (ep = 0; ep < hsotg->num_of_eps; ep++) { + if (hsotg->eps_in[ep]) +- dwc2_hsotg_ep_disable(&hsotg->eps_in[ep]->ep); ++ dwc2_hsotg_ep_disable_lock(&hsotg->eps_in[ep]->ep); + if (hsotg->eps_out[ep]) +- dwc2_hsotg_ep_disable(&hsotg->eps_out[ep]->ep); ++ dwc2_hsotg_ep_disable_lock(&hsotg->eps_out[ep]->ep); + } + } + +diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c +index ae704658b528..124356dc39e1 100644 +--- a/drivers/vhost/net.c ++++ b/drivers/vhost/net.c +@@ -497,12 +497,6 @@ static size_t init_iov_iter(struct vhost_virtqueue *vq, struct iov_iter *iter, + return iov_iter_count(iter); + } + +-static bool vhost_exceeds_weight(int pkts, int total_len) +-{ +- return total_len >= VHOST_NET_WEIGHT || +- pkts >= VHOST_NET_PKT_WEIGHT; +-} +- + static int get_tx_bufs(struct vhost_net *net, + struct vhost_net_virtqueue *nvq, + struct msghdr *msg, +@@ -557,7 +551,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) + int err; + int sent_pkts = 0; + +- for (;;) { ++ do { + bool busyloop_intr = false; + + head = get_tx_bufs(net, nvq, &msg, &out, &in, &len, +@@ -598,11 +592,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) + err, len); + if (++nvq->done_idx >= VHOST_NET_BATCH) + vhost_net_signal_used(nvq); +- if (vhost_exceeds_weight(++sent_pkts, total_len)) { +- vhost_poll_queue(&vq->poll); +- break; +- } +- } ++ } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len))); + + vhost_net_signal_used(nvq); + } +@@ -626,7 +616,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) + bool zcopy_used; + int sent_pkts = 0; + +- for (;;) { ++ do { + bool busyloop_intr; + + /* Release DMAs done buffers first */ +@@ -701,11 +691,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) + else + vhost_zerocopy_signal_used(net, vq); + vhost_net_tx_packet(net); +- if (unlikely(vhost_exceeds_weight(++sent_pkts, total_len))) { +- vhost_poll_queue(&vq->poll); +- break; +- } +- } ++ } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len))); + } + + /* Expects to be always run from workqueue - which acts as +@@ -941,8 +927,11 @@ static void handle_rx(struct vhost_net *net) + vq->log : NULL; + mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF); + +- while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk, +- &busyloop_intr))) { ++ do { ++ sock_len = vhost_net_rx_peek_head_len(net, sock->sk, ++ &busyloop_intr); ++ if (!sock_len) ++ break; + sock_len += sock_hlen; + vhost_len = sock_len + vhost_hlen; + headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx, +@@ -1027,14 +1016,11 @@ static void handle_rx(struct vhost_net *net) + vhost_log_write(vq, vq_log, log, vhost_len, + vq->iov, in); + total_len += vhost_len; +- if (unlikely(vhost_exceeds_weight(++recv_pkts, total_len))) { +- vhost_poll_queue(&vq->poll); +- goto out; +- } +- } ++ } while (likely(!vhost_exceeds_weight(vq, ++recv_pkts, total_len))); ++ + if (unlikely(busyloop_intr)) + vhost_poll_queue(&vq->poll); +- else ++ else if (!sock_len) + vhost_net_enable_vq(net, vq); + out: + vhost_net_signal_used(nvq); +@@ -1115,7 +1101,8 @@ static int vhost_net_open(struct inode *inode, struct file *f) + vhost_net_buf_init(&n->vqs[i].rxq); + } + vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX, +- UIO_MAXIOV + VHOST_NET_BATCH); ++ UIO_MAXIOV + VHOST_NET_BATCH, ++ VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT); + + vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev); + vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev); +diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c +index 0cfa925be4ec..5e298d9287f1 100644 +--- a/drivers/vhost/scsi.c ++++ b/drivers/vhost/scsi.c +@@ -57,6 +57,12 @@ + #define VHOST_SCSI_PREALLOC_UPAGES 2048 + #define VHOST_SCSI_PREALLOC_PROT_SGLS 2048 + ++/* Max number of requests before requeueing the job. ++ * Using this limit prevents one virtqueue from starving others with ++ * request. ++ */ ++#define VHOST_SCSI_WEIGHT 256 ++ + struct vhost_scsi_inflight { + /* Wait for the flush operation to finish */ + struct completion comp; +@@ -811,7 +817,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) + u64 tag; + u32 exp_data_len, data_direction; + unsigned int out = 0, in = 0; +- int head, ret, prot_bytes; ++ int head, ret, prot_bytes, c = 0; + size_t req_size, rsp_size = sizeof(struct virtio_scsi_cmd_resp); + size_t out_size, in_size; + u16 lun; +@@ -830,7 +836,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) + + vhost_disable_notify(&vs->dev, vq); + +- for (;;) { ++ do { + head = vhost_get_vq_desc(vq, vq->iov, + ARRAY_SIZE(vq->iov), &out, &in, + NULL, NULL); +@@ -1045,7 +1051,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) + */ + INIT_WORK(&cmd->work, vhost_scsi_submission_work); + queue_work(vhost_scsi_workqueue, &cmd->work); +- } ++ } while (likely(!vhost_exceeds_weight(vq, ++c, 0))); + out: + mutex_unlock(&vq->mutex); + } +@@ -1398,7 +1404,8 @@ static int vhost_scsi_open(struct inode *inode, struct file *f) + vqs[i] = &vs->vqs[i].vq; + vs->vqs[i].vq.handle_kick = vhost_scsi_handle_kick; + } +- vhost_dev_init(&vs->dev, vqs, VHOST_SCSI_MAX_VQ, UIO_MAXIOV); ++ vhost_dev_init(&vs->dev, vqs, VHOST_SCSI_MAX_VQ, UIO_MAXIOV, ++ VHOST_SCSI_WEIGHT, 0); + + vhost_scsi_init_inflight(vs, NULL); + +diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c +index c163bc15976a..0752f8dc47b1 100644 +--- a/drivers/vhost/vhost.c ++++ b/drivers/vhost/vhost.c +@@ -413,8 +413,24 @@ static void vhost_dev_free_iovecs(struct vhost_dev *dev) + vhost_vq_free_iovecs(dev->vqs[i]); + } + ++bool vhost_exceeds_weight(struct vhost_virtqueue *vq, ++ int pkts, int total_len) ++{ ++ struct vhost_dev *dev = vq->dev; ++ ++ if ((dev->byte_weight && total_len >= dev->byte_weight) || ++ pkts >= dev->weight) { ++ vhost_poll_queue(&vq->poll); ++ return true; ++ } ++ ++ return false; ++} ++EXPORT_SYMBOL_GPL(vhost_exceeds_weight); ++ + void vhost_dev_init(struct vhost_dev *dev, +- struct vhost_virtqueue **vqs, int nvqs, int iov_limit) ++ struct vhost_virtqueue **vqs, int nvqs, ++ int iov_limit, int weight, int byte_weight) + { + struct vhost_virtqueue *vq; + int i; +@@ -428,6 +444,8 @@ void vhost_dev_init(struct vhost_dev *dev, + dev->mm = NULL; + dev->worker = NULL; + dev->iov_limit = iov_limit; ++ dev->weight = weight; ++ dev->byte_weight = byte_weight; + init_llist_head(&dev->work_list); + init_waitqueue_head(&dev->wait); + INIT_LIST_HEAD(&dev->read_list); +diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h +index 9490e7ddb340..27a78a9b8cc7 100644 +--- a/drivers/vhost/vhost.h ++++ b/drivers/vhost/vhost.h +@@ -171,10 +171,13 @@ struct vhost_dev { + struct list_head pending_list; + wait_queue_head_t wait; + int iov_limit; ++ int weight; ++ int byte_weight; + }; + ++bool vhost_exceeds_weight(struct vhost_virtqueue *vq, int pkts, int total_len); + void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, +- int nvqs, int iov_limit); ++ int nvqs, int iov_limit, int weight, int byte_weight); + long vhost_dev_set_owner(struct vhost_dev *dev); + bool vhost_dev_has_owner(struct vhost_dev *dev); + long vhost_dev_check_owner(struct vhost_dev *); +diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c +index e440f87ae1d6..bab495d73195 100644 +--- a/drivers/vhost/vsock.c ++++ b/drivers/vhost/vsock.c +@@ -21,6 +21,14 @@ + #include "vhost.h" + + #define VHOST_VSOCK_DEFAULT_HOST_CID 2 ++/* Max number of bytes transferred before requeueing the job. ++ * Using this limit prevents one virtqueue from starving others. */ ++#define VHOST_VSOCK_WEIGHT 0x80000 ++/* Max number of packets transferred before requeueing the job. ++ * Using this limit prevents one virtqueue from starving others with ++ * small pkts. ++ */ ++#define VHOST_VSOCK_PKT_WEIGHT 256 + + enum { + VHOST_VSOCK_FEATURES = VHOST_FEATURES, +@@ -78,6 +86,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock, + struct vhost_virtqueue *vq) + { + struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX]; ++ int pkts = 0, total_len = 0; + bool added = false; + bool restart_tx = false; + +@@ -89,7 +98,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock, + /* Avoid further vmexits, we're already processing the virtqueue */ + vhost_disable_notify(&vsock->dev, vq); + +- for (;;) { ++ do { + struct virtio_vsock_pkt *pkt; + struct iov_iter iov_iter; + unsigned out, in; +@@ -174,8 +183,9 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock, + */ + virtio_transport_deliver_tap_pkt(pkt); + ++ total_len += pkt->len; + virtio_transport_free_pkt(pkt); +- } ++ } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len))); + if (added) + vhost_signal(&vsock->dev, vq); + +@@ -350,7 +360,7 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work) + struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock, + dev); + struct virtio_vsock_pkt *pkt; +- int head; ++ int head, pkts = 0, total_len = 0; + unsigned int out, in; + bool added = false; + +@@ -360,7 +370,7 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work) + goto out; + + vhost_disable_notify(&vsock->dev, vq); +- for (;;) { ++ do { + u32 len; + + if (!vhost_vsock_more_replies(vsock)) { +@@ -401,9 +411,11 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work) + else + virtio_transport_free_pkt(pkt); + +- vhost_add_used(vq, head, sizeof(pkt->hdr) + len); ++ len += sizeof(pkt->hdr); ++ vhost_add_used(vq, head, len); ++ total_len += len; + added = true; +- } ++ } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len))); + + no_more_replies: + if (added) +@@ -531,7 +543,9 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file) + vsock->vqs[VSOCK_VQ_TX].handle_kick = vhost_vsock_handle_tx_kick; + vsock->vqs[VSOCK_VQ_RX].handle_kick = vhost_vsock_handle_rx_kick; + +- vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs), UIO_MAXIOV); ++ vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs), ++ UIO_MAXIOV, VHOST_VSOCK_PKT_WEIGHT, ++ VHOST_VSOCK_WEIGHT); + + file->private_data = vsock; + spin_lock_init(&vsock->send_pkt_list_lock); +diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c +index c7542e8dd096..a11fa0b6b34d 100644 +--- a/fs/ceph/caps.c ++++ b/fs/ceph/caps.c +@@ -1237,20 +1237,23 @@ static int send_cap_msg(struct cap_msg_args *arg) + } + + /* +- * Queue cap releases when an inode is dropped from our cache. Since +- * inode is about to be destroyed, there is no need for i_ceph_lock. ++ * Queue cap releases when an inode is dropped from our cache. + */ + void ceph_queue_caps_release(struct inode *inode) + { + struct ceph_inode_info *ci = ceph_inode(inode); + struct rb_node *p; + ++ /* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU) ++ * may call __ceph_caps_issued_mask() on a freeing inode. */ ++ spin_lock(&ci->i_ceph_lock); + p = rb_first(&ci->i_caps); + while (p) { + struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); + p = rb_next(p); + __ceph_remove_cap(cap, true); + } ++ spin_unlock(&ci->i_ceph_lock); + } + + /* +diff --git a/fs/exec.c b/fs/exec.c +index 433b1257694a..561ea64829ec 100644 +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -1826,7 +1826,7 @@ static int __do_execve_file(int fd, struct filename *filename, + membarrier_execve(current); + rseq_execve(current); + acct_update_integrals(current); +- task_numa_free(current); ++ task_numa_free(current, false); + free_bprm(bprm); + kfree(pathbuf); + if (filename) +diff --git a/fs/nfs/client.c b/fs/nfs/client.c +index c092661147b3..0a2b59c1ecb3 100644 +--- a/fs/nfs/client.c ++++ b/fs/nfs/client.c +@@ -416,10 +416,10 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) + clp = nfs_match_client(cl_init); + if (clp) { + spin_unlock(&nn->nfs_client_lock); +- if (IS_ERR(clp)) +- return clp; + if (new) + new->rpc_ops->free_client(new); ++ if (IS_ERR(clp)) ++ return clp; + return nfs_found_client(cl_init, clp); + } + if (new) { +diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c +index 8bfaa658b2c1..71b2e390becf 100644 +--- a/fs/nfs/dir.c ++++ b/fs/nfs/dir.c +@@ -1072,6 +1072,100 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, + return !nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU); + } + ++static int ++nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry, ++ struct inode *inode, int error) ++{ ++ switch (error) { ++ case 1: ++ dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n", ++ __func__, dentry); ++ return 1; ++ case 0: ++ nfs_mark_for_revalidate(dir); ++ if (inode && S_ISDIR(inode->i_mode)) { ++ /* Purge readdir caches. */ ++ nfs_zap_caches(inode); ++ /* ++ * We can't d_drop the root of a disconnected tree: ++ * its d_hash is on the s_anon list and d_drop() would hide ++ * it from shrink_dcache_for_unmount(), leading to busy ++ * inodes on unmount and further oopses. ++ */ ++ if (IS_ROOT(dentry)) ++ return 1; ++ } ++ dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n", ++ __func__, dentry); ++ return 0; ++ } ++ dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) lookup returned error %d\n", ++ __func__, dentry, error); ++ return error; ++} ++ ++static int ++nfs_lookup_revalidate_negative(struct inode *dir, struct dentry *dentry, ++ unsigned int flags) ++{ ++ int ret = 1; ++ if (nfs_neg_need_reval(dir, dentry, flags)) { ++ if (flags & LOOKUP_RCU) ++ return -ECHILD; ++ ret = 0; ++ } ++ return nfs_lookup_revalidate_done(dir, dentry, NULL, ret); ++} ++ ++static int ++nfs_lookup_revalidate_delegated(struct inode *dir, struct dentry *dentry, ++ struct inode *inode) ++{ ++ nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); ++ return nfs_lookup_revalidate_done(dir, dentry, inode, 1); ++} ++ ++static int ++nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry, ++ struct inode *inode) ++{ ++ struct nfs_fh *fhandle; ++ struct nfs_fattr *fattr; ++ struct nfs4_label *label; ++ int ret; ++ ++ ret = -ENOMEM; ++ fhandle = nfs_alloc_fhandle(); ++ fattr = nfs_alloc_fattr(); ++ label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL); ++ if (fhandle == NULL || fattr == NULL || IS_ERR(label)) ++ goto out; ++ ++ ret = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label); ++ if (ret < 0) { ++ if (ret == -ESTALE || ret == -ENOENT) ++ ret = 0; ++ goto out; ++ } ++ ret = 0; ++ if (nfs_compare_fh(NFS_FH(inode), fhandle)) ++ goto out; ++ if (nfs_refresh_inode(inode, fattr) < 0) ++ goto out; ++ ++ nfs_setsecurity(inode, fattr, label); ++ nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); ++ ++ /* set a readdirplus hint that we had a cache miss */ ++ nfs_force_use_readdirplus(dir); ++ ret = 1; ++out: ++ nfs_free_fattr(fattr); ++ nfs_free_fhandle(fhandle); ++ nfs4_label_free(label); ++ return nfs_lookup_revalidate_done(dir, dentry, inode, ret); ++} ++ + /* + * This is called every time the dcache has a lookup hit, + * and we should check whether we can really trust that +@@ -1083,58 +1177,36 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, + * If the parent directory is seen to have changed, we throw out the + * cached dentry and do a new lookup. + */ +-static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) ++static int ++nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, ++ unsigned int flags) + { +- struct inode *dir; + struct inode *inode; +- struct dentry *parent; +- struct nfs_fh *fhandle = NULL; +- struct nfs_fattr *fattr = NULL; +- struct nfs4_label *label = NULL; + int error; + +- if (flags & LOOKUP_RCU) { +- parent = READ_ONCE(dentry->d_parent); +- dir = d_inode_rcu(parent); +- if (!dir) +- return -ECHILD; +- } else { +- parent = dget_parent(dentry); +- dir = d_inode(parent); +- } + nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); + inode = d_inode(dentry); + +- if (!inode) { +- if (nfs_neg_need_reval(dir, dentry, flags)) { +- if (flags & LOOKUP_RCU) +- return -ECHILD; +- goto out_bad; +- } +- goto out_valid; +- } ++ if (!inode) ++ return nfs_lookup_revalidate_negative(dir, dentry, flags); + + if (is_bad_inode(inode)) { +- if (flags & LOOKUP_RCU) +- return -ECHILD; + dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n", + __func__, dentry); + goto out_bad; + } + + if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ)) +- goto out_set_verifier; ++ return nfs_lookup_revalidate_delegated(dir, dentry, inode); + + /* Force a full look up iff the parent directory has changed */ + if (!(flags & (LOOKUP_EXCL | LOOKUP_REVAL)) && + nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) { + error = nfs_lookup_verify_inode(inode, flags); + if (error) { +- if (flags & LOOKUP_RCU) +- return -ECHILD; + if (error == -ESTALE) +- goto out_zap_parent; +- goto out_error; ++ nfs_zap_caches(dir); ++ goto out_bad; + } + nfs_advise_use_readdirplus(dir); + goto out_valid; +@@ -1146,81 +1218,45 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) + if (NFS_STALE(inode)) + goto out_bad; + +- error = -ENOMEM; +- fhandle = nfs_alloc_fhandle(); +- fattr = nfs_alloc_fattr(); +- if (fhandle == NULL || fattr == NULL) +- goto out_error; +- +- label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT); +- if (IS_ERR(label)) +- goto out_error; +- + trace_nfs_lookup_revalidate_enter(dir, dentry, flags); +- error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label); ++ error = nfs_lookup_revalidate_dentry(dir, dentry, inode); + trace_nfs_lookup_revalidate_exit(dir, dentry, flags, error); +- if (error == -ESTALE || error == -ENOENT) +- goto out_bad; +- if (error) +- goto out_error; +- if (nfs_compare_fh(NFS_FH(inode), fhandle)) +- goto out_bad; +- if ((error = nfs_refresh_inode(inode, fattr)) != 0) +- goto out_bad; +- +- nfs_setsecurity(inode, fattr, label); +- +- nfs_free_fattr(fattr); +- nfs_free_fhandle(fhandle); +- nfs4_label_free(label); ++ return error; ++out_valid: ++ return nfs_lookup_revalidate_done(dir, dentry, inode, 1); ++out_bad: ++ if (flags & LOOKUP_RCU) ++ return -ECHILD; ++ return nfs_lookup_revalidate_done(dir, dentry, inode, 0); ++} + +- /* set a readdirplus hint that we had a cache miss */ +- nfs_force_use_readdirplus(dir); ++static int ++__nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags, ++ int (*reval)(struct inode *, struct dentry *, unsigned int)) ++{ ++ struct dentry *parent; ++ struct inode *dir; ++ int ret; + +-out_set_verifier: +- nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); +- out_valid: + if (flags & LOOKUP_RCU) { ++ parent = READ_ONCE(dentry->d_parent); ++ dir = d_inode_rcu(parent); ++ if (!dir) ++ return -ECHILD; ++ ret = reval(dir, dentry, flags); + if (parent != READ_ONCE(dentry->d_parent)) + return -ECHILD; +- } else ++ } else { ++ parent = dget_parent(dentry); ++ ret = reval(d_inode(parent), dentry, flags); + dput(parent); +- dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n", +- __func__, dentry); +- return 1; +-out_zap_parent: +- nfs_zap_caches(dir); +- out_bad: +- WARN_ON(flags & LOOKUP_RCU); +- nfs_free_fattr(fattr); +- nfs_free_fhandle(fhandle); +- nfs4_label_free(label); +- nfs_mark_for_revalidate(dir); +- if (inode && S_ISDIR(inode->i_mode)) { +- /* Purge readdir caches. */ +- nfs_zap_caches(inode); +- /* +- * We can't d_drop the root of a disconnected tree: +- * its d_hash is on the s_anon list and d_drop() would hide +- * it from shrink_dcache_for_unmount(), leading to busy +- * inodes on unmount and further oopses. +- */ +- if (IS_ROOT(dentry)) +- goto out_valid; + } +- dput(parent); +- dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n", +- __func__, dentry); +- return 0; +-out_error: +- WARN_ON(flags & LOOKUP_RCU); +- nfs_free_fattr(fattr); +- nfs_free_fhandle(fhandle); +- nfs4_label_free(label); +- dput(parent); +- dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) lookup returned error %d\n", +- __func__, dentry, error); +- return error; ++ return ret; ++} ++ ++static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) ++{ ++ return __nfs_lookup_revalidate(dentry, flags, nfs_do_lookup_revalidate); + } + + /* +@@ -1579,62 +1615,55 @@ no_open: + } + EXPORT_SYMBOL_GPL(nfs_atomic_open); + +-static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) ++static int ++nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, ++ unsigned int flags) + { + struct inode *inode; +- int ret = 0; + + if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY)) +- goto no_open; ++ goto full_reval; + if (d_mountpoint(dentry)) +- goto no_open; +- if (NFS_SB(dentry->d_sb)->caps & NFS_CAP_ATOMIC_OPEN_V1) +- goto no_open; ++ goto full_reval; + + inode = d_inode(dentry); + + /* We can't create new files in nfs_open_revalidate(), so we + * optimize away revalidation of negative dentries. + */ +- if (inode == NULL) { +- struct dentry *parent; +- struct inode *dir; +- +- if (flags & LOOKUP_RCU) { +- parent = READ_ONCE(dentry->d_parent); +- dir = d_inode_rcu(parent); +- if (!dir) +- return -ECHILD; +- } else { +- parent = dget_parent(dentry); +- dir = d_inode(parent); +- } +- if (!nfs_neg_need_reval(dir, dentry, flags)) +- ret = 1; +- else if (flags & LOOKUP_RCU) +- ret = -ECHILD; +- if (!(flags & LOOKUP_RCU)) +- dput(parent); +- else if (parent != READ_ONCE(dentry->d_parent)) +- return -ECHILD; +- goto out; +- } ++ if (inode == NULL) ++ goto full_reval; ++ ++ if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ)) ++ return nfs_lookup_revalidate_delegated(dir, dentry, inode); + + /* NFS only supports OPEN on regular files */ + if (!S_ISREG(inode->i_mode)) +- goto no_open; ++ goto full_reval; ++ + /* We cannot do exclusive creation on a positive dentry */ +- if (flags & LOOKUP_EXCL) +- goto no_open; ++ if (flags & (LOOKUP_EXCL | LOOKUP_REVAL)) ++ goto reval_dentry; ++ ++ /* Check if the directory changed */ ++ if (!nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) ++ goto reval_dentry; + + /* Let f_op->open() actually open (and revalidate) the file */ +- ret = 1; ++ return 1; ++reval_dentry: ++ if (flags & LOOKUP_RCU) ++ return -ECHILD; ++ return nfs_lookup_revalidate_dentry(dir, dentry, inode);; + +-out: +- return ret; ++full_reval: ++ return nfs_do_lookup_revalidate(dir, dentry, flags); ++} + +-no_open: +- return nfs_lookup_revalidate(dentry, flags); ++static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) ++{ ++ return __nfs_lookup_revalidate(dentry, flags, ++ nfs4_do_lookup_revalidate); + } + + #endif /* CONFIG_NFSV4 */ +diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c +index 1de855e0ae61..904e08bbb289 100644 +--- a/fs/nfs/nfs4proc.c ++++ b/fs/nfs/nfs4proc.c +@@ -1355,12 +1355,20 @@ static bool nfs4_mode_match_open_stateid(struct nfs4_state *state, + return false; + } + +-static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode) ++static int can_open_cached(struct nfs4_state *state, fmode_t mode, ++ int open_mode, enum open_claim_type4 claim) + { + int ret = 0; + + if (open_mode & (O_EXCL|O_TRUNC)) + goto out; ++ switch (claim) { ++ case NFS4_OPEN_CLAIM_NULL: ++ case NFS4_OPEN_CLAIM_FH: ++ goto out; ++ default: ++ break; ++ } + switch (mode & (FMODE_READ|FMODE_WRITE)) { + case FMODE_READ: + ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0 +@@ -1753,7 +1761,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) + + for (;;) { + spin_lock(&state->owner->so_lock); +- if (can_open_cached(state, fmode, open_mode)) { ++ if (can_open_cached(state, fmode, open_mode, claim)) { + update_open_stateflags(state, fmode); + spin_unlock(&state->owner->so_lock); + goto out_return_state; +@@ -2282,7 +2290,8 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) + if (data->state != NULL) { + struct nfs_delegation *delegation; + +- if (can_open_cached(data->state, data->o_arg.fmode, data->o_arg.open_flags)) ++ if (can_open_cached(data->state, data->o_arg.fmode, ++ data->o_arg.open_flags, claim)) + goto out_no_action; + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(data->state->inode)->delegation); +diff --git a/fs/proc/base.c b/fs/proc/base.c +index a7fbda72afeb..3b9b726b1a6c 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -205,12 +205,53 @@ static int proc_root_link(struct dentry *dentry, struct path *path) + return result; + } + ++/* ++ * If the user used setproctitle(), we just get the string from ++ * user space at arg_start, and limit it to a maximum of one page. ++ */ ++static ssize_t get_mm_proctitle(struct mm_struct *mm, char __user *buf, ++ size_t count, unsigned long pos, ++ unsigned long arg_start) ++{ ++ char *page; ++ int ret, got; ++ ++ if (pos >= PAGE_SIZE) ++ return 0; ++ ++ page = (char *)__get_free_page(GFP_KERNEL); ++ if (!page) ++ return -ENOMEM; ++ ++ ret = 0; ++ got = access_remote_vm(mm, arg_start, page, PAGE_SIZE, FOLL_ANON); ++ if (got > 0) { ++ int len = strnlen(page, got); ++ ++ /* Include the NUL character if it was found */ ++ if (len < got) ++ len++; ++ ++ if (len > pos) { ++ len -= pos; ++ if (len > count) ++ len = count; ++ len -= copy_to_user(buf, page+pos, len); ++ if (!len) ++ len = -EFAULT; ++ ret = len; ++ } ++ } ++ free_page((unsigned long)page); ++ return ret; ++} ++ + static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf, + size_t count, loff_t *ppos) + { + unsigned long arg_start, arg_end, env_start, env_end; + unsigned long pos, len; +- char *page; ++ char *page, c; + + /* Check if process spawned far enough to have cmdline. */ + if (!mm->env_end) +@@ -227,28 +268,42 @@ static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf, + return 0; + + /* +- * We have traditionally allowed the user to re-write +- * the argument strings and overflow the end result +- * into the environment section. But only do that if +- * the environment area is contiguous to the arguments. ++ * We allow setproctitle() to overwrite the argument ++ * strings, and overflow past the original end. But ++ * only when it overflows into the environment area. + */ +- if (env_start != arg_end || env_start >= env_end) ++ if (env_start != arg_end || env_end < env_start) + env_start = env_end = arg_end; +- +- /* .. and limit it to a maximum of one page of slop */ +- if (env_end >= arg_end + PAGE_SIZE) +- env_end = arg_end + PAGE_SIZE - 1; ++ len = env_end - arg_start; + + /* We're not going to care if "*ppos" has high bits set */ +- pos = arg_start + *ppos; +- +- /* .. but we do check the result is in the proper range */ +- if (pos < arg_start || pos >= env_end) ++ pos = *ppos; ++ if (pos >= len) + return 0; ++ if (count > len - pos) ++ count = len - pos; ++ if (!count) ++ return 0; ++ ++ /* ++ * Magical special case: if the argv[] end byte is not ++ * zero, the user has overwritten it with setproctitle(3). ++ * ++ * Possible future enhancement: do this only once when ++ * pos is 0, and set a flag in the 'struct file'. ++ */ ++ if (access_remote_vm(mm, arg_end-1, &c, 1, FOLL_ANON) == 1 && c) ++ return get_mm_proctitle(mm, buf, count, pos, arg_start); + +- /* .. and we never go past env_end */ +- if (env_end - pos < count) +- count = env_end - pos; ++ /* ++ * For the non-setproctitle() case we limit things strictly ++ * to the [arg_start, arg_end[ range. ++ */ ++ pos += arg_start; ++ if (pos < arg_start || pos >= arg_end) ++ return 0; ++ if (count > arg_end - pos) ++ count = arg_end - pos; + + page = (char *)__get_free_page(GFP_KERNEL); + if (!page) +@@ -258,48 +313,11 @@ static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf, + while (count) { + int got; + size_t size = min_t(size_t, PAGE_SIZE, count); +- long offset; + +- /* +- * Are we already starting past the official end? +- * We always include the last byte that is *supposed* +- * to be NUL +- */ +- offset = (pos >= arg_end) ? pos - arg_end + 1 : 0; +- +- got = access_remote_vm(mm, pos - offset, page, size + offset, FOLL_ANON); +- if (got <= offset) ++ got = access_remote_vm(mm, pos, page, size, FOLL_ANON); ++ if (got <= 0) + break; +- got -= offset; +- +- /* Don't walk past a NUL character once you hit arg_end */ +- if (pos + got >= arg_end) { +- int n = 0; +- +- /* +- * If we started before 'arg_end' but ended up +- * at or after it, we start the NUL character +- * check at arg_end-1 (where we expect the normal +- * EOF to be). +- * +- * NOTE! This is smaller than 'got', because +- * pos + got >= arg_end +- */ +- if (pos < arg_end) +- n = arg_end - pos - 1; +- +- /* Cut off at first NUL after 'n' */ +- got = n + strnlen(page+n, offset+got-n); +- if (got < offset) +- break; +- got -= offset; +- +- /* Include the NUL if it existed */ +- if (got < size) +- got++; +- } +- +- got -= copy_to_user(buf, page+offset, got); ++ got -= copy_to_user(buf, page, got); + if (unlikely(!got)) { + if (!len) + len = -EFAULT; +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index 6980014357d4..d51e10f50e75 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -504,6 +504,12 @@ struct request_queue { + * various queue flags, see QUEUE_* below + */ + unsigned long queue_flags; ++ /* ++ * Number of contexts that have called blk_set_pm_only(). If this ++ * counter is above zero then only RQF_PM and RQF_PREEMPT requests are ++ * processed. ++ */ ++ atomic_t pm_only; + + /* + * ida allocated id for this queue. Used to index queues from +@@ -698,7 +704,6 @@ struct request_queue { + #define QUEUE_FLAG_REGISTERED 26 /* queue has been registered to a disk */ + #define QUEUE_FLAG_SCSI_PASSTHROUGH 27 /* queue supports SCSI commands */ + #define QUEUE_FLAG_QUIESCED 28 /* queue has been quiesced */ +-#define QUEUE_FLAG_PREEMPT_ONLY 29 /* only process REQ_PREEMPT requests */ + + #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ + (1 << QUEUE_FLAG_SAME_COMP) | \ +@@ -736,12 +741,11 @@ bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q); + ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \ + REQ_FAILFAST_DRIVER)) + #define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags) +-#define blk_queue_preempt_only(q) \ +- test_bit(QUEUE_FLAG_PREEMPT_ONLY, &(q)->queue_flags) ++#define blk_queue_pm_only(q) atomic_read(&(q)->pm_only) + #define blk_queue_fua(q) test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags) + +-extern int blk_set_preempt_only(struct request_queue *q); +-extern void blk_clear_preempt_only(struct request_queue *q); ++extern void blk_set_pm_only(struct request_queue *q); ++extern void blk_clear_pm_only(struct request_queue *q); + + static inline int queue_in_flight(struct request_queue *q) + { +diff --git a/include/linux/iova.h b/include/linux/iova.h +index 928442dda565..84fbe73d2ec0 100644 +--- a/include/linux/iova.h ++++ b/include/linux/iova.h +@@ -156,6 +156,7 @@ struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo, + void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to); + void init_iova_domain(struct iova_domain *iovad, unsigned long granule, + unsigned long start_pfn); ++bool has_iova_flush_queue(struct iova_domain *iovad); + int init_iova_flush_queue(struct iova_domain *iovad, + iova_flush_cb flush_cb, iova_entry_dtor entry_dtor); + struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn); +@@ -236,6 +237,11 @@ static inline void init_iova_domain(struct iova_domain *iovad, + { + } + ++static inline bool has_iova_flush_queue(struct iova_domain *iovad) ++{ ++ return false; ++} ++ + static inline int init_iova_flush_queue(struct iova_domain *iovad, + iova_flush_cb flush_cb, + iova_entry_dtor entry_dtor) +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 5dc024e28397..20f5ba262cc0 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1023,7 +1023,15 @@ struct task_struct { + u64 last_sum_exec_runtime; + struct callback_head numa_work; + +- struct numa_group *numa_group; ++ /* ++ * This pointer is only modified for current in syscall and ++ * pagefault context (and for tasks being destroyed), so it can be read ++ * from any of the following contexts: ++ * - RCU read-side critical section ++ * - current->numa_group from everywhere ++ * - task's runqueue locked, task not running ++ */ ++ struct numa_group __rcu *numa_group; + + /* + * numa_faults is an array split into four regions: +diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h +index e7dd04a84ba8..3988762efe15 100644 +--- a/include/linux/sched/numa_balancing.h ++++ b/include/linux/sched/numa_balancing.h +@@ -19,7 +19,7 @@ + extern void task_numa_fault(int last_node, int node, int pages, int flags); + extern pid_t task_numa_group_id(struct task_struct *p); + extern void set_numabalancing_state(bool enabled); +-extern void task_numa_free(struct task_struct *p); ++extern void task_numa_free(struct task_struct *p, bool final); + extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page, + int src_nid, int dst_cpu); + #else +@@ -34,7 +34,7 @@ static inline pid_t task_numa_group_id(struct task_struct *p) + static inline void set_numabalancing_state(bool enabled) + { + } +-static inline void task_numa_free(struct task_struct *p) ++static inline void task_numa_free(struct task_struct *p, bool final) + { + } + static inline bool should_numa_migrate_memory(struct task_struct *p, +diff --git a/kernel/fork.c b/kernel/fork.c +index 69874db3fba8..e76ce81c9c75 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -679,7 +679,7 @@ void __put_task_struct(struct task_struct *tsk) + WARN_ON(tsk == current); + + cgroup_free(tsk); +- task_numa_free(tsk); ++ task_numa_free(tsk, true); + security_task_free(tsk); + exit_creds(tsk); + delayacct_tsk_free(tsk); +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 4a433608ba74..75f322603d44 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1053,6 +1053,21 @@ struct numa_group { + unsigned long faults[0]; + }; + ++/* ++ * For functions that can be called in multiple contexts that permit reading ++ * ->numa_group (see struct task_struct for locking rules). ++ */ ++static struct numa_group *deref_task_numa_group(struct task_struct *p) ++{ ++ return rcu_dereference_check(p->numa_group, p == current || ++ (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu))); ++} ++ ++static struct numa_group *deref_curr_numa_group(struct task_struct *p) ++{ ++ return rcu_dereference_protected(p->numa_group, p == current); ++} ++ + static inline unsigned long group_faults_priv(struct numa_group *ng); + static inline unsigned long group_faults_shared(struct numa_group *ng); + +@@ -1096,10 +1111,12 @@ static unsigned int task_scan_start(struct task_struct *p) + { + unsigned long smin = task_scan_min(p); + unsigned long period = smin; ++ struct numa_group *ng; + + /* Scale the maximum scan period with the amount of shared memory. */ +- if (p->numa_group) { +- struct numa_group *ng = p->numa_group; ++ rcu_read_lock(); ++ ng = rcu_dereference(p->numa_group); ++ if (ng) { + unsigned long shared = group_faults_shared(ng); + unsigned long private = group_faults_priv(ng); + +@@ -1107,6 +1124,7 @@ static unsigned int task_scan_start(struct task_struct *p) + period *= shared + 1; + period /= private + shared + 1; + } ++ rcu_read_unlock(); + + return max(smin, period); + } +@@ -1115,13 +1133,14 @@ static unsigned int task_scan_max(struct task_struct *p) + { + unsigned long smin = task_scan_min(p); + unsigned long smax; ++ struct numa_group *ng; + + /* Watch for min being lower than max due to floor calculations */ + smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); + + /* Scale the maximum scan period with the amount of shared memory. */ +- if (p->numa_group) { +- struct numa_group *ng = p->numa_group; ++ ng = deref_curr_numa_group(p); ++ if (ng) { + unsigned long shared = group_faults_shared(ng); + unsigned long private = group_faults_priv(ng); + unsigned long period = smax; +@@ -1153,7 +1172,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) + p->numa_scan_period = sysctl_numa_balancing_scan_delay; + p->numa_work.next = &p->numa_work; + p->numa_faults = NULL; +- p->numa_group = NULL; ++ RCU_INIT_POINTER(p->numa_group, NULL); + p->last_task_numa_placement = 0; + p->last_sum_exec_runtime = 0; + +@@ -1200,7 +1219,16 @@ static void account_numa_dequeue(struct rq *rq, struct task_struct *p) + + pid_t task_numa_group_id(struct task_struct *p) + { +- return p->numa_group ? p->numa_group->gid : 0; ++ struct numa_group *ng; ++ pid_t gid = 0; ++ ++ rcu_read_lock(); ++ ng = rcu_dereference(p->numa_group); ++ if (ng) ++ gid = ng->gid; ++ rcu_read_unlock(); ++ ++ return gid; + } + + /* +@@ -1225,11 +1253,13 @@ static inline unsigned long task_faults(struct task_struct *p, int nid) + + static inline unsigned long group_faults(struct task_struct *p, int nid) + { +- if (!p->numa_group) ++ struct numa_group *ng = deref_task_numa_group(p); ++ ++ if (!ng) + return 0; + +- return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] + +- p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)]; ++ return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] + ++ ng->faults[task_faults_idx(NUMA_MEM, nid, 1)]; + } + + static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) +@@ -1367,12 +1397,13 @@ static inline unsigned long task_weight(struct task_struct *p, int nid, + static inline unsigned long group_weight(struct task_struct *p, int nid, + int dist) + { ++ struct numa_group *ng = deref_task_numa_group(p); + unsigned long faults, total_faults; + +- if (!p->numa_group) ++ if (!ng) + return 0; + +- total_faults = p->numa_group->total_faults; ++ total_faults = ng->total_faults; + + if (!total_faults) + return 0; +@@ -1386,7 +1417,7 @@ static inline unsigned long group_weight(struct task_struct *p, int nid, + bool should_numa_migrate_memory(struct task_struct *p, struct page * page, + int src_nid, int dst_cpu) + { +- struct numa_group *ng = p->numa_group; ++ struct numa_group *ng = deref_curr_numa_group(p); + int dst_nid = cpu_to_node(dst_cpu); + int last_cpupid, this_cpupid; + +@@ -1592,13 +1623,14 @@ static bool load_too_imbalanced(long src_load, long dst_load, + static void task_numa_compare(struct task_numa_env *env, + long taskimp, long groupimp, bool maymove) + { ++ struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p); + struct rq *dst_rq = cpu_rq(env->dst_cpu); ++ long imp = p_ng ? groupimp : taskimp; + struct task_struct *cur; + long src_load, dst_load; +- long load; +- long imp = env->p->numa_group ? groupimp : taskimp; +- long moveimp = imp; + int dist = env->dist; ++ long moveimp = imp; ++ long load; + + if (READ_ONCE(dst_rq->numa_migrate_on)) + return; +@@ -1637,21 +1669,22 @@ static void task_numa_compare(struct task_numa_env *env, + * If dst and source tasks are in the same NUMA group, or not + * in any group then look only at task weights. + */ +- if (cur->numa_group == env->p->numa_group) { ++ cur_ng = rcu_dereference(cur->numa_group); ++ if (cur_ng == p_ng) { + imp = taskimp + task_weight(cur, env->src_nid, dist) - + task_weight(cur, env->dst_nid, dist); + /* + * Add some hysteresis to prevent swapping the + * tasks within a group over tiny differences. + */ +- if (cur->numa_group) ++ if (cur_ng) + imp -= imp / 16; + } else { + /* + * Compare the group weights. If a task is all by itself + * (not part of a group), use the task weight instead. + */ +- if (cur->numa_group && env->p->numa_group) ++ if (cur_ng && p_ng) + imp += group_weight(cur, env->src_nid, dist) - + group_weight(cur, env->dst_nid, dist); + else +@@ -1749,11 +1782,12 @@ static int task_numa_migrate(struct task_struct *p) + .best_imp = 0, + .best_cpu = -1, + }; ++ unsigned long taskweight, groupweight; + struct sched_domain *sd; ++ long taskimp, groupimp; ++ struct numa_group *ng; + struct rq *best_rq; +- unsigned long taskweight, groupweight; + int nid, ret, dist; +- long taskimp, groupimp; + + /* + * Pick the lowest SD_NUMA domain, as that would have the smallest +@@ -1799,7 +1833,8 @@ static int task_numa_migrate(struct task_struct *p) + * multiple NUMA nodes; in order to better consolidate the group, + * we need to check other locations. + */ +- if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) { ++ ng = deref_curr_numa_group(p); ++ if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) { + for_each_online_node(nid) { + if (nid == env.src_nid || nid == p->numa_preferred_nid) + continue; +@@ -1832,7 +1867,7 @@ static int task_numa_migrate(struct task_struct *p) + * A task that migrated to a second choice node will be better off + * trying for a better one later. Do not set the preferred node here. + */ +- if (p->numa_group) { ++ if (ng) { + if (env.best_cpu == -1) + nid = env.src_nid; + else +@@ -2127,6 +2162,7 @@ static void task_numa_placement(struct task_struct *p) + unsigned long total_faults; + u64 runtime, period; + spinlock_t *group_lock = NULL; ++ struct numa_group *ng; + + /* + * The p->mm->numa_scan_seq field gets updated without +@@ -2144,8 +2180,9 @@ static void task_numa_placement(struct task_struct *p) + runtime = numa_get_avg_runtime(p, &period); + + /* If the task is part of a group prevent parallel updates to group stats */ +- if (p->numa_group) { +- group_lock = &p->numa_group->lock; ++ ng = deref_curr_numa_group(p); ++ if (ng) { ++ group_lock = &ng->lock; + spin_lock_irq(group_lock); + } + +@@ -2186,7 +2223,7 @@ static void task_numa_placement(struct task_struct *p) + p->numa_faults[cpu_idx] += f_diff; + faults += p->numa_faults[mem_idx]; + p->total_numa_faults += diff; +- if (p->numa_group) { ++ if (ng) { + /* + * safe because we can only change our own group + * +@@ -2194,14 +2231,14 @@ static void task_numa_placement(struct task_struct *p) + * nid and priv in a specific region because it + * is at the beginning of the numa_faults array. + */ +- p->numa_group->faults[mem_idx] += diff; +- p->numa_group->faults_cpu[mem_idx] += f_diff; +- p->numa_group->total_faults += diff; +- group_faults += p->numa_group->faults[mem_idx]; ++ ng->faults[mem_idx] += diff; ++ ng->faults_cpu[mem_idx] += f_diff; ++ ng->total_faults += diff; ++ group_faults += ng->faults[mem_idx]; + } + } + +- if (!p->numa_group) { ++ if (!ng) { + if (faults > max_faults) { + max_faults = faults; + max_nid = nid; +@@ -2212,8 +2249,8 @@ static void task_numa_placement(struct task_struct *p) + } + } + +- if (p->numa_group) { +- numa_group_count_active_nodes(p->numa_group); ++ if (ng) { ++ numa_group_count_active_nodes(ng); + spin_unlock_irq(group_lock); + max_nid = preferred_group_nid(p, max_nid); + } +@@ -2247,7 +2284,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, + int cpu = cpupid_to_cpu(cpupid); + int i; + +- if (unlikely(!p->numa_group)) { ++ if (unlikely(!deref_curr_numa_group(p))) { + unsigned int size = sizeof(struct numa_group) + + 4*nr_node_ids*sizeof(unsigned long); + +@@ -2283,7 +2320,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, + if (!grp) + goto no_join; + +- my_grp = p->numa_group; ++ my_grp = deref_curr_numa_group(p); + if (grp == my_grp) + goto no_join; + +@@ -2345,13 +2382,24 @@ no_join: + return; + } + +-void task_numa_free(struct task_struct *p) ++/* ++ * Get rid of NUMA staticstics associated with a task (either current or dead). ++ * If @final is set, the task is dead and has reached refcount zero, so we can ++ * safely free all relevant data structures. Otherwise, there might be ++ * concurrent reads from places like load balancing and procfs, and we should ++ * reset the data back to default state without freeing ->numa_faults. ++ */ ++void task_numa_free(struct task_struct *p, bool final) + { +- struct numa_group *grp = p->numa_group; +- void *numa_faults = p->numa_faults; ++ /* safe: p either is current or is being freed by current */ ++ struct numa_group *grp = rcu_dereference_raw(p->numa_group); ++ unsigned long *numa_faults = p->numa_faults; + unsigned long flags; + int i; + ++ if (!numa_faults) ++ return; ++ + if (grp) { + spin_lock_irqsave(&grp->lock, flags); + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) +@@ -2364,8 +2412,14 @@ void task_numa_free(struct task_struct *p) + put_numa_group(grp); + } + +- p->numa_faults = NULL; +- kfree(numa_faults); ++ if (final) { ++ p->numa_faults = NULL; ++ kfree(numa_faults); ++ } else { ++ p->total_numa_faults = 0; ++ for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) ++ numa_faults[i] = 0; ++ } + } + + /* +@@ -2418,7 +2472,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) + * actively using should be counted as local. This allows the + * scan rate to slow down when a workload has settled down. + */ +- ng = p->numa_group; ++ ng = deref_curr_numa_group(p); + if (!priv && !local && ng && ng->active_nodes > 1 && + numa_is_active_node(cpu_node, ng) && + numa_is_active_node(mem_node, ng)) +@@ -10218,18 +10272,22 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) + { + int node; + unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0; ++ struct numa_group *ng; + ++ rcu_read_lock(); ++ ng = rcu_dereference(p->numa_group); + for_each_online_node(node) { + if (p->numa_faults) { + tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)]; + tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)]; + } +- if (p->numa_group) { +- gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)], +- gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)]; ++ if (ng) { ++ gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)], ++ gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)]; + } + print_numa_stats(m, node, tsf, tpf, gsf, gpf); + } ++ rcu_read_unlock(); + } + #endif /* CONFIG_NUMA_BALANCING */ + #endif /* CONFIG_SCHED_DEBUG */ +diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c +index c248e0dccbe1..67ef9d853d90 100644 +--- a/net/ipv4/ip_tunnel_core.c ++++ b/net/ipv4/ip_tunnel_core.c +@@ -89,9 +89,12 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, + __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1); + + err = ip_local_out(net, sk, skb); +- if (unlikely(net_xmit_eval(err))) +- pkt_len = 0; +- iptunnel_xmit_stats(dev, pkt_len); ++ ++ if (dev) { ++ if (unlikely(net_xmit_eval(err))) ++ pkt_len = 0; ++ iptunnel_xmit_stats(dev, pkt_len); ++ } + } + EXPORT_SYMBOL_GPL(iptunnel_xmit); + +diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c +index ab27a2872935..2e30bf197583 100644 +--- a/net/vmw_vsock/af_vsock.c ++++ b/net/vmw_vsock/af_vsock.c +@@ -281,7 +281,8 @@ EXPORT_SYMBOL_GPL(vsock_insert_connected); + void vsock_remove_bound(struct vsock_sock *vsk) + { + spin_lock_bh(&vsock_table_lock); +- __vsock_remove_bound(vsk); ++ if (__vsock_in_bound_table(vsk)) ++ __vsock_remove_bound(vsk); + spin_unlock_bh(&vsock_table_lock); + } + EXPORT_SYMBOL_GPL(vsock_remove_bound); +@@ -289,7 +290,8 @@ EXPORT_SYMBOL_GPL(vsock_remove_bound); + void vsock_remove_connected(struct vsock_sock *vsk) + { + spin_lock_bh(&vsock_table_lock); +- __vsock_remove_connected(vsk); ++ if (__vsock_in_connected_table(vsk)) ++ __vsock_remove_connected(vsk); + spin_unlock_bh(&vsock_table_lock); + } + EXPORT_SYMBOL_GPL(vsock_remove_connected); +@@ -325,35 +327,10 @@ struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, + } + EXPORT_SYMBOL_GPL(vsock_find_connected_socket); + +-static bool vsock_in_bound_table(struct vsock_sock *vsk) +-{ +- bool ret; +- +- spin_lock_bh(&vsock_table_lock); +- ret = __vsock_in_bound_table(vsk); +- spin_unlock_bh(&vsock_table_lock); +- +- return ret; +-} +- +-static bool vsock_in_connected_table(struct vsock_sock *vsk) +-{ +- bool ret; +- +- spin_lock_bh(&vsock_table_lock); +- ret = __vsock_in_connected_table(vsk); +- spin_unlock_bh(&vsock_table_lock); +- +- return ret; +-} +- + void vsock_remove_sock(struct vsock_sock *vsk) + { +- if (vsock_in_bound_table(vsk)) +- vsock_remove_bound(vsk); +- +- if (vsock_in_connected_table(vsk)) +- vsock_remove_connected(vsk); ++ vsock_remove_bound(vsk); ++ vsock_remove_connected(vsk); + } + EXPORT_SYMBOL_GPL(vsock_remove_sock); + +@@ -484,8 +461,7 @@ static void vsock_pending_work(struct work_struct *work) + * incoming packets can't find this socket, and to reduce the reference + * count. + */ +- if (vsock_in_connected_table(vsk)) +- vsock_remove_connected(vsk); ++ vsock_remove_connected(vsk); + + sk->sk_state = TCP_CLOSE; + +diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c +index b131561a9469..9c7da811d130 100644 +--- a/net/vmw_vsock/hyperv_transport.c ++++ b/net/vmw_vsock/hyperv_transport.c +@@ -35,6 +35,9 @@ + /* The MTU is 16KB per the host side's design */ + #define HVS_MTU_SIZE (1024 * 16) + ++/* How long to wait for graceful shutdown of a connection */ ++#define HVS_CLOSE_TIMEOUT (8 * HZ) ++ + struct vmpipe_proto_header { + u32 pkt_type; + u32 data_size; +@@ -290,19 +293,32 @@ static void hvs_channel_cb(void *ctx) + sk->sk_write_space(sk); + } + +-static void hvs_close_connection(struct vmbus_channel *chan) ++static void hvs_do_close_lock_held(struct vsock_sock *vsk, ++ bool cancel_timeout) + { +- struct sock *sk = get_per_channel_state(chan); +- struct vsock_sock *vsk = vsock_sk(sk); +- +- lock_sock(sk); ++ struct sock *sk = sk_vsock(vsk); + +- sk->sk_state = TCP_CLOSE; + sock_set_flag(sk, SOCK_DONE); +- vsk->peer_shutdown |= SEND_SHUTDOWN | RCV_SHUTDOWN; +- ++ vsk->peer_shutdown = SHUTDOWN_MASK; ++ if (vsock_stream_has_data(vsk) <= 0) ++ sk->sk_state = TCP_CLOSING; + sk->sk_state_change(sk); ++ if (vsk->close_work_scheduled && ++ (!cancel_timeout || cancel_delayed_work(&vsk->close_work))) { ++ vsk->close_work_scheduled = false; ++ vsock_remove_sock(vsk); + ++ /* Release the reference taken while scheduling the timeout */ ++ sock_put(sk); ++ } ++} ++ ++static void hvs_close_connection(struct vmbus_channel *chan) ++{ ++ struct sock *sk = get_per_channel_state(chan); ++ ++ lock_sock(sk); ++ hvs_do_close_lock_held(vsock_sk(sk), true); + release_sock(sk); + } + +@@ -445,50 +461,80 @@ static int hvs_connect(struct vsock_sock *vsk) + return vmbus_send_tl_connect_request(&h->vm_srv_id, &h->host_srv_id); + } + ++static void hvs_shutdown_lock_held(struct hvsock *hvs, int mode) ++{ ++ struct vmpipe_proto_header hdr; ++ ++ if (hvs->fin_sent || !hvs->chan) ++ return; ++ ++ /* It can't fail: see hvs_channel_writable_bytes(). */ ++ (void)hvs_send_data(hvs->chan, (struct hvs_send_buf *)&hdr, 0); ++ hvs->fin_sent = true; ++} ++ + static int hvs_shutdown(struct vsock_sock *vsk, int mode) + { + struct sock *sk = sk_vsock(vsk); +- struct vmpipe_proto_header hdr; +- struct hvs_send_buf *send_buf; +- struct hvsock *hvs; + + if (!(mode & SEND_SHUTDOWN)) + return 0; + + lock_sock(sk); ++ hvs_shutdown_lock_held(vsk->trans, mode); ++ release_sock(sk); ++ return 0; ++} + +- hvs = vsk->trans; +- if (hvs->fin_sent) +- goto out; +- +- send_buf = (struct hvs_send_buf *)&hdr; ++static void hvs_close_timeout(struct work_struct *work) ++{ ++ struct vsock_sock *vsk = ++ container_of(work, struct vsock_sock, close_work.work); ++ struct sock *sk = sk_vsock(vsk); + +- /* It can't fail: see hvs_channel_writable_bytes(). */ +- (void)hvs_send_data(hvs->chan, send_buf, 0); ++ sock_hold(sk); ++ lock_sock(sk); ++ if (!sock_flag(sk, SOCK_DONE)) ++ hvs_do_close_lock_held(vsk, false); + +- hvs->fin_sent = true; +-out: ++ vsk->close_work_scheduled = false; + release_sock(sk); +- return 0; ++ sock_put(sk); + } + +-static void hvs_release(struct vsock_sock *vsk) ++/* Returns true, if it is safe to remove socket; false otherwise */ ++static bool hvs_close_lock_held(struct vsock_sock *vsk) + { + struct sock *sk = sk_vsock(vsk); +- struct hvsock *hvs = vsk->trans; +- struct vmbus_channel *chan; + +- lock_sock(sk); ++ if (!(sk->sk_state == TCP_ESTABLISHED || ++ sk->sk_state == TCP_CLOSING)) ++ return true; + +- sk->sk_state = TCP_CLOSING; +- vsock_remove_sock(vsk); ++ if ((sk->sk_shutdown & SHUTDOWN_MASK) != SHUTDOWN_MASK) ++ hvs_shutdown_lock_held(vsk->trans, SHUTDOWN_MASK); + +- release_sock(sk); ++ if (sock_flag(sk, SOCK_DONE)) ++ return true; + +- chan = hvs->chan; +- if (chan) +- hvs_shutdown(vsk, RCV_SHUTDOWN | SEND_SHUTDOWN); ++ /* This reference will be dropped by the delayed close routine */ ++ sock_hold(sk); ++ INIT_DELAYED_WORK(&vsk->close_work, hvs_close_timeout); ++ vsk->close_work_scheduled = true; ++ schedule_delayed_work(&vsk->close_work, HVS_CLOSE_TIMEOUT); ++ return false; ++} + ++static void hvs_release(struct vsock_sock *vsk) ++{ ++ struct sock *sk = sk_vsock(vsk); ++ bool remove_sock; ++ ++ lock_sock(sk); ++ remove_sock = hvs_close_lock_held(vsk); ++ release_sock(sk); ++ if (remove_sock) ++ vsock_remove_sock(vsk); + } + + static void hvs_destruct(struct vsock_sock *vsk) |