1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
|
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 0335285f3918..49935d5bb5c6 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -230,6 +230,14 @@ tcp_base_mss - INTEGER
Path MTU discovery (MTU probing). If MTU probing is enabled,
this is the initial MSS used by the connection.
+tcp_min_snd_mss - INTEGER
+ TCP SYN and SYNACK messages usually advertise an ADVMSS option,
+ as described in RFC 1122 and RFC 6691.
+ If this ADVMSS option is smaller than tcp_min_snd_mss,
+ it is silently capped to tcp_min_snd_mss.
+
+ Default : 48 (at least 8 bytes of payload per segment)
+
tcp_congestion_control - STRING
Set the congestion control algorithm to be used for new
connections. The algorithm "reno" is always available, but
diff --git a/Makefile b/Makefile
index 584af2e57735..f34cb9225255 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
VERSION = 4
PATCHLEVEL = 9
-SUBLEVEL = 181
+SUBLEVEL = 182
EXTRAVERSION =
NAME = Roaring Lionus
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index d0c3615f9050..7f517458c64f 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -433,4 +433,7 @@ static inline void tcp_saved_syn_free(struct tcp_sock *tp)
tp->saved_syn = NULL;
}
+int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount,
+ int shiftlen);
+
#endif /* _LINUX_TCP_H */
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 7adf4386ac8f..bf619a67ec03 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -94,6 +94,7 @@ struct netns_ipv4 {
#endif
int sysctl_tcp_mtu_probing;
int sysctl_tcp_base_mss;
+ int sysctl_tcp_min_snd_mss;
int sysctl_tcp_probe_threshold;
u32 sysctl_tcp_probe_interval;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index fed2a78fb8cb..d7047de952f0 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -53,6 +53,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
#define MAX_TCP_HEADER (128 + MAX_HEADER)
#define MAX_TCP_OPTION_SPACE 40
+#define TCP_MIN_SND_MSS 48
+#define TCP_MIN_GSO_SIZE (TCP_MIN_SND_MSS - MAX_TCP_OPTION_SPACE)
/*
* Never offer a window over 32767 without using window scaling. Some
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 3442a26d36d9..56e3460d1f9f 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -282,6 +282,7 @@ enum
LINUX_MIB_TCPKEEPALIVE, /* TCPKeepAlive */
LINUX_MIB_TCPMTUPFAIL, /* TCPMTUPFail */
LINUX_MIB_TCPMTUPSUCCESS, /* TCPMTUPSuccess */
+ LINUX_MIB_TCPWQUEUETOOBIG, /* TCPWqueueTooBig */
__LINUX_MIB_MAX
};
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index ec48d8eafc7e..8b221398534b 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -306,6 +306,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE),
SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL),
SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS),
+ SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG),
SNMP_MIB_SENTINEL
};
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 85713adf2770..e202babb14d6 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -35,6 +35,8 @@ static int ip_local_port_range_min[] = { 1, 1 };
static int ip_local_port_range_max[] = { 65535, 65535 };
static int tcp_adv_win_scale_min = -31;
static int tcp_adv_win_scale_max = 31;
+static int tcp_min_snd_mss_min = TCP_MIN_SND_MSS;
+static int tcp_min_snd_mss_max = 65535;
static int ip_ttl_min = 1;
static int ip_ttl_max = 255;
static int tcp_syn_retries_min = 1;
@@ -838,6 +840,15 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "tcp_min_snd_mss",
+ .data = &init_net.ipv4.sysctl_tcp_min_snd_mss,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &tcp_min_snd_mss_min,
+ .extra2 = &tcp_min_snd_mss_max,
+ },
{
.procname = "tcp_probe_threshold",
.data = &init_net.ipv4.sysctl_tcp_probe_threshold,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2ededb32b754..ee2822a411f9 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3307,6 +3307,7 @@ void __init tcp_init(void)
unsigned long limit;
unsigned int i;
+ BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
FIELD_SIZEOF(struct sk_buff, cb));
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e238539c3497..e2e58bc42ba4 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1320,7 +1320,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
TCP_SKB_CB(skb)->seq += shifted;
tcp_skb_pcount_add(prev, pcount);
- BUG_ON(tcp_skb_pcount(skb) < pcount);
+ WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
tcp_skb_pcount_add(skb, -pcount);
/* When we're adding to gso_segs == 1, gso_size will be zero,
@@ -1387,6 +1387,21 @@ static int skb_can_shift(const struct sk_buff *skb)
return !skb_headlen(skb) && skb_is_nonlinear(skb);
}
+int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from,
+ int pcount, int shiftlen)
+{
+ /* TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE)
+ * Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need
+ * to make sure not storing more than 65535 * 8 bytes per skb,
+ * even if current MSS is bigger.
+ */
+ if (unlikely(to->len + shiftlen >= 65535 * TCP_MIN_GSO_SIZE))
+ return 0;
+ if (unlikely(tcp_skb_pcount(to) + pcount > 65535))
+ return 0;
+ return skb_shift(to, from, shiftlen);
+}
+
/* Try collapsing SACK blocks spanning across multiple skbs to a single
* skb.
*/
@@ -1398,6 +1413,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *prev;
int mss;
+ int next_pcount;
int pcount = 0;
int len;
int in_sack;
@@ -1495,7 +1511,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
goto fallback;
- if (!skb_shift(prev, skb, len))
+ if (!tcp_skb_shift(prev, skb, pcount, len))
goto fallback;
if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
goto out;
@@ -1514,11 +1530,11 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
goto out;
len = skb->len;
- if (skb_shift(prev, skb, len)) {
- pcount += tcp_skb_pcount(skb);
- tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0);
+ next_pcount = tcp_skb_pcount(skb);
+ if (tcp_skb_shift(prev, skb, next_pcount, len)) {
+ pcount += next_pcount;
+ tcp_shifted_skb(sk, skb, state, next_pcount, len, mss, 0);
}
-
out:
state->fack_count += pcount;
return prev;
@@ -2837,9 +2853,9 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
(tcp_fackets_out(tp) > tp->reordering));
- if (WARN_ON(!tp->packets_out && tp->sacked_out))
+ if (!tp->packets_out && tp->sacked_out)
tp->sacked_out = 0;
- if (WARN_ON(!tp->sacked_out && tp->fackets_out))
+ if (!tp->sacked_out && tp->fackets_out)
tp->fackets_out = 0;
/* Now state machine starts.
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 82c1064ff4aa..848f2c1da8a5 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2456,6 +2456,7 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_ecn_fallback = 1;
net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
+ net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 6f35cdd5f2f0..d8c6b833f0ce 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1185,6 +1185,11 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
if (nsize < 0)
nsize = 0;
+ if (unlikely((sk->sk_wmem_queued >> 1) > sk->sk_sndbuf)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG);
+ return -ENOMEM;
+ }
+
if (skb_unclone(skb, gfp))
return -ENOMEM;
@@ -1355,8 +1360,7 @@ static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
mss_now -= icsk->icsk_ext_hdr_len;
/* Then reserve room for full set of TCP options and 8 bytes of data */
- if (mss_now < 48)
- mss_now = 48;
+ mss_now = max(mss_now, sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss);
return mss_now;
}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 69523389f067..d9e364c4863a 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -140,6 +140,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
mss = max(mss, 68 - tp->tcp_header_len);
+ mss = max(mss, net->ipv4.sysctl_tcp_min_snd_mss);
icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
}
|