master
1/*-
2 * Copyright (c) 2016-2020 Netflix, Inc.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26#ifndef _NETINET_TCP_BBR_H_
27#define _NETINET_TCP_BBR_H_
28
29#define BBR_INITIAL_RTO 1000000 /* 1 second in micro-seconds */
30/* Send map flags */
31#define BBR_ACKED 0x0001 /* The remote endpoint acked this */
32#define BBR_WAS_RENEGED 0x0002 /* The peer reneged the ack */
33#define BBR_RXT_CLEARED 0x0004 /* ACK Cleared by the RXT timer */
34#define BBR_OVERMAX 0x0008 /* We have more retran's then we can
35 * fit */
36#define BBR_SACK_PASSED 0x0010 /* A sack was done above this block */
37#define BBR_WAS_SACKPASS 0x0020 /* We retransmitted due to SACK pass */
38#define BBR_HAS_FIN 0x0040 /* segment is sent with fin */
39#define BBR_TLP 0x0080 /* segment sent as tail-loss-probe */
40#define BBR_HAS_SYN 0x0100 /* segment has the syn */
41#define BBR_MARKED_LOST 0x0200 /*
42 * This segments is lost and
43 * totaled into bbr->rc_ctl.rc_lost
44 */
45#define BBR_RWND_COLLAPSED 0x0400 /* The peer collapsed the rwnd on the segment */
46#define BBR_NUM_OF_RETRANS 7
47
48/* Defines for socket options to set pacing overheads */
49#define BBR_INCL_ENET_OH 0x01
50#define BBR_INCL_IP_OH 0x02
51#define BBR_INCL_TCP_OH 0x03
52
53/*
54 * With the addition of both measurement algorithms
55 * I had to move over the size of a
56 * cache line (unfortunately). For now there is
57 * no way around this. We may be able to cut back
58 * at some point I hope.
59 */
60struct bbr_sendmap {
61 TAILQ_ENTRY(bbr_sendmap) r_next; /* seq number arrayed next */
62 TAILQ_ENTRY(bbr_sendmap) r_tnext; /* Time of tmit based next */
63 uint32_t r_start; /* Sequence number of the segment */
64 uint32_t r_end; /* End seq, this is 1 beyond actually */
65
66 uint32_t r_rtr_bytes; /* How many bytes have been retransmitted */
67 uint32_t r_delivered; /* Delivered amount at send */
68
69 uint32_t r_del_time; /* The time of the last delivery update */
70 uint8_t r_rtr_cnt:4, /* Retran count, index this -1 to get time
71 * sent */
72 r_rtt_not_allowed:1, /* No rtt measurement allowed */
73 r_is_drain:1, /* In a draining cycle */
74 r_app_limited:1,/* We went app limited */
75 r_ts_valid:1; /* Timestamp field is valid (r_del_ack_ts) */
76 uint8_t r_dupack; /* Dup ack count */
77 uint8_t r_in_tmap:1, /* Flag to see if its in the r_tnext array */
78 r_is_smallmap:1,/* Was logged as a small-map send-map item */
79 r_is_gain:1, /* Was in gain cycle */
80 r_bbr_state:5; /* The BBR state at send */
81 uint8_t r_limit_type; /* is this entry counted against a limit? */
82
83 uint16_t r_flags; /* Flags as defined above */
84 uint16_t r_spare16;
85 uint32_t r_del_ack_ts; /* At send what timestamp of peer was (if r_ts_valid set) */
86 /****************Cache line*****************/
87 uint32_t r_tim_lastsent[BBR_NUM_OF_RETRANS];
88 /*
89 * Question, should we instead just grab the sending b/w
90 * from the filter with the gain and store it in a
91 * uint64_t instead?
92 */
93 uint32_t r_first_sent_time; /* Time of first pkt in flight sent */
94 uint32_t r_pacing_delay; /* pacing delay of this send */
95 uint32_t r_flight_at_send; /* flight at the time of the send */
96#ifdef _KERNEL
97} __aligned(CACHE_LINE_SIZE);
98#else
99};
100#endif
101#define BBR_LIMIT_TYPE_SPLIT 1
102
103TAILQ_HEAD(bbr_head, bbr_sendmap);
104
105#define BBR_SEGMENT_TIME_SIZE 1500 /* How many bytes in time_between */
106
107#define BBR_MIN_SEG 1460 /* MSS size */
108#define BBR_MAX_GAIN_VALUE 0xffff
109
110#define BBR_TIMER_FUDGE 1500 /* 1.5ms in micro seconds */
111
112/* BW twiddle secret codes */
113#define BBR_RED_BW_CONGSIG 0 /* We enter recovery and set using b/w */
114#define BBR_RED_BW_RATECAL 1 /* We are calculating the loss rate */
115#define BBR_RED_BW_USELRBW 2 /* We are dropping the lower b/w with
116 * cDR */
117#define BBR_RED_BW_SETHIGHLOSS 3 /* We have set our highloss value at
118 * exit from probe-rtt */
119#define BBR_RED_BW_PE_CLREARLY 4 /* We have decided to clear the
120 * reduction early */
121#define BBR_RED_BW_PE_CLAFDEL 5 /* We are clearing it on schedule
122 * delayed */
123#define BBR_RED_BW_REC_ENDCLL 6 /* Recover exits save high if needed
124 * an clear to start measuring */
125#define BBR_RED_BW_PE_NOEARLY_OUT 7 /* Set pkt epoch judged that we do not
126 * get out of jail early */
127/* For calculating a rate */
128#define BBR_CALC_BW 1
129#define BBR_CALC_LOSS 2
130
131#define BBR_RTT_BY_TIMESTAMP 0
132#define BBR_RTT_BY_EXACTMATCH 1
133#define BBR_RTT_BY_EARLIER_RET 2
134#define BBR_RTT_BY_THIS_RETRAN 3
135#define BBR_RTT_BY_SOME_RETRAN 4
136#define BBR_RTT_BY_TSMATCHING 5
137
138/* Markers to track where we enter persists from */
139#define BBR_PERSISTS_FROM_1 1
140#define BBR_PERSISTS_FROM_2 2
141#define BBR_PERSISTS_FROM_3 3
142#define BBR_PERSISTS_FROM_4 4
143#define BBR_PERSISTS_FROM_5 5
144
145/* magic cookies to ask for the RTT */
146#define BBR_RTT_PROP 0
147#define BBR_RTT_RACK 1
148#define BBR_RTT_PKTRTT 2
149#define BBR_SRTT 3
150
151#define BBR_SACKED 0
152#define BBR_CUM_ACKED 1
153
154/* threshold in useconds where we consider we need a higher min cwnd */
155#define BBR_HIGH_SPEED 1000
156#define BBR_HIGHSPEED_NUM_MSS 12
157
158#define MAX_REDUCE_RXT 3 /* What is the maximum times we are willing to
159 * reduce b/w in RTX's. Setting this has a
160 * multiplicative effect e.g. if we are
161 * reducing by 20% then setting it to 3 means
162 * you will have reduced the b/w estimate by >
163 * 60% before you stop. */
164/*
165 * We use the rate sample structure to
166 * assist in single sack/ack rate and rtt
167 * calculation. In the future we will expand
168 * this in BBR to do forward rate sample
169 * b/w estimation.
170 */
171#define BBR_RS_RTT_EMPTY 0x00000001 /* Nothing yet stored in RTT's */
172#define BBR_RS_BW_EMPTY 0x00000002 /* Nothing yet stored in cDR */
173#define BBR_RS_RTT_VALID 0x00000004 /* We have at least one valid RTT */
174#define BBR_RS_BW_VAILD 0x00000008 /* We have a valid cDR */
175#define BBR_RS_EMPTY (BBR_RS_RTT_EMPTY|BBR_RS_BW_EMPTY)
176struct bbr_rtt_sample {
177 uint32_t rs_flags;
178 uint32_t rs_rtt_lowest;
179 uint32_t rs_rtt_lowest_sendtime;
180 uint32_t rs_rtt_low_seq_start;
181
182 uint32_t rs_rtt_highest;
183 uint32_t rs_rtt_cnt;
184
185 uint64_t rs_rtt_tot;
186 uint32_t cur_rtt;
187 uint32_t cur_rtt_bytecnt;
188
189 uint32_t cur_rtt_rsmcnt;
190 uint32_t rc_crtt_set:1,
191 avail_bits:31;
192 uint64_t rs_cDR;
193};
194
195/* RTT shrink reasons */
196#define BBR_RTTS_INIT 0
197#define BBR_RTTS_NEWRTT 1
198#define BBR_RTTS_RTTPROBE 2
199#define BBR_RTTS_WASIDLE 3
200#define BBR_RTTS_PERSIST 4
201#define BBR_RTTS_REACHTAR 5
202#define BBR_RTTS_ENTERPROBE 6
203#define BBR_RTTS_SHRINK_PG 7
204#define BBR_RTTS_SHRINK_PG_FINAL 8
205#define BBR_RTTS_NEW_TARGET 9
206#define BBR_RTTS_LEAVE_DRAIN 10
207#define BBR_RTTS_RESETS_VALUES 11
208
209#define BBR_NUM_RATES 5
210/* Rate flags */
211#define BBR_RT_FLAG_FREE 0x00 /* Is on the free list */
212#define BBR_RT_FLAG_INUSE 0x01 /* Has been allocated */
213#define BBR_RT_FLAG_READY 0x02 /* Ready to initiate a measurement. */
214#define BBR_RT_FLAG_CAPPED_PRE 0x04 /* Ready to cap if we send the next segment */
215#define BBR_RT_FLAG_CAPPED 0x08 /* Measurement is capped */
216#define BBR_RT_FLAG_PASTFA 0x10 /* Past the first ack. */
217#define BBR_RT_FLAG_LIMITED 0x20 /* Saw application/cwnd or rwnd limited period */
218#define BBR_RT_SEEN_A_ACK 0x40 /* A ack has been saved */
219#define BBR_RT_PREV_RTT_SET 0x80 /* There was a RTT set in */
220#define BBR_RT_PREV_SEND_TIME 0x100 /*
221 *There was a RTT send time set that can be used
222 * no snd_limits
223 */
224#define BBR_RT_SET_GRADIENT 0x200
225#define BBR_RT_TS_VALID 0x400
226
227struct bbr_log {
228 union {
229 struct bbr_sendmap *rsm; /* For alloc/free */
230 uint64_t sb_acc; /* For out/ack or t-o */
231 };
232 struct tcpcb *tp;
233 uint32_t t_flags;
234 uint32_t th_seq;
235 uint32_t th_ack;
236 uint32_t snd_una;
237 uint32_t snd_nxt;
238 uint32_t snd_max;
239 uint32_t snd_cwnd;
240 uint32_t snd_wnd;
241 uint32_t rc_lost;
242 uint32_t target_cwnd; /* UU */
243 uint32_t inflight; /* UU */
244 uint32_t applimited; /* UU */
245 /* Things for BBR */
246 uint32_t delivered; /* UU */
247 uint64_t cur_del_rate; /* UU */
248 uint64_t delRate; /* UU */
249 uint64_t rttProp; /* UU */
250 uint64_t lt_bw; /* UU */
251 uint32_t timeStamp;
252 uint32_t time;
253 uint32_t slot; /* UU */
254 uint32_t delayed_by;
255 uint32_t exp_del;
256 uint32_t pkts_out;
257 uint32_t new_win;
258 uint32_t hptsi_gain; /* UU */
259 uint32_t cwnd_gain; /* UU */
260 uint32_t epoch; /* UU */
261 uint32_t lt_epoch; /* UU */
262 /* Sack fun */
263 uint32_t blk_start[4]; /* xx */
264 uint32_t blk_end[4];
265 uint32_t len; /* Timeout T3=1, TLP=2, RACK=3 */
266 uint8_t type;
267 uint8_t n_sackblks;
268 uint8_t applied; /* UU */
269 uint8_t inhpts; /* UU */
270 uint8_t __spare; /* UU */
271 uint8_t use_lt_bw; /* UU */
272};
273
274struct bbr_log_sysctl_out {
275 uint32_t bbr_log_at;
276 uint32_t bbr_log_max;
277 struct bbr_log entries[0];
278};
279
280/*
281 * Magic numbers for logging timeout events if the
282 * logging is enabled.
283 */
284#define BBR_TO_FRM_TMR 1
285#define BBR_TO_FRM_TLP 2
286#define BBR_TO_FRM_RACK 3
287#define BBR_TO_FRM_KEEP 4
288#define BBR_TO_FRM_PERSIST 5
289#define BBR_TO_FRM_DELACK 6
290
291#define BBR_SEES_STRETCH_ACK 1
292#define BBR_SEES_COMPRESSED_ACKS 2
293
294/*
295 * As we get each SACK we wade through the
296 * rc_map and mark off what is acked.
297 * We also increment rc_sacked as well.
298 *
299 * We also pay attention to missing entries
300 * based on the time and possibly mark them
301 * for retransmit. If we do and we are not already
302 * in recovery we enter recovery. In doing
303 * so we claer prr_delivered/holes_rxt and prr_sent_dur_rec.
304 * We also setup rc_next/rc_snd_nxt/rc_send_end so
305 * we will know where to send from. When not in
306 * recovery rc_next will be NULL and rc_snd_nxt should
307 * equal snd_max.
308 *
309 * Whenever we retransmit from recovery we increment
310 * rc_holes_rxt as we retran a block and mark it as retransmitted
311 * with the time it was sent. During non-recovery sending we
312 * add to our map and note the time down of any send expanding
313 * the rc_map at the tail and moving rc_snd_nxt up with snd_max.
314 *
315 * In recovery during SACK/ACK processing if a chunk has
316 * been retransmitted and it is now acked, we decrement rc_holes_rxt.
317 * When we retransmit from the scoreboard we use
318 * rc_next and rc_snd_nxt/rc_send_end to help us
319 * find what needs to be retran.
320 *
321 * To calculate pipe we simply take (snd_max - snd_una) + rc_holes_rxt
322 * This gets us the effect of RFC6675 pipe, counting twice for
323 * bytes retransmitted.
324 */
325
326#define TT_BBR_FR_TMR 0x2001
327
328#define BBR_SCALE 8
329#define BBR_UNIT (1 << BBR_SCALE)
330
331#define BBR_NUM_RTTS_FOR_DEL_LIMIT 8 /* How many pkt-rtts do we keep
332 * Delivery rate for */
333#define BBR_NUM_RTTS_FOR_GOOG_DEL_LIMIT 10 /* How many pkt-rtts do we keep
334 * Delivery rate for google */
335
336#define BBR_SECONDS_NO_RTT 10 /* 10 seconds with no RTT shrinkage */
337#define BBR_PROBERTT_MAX 200 /* 200ms */
338#define BBR_PROBERTT_NUM_MSS 4
339#define BBR_STARTUP_EPOCHS 3
340#define USECS_IN_MSEC 1000
341#define BBR_TIME_TO_SECONDS(a) (a / USECS_IN_SECOND)
342#define BBR_TIME_TO_MILLI(a) (a / MS_IN_USEC)
343
344/* BBR keeps time in usec's so we divide by 1000 and round up */
345#define BBR_TS_TO_MS(t) ((t+999)/MS_IN_USEC)
346
347/*
348 * Locking for the rack control block.
349 * a) Locked by INP_WLOCK
350 * b) Locked by the hpts-mutex
351 *
352 */
353#define BBR_STATE_STARTUP 0x01
354#define BBR_STATE_DRAIN 0x02
355#define BBR_STATE_PROBE_BW 0x03
356#define BBR_STATE_PROBE_RTT 0x04
357#define BBR_STATE_IDLE_EXIT 0x05
358
359/* Substate defines for STATE == PROBE_BW */
360#define BBR_SUB_GAIN 0 /* State 0 where we are 5/4 BBR_UNIT */
361#define BBR_SUB_DRAIN 1 /* State 1 where we are at 3/4 BBR_UNIT */
362#define BBR_SUB_LEVEL1 2 /* State 1 first BBR_UNIT */
363#define BBR_SUB_LEVEL2 3 /* State 2nd BBR_UNIT */
364#define BBR_SUB_LEVEL3 4 /* State 3rd BBR_UNIT */
365#define BBR_SUB_LEVEL4 5 /* State 4th BBR_UNIT */
366#define BBR_SUB_LEVEL5 6 /* State 5th BBR_UNIT */
367#define BBR_SUB_LEVEL6 7 /* State last BBR_UNIT */
368#define BBR_SUBSTATE_COUNT 8
369
370/* Single remaining reduce log */
371#define BBR_REDUCE_AT_FR 5
372
373#define BBR_BIG_LOG_SIZE 300000
374
375struct bbr_stats {
376 uint64_t bbr_badfr; /* 0 */
377 uint64_t bbr_badfr_bytes; /* 1 */
378 uint64_t bbr_saw_oerr; /* 2 */
379 uint64_t bbr_saw_emsgsiz; /* 3 */
380 uint64_t bbr_reorder_seen; /* 4 */
381 uint64_t bbr_tlp_tot; /* 5 */
382 uint64_t bbr_tlp_newdata; /* 6 */
383 uint64_t bbr_offset_recovery; /* 7 */
384 uint64_t bbr_tlp_retran_fail; /* 8 */
385 uint64_t bbr_to_tot; /* 9 */
386 uint64_t bbr_to_arm_rack; /* 10 */
387 uint64_t bbr_enter_probertt; /* 11 */
388 uint64_t bbr_tlp_set; /* 12 */
389 uint64_t bbr_resends_set; /* 13 */
390 uint64_t bbr_force_output; /* 14 */
391 uint64_t bbr_to_arm_tlp; /* 15 */
392 uint64_t bbr_paced_segments; /* 16 */
393 uint64_t bbr_saw_enobuf; /* 17 */
394 uint64_t bbr_to_alloc_failed; /* 18 */
395 uint64_t bbr_to_alloc_emerg; /* 19 */
396 uint64_t bbr_sack_proc_all; /* 20 */
397 uint64_t bbr_sack_proc_short; /* 21 */
398 uint64_t bbr_sack_proc_restart; /* 22 */
399 uint64_t bbr_to_alloc; /* 23 */
400 uint64_t bbr_offset_drop; /* 24 */
401 uint64_t bbr_runt_sacks; /* 25 */
402 uint64_t bbr_sack_passed; /* 26 */
403 uint64_t bbr_rlock_left_ret0; /* 27 */
404 uint64_t bbr_rlock_left_ret1; /* 28 */
405 uint64_t bbr_dynamic_rwnd; /* 29 */
406 uint64_t bbr_static_rwnd; /* 30 */
407 uint64_t bbr_sack_blocks; /* 31 */
408 uint64_t bbr_sack_blocks_skip; /* 32 */
409 uint64_t bbr_sack_search_both; /* 33 */
410 uint64_t bbr_sack_search_fwd; /* 34 */
411 uint64_t bbr_sack_search_back; /* 35 */
412 uint64_t bbr_plain_acks; /* 36 */
413 uint64_t bbr_acks_with_sacks; /* 37 */
414 uint64_t bbr_progress_drops; /* 38 */
415 uint64_t bbr_early; /* 39 */
416 uint64_t bbr_reneges_seen; /* 40 */
417 uint64_t bbr_persist_reneg; /* 41 */
418 uint64_t bbr_dropped_af_data; /* 42 */
419 uint64_t bbr_failed_mbuf_aloc; /* 43 */
420 uint64_t bbr_cwnd_limited; /* 44 */
421 uint64_t bbr_rwnd_limited; /* 45 */
422 uint64_t bbr_app_limited; /* 46 */
423 uint64_t bbr_force_timer_start; /* 47 */
424 uint64_t bbr_hpts_min_time; /* 48 */
425 uint64_t bbr_meets_tso_thresh; /* 49 */
426 uint64_t bbr_miss_tso_rwnd; /* 50 */
427 uint64_t bbr_miss_tso_cwnd; /* 51 */
428 uint64_t bbr_miss_tso_app; /* 52 */
429 uint64_t bbr_miss_retran; /* 53 */
430 uint64_t bbr_miss_tlp; /* 54 */
431 uint64_t bbr_miss_unknown; /* 55 */
432 uint64_t bbr_hdwr_rl_add_ok; /* 56 */
433 uint64_t bbr_hdwr_rl_add_fail; /* 57 */
434 uint64_t bbr_hdwr_rl_mod_ok; /* 58 */
435 uint64_t bbr_hdwr_rl_mod_fail; /* 59 */
436 uint64_t bbr_collapsed_win; /* 60 */
437 uint64_t bbr_alloc_limited; /* 61 */
438 uint64_t bbr_alloc_limited_conns; /* 62 */
439 uint64_t bbr_split_limited; /* 63 */
440};
441
442/*
443 * The structure bbr_opt_stats is a simple
444 * way to see how many options are being
445 * changed in the stack.
446 */
447struct bbr_opts_stats {
448 uint64_t tcp_bbr_pace_per_sec;
449 uint64_t tcp_bbr_pace_del_tar;
450 uint64_t tcp_bbr_pace_seg_max;
451 uint64_t tcp_bbr_pace_seg_min;
452 uint64_t tcp_bbr_pace_cross;
453 uint64_t tcp_bbr_drain_inc_extra;
454 uint64_t tcp_bbr_unlimited;
455 uint64_t tcp_bbr_iwintso;
456 uint64_t tcp_bbr_rec_over_hpts;
457 uint64_t tcp_bbr_recforce;
458 uint64_t tcp_bbr_startup_pg;
459 uint64_t tcp_bbr_drain_pg;
460 uint64_t tcp_bbr_rwnd_is_app;
461 uint64_t tcp_bbr_probe_rtt_int;
462 uint64_t tcp_bbr_one_retran;
463 uint64_t tcp_bbr_startup_loss_exit;
464 uint64_t tcp_bbr_use_lowgain;
465 uint64_t tcp_bbr_lowgain_thresh;
466 uint64_t tcp_bbr_lowgain_half;
467 uint64_t tcp_bbr_lowgain_fd;
468 uint64_t tcp_bbr_usedel_rate;
469 uint64_t tcp_bbr_min_rto;
470 uint64_t tcp_bbr_max_rto;
471 uint64_t tcp_rack_pace_max_seg;
472 uint64_t tcp_rack_min_to;
473 uint64_t tcp_rack_reord_thresh;
474 uint64_t tcp_rack_reord_fade;
475 uint64_t tcp_rack_tlp_thresh;
476 uint64_t tcp_rack_pkt_delay;
477 uint64_t tcp_bbr_startup_exit_epoch;
478 uint64_t tcp_bbr_ack_comp_alg;
479 uint64_t tcp_rack_cheat;
480 uint64_t tcp_iwnd_tso;
481 uint64_t tcp_utter_max_tso;
482 uint64_t tcp_hdwr_pacing;
483 uint64_t tcp_extra_state;
484 uint64_t tcp_floor_min_tso;
485 /* New */
486 uint64_t tcp_bbr_algorithm;
487 uint64_t tcp_bbr_tslimits;
488 uint64_t tcp_bbr_probertt_len;
489 uint64_t tcp_bbr_probertt_gain;
490 uint64_t tcp_bbr_topaceout;
491 uint64_t tcp_use_rackcheat;
492 uint64_t tcp_delack;
493 uint64_t tcp_maxpeak;
494 uint64_t tcp_retran_wtso;
495 uint64_t tcp_data_ac;
496 uint64_t tcp_ts_raises;
497 uint64_t tcp_pacing_oh_tmr;
498 uint64_t tcp_pacing_oh;
499 uint64_t tcp_policer_det;
500};
501
502#ifdef _KERNEL
503#define BBR_STAT_SIZE (sizeof(struct bbr_stats)/sizeof(uint64_t))
504extern counter_u64_t bbr_stat_arry[BBR_STAT_SIZE];
505#define BBR_STAT_ADD(name, amm) counter_u64_add(bbr_stat_arry[(offsetof(struct bbr_stats, name)/sizeof(uint64_t))], (amm))
506#define BBR_STAT_INC(name) BBR_STAT_ADD(name, 1)
507#define BBR_OPTS_SIZE (sizeof(struct bbr_stats)/sizeof(uint64_t))
508extern counter_u64_t bbr_opts_arry[BBR_OPTS_SIZE];
509#define BBR_OPTS_ADD(name, amm) counter_u64_add(bbr_opts_arry[(offsetof(struct bbr_opts_stats, name)/sizeof(uint64_t))], (amm))
510#define BBR_OPTS_INC(name) BBR_OPTS_ADD(name, 1)
511#endif
512
513#define BBR_NUM_LOSS_RATES 3
514#define BBR_NUM_BW_RATES 3
515
516#define BBR_RECOVERY_LOWRTT 1
517#define BBR_RECOVERY_MEDRTT 2
518#define BBR_RECOVERY_HIGHRTT 3
519#define BBR_RECOVERY_EXTREMERTT 4
520
521struct bbr_control {
522 /*******************************/
523 /* Cache line 2 from bbr start */
524 /*******************************/
525 struct bbr_head rc_map; /* List of all segments Lock(a) */
526 struct bbr_head rc_tmap; /* List in transmit order Lock(a) */
527 struct bbr_sendmap *rc_resend; /* something we have been asked to
528 * resend */
529 uint32_t rc_last_delay_val; /* How much we expect to delay Lock(a) */
530 uint32_t rc_bbr_hptsi_gain:16, /* Current hptsi gain Lock(a) */
531 rc_hpts_flags:16; /* flags on whats on the pacer wheel */
532
533 uint32_t rc_delivered; /* BRR delivered amount Lock(a) */
534 uint32_t rc_hptsi_agg_delay; /* How much time are we behind */
535
536 uint32_t rc_flight_at_input;
537 uint32_t rc_lost_bytes; /* Total bytes currently marked lost */
538 /*******************************/
539 /* Cache line 3 from bbr start */
540 /*******************************/
541 struct time_filter rc_delrate;
542 /*******************************/
543 /* Cache line 4 from bbr start */
544 /*******************************/
545 struct bbr_head rc_free; /* List of Free map entries Lock(a) */
546 struct bbr_sendmap *rc_tlp_send; /* something we have been
547 * asked to resend */
548 uint32_t rc_del_time;
549 uint32_t rc_target_at_state; /* Target for a state */
550
551 uint16_t rc_free_cnt; /* Number of free entries on the rc_free list
552 * Lock(a) */
553 uint16_t rc_startup_pg;
554
555 uint32_t cur_rtt; /* Last RTT from ack */
556
557 uint32_t rc_went_idle_time; /* Used for persits to see if its
558 * probe-rtt qualified */
559 uint32_t rc_pace_max_segs:17, /* How much in any single TSO we send Lock(a) */
560 rc_pace_min_segs:15; /* The minimum single segment size before we enter persists */
561
562 uint32_t rc_rtt_shrinks; /* Time of last rtt shrinkage Lock(a) */
563 uint32_t r_app_limited_until;
564 uint32_t rc_timer_exp; /* If a timer ticks of expiry */
565 uint32_t rc_rcv_epoch_start; /* Start time of the Epoch Lock(a) */
566
567 /*******************************/
568 /* Cache line 5 from bbr start */
569 /*******************************/
570
571 uint32_t rc_lost_at_pktepoch; /* what the lost value was at the last
572 * pkt-epoch */
573 uint32_t r_measurement_count; /* count of measurement applied lock(a) */
574
575 uint32_t rc_last_tlp_seq; /* Last tlp sequence Lock(a) */
576 uint16_t rc_reorder_shift; /* Socket option value Lock(a) */
577 uint16_t rc_pkt_delay; /* Socket option value Lock(a) */
578
579 struct bbr_sendmap *rc_sacklast; /* sack remembered place
580 * Lock(a) */
581 struct bbr_sendmap *rc_next; /* remembered place where we next
582 * retransmit at Lock(a) */
583
584 uint32_t rc_sacked; /* Tot sacked on scoreboard Lock(a) */
585 uint32_t rc_holes_rxt; /* Tot retraned from scoreboard Lock(a) */
586
587 uint32_t rc_reorder_ts; /* Last time we saw reordering Lock(a) */
588 uint32_t rc_init_rwnd; /* Initial rwnd when we transitioned */
589 /*- ---
590 * used only initial and close
591 */
592 uint32_t rc_high_rwnd; /* Highest rwnd seen */
593 uint32_t rc_lowest_rtt; /* Smallest RTT we have seen */
594
595 uint32_t rc_last_rtt; /* Last valid measured RTT that ack'd data */
596 uint32_t bbr_cross_over;
597
598 /*******************************/
599 /* Cache line 6 from bbr start */
600 /*******************************/
601 struct sack_filter bbr_sf;
602
603 /*******************************/
604 /* Cache line 7 from bbr start */
605 /*******************************/
606 struct time_filter_small rc_rttprop;
607 uint32_t last_inbound_ts; /* Peers last timestamp */
608
609 uint32_t rc_inc_tcp_oh: 1,
610 rc_inc_ip_oh: 1,
611 rc_inc_enet_oh:1,
612 rc_incr_tmrs:1,
613 restrict_growth:28;
614 uint32_t rc_lt_epoch_use; /* When we started lt-bw use Lock(a) */
615
616 uint32_t rc_recovery_start; /* Time we start recovery Lock(a) */
617 uint32_t rc_lt_del; /* Delivered at lt bw sampling start Lock(a) */
618
619 uint64_t rc_bbr_cur_del_rate; /* Current measured delivery rate
620 * Lock(a) */
621
622 /*******************************/
623 /* Cache line 8 from bbr start */
624 /*******************************/
625 uint32_t rc_cwnd_on_ent; /* On entry to recovery the cwnd
626 * Lock(a) */
627 uint32_t rc_agg_early; /* aggregate amount early */
628
629 uint32_t rc_rcvtime; /* When we last received data Lock(a) */
630 uint32_t rc_pkt_epoch_del; /* seq num that we need for RTT epoch */
631
632 uint32_t rc_pkt_epoch; /* Epoch based on packet RTTs */
633 uint32_t rc_pkt_epoch_time; /* Time we started the pkt epoch */
634
635 uint32_t rc_pkt_epoch_rtt; /* RTT using the packet epoch */
636 uint32_t rc_rtt_epoch; /* Current RTT epoch, it ticks every rttProp
637 * Lock(a) */
638 uint32_t lowest_rtt;
639 uint32_t bbr_smallest_srtt_this_state;
640
641 uint32_t rc_lt_epoch; /* LT epoch start of bw_sampling */
642 uint32_t rc_lost_at_startup;
643
644 uint32_t rc_bbr_state_atflight;
645 uint32_t rc_bbr_last_startup_epoch; /* Last startup epoch where we
646 * increased 20% */
647 uint32_t rc_bbr_enters_probertt; /* Timestamp we entered
648 * probertt Lock(a) */
649 uint32_t rc_lt_time; /* Time of lt sampling start Lock(a) */
650
651 /*******************************/
652 /* Cache line 9 from bbr start */
653 /*******************************/
654 uint64_t rc_lt_bw; /* LT bw calculated Lock(a) */
655 uint64_t rc_bbr_lastbtlbw; /* For startup, what was last btlbw I
656 * saw to check the 20% gain Lock(a) */
657
658 uint32_t rc_bbr_cwnd_gain; /* Current cwnd gain Lock(a) */
659 uint32_t rc_pkt_epoch_loss_rate; /* pkt-epoch loss rate */
660
661 uint32_t rc_saved_cwnd; /* Saved cwnd during Probe-rtt drain Lock(a) */
662 uint32_t substate_pe;
663
664 uint32_t rc_lost; /* Number of bytes lost Lock(a) */
665 uint32_t rc_exta_time_gd; /* How much extra time we got in d/g */
666
667 uint32_t rc_lt_lost; /* Number of lt bytes lost at sampling start
668 * Lock(a) */
669 uint32_t rc_bbr_state_time;
670
671 uint32_t rc_min_to; /* Socket option value Lock(a) */
672 uint32_t rc_initial_hptsi_bw; /* Our initial startup bw Lock(a) */
673
674 uint32_t bbr_lost_at_state; /* Temp counter debug lost value as we
675 * enter a state */
676 /*******************************/
677 /* Cache line 10 from bbr start */
678 /*******************************/
679 uint32_t rc_level_state_extra;
680 uint32_t rc_red_cwnd_pe;
681 const struct tcp_hwrate_limit_table *crte;
682 uint64_t red_bw;
683
684 uint32_t rc_probertt_int;
685 uint32_t rc_probertt_srttchktim; /* Time we last did a srtt
686 * check */
687 uint32_t gain_epoch; /* Epoch we should be out of gain */
688 uint32_t rc_min_rto_ms;
689
690 uint32_t rc_reorder_fade; /* Socket option value Lock(a) */
691 uint32_t last_startup_measure;
692
693 int32_t bbr_hptsi_per_second;
694 int32_t bbr_hptsi_segments_delay_tar;
695
696 int32_t bbr_hptsi_segments_max;
697 uint32_t bbr_rttprobe_gain_val;
698 /*******************************/
699 /* Cache line 11 from bbr start */
700 /*******************************/
701 uint32_t cur_rtt_send_time; /* Time we sent our rtt measured packet */
702 uint32_t bbr_peer_tsratio; /* Our calculated ts ratio to multply */
703 uint32_t bbr_ts_check_tstmp; /* When we filled it the TS that came on the ack */
704 uint32_t bbr_ts_check_our_cts; /* When we filled it the cts of the send */
705 uint32_t rc_tlp_rxt_last_time;
706 uint32_t bbr_smallest_srtt_state2;
707 uint32_t bbr_hdwr_cnt_noset_snt; /* count of hw pacing sends during delay */
708 uint32_t startup_last_srtt;
709 uint32_t rc_ack_hdwr_delay;
710 uint32_t highest_hdwr_delay; /* Largest delay we have seen from hardware */
711 uint32_t non_gain_extra;
712 uint32_t recovery_lr; /* The sum of the loss rate from the pe's during recovery */
713 uint32_t last_in_probertt;
714 uint32_t flightsize_at_drain; /* In draining what was the last marked flight size */
715 uint32_t rc_pe_of_prtt; /* PE we went into probe-rtt */
716 uint32_t ts_in; /* ts that went with the last rtt */
717
718 uint16_t rc_tlp_seg_send_cnt; /* Number of times we have TLP sent
719 * rc_last_tlp_seq Lock(a) */
720 uint16_t rc_drain_pg;
721 uint32_t rc_num_maps_alloced; /* num send map entries allocated */
722 uint32_t rc_num_split_allocs; /* num split map entries allocated */
723 uint16_t rc_num_small_maps_alloced; /* Number of sack blocks
724 * allocated */
725 uint16_t bbr_hptsi_bytes_min;
726
727 uint16_t bbr_hptsi_segments_floor;
728 uint16_t bbr_utter_max;
729 uint16_t bbr_google_discount;
730
731};
732
733struct socket;
734struct tcp_bbr {
735 /* First cache line 0x00 */
736 int32_t(*r_substate) (struct mbuf *, struct tcphdr *,
737 struct socket *, struct tcpcb *, struct tcpopt *,
738 int32_t, int32_t, uint32_t, int32_t, int32_t, uint8_t); /* Lock(a) */
739 struct tcpcb *rc_tp; /* The tcpcb Lock(a) */
740 struct inpcb *rc_inp; /* The inpcb Lock(a) */
741 struct timeval rc_tv;
742 uint32_t rc_pacer_started; /* Time we started the pacer */
743 uint16_t no_pacing_until:8, /* No pacing until N packet epochs */
744 ts_can_raise:1,/* TS b/w calculations can raise the bw higher */
745 skip_gain:1, /* Skip the gain cycle (hardware pacing) */
746 gain_is_limited:1, /* With hardware pacing we are limiting gain */
747 output_error_seen:1,
748 oerror_cnt:4,
749 hw_pacing_set:1; /* long enough has passed for us to start pacing */
750 uint16_t xxx_r_ack_count; /* During recovery count of ack's received
751 * that added data since output */
752 uint16_t bbr_segs_rcvd; /* In Segment count since we sent a ack */
753
754 uint8_t bbr_timer_src:4, /* Used for debugging Lock(a) */
755 bbr_use_rack_cheat:1, /* Use the rack cheat */
756 bbr_init_win_cheat:1, /* Send full IW for TSO */
757 bbr_attempt_hdwr_pace:1,/* Try to do hardware pacing */
758 bbr_hdrw_pacing:1; /* Hardware pacing is available */
759 uint8_t bbr_hdw_pace_ena:1, /* Does the connection allow hardware pacing to be attempted */
760 bbr_prev_in_rec:1, /* We were previously in recovery */
761 pkt_conservation:1,
762 use_policer_detection:1,
763 xxx_bbr_hdw_pace_idx:4; /* If hardware pacing is on, index to slot in pace tbl */
764 uint16_t r_wanted_output:1,
765 rtt_valid:1,
766 rc_timer_first:1,
767 rc_output_starts_timer:1,
768 rc_resends_use_tso:1,
769 rc_all_timers_stopped:1,
770 rc_loss_exit:1,
771 rc_ack_was_delayed:1,
772 rc_lt_is_sampling:1,
773 rc_filled_pipe:1,
774 rc_tlp_new_data:1,
775 rc_hit_state_1:1,
776 rc_ts_valid:1,
777 rc_prtt_set_ts:1,
778 rc_is_pkt_epoch_now:1,
779 rc_has_collapsed:1;
780
781 uint8_t r_state:4, /* Current bbr state Lock(a) */
782 r_agg_early_set:1, /* Did we get called early */
783 r_init_rtt:1,
784 r_use_policer:1, /* For google mode only */
785 r_recovery_bw:1;
786 uint8_t r_timer_override:1, /* pacer override Lock(a) 0/1 */
787 rc_in_persist:1,
788 rc_lt_use_bw:1,
789 rc_allow_data_af_clo:1,
790 rc_tlp_rtx_out:1, /* A TLP is in flight */
791 rc_tlp_in_progress:1, /* a TLP timer is running needed? */
792 rc_use_idle_restart:1; /* Do we restart fast after idle (persist or applim) */
793 uint8_t rc_bbr_state:3, /* What is the major BBR state */
794 rc_bbr_substate:3, /* For probeBW state */
795 r_is_v6:1,
796 rc_past_init_win:1;
797 uint8_t rc_last_options;
798 uint8_t rc_tlp_threshold; /* Socket option value Lock(a) */
799 uint8_t rc_max_rto_sec;
800 uint8_t rc_cwnd_limited:1, /* We are cwnd limited */
801 rc_tmr_stopped:7; /* What timers have been stopped */
802 uint8_t rc_use_google:1,
803 rc_use_ts_limit:1,
804 rc_ts_data_set:1, /* We have filled a set point to determine */
805 rc_ts_clock_set:1, /* We have determined the ts type */
806 rc_ts_cant_be_used:1, /* We determined we can't use ts values */
807 rc_ack_is_cumack:1,
808 rc_no_pacing:1,
809 alloc_limit_reported:1;
810 uint8_t rc_init_win;
811 /* Cache line 2 0x40 */
812 struct bbr_control r_ctl;
813#ifdef _KERNEL
814} __aligned(CACHE_LINE_SIZE);
815#else
816};
817#endif
818
819#endif