master
1/*-
2 *
3 * SPDX-License-Identifier: BSD-3-Clause
4 *
5 * Copyright (c) 2018-2020
6 * Netflix Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 */
30/**
31 * Author: Randall Stewart <rrs@netflix.com>
32 */
33#ifndef __tcp_ratelimit_h__
34#define __tcp_ratelimit_h__
35
36struct m_snd_tag;
37
38#define RL_MIN_DIVISOR 50
39#define RL_DEFAULT_DIVISOR 1000
40
41/* Flags on an individual rate */
42#define HDWRPACE_INITED 0x0001
43#define HDWRPACE_TAGPRESENT 0x0002
44#define HDWRPACE_IFPDEPARTED 0x0004
45struct tcp_hwrate_limit_table {
46 const struct tcp_rate_set *ptbl; /* Pointer to parent table */
47 struct m_snd_tag *tag; /* Send tag if needed (chelsio) */
48 long rate; /* Rate we get in Bytes per second (Bps) */
49 long using; /* How many flows are using this hdwr rate. */
50 long rs_num_enobufs;
51 uint32_t time_between; /* Time-Gap between packets at this rate */
52 uint32_t flags;
53};
54
55/* Rateset flags */
56#define RS_IS_DEFF 0x0001 /* Its a lagg, do a double lookup */
57#define RS_IS_INTF 0x0002 /* Its a plain interface */
58#define RS_NO_PRE 0x0004 /* The interfacd has set rates */
59#define RS_INT_TBL 0x0010 /*
60 * The table is the internal version
61 * which has special setup requirements.
62 */
63#define RS_IS_DEAD 0x0020 /* The RS is dead list */
64#define RS_FUNERAL_SCHD 0x0040 /* Is a epoch call scheduled to bury this guy?*/
65#define RS_INTF_NO_SUP 0x0100 /* The interface does not support the ratelimiting */
66
67struct tcp_rate_set {
68 struct sysctl_ctx_list sysctl_ctx;
69 CK_LIST_ENTRY(tcp_rate_set) next;
70 struct ifnet *rs_ifp;
71 struct tcp_hwrate_limit_table *rs_rlt;
72 uint64_t rs_flows_using;
73 uint64_t rs_flow_limit;
74 uint32_t rs_if_dunit;
75 int rs_rate_cnt;
76 int rs_min_seg;
77 int rs_highest_valid;
78 int rs_lowest_valid;
79 int rs_disable;
80 int rs_flags;
81 struct epoch_context rs_epoch_ctx;
82};
83
84CK_LIST_HEAD(head_tcp_rate_set, tcp_rate_set);
85
86/* Request flags */
87#define RS_PACING_EXACT_MATCH 0x0001 /* Need an exact match for rate */
88#define RS_PACING_GT 0x0002 /* Greater than requested */
89#define RS_PACING_GEQ 0x0004 /* Greater than or equal too */
90#define RS_PACING_LT 0x0008 /* Less than requested rate */
91#define RS_PACING_SUB_OK 0x0010 /* If a rate can't be found get the
92 * next best rate (highest or lowest). */
93#ifdef _KERNEL
94#ifndef ETHERNET_SEGMENT_SIZE
95#define ETHERNET_SEGMENT_SIZE 1514
96#endif
97#ifdef RATELIMIT
98#define DETAILED_RATELIMIT_SYSCTL 1 /*
99 * Undefine this if you don't want
100 * detailed rates to appear in
101 * net.inet.tcp.rl.
102 * With the defintion each rate
103 * shows up in your sysctl tree
104 * this can be big.
105 */
106uint64_t inline
107tcp_hw_highest_rate(const struct tcp_hwrate_limit_table *rle)
108{
109 return (rle->ptbl->rs_rlt[rle->ptbl->rs_highest_valid].rate);
110}
111
112uint64_t
113tcp_hw_highest_rate_ifp(struct ifnet *ifp, struct inpcb *inp);
114
115const struct tcp_hwrate_limit_table *
116tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
117 uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate);
118
119const struct tcp_hwrate_limit_table *
120tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
121 struct tcpcb *tp, struct ifnet *ifp,
122 uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate);
123void
124tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte,
125 struct tcpcb *tp);
126
127uint32_t
128tcp_get_pacing_burst_size_w_divisor(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, int can_use_1mss,
129 const struct tcp_hwrate_limit_table *te, int *err, int divisor);
130
131void
132tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table *rte);
133
134#else
135static inline const struct tcp_hwrate_limit_table *
136tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
137 uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate)
138{
139 if (error)
140 *error = EOPNOTSUPP;
141 return (NULL);
142}
143
144static inline const struct tcp_hwrate_limit_table *
145tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
146 struct tcpcb *tp, struct ifnet *ifp,
147 uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate)
148{
149 if (error)
150 *error = EOPNOTSUPP;
151 return (NULL);
152}
153
154static inline void
155tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte,
156 struct tcpcb *tp)
157{
158 return;
159}
160
161static uint64_t inline
162tcp_hw_highest_rate(const struct tcp_hwrate_limit_table *rle)
163{
164 return (0);
165}
166
167static uint64_t inline
168tcp_hw_highest_rate_ifp(struct ifnet *ifp, struct inpcb *inp)
169{
170 return (0);
171}
172
173static inline uint32_t
174tcp_get_pacing_burst_size_w_divisor(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, int can_use_1mss,
175 const struct tcp_hwrate_limit_table *te, int *err, int divisor)
176{
177 /*
178 * We use the google formula to calculate the
179 * TSO size. I.E.
180 * bw < 24Meg
181 * tso = 2mss
182 * else
183 * tso = min(bw/(div=1000), 64k)
184 *
185 * Note for these calculations we ignore the
186 * packet overhead (enet hdr, ip hdr and tcp hdr).
187 * We only get the google formula when we have
188 * divisor = 1000, which is the default for now.
189 */
190 uint64_t bytes;
191 uint32_t new_tso, min_tso_segs;
192
193 /* It can't be zero */
194 if ((divisor == 0) ||
195 (divisor < RL_MIN_DIVISOR)) {
196 bytes = bw / RL_DEFAULT_DIVISOR;
197 } else
198 bytes = bw / divisor;
199 /* We can't ever send more than 65k in a TSO */
200 if (bytes > 0xffff) {
201 bytes = 0xffff;
202 }
203 /* Round up */
204 new_tso = (bytes + segsiz - 1) / segsiz;
205 if (can_use_1mss)
206 min_tso_segs = 1;
207 else
208 min_tso_segs = 2;
209 if (new_tso < min_tso_segs)
210 new_tso = min_tso_segs;
211 new_tso *= segsiz;
212 return (new_tso);
213}
214
215/* Do nothing if RATELIMIT is not defined */
216static inline void
217tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table *rte)
218{
219}
220
221#endif
222
223/*
224 * Given a b/w and a segsiz, and optional hardware
225 * rate limit, return the ideal size to burst
226 * out at once. Note the parameter can_use_1mss
227 * dictates if the transport will tolerate a 1mss
228 * limit, if not it will bottom out at 2mss (think
229 * delayed ack).
230 */
231static inline uint32_t
232tcp_get_pacing_burst_size(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, int can_use_1mss,
233 const struct tcp_hwrate_limit_table *te, int *err)
234{
235
236 return (tcp_get_pacing_burst_size_w_divisor(tp, bw, segsiz,
237 can_use_1mss,
238 te, err, 0));
239}
240
241#endif
242#endif