1/*	$NetBSD: tcp_vtw.h,v 1.10 2022/12/11 08:09:20 mlelstv Exp $	*/
  2/*
  3 * Copyright (c) 2011 The NetBSD Foundation, Inc.
  4 * All rights reserved.
  5 *
  6 * This code is derived from software contributed to The NetBSD Foundation
  7 * by Coyote Point Systems, Inc.
  8 *
  9 * Redistribution and use in source and binary forms, with or without
 10 * modification, are permitted provided that the following conditions
 11 * are met:
 12 * 1. Redistributions of source code must retain the above copyright
 13 *    notice, this list of conditions and the following disclaimer.
 14 * 2. Redistributions in binary form must reproduce the above copyright
 15 *    notice, this list of conditions and the following disclaimer in the
 16 *    documentation and/or other materials provided with the distribution.
 17 *
 18 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 19 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 20 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 21 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 22 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 28 * POSSIBILITY OF SUCH DAMAGE.
 29 */
 30
 31/*
 32 * Vestigial time-wait.
 33 *
 34 * This implementation uses cache-efficient techniques, which will
 35 * appear somewhat peculiar.  The main philosophy is to optimise the
 36 * amount of information available within a cache line.  Cache miss is
 37 * expensive.  So we employ ad-hoc techniques to pull a series of
 38 * linked-list follows into a cache line.  One cache line, multiple
 39 * linked-list equivalents.
 40 *
 41 * One such ad-hoc technique is fat pointers.  Additional degrees of
 42 * ad-hoqueness result from having to hand tune it for pointer size
 43 * and for cache line size.
 44 *
 45 * The 'fat pointer' approach aggregates, for x86_32, 15 linked-list
 46 * data structures into one cache line.  The additional 32 bits in the
 47 * cache line are used for linking fat pointers, and for
 48 * allocation/bookkeeping.
 49 *
 50 * The 15 32-bit tags encode the pointers to the linked list elements,
 51 * and also encode the results of a search comparison.
 52 *
 53 * First, some more assumptions/restrictions.
 54 *
 55 * All the fat pointers are from a contiguous allocation arena.  Thus,
 56 * we can refer to them by offset from a base, not as full pointers.
 57 *
 58 * All the linked list data elements are also from a contiguous
 59 * allocation arena, again so that we can refer to them as offset from
 60 * a base.
 61 *
 62 * In order to add a data element to a fat pointer, a key value is
 63 * computed, based on unique data within the data element.  It is the
 64 * linear searching of the linked lists of these elements based on
 65 * these unique data that are being optimised here.
 66 *
 67 * Lets call the function that computes the key k(e), where e is the
 68 * data element.  In this example, k(e) returns 32-bits.
 69 *
 70 * Consider a set E (say of order 15) of data elements.  Let K be
 71 * the set of the k(e) for e in E.
 72 *
 73 * Let O be the set of the offsets from the base of the data elements in E.
 74 *
 75 * For each x in K, for each matching o in O, let t be x ^ o.  These
 76 * are the tags. (More or less).
 77 *
 78 * In order to search all the data elements in E, we compute the
 79 * search key, and one at a time, XOR the key into the tags.  If any
 80 * result is a valid data element index, we have a possible match.  If
 81 * not, there is no match.
 82 *
 83 * The no-match cases mean we do not have to de-reference the pointer
 84 * to the data element in question.  We save cache miss penalty and
 85 * cache load decreases.  Only in the case of a valid looking data
 86 * element index, do we have to look closer.
 87 *
 88 * Thus, in the absence of false positives, 15 data elements can be
 89 * searched with one cache line fill, as opposed to 15 cache line
 90 * fills for the usual implementation.
 91 *
 92 * The vestigial time waits (vtw_t), the data elements in the above, are
 93 * searched by faddr, fport, laddr, lport.  The key is a function of
 94 * these values.
 95 *
 96 * We hash these keys into the traditional hash chains to reduce the
 97 * search time, and use fat pointers to reduce the cache impacts of
 98 * searching.
 99 *
100 * The vtw_t are, per requirement, in a contiguous chunk.  Allocation
101 * is done with a clock hand, and all vtw_t within one allocation
102 * domain have the same lifetime, so they will always be sorted by
103 * age.
104 *
105 * A vtw_t will be allocated, timestamped, and have a fixed future
106 * expiration.  It will be added to a hash bucket implemented with fat
107 * pointers, which means that a cache line will be allocated in the
108 * hash bucket, placed at the head (more recent in time) and the vtw_t
109 * will be added to this.  As more entries are added, the fat pointer
110 * cache line will fill, requiring additional cache lines for fat
111 * pointers to be allocated. These will be added at the head, and the
112 * aged entries will hang down, tapeworm like.  As the vtw_t entries
113 * expire, the corresponding slot in the fat pointer will be
114 * reclaimed, and eventually the cache line will completely empty and
115 * be re-cycled, if not at the head of the chain.
116 *
117 * At times, a time-wait timer is restarted.  This corresponds to
118 * deleting the current entry and re-adding it.
119 *
120 * Most of the time, they are just placed here to die.
121 */
122#ifndef _NETINET_TCP_VTW_H
123#define _NETINET_TCP_VTW_H
124
125#include <sys/types.h>
126#include <sys/socket.h>
127#include <sys/sysctl.h>
128#include <net/if.h>
129#include <netinet/in.h>
130#include <netinet/in_systm.h>
131#include <netinet/ip.h>
132#include <netinet/in_pcb.h>
133#include <netinet/in_var.h>
134#include <netinet/ip_var.h>
135#include <netinet/in.h>
136#include <netinet/tcp.h>
137#include <netinet/tcp_timer.h>
138#include <netinet/tcp_var.h>
139#include <netinet6/in6.h>
140#include <netinet/ip6.h>
141#include <netinet6/ip6_var.h>
142#include <netinet6/in6_pcb.h>
143#include <netinet6/ip6_var.h>
144#include <netinet6/in6_var.h>
145#include <netinet/icmp6.h>
146
147#define	VTW_NCLASS	(1+3)		/* # different classes */
148
149/*
150 * fat pointers, MI.
151 */
152struct fatp_mi;
153
154#if CACHE_LINE_SIZE == 128
155typedef uint64_t fatp_word_t;
156#else
157typedef uint32_t fatp_word_t;
158#endif
159
160typedef struct fatp_mi	fatp_t;
161
162/* Supported cacheline sizes: 32 64 128 bytes.  See fatp_key(),
163 * fatp_slot_from_key(), fatp_xtra[].
164 */
165#define	FATP_NTAGS	(CACHE_LINE_SIZE / sizeof(fatp_word_t) - 1)
166#define	FATP_NXT_WIDTH	(sizeof(fatp_word_t) * NBBY - FATP_NTAGS)
167
168#define	FATP_MAX	(1 << (FATP_NXT_WIDTH < 31 ? FATP_NXT_WIDTH : 31))
169
170/* Worked example: ULP32 with 64-byte cacheline (32-bit x86):
171 * 15 tags per cacheline.  At most 2^17 fat pointers per fatp_ctl_t.
172 * The comments on the fatp_mi members, below, correspond to the worked
173 * example.
174 */
175struct fatp_mi {
176	fatp_word_t	inuse	: FATP_NTAGS;	/* (1+15)*4 == CL_SIZE */
177	fatp_word_t	nxt	: FATP_NXT_WIDTH;/* at most 2^17 fat pointers */
178	fatp_word_t	tag[FATP_NTAGS];	/* 15 tags per CL */
179};
180
181static __inline int
182fatp_ntags(void)
183{
184	return FATP_NTAGS;
185}
186
187static __inline int
188fatp_full(fatp_t *fp) 
189{
190	fatp_t full;
191
192	full.inuse = (1U << FATP_NTAGS) - 1U;
193
194	return (fp->inuse == full.inuse);
195}
196
197struct vtw_common;
198struct vtw_v4;
199struct vtw_v6;
200struct vtw_ctl;
201
202/*!\brief common to all vtw
203 */
204typedef struct vtw_common {
205	struct timeval	expire;		/* date of birth+msl */
206	uint32_t	key;		/* hash key: full hash */
207	uint32_t	port_key;	/* hash key: local port hash */
208	uint32_t	rcv_nxt;
209	uint32_t	rcv_wnd;
210	uint32_t	snd_nxt;
211	uint32_t	snd_scale	: 8;	/* window scaling for send win */
212	uint32_t	msl_class	: 2;	/* TCP MSL class {0,1,2,3} */
213	uint32_t	reuse_port	: 1;
214	uint32_t	reuse_addr	: 1;
215	uint32_t	v6only		: 1;
216	uint32_t	hashed		: 1;	/* reachable via FATP */
217	uint32_t	uid;
218} vtw_t;
219
220/*!\brief vestigial timewait for IPv4
221 */
222typedef struct vtw_v4 {
223	vtw_t		common;		/*  must be first */
224	uint16_t	lport;
225	uint16_t	fport;
226	uint32_t	laddr;
227	uint32_t	faddr;
228} vtw_v4_t;
229
230/*!\brief vestigial timewait for IPv6
231 */
232typedef struct vtw_v6 {
233	vtw_t		common;		/* must be first */
234	uint16_t	lport;
235	uint16_t	fport;
236	struct in6_addr	laddr;
237	struct in6_addr	faddr;
238} vtw_v6_t;
239
240struct fatp_ctl;
241typedef struct vtw_ctl		vtw_ctl_t;
242typedef struct fatp_ctl		fatp_ctl_t;
243
244/*
245 * The vestigial time waits are kept in a contiguous chunk.
246 * Allocation and free pointers run as clock hands thru this array.
247 */
248struct vtw_ctl {
249	fatp_ctl_t	*fat;		/* collection of fatp to use	*/
250	vtw_ctl_t	*ctl;		/* <! controller's controller	*/
251	union {
252		vtw_t		*v;	/* common			*/
253		struct vtw_v4	*v4;	/* IPv4 resources		*/
254		struct vtw_v6	*v6;	/* IPv6 resources		*/
255	}		base,		/* base of vtw_t array		*/
256		/**/	lim,		/* extent of vtw_t array	*/
257		/**/	alloc,		/* allocation pointer		*/
258		/**/	oldest;		/* ^ to oldest			*/
259	uint32_t	nfree;		/* # free			*/
260	uint32_t	nalloc;		/* # allocated			*/
261	uint32_t	idx_mask;	/* mask capturing all index bits*/
262	uint32_t	is_v4	: 1;
263	uint32_t	is_v6	: 1;
264	uint32_t	idx_bits: 6;
265	uint32_t	clidx	: 3;	/* <! class index */
266};
267
268/*!\brief Collections of fat pointers.
269 */
270struct fatp_ctl {
271	vtw_ctl_t	*vtw;		/* associated VTWs		*/
272	fatp_t		*base;		/* base of fatp_t array		*/
273	fatp_t		*lim;		/* extent of fatp_t array	*/
274	fatp_t		*free;		/* free list			*/
275	uint32_t	mask;		/* hash mask			*/
276	uint32_t	nfree;		/* # free			*/
277	uint32_t	nalloc;		/* # allocated			*/
278	fatp_t		**hash;		/* hash anchors			*/
279	fatp_t		**port;		/* port hash anchors		*/
280};
281
282/*!\brief stats
283 */
284struct vtw_stats {
285	uint64_t	ins;		/* <! inserts */
286	uint64_t	del;		/* <! deleted */
287	uint64_t	kill;		/* <! assassination */
288	uint64_t	look[2];	/* <! lookup: full hash, port hash */
289	uint64_t	hit[2];		/* <! lookups that hit */
290	uint64_t	miss[2];	/* <! lookups that miss */
291	uint64_t	probe[2];	/* <! hits+miss */
292	uint64_t	losing[2];	/* <! misses requiring dereference */
293	uint64_t	max_chain[2];	/* <! max fatp chain traversed */
294	uint64_t	max_probe[2];	/* <! max probes in any one chain */
295	uint64_t	max_loss[2];	/* <! max losing probes in any one
296					 * chain
297					 */
298};
299
300typedef struct vtw_stats	vtw_stats_t;
301
302/*!\brief	follow fatp next 'pointer'
303 */
304static __inline fatp_t *
305fatp_next(fatp_ctl_t *fat, fatp_t *fp)
306{
307	return fp->nxt ? fat->base + fp->nxt-1 : 0;
308}
309
310/*!\brief determine a collection-relative fat pointer index.
311 */
312static __inline uint32_t
313fatp_index(fatp_ctl_t *fat, fatp_t *fp)
314{
315	return fp ? 1 + (fp - fat->base) : 0;
316}
317
318
319static __inline uint32_t
320v4_tag(uint32_t faddr, uint32_t fport, uint32_t laddr, uint32_t lport)
321{
322	return (ntohl(faddr)   + ntohs(fport)
323		+ ntohl(laddr) + ntohs(lport));
324}
325
326static __inline uint32_t
327v6_tag(const struct in6_addr *faddr, uint16_t fport,
328       const struct in6_addr *laddr, uint16_t lport)
329{
330#ifdef IN6_HASH
331	return IN6_HASH(faddr, fport, laddr, lport);
332#else
333	return 0;
334#endif
335}
336
337static __inline uint32_t
338v4_port_tag(uint16_t lport)
339{
340	uint32_t tag = lport ^ (lport << 11);
341
342	tag ^= tag << 3;
343	tag += tag >> 5;
344	tag ^= tag << 4;
345	tag += tag >> 17;
346	tag ^= tag << 25;
347	tag += tag >> 6;
348
349	return tag;
350}
351
352static __inline uint32_t
353v6_port_tag(uint16_t lport)
354{
355	return v4_port_tag(lport);
356}
357
358struct tcpcb;
359struct tcphdr;
360
361int  vtw_add(int, struct tcpcb *);
362void vtw_del(vtw_ctl_t *, vtw_t *);
363int vtw_lookup_v4(const struct ip *ip, const struct tcphdr *th,
364		  uint32_t faddr, uint16_t fport,
365		  uint32_t laddr, uint16_t lport);
366struct ip6_hdr;
367struct in6_addr;
368
369int vtw_lookup_v6(const struct ip6_hdr *ip, const struct tcphdr *th,
370		  const struct in6_addr *faddr, uint16_t fport,
371		  const struct in6_addr *laddr, uint16_t lport);
372
373typedef struct vestigial_inpcb {
374	union {
375		struct in_addr	v4;
376		struct in6_addr	v6;
377	} faddr, laddr;
378	uint16_t		fport, lport;
379	uint32_t		valid		: 1;
380	uint32_t		v4		: 1;
381	uint32_t		reuse_addr	: 1;
382	uint32_t		reuse_port	: 1;
383	uint32_t		v6only		: 1;
384	uint32_t		more_tbd	: 1;
385	uint32_t		uid;
386	uint32_t		rcv_nxt;
387	uint32_t		rcv_wnd;
388	uint32_t		snd_nxt;
389	struct vtw_common	*vtw;
390	struct vtw_ctl		*ctl;
391} vestigial_inpcb_t;
392
393#ifdef _KERNEL
394void vtw_restart(vestigial_inpcb_t*);
395int vtw_earlyinit(void);
396int sysctl_tcp_vtw_enable(SYSCTLFN_PROTO);
397#endif /* _KERNEL */
398
399#ifdef VTW_DEBUG
400typedef struct sin_either {
401	uint8_t		sin_len;
402	uint8_t		sin_family;
403	uint16_t	sin_port;
404	union {
405		struct in_addr	v4;
406		struct in6_addr	v6;
407	}		sin_addr;
408} sin_either_t;
409
410int vtw_debug_add(int af, sin_either_t *, sin_either_t *, int, int);
411
412typedef struct vtw_sysargs {
413	uint32_t	op;
414	sin_either_t	fa;
415	sin_either_t	la;
416} vtw_sysargs_t;
417
418#endif /* VTW_DEBUG */
419
420#endif /* _NETINET_TCP_VTW_H */