master
1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#ifndef _G_RAID_H_
30#define _G_RAID_H_
31
32#include <sys/param.h>
33#include <sys/kobj.h>
34#include <sys/bio.h>
35#include <sys/time.h>
36#ifdef _KERNEL
37#include <sys/sysctl.h>
38#endif
39
40#define G_RAID_CLASS_NAME "RAID"
41
42#define G_RAID_MAGIC "GEOM::RAID"
43
44#define G_RAID_VERSION 0
45
46struct g_raid_md_object;
47struct g_raid_tr_object;
48
49#define G_RAID_DEVICE_FLAG_NOAUTOSYNC 0x0000000000000001ULL
50#define G_RAID_DEVICE_FLAG_NOFAILSYNC 0x0000000000000002ULL
51#define G_RAID_DEVICE_FLAG_MASK (G_RAID_DEVICE_FLAG_NOAUTOSYNC | \
52 G_RAID_DEVICE_FLAG_NOFAILSYNC)
53
54#ifdef _KERNEL
55extern u_int g_raid_aggressive_spare;
56extern u_int g_raid_debug;
57extern int g_raid_enable;
58extern int g_raid_read_err_thresh;
59extern u_int g_raid_start_timeout;
60extern struct g_class g_raid_class;
61
62#define G_RAID_DEBUG(lvl, ...) \
63 _GEOM_DEBUG("GEOM_RAID", g_raid_debug, (lvl), NULL, __VA_ARGS__)
64#define G_RAID_DEBUG1(lvl, sc, fmt, ...) \
65 _GEOM_DEBUG("GEOM_RAID", g_raid_debug, (lvl), NULL, "%s: " fmt, \
66 (sc)->sc_name, ## __VA_ARGS__)
67#define G_RAID_LOGREQ(lvl, bp, ...) \
68 _GEOM_DEBUG("GEOM_RAID", g_raid_debug, (lvl), (bp), __VA_ARGS__)
69
70/*
71 * Flags we use to distinguish I/O initiated by the TR layer to maintain
72 * the volume's characteristics, fix subdisks, extra copies of data, etc.
73 *
74 * G_RAID_BIO_FLAG_SYNC I/O to update an extra copy of the data
75 * for RAID volumes that maintain extra data
76 * and need to rebuild that data.
77 * G_RAID_BIO_FLAG_REMAP I/O done to try to provoke a subdisk into
78 * doing some desirable action such as bad
79 * block remapping after we detect a bad part
80 * of the disk.
81 * G_RAID_BIO_FLAG_LOCKED I/O holds range lock that should re released.
82 *
83 * and the following meta item:
84 * G_RAID_BIO_FLAG_SPECIAL And of the I/O flags that need to make it
85 * through the range locking which would
86 * otherwise defer the I/O until after that
87 * range is unlocked.
88 */
89#define G_RAID_BIO_FLAG_SYNC 0x01
90#define G_RAID_BIO_FLAG_REMAP 0x02
91#define G_RAID_BIO_FLAG_SPECIAL \
92 (G_RAID_BIO_FLAG_SYNC|G_RAID_BIO_FLAG_REMAP)
93#define G_RAID_BIO_FLAG_LOCKED 0x80
94
95struct g_raid_lock {
96 off_t l_offset;
97 off_t l_length;
98 void *l_callback_arg;
99 int l_pending;
100 LIST_ENTRY(g_raid_lock) l_next;
101};
102
103#define G_RAID_EVENT_WAIT 0x01
104#define G_RAID_EVENT_VOLUME 0x02
105#define G_RAID_EVENT_SUBDISK 0x04
106#define G_RAID_EVENT_DISK 0x08
107#define G_RAID_EVENT_DONE 0x10
108struct g_raid_event {
109 void *e_tgt;
110 int e_event;
111 int e_flags;
112 int e_error;
113 TAILQ_ENTRY(g_raid_event) e_next;
114};
115#define G_RAID_DISK_S_NONE 0x00 /* State is unknown. */
116#define G_RAID_DISK_S_OFFLINE 0x01 /* Missing disk placeholder. */
117#define G_RAID_DISK_S_DISABLED 0x02 /* Disabled. */
118#define G_RAID_DISK_S_FAILED 0x03 /* Failed. */
119#define G_RAID_DISK_S_STALE_FAILED 0x04 /* Old failed. */
120#define G_RAID_DISK_S_SPARE 0x05 /* Hot-spare. */
121#define G_RAID_DISK_S_STALE 0x06 /* Old disk, unused now. */
122#define G_RAID_DISK_S_ACTIVE 0x07 /* Operational. */
123
124#define G_RAID_DISK_E_DISCONNECTED 0x01
125
126struct g_raid_disk {
127 struct g_raid_softc *d_softc; /* Back-pointer to softc. */
128 struct g_consumer *d_consumer; /* GEOM disk consumer. */
129 void *d_md_data; /* Disk's metadata storage. */
130 int d_candelete; /* BIO_DELETE supported. */
131 uint64_t d_flags; /* Additional flags. */
132 u_int d_state; /* Disk state. */
133 u_int d_load; /* Disk average load. */
134 off_t d_last_offset; /* Last head offset. */
135 int d_read_errs; /* Count of the read errors */
136 TAILQ_HEAD(, g_raid_subdisk) d_subdisks; /* List of subdisks. */
137 TAILQ_ENTRY(g_raid_disk) d_next; /* Next disk in the node. */
138 struct g_kerneldump d_kd; /* Kernel dumping method/args. */
139};
140
141#define G_RAID_SUBDISK_S_NONE 0x00 /* Absent. */
142#define G_RAID_SUBDISK_S_FAILED 0x01 /* Failed. */
143#define G_RAID_SUBDISK_S_NEW 0x02 /* Blank. */
144#define G_RAID_SUBDISK_S_REBUILD 0x03 /* Blank + rebuild. */
145#define G_RAID_SUBDISK_S_UNINITIALIZED 0x04 /* Disk of the new volume. */
146#define G_RAID_SUBDISK_S_STALE 0x05 /* Dirty. */
147#define G_RAID_SUBDISK_S_RESYNC 0x06 /* Dirty + check/repair. */
148#define G_RAID_SUBDISK_S_ACTIVE 0x07 /* Usable. */
149
150#define G_RAID_SUBDISK_E_NEW 0x01 /* A new subdisk has arrived */
151#define G_RAID_SUBDISK_E_FAILED 0x02 /* A subdisk failed, but remains in volume */
152#define G_RAID_SUBDISK_E_DISCONNECTED 0x03 /* A subdisk removed from volume. */
153#define G_RAID_SUBDISK_E_FIRST_TR_PRIVATE 0x80 /* translation private events */
154
155#define G_RAID_SUBDISK_POS(sd) \
156 ((sd)->sd_disk ? ((sd)->sd_disk->d_last_offset - (sd)->sd_offset) : 0)
157#define G_RAID_SUBDISK_TRACK_SIZE (1 * 1024 * 1024)
158#define G_RAID_SUBDISK_LOAD(sd) \
159 ((sd)->sd_disk ? ((sd)->sd_disk->d_load) : 0)
160#define G_RAID_SUBDISK_LOAD_SCALE 256
161
162struct g_raid_subdisk {
163 struct g_raid_softc *sd_softc; /* Back-pointer to softc. */
164 struct g_raid_disk *sd_disk; /* Where this subdisk lives. */
165 struct g_raid_volume *sd_volume; /* Volume, sd is a part of. */
166 off_t sd_offset; /* Offset on the disk. */
167 off_t sd_size; /* Size on the disk. */
168 u_int sd_pos; /* Position in volume. */
169 u_int sd_state; /* Subdisk state. */
170 off_t sd_rebuild_pos; /* Rebuild position. */
171 int sd_recovery; /* Count of recovery reqs. */
172 TAILQ_ENTRY(g_raid_subdisk) sd_next; /* Next subdisk on disk. */
173};
174
175#define G_RAID_MAX_SUBDISKS 16
176#define G_RAID_MAX_VOLUMENAME 32
177
178#define G_RAID_VOLUME_S_STARTING 0x00
179#define G_RAID_VOLUME_S_BROKEN 0x01
180#define G_RAID_VOLUME_S_DEGRADED 0x02
181#define G_RAID_VOLUME_S_SUBOPTIMAL 0x03
182#define G_RAID_VOLUME_S_OPTIMAL 0x04
183#define G_RAID_VOLUME_S_UNSUPPORTED 0x05
184#define G_RAID_VOLUME_S_STOPPED 0x06
185
186#define G_RAID_VOLUME_S_ALIVE(s) \
187 ((s) == G_RAID_VOLUME_S_DEGRADED || \
188 (s) == G_RAID_VOLUME_S_SUBOPTIMAL || \
189 (s) == G_RAID_VOLUME_S_OPTIMAL)
190
191#define G_RAID_VOLUME_E_DOWN 0x00
192#define G_RAID_VOLUME_E_UP 0x01
193#define G_RAID_VOLUME_E_START 0x10
194#define G_RAID_VOLUME_E_STARTMD 0x11
195
196#define G_RAID_VOLUME_RL_RAID0 0x00
197#define G_RAID_VOLUME_RL_RAID1 0x01
198#define G_RAID_VOLUME_RL_RAID3 0x03
199#define G_RAID_VOLUME_RL_RAID4 0x04
200#define G_RAID_VOLUME_RL_RAID5 0x05
201#define G_RAID_VOLUME_RL_RAID6 0x06
202#define G_RAID_VOLUME_RL_RAIDMDF 0x07
203#define G_RAID_VOLUME_RL_RAID1E 0x11
204#define G_RAID_VOLUME_RL_SINGLE 0x0f
205#define G_RAID_VOLUME_RL_CONCAT 0x1f
206#define G_RAID_VOLUME_RL_RAID5E 0x15
207#define G_RAID_VOLUME_RL_RAID5EE 0x25
208#define G_RAID_VOLUME_RL_RAID5R 0x35
209#define G_RAID_VOLUME_RL_UNKNOWN 0xff
210
211#define G_RAID_VOLUME_RLQ_NONE 0x00
212#define G_RAID_VOLUME_RLQ_R1SM 0x00
213#define G_RAID_VOLUME_RLQ_R1MM 0x01
214#define G_RAID_VOLUME_RLQ_R3P0 0x00
215#define G_RAID_VOLUME_RLQ_R3PN 0x01
216#define G_RAID_VOLUME_RLQ_R4P0 0x00
217#define G_RAID_VOLUME_RLQ_R4PN 0x01
218#define G_RAID_VOLUME_RLQ_R5RA 0x00
219#define G_RAID_VOLUME_RLQ_R5RS 0x01
220#define G_RAID_VOLUME_RLQ_R5LA 0x02
221#define G_RAID_VOLUME_RLQ_R5LS 0x03
222#define G_RAID_VOLUME_RLQ_R6RA 0x00
223#define G_RAID_VOLUME_RLQ_R6RS 0x01
224#define G_RAID_VOLUME_RLQ_R6LA 0x02
225#define G_RAID_VOLUME_RLQ_R6LS 0x03
226#define G_RAID_VOLUME_RLQ_RMDFRA 0x00
227#define G_RAID_VOLUME_RLQ_RMDFRS 0x01
228#define G_RAID_VOLUME_RLQ_RMDFLA 0x02
229#define G_RAID_VOLUME_RLQ_RMDFLS 0x03
230#define G_RAID_VOLUME_RLQ_R1EA 0x00
231#define G_RAID_VOLUME_RLQ_R1EO 0x01
232#define G_RAID_VOLUME_RLQ_R5ERA 0x00
233#define G_RAID_VOLUME_RLQ_R5ERS 0x01
234#define G_RAID_VOLUME_RLQ_R5ELA 0x02
235#define G_RAID_VOLUME_RLQ_R5ELS 0x03
236#define G_RAID_VOLUME_RLQ_R5EERA 0x00
237#define G_RAID_VOLUME_RLQ_R5EERS 0x01
238#define G_RAID_VOLUME_RLQ_R5EELA 0x02
239#define G_RAID_VOLUME_RLQ_R5EELS 0x03
240#define G_RAID_VOLUME_RLQ_R5RRA 0x00
241#define G_RAID_VOLUME_RLQ_R5RRS 0x01
242#define G_RAID_VOLUME_RLQ_R5RLA 0x02
243#define G_RAID_VOLUME_RLQ_R5RLS 0x03
244#define G_RAID_VOLUME_RLQ_UNKNOWN 0xff
245
246struct g_raid_volume;
247
248struct g_raid_volume {
249 struct g_raid_softc *v_softc; /* Back-pointer to softc. */
250 struct g_provider *v_provider; /* GEOM provider. */
251 struct g_raid_subdisk v_subdisks[G_RAID_MAX_SUBDISKS];
252 /* Subdisks of this volume. */
253 void *v_md_data; /* Volume's metadata storage. */
254 struct g_raid_tr_object *v_tr; /* Transformation object. */
255 char v_name[G_RAID_MAX_VOLUMENAME];
256 /* Volume name. */
257 u_int v_state; /* Volume state. */
258 u_int v_raid_level; /* Array RAID level. */
259 u_int v_raid_level_qualifier; /* RAID level det. */
260 u_int v_disks_count; /* Number of disks in array. */
261 u_int v_mdf_pdisks; /* Number of parity disks
262 in RAIDMDF array. */
263 uint16_t v_mdf_polynomial; /* Polynomial for RAIDMDF. */
264 uint8_t v_mdf_method; /* Generation method for RAIDMDF. */
265 u_int v_strip_size; /* Array strip size. */
266 u_int v_rotate_parity; /* Rotate RAID5R parity
267 after numer of stripes. */
268 u_int v_sectorsize; /* Volume sector size. */
269 off_t v_mediasize; /* Volume media size. */
270 struct bio_queue_head v_inflight; /* In-flight write requests. */
271 struct bio_queue_head v_locked; /* Blocked I/O requests. */
272 LIST_HEAD(, g_raid_lock) v_locks; /* List of locked regions. */
273 int v_pending_lock; /* writes to locked region */
274 int v_dirty; /* Volume is DIRTY. */
275 struct timeval v_last_done; /* Time of the last I/O. */
276 time_t v_last_write; /* Time of the last write. */
277 u_int v_writes; /* Number of active writes. */
278 struct root_hold_token *v_rootmount; /* Root mount delay token. */
279 int v_starting; /* Volume is starting */
280 int v_stopping; /* Volume is stopping */
281 int v_provider_open; /* Number of opens. */
282 int v_global_id; /* Global volume ID (rX). */
283 int v_read_only; /* Volume is read-only. */
284 TAILQ_ENTRY(g_raid_volume) v_next; /* List of volumes entry. */
285 LIST_ENTRY(g_raid_volume) v_global_next; /* Global list entry. */
286};
287
288#define G_RAID_NODE_E_WAKE 0x00
289#define G_RAID_NODE_E_START 0x01
290
291struct g_raid_softc {
292 struct g_raid_md_object *sc_md; /* Metadata object. */
293 struct g_geom *sc_geom; /* GEOM class instance. */
294 uint64_t sc_flags; /* Additional flags. */
295 TAILQ_HEAD(, g_raid_volume) sc_volumes; /* List of volumes. */
296 TAILQ_HEAD(, g_raid_disk) sc_disks; /* List of disks. */
297 struct sx sc_lock; /* Main node lock. */
298 struct proc *sc_worker; /* Worker process. */
299 struct mtx sc_queue_mtx; /* Worker queues lock. */
300 TAILQ_HEAD(, g_raid_event) sc_events; /* Worker events queue. */
301 struct bio_queue_head sc_queue; /* Worker I/O queue. */
302 int sc_stopping; /* Node is stopping */
303};
304#define sc_name sc_geom->name
305
306SYSCTL_DECL(_kern_geom_raid);
307
308/*
309 * KOBJ parent class of metadata processing modules.
310 */
311struct g_raid_md_class {
312 KOBJ_CLASS_FIELDS;
313 int mdc_enable;
314 int mdc_priority;
315 LIST_ENTRY(g_raid_md_class) mdc_list;
316};
317
318/*
319 * KOBJ instance of metadata processing module.
320 */
321struct g_raid_md_object {
322 KOBJ_FIELDS;
323 struct g_raid_md_class *mdo_class;
324 struct g_raid_softc *mdo_softc; /* Back-pointer to softc. */
325};
326
327int g_raid_md_modevent(module_t, int, void *);
328
329#define G_RAID_MD_DECLARE(name, label) \
330 static moduledata_t g_raid_md_##name##_mod = { \
331 "g_raid_md_" __XSTRING(name), \
332 g_raid_md_modevent, \
333 &g_raid_md_##name##_class \
334 }; \
335 DECLARE_MODULE(g_raid_md_##name, g_raid_md_##name##_mod, \
336 SI_SUB_DRIVERS, SI_ORDER_SECOND); \
337 MODULE_DEPEND(g_raid_md_##name, geom_raid, 0, 0, 0); \
338 SYSCTL_NODE(_kern_geom_raid, OID_AUTO, name, \
339 CTLFLAG_RD | CTLFLAG_MPSAFE, \
340 NULL, label " metadata module"); \
341 SYSCTL_INT(_kern_geom_raid_##name, OID_AUTO, enable, \
342 CTLFLAG_RWTUN, &g_raid_md_##name##_class.mdc_enable, 0, \
343 "Enable " label " metadata format taste")
344
345/*
346 * KOBJ parent class of data transformation modules.
347 */
348struct g_raid_tr_class {
349 KOBJ_CLASS_FIELDS;
350 int trc_enable;
351 int trc_priority;
352 int trc_accept_unmapped;
353 LIST_ENTRY(g_raid_tr_class) trc_list;
354};
355
356/*
357 * KOBJ instance of data transformation module.
358 */
359struct g_raid_tr_object {
360 KOBJ_FIELDS;
361 struct g_raid_tr_class *tro_class;
362 struct g_raid_volume *tro_volume; /* Back-pointer to volume. */
363};
364
365int g_raid_tr_modevent(module_t, int, void *);
366
367#define G_RAID_TR_DECLARE(name, label) \
368 static moduledata_t g_raid_tr_##name##_mod = { \
369 "g_raid_tr_" __XSTRING(name), \
370 g_raid_tr_modevent, \
371 &g_raid_tr_##name##_class \
372 }; \
373 DECLARE_MODULE(g_raid_tr_##name, g_raid_tr_##name##_mod, \
374 SI_SUB_DRIVERS, SI_ORDER_FIRST); \
375 MODULE_DEPEND(g_raid_tr_##name, geom_raid, 0, 0, 0); \
376 SYSCTL_NODE(_kern_geom_raid, OID_AUTO, name, \
377 CTLFLAG_RD | CTLFLAG_MPSAFE, \
378 NULL, label " transformation module"); \
379 SYSCTL_INT(_kern_geom_raid_##name, OID_AUTO, enable, \
380 CTLFLAG_RWTUN, &g_raid_tr_##name##_class.trc_enable, 0, \
381 "Enable " label " transformation module taste")
382
383const char * g_raid_volume_level2str(int level, int qual);
384int g_raid_volume_str2level(const char *str, int *level, int *qual);
385const char * g_raid_volume_state2str(int state);
386const char * g_raid_subdisk_state2str(int state);
387const char * g_raid_disk_state2str(int state);
388
389struct g_raid_softc * g_raid_create_node(struct g_class *mp,
390 const char *name, struct g_raid_md_object *md);
391int g_raid_create_node_format(const char *format, struct gctl_req *req,
392 struct g_geom **gp);
393struct g_raid_volume * g_raid_create_volume(struct g_raid_softc *sc,
394 const char *name, int id);
395struct g_raid_disk * g_raid_create_disk(struct g_raid_softc *sc);
396const char * g_raid_get_diskname(struct g_raid_disk *disk);
397void g_raid_get_disk_info(struct g_raid_disk *disk);
398
399int g_raid_start_volume(struct g_raid_volume *vol);
400
401int g_raid_destroy_node(struct g_raid_softc *sc, int worker);
402int g_raid_destroy_volume(struct g_raid_volume *vol);
403int g_raid_destroy_disk(struct g_raid_disk *disk);
404
405void g_raid_iodone(struct bio *bp, int error);
406void g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp);
407int g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd, void *virtual,
408 off_t offset, size_t length);
409
410struct g_consumer *g_raid_open_consumer(struct g_raid_softc *sc,
411 const char *name);
412void g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp);
413
414void g_raid_report_disk_state(struct g_raid_disk *disk);
415void g_raid_change_disk_state(struct g_raid_disk *disk, int state);
416void g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state);
417void g_raid_change_volume_state(struct g_raid_volume *vol, int state);
418
419void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol,
420 struct g_raid_subdisk *sd, struct g_raid_disk *disk);
421void g_raid_fail_disk(struct g_raid_softc *sc,
422 struct g_raid_subdisk *sd, struct g_raid_disk *disk);
423
424void g_raid_tr_flush_common(struct g_raid_tr_object *tr, struct bio *bp);
425int g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr,
426 void *virtual, vm_offset_t physical, off_t offset, size_t length);
427
428u_int g_raid_ndisks(struct g_raid_softc *sc, int state);
429u_int g_raid_nsubdisks(struct g_raid_volume *vol, int state);
430u_int g_raid_nopens(struct g_raid_softc *sc);
431struct g_raid_subdisk * g_raid_get_subdisk(struct g_raid_volume *vol,
432 int state);
433#define G_RAID_DESTROY_SOFT 0
434#define G_RAID_DESTROY_DELAYED 1
435#define G_RAID_DESTROY_HARD 2
436int g_raid_destroy(struct g_raid_softc *sc, int how);
437int g_raid_event_send(void *arg, int event, int flags);
438int g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len,
439 struct bio *ignore, void *argp);
440int g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len);
441
442g_ctl_req_t g_raid_ctl;
443#endif /* _KERNEL */
444
445#endif /* !_G_RAID_H_ */