1 /* $OpenBSD: rf_raid5.c,v 1.4 2002/12/16 07:01:04 tdeval Exp $ */
2 /* $NetBSD: rf_raid5.c,v 1.4 2000/01/08 22:57:30 oster Exp $ */
3
4 /*
5 * Copyright (c) 1995 Carnegie-Mellon University.
6 * All rights reserved.
7 *
8 * Author: Mark Holland
9 *
10 * Permission to use, copy, modify and distribute this software and
11 * its documentation is hereby granted, provided that both the copyright
12 * notice and this permission notice appear in all copies of the
13 * software, derivative works or modified versions, and any portions
14 * thereof, and that both notices appear in supporting documentation.
15 *
16 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
17 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
18 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
19 *
20 * Carnegie Mellon requests users of this software to return to
21 *
22 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
23 * School of Computer Science
24 * Carnegie Mellon University
25 * Pittsburgh PA 15213-3890
26 *
27 * any improvements or extensions that they make and grant Carnegie the
28 * rights to redistribute these changes.
29 */
30
31 /*****************************************************************************
32 *
33 * rf_raid5.c -- Implements RAID Level 5.
34 *
35 *****************************************************************************/
36
37 #include "rf_types.h"
38 #include "rf_raid.h"
39 #include "rf_raid5.h"
40 #include "rf_dag.h"
41 #include "rf_dagffrd.h"
42 #include "rf_dagffwr.h"
43 #include "rf_dagdegrd.h"
44 #include "rf_dagdegwr.h"
45 #include "rf_dagutils.h"
46 #include "rf_general.h"
47 #include "rf_map.h"
48 #include "rf_utils.h"
49
50 typedef struct RF_Raid5ConfigInfo_s {
51 RF_RowCol_t **stripeIdentifier; /*
52 * Filled in at config time and used
53 * by IdentifyStripe.
54 */
55 } RF_Raid5ConfigInfo_t;
56
57
58 int
59 rf_ConfigureRAID5(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
60 RF_Config_t *cfgPtr)
61 {
62 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
63 RF_Raid5ConfigInfo_t *info;
64 RF_RowCol_t i, j, startdisk;
65
66 /* Create a RAID level 5 configuration structure. */
67 RF_MallocAndAdd(info, sizeof(RF_Raid5ConfigInfo_t),
68 (RF_Raid5ConfigInfo_t *), raidPtr->cleanupList);
69 if (info == NULL)
70 return (ENOMEM);
71 layoutPtr->layoutSpecificInfo = (void *) info;
72
73 RF_ASSERT(raidPtr->numRow == 1);
74
75 /*
76 * The stripe identifier must identify the disks in each stripe, IN
77 * THE ORDER THAT THEY APPEAR IN THE STRIPE.
78 */
79 info->stripeIdentifier = rf_make_2d_array(raidPtr->numCol,
80 raidPtr->numCol, raidPtr->cleanupList);
81 if (info->stripeIdentifier == NULL)
82 return (ENOMEM);
83 startdisk = 0;
84 for (i = 0; i < raidPtr->numCol; i++) {
85 for (j = 0; j < raidPtr->numCol; j++) {
86 info->stripeIdentifier[i][j] = (startdisk + j) %
87 raidPtr->numCol;
88 }
89 if ((--startdisk) < 0)
90 startdisk = raidPtr->numCol - 1;
91 }
92
93 /* Fill in the remaining layout parameters. */
94 layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk;
95 layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit <<
96 raidPtr->logBytesPerSector;
97 layoutPtr->numDataCol = raidPtr->numCol - 1;
98 layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol *
99 layoutPtr->sectorsPerStripeUnit;
100 layoutPtr->numParityCol = 1;
101 layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk;
102
103 raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk *
104 layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
105
106 return (0);
107 }
108
109 int
110 rf_GetDefaultNumFloatingReconBuffersRAID5(RF_Raid_t *raidPtr)
111 {
112 return (20);
113 }
114
115 RF_HeadSepLimit_t
116 rf_GetDefaultHeadSepLimitRAID5(RF_Raid_t *raidPtr)
117 {
118 return (10);
119 }
120
121 #if !defined(__NetBSD__) && !defined(__OpenBSD__) && !defined(_KERNEL)
122 /* Not currently used. */
123 int
124 rf_ShutdownRAID5(RF_Raid_t *raidPtr)
125 {
126 return (0);
127 }
128 #endif
129
130 void
131 rf_MapSectorRAID5(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
132 RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap)
133 {
134 RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
135 *row = 0;
136 *col = (SUID % raidPtr->numCol);
137 *diskSector = (SUID / (raidPtr->Layout.numDataCol)) *
138 raidPtr->Layout.sectorsPerStripeUnit +
139 (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
140 }
141
142 void
143 rf_MapParityRAID5(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
144 RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap)
145 {
146 RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
147
148 *row = 0;
149 *col = raidPtr->Layout.numDataCol -
150 (SUID / raidPtr->Layout.numDataCol) % raidPtr->numCol;
151 *diskSector = (SUID / (raidPtr->Layout.numDataCol)) *
152 raidPtr->Layout.sectorsPerStripeUnit +
153 (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
154 }
155
156 void
157 rf_IdentifyStripeRAID5(RF_Raid_t *raidPtr, RF_RaidAddr_t addr,
158 RF_RowCol_t **diskids, RF_RowCol_t *outRow)
159 {
160 RF_StripeNum_t stripeID =
161 rf_RaidAddressToStripeID(&raidPtr->Layout, addr);
162 RF_Raid5ConfigInfo_t *info =
163 (RF_Raid5ConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
164
165 *outRow = 0;
166 *diskids = info->stripeIdentifier[stripeID % raidPtr->numCol];
167 }
168
169 void
170 rf_MapSIDToPSIDRAID5(RF_RaidLayout_t *layoutPtr, RF_StripeNum_t stripeID,
171 RF_StripeNum_t *psID, RF_ReconUnitNum_t *which_ru)
172 {
173 *which_ru = 0;
174 *psID = stripeID;
175 }
176
177
178 /*
179 * Select an algorithm for performing an access. Returns two pointers,
180 * one to a function that will return information about the DAG, and
181 * another to a function that will create the dag.
182 */
183 void
184 rf_RaidFiveDagSelect(RF_Raid_t *raidPtr, RF_IoType_t type,
185 RF_AccessStripeMap_t *asmap, RF_VoidFuncPtr *createFunc)
186 {
187 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
188 RF_PhysDiskAddr_t *failedPDA = NULL;
189 RF_RowCol_t frow, fcol;
190 RF_RowStatus_t rstat;
191 int prior_recon;
192
193 RF_ASSERT(RF_IO_IS_R_OR_W(type));
194
195 if (asmap->numDataFailed + asmap->numParityFailed > 1) {
196 RF_ERRORMSG("Multiple disks failed in a single group !"
197 " Aborting I/O operation.\n");
198 /* *infoFunc = */ *createFunc = NULL;
199 return;
200 } else
201 if (asmap->numDataFailed + asmap->numParityFailed == 1) {
202
203 /*
204 * If under recon & already reconstructed, redirect
205 * the access to the spare drive and eliminate the
206 * failure indication.
207 */
208 failedPDA = asmap->failedPDAs[0];
209 frow = failedPDA->row;
210 fcol = failedPDA->col;
211 rstat = raidPtr->status[failedPDA->row];
212 prior_recon = (rstat == rf_rs_reconfigured) || (
213 (rstat == rf_rs_reconstructing) ?
214 rf_CheckRUReconstructed(raidPtr
215 ->reconControl[frow]->reconMap,
216 failedPDA->startSector) : 0);
217 if (prior_recon) {
218 RF_RowCol_t or = failedPDA->row;
219 RF_RowCol_t oc = failedPDA->col;
220 RF_SectorNum_t oo = failedPDA->startSector;
221
222 if (layoutPtr->map->flags &
223 RF_DISTRIBUTE_SPARE) {
224 /* Redirect to dist spare space. */
225
226 if (failedPDA == asmap->parityInfo) {
227
228 /* Parity has failed. */
229 (layoutPtr->map->MapParity)
230 (raidPtr,
231 failedPDA->raidAddress,
232 &failedPDA->row,
233 &failedPDA->col,
234 &failedPDA->startSector,
235 RF_REMAP);
236
237 if (asmap->parityInfo->next) {
238 /*
239 * Redir 2nd component,
240 * if any.
241 */
242 RF_PhysDiskAddr_t *p =
243 asmap
244 ->parityInfo->next;
245 RF_SectorNum_t SUoffs =
246 p->startSector %
247 layoutPtr->sectorsPerStripeUnit;
248 p->row = failedPDA->row;
249 p->col = failedPDA->col;
250 /*
251 * Cheating:
252 * startSector is not
253 * really a RAID
254 * address.
255 */
256 p->startSector =
257 rf_RaidAddressOfPrevStripeUnitBoundary(
258 layoutPtr, failedPDA->startSector) +
259 SUoffs;
260 }
261 } else
262 if (asmap->parityInfo->next &&
263 failedPDA ==
264 asmap->parityInfo->next) {
265 /*
266 * Should never happen.
267 */
268 RF_ASSERT(0);
269 } else {
270 /* Data has failed. */
271 (layoutPtr->map
272 ->MapSector) (raidPtr,
273 failedPDA->raidAddress,
274 &failedPDA->row,
275 &failedPDA->col,
276 &failedPDA->startSector,
277 RF_REMAP);
278 }
279
280 } else {
281 /* Redirect to dedicated spare space. */
282
283 failedPDA->row =
284 raidPtr->Disks[frow][fcol].spareRow;
285 failedPDA->col =
286 raidPtr->Disks[frow][fcol].spareCol;
287
288 /*
289 * The parity may have two distinct
290 * components, both of which may need
291 * to be redirected.
292 */
293 if (asmap->parityInfo->next) {
294 if (failedPDA ==
295 asmap->parityInfo) {
296 failedPDA->next->row =
297 failedPDA->row;
298 failedPDA->next->col =
299 failedPDA->col;
300 } else {
301 if (failedPDA ==
302 asmap->parityInfo
303 ->next) {
304 /*
305 * Paranoid:
306 * Should never
307 * occur.
308 */
309 asmap
310 ->parityInfo
311 ->row =
312 failedPDA->row;
313 asmap
314 ->parityInfo
315 ->col =
316 failedPDA->col;
317 }
318 }
319 }
320 }
321
322 RF_ASSERT(failedPDA->col != -1);
323
324 if (rf_dagDebug || rf_mapDebug) {
325 printf("raid%d: Redirected type '%c'"
326 " r %d c %d o %ld -> r %d c %d"
327 " o %ld\n", raidPtr->raidid,
328 type, or, oc, (long) oo,
329 failedPDA->row, failedPDA->col,
330 (long) failedPDA->startSector);
331 }
332 asmap->numDataFailed = asmap->numParityFailed
333 = 0;
334 }
335 }
336 /*
337 * All DAGs begin/end with block/unblock node. Therefore, hdrSucc &
338 * termAnt counts should always be 1. Also, these counts should not be
339 * visible outside DAG creation routines - manipulating the counts
340 * here should be removed.
341 */
342 if (type == RF_IO_TYPE_READ) {
343 if (asmap->numDataFailed == 0)
344 *createFunc = (RF_VoidFuncPtr)
345 rf_CreateFaultFreeReadDAG;
346 else
347 *createFunc = (RF_VoidFuncPtr)
348 rf_CreateRaidFiveDegradedReadDAG;
349 } else {
350 /*
351 * If mirroring, always use large writes. If the access
352 * requires two distinct parity updates, always do a small
353 * write. If the stripe contains a failure but the access
354 * does not, do a small write. The first conditional
355 * (numStripeUnitsAccessed <= numDataCol/2) uses a
356 * less-than-or-equal rather than just a less-than because
357 * when G is 3 or 4, numDataCol/2 is 1, and I want
358 * single-stripe-unit updates to use just one disk.
359 */
360 if ((asmap->numDataFailed + asmap->numParityFailed) == 0) {
361 if (rf_suppressLocksAndLargeWrites ||
362 (((asmap->numStripeUnitsAccessed <=
363 (layoutPtr->numDataCol / 2)) &&
364 (layoutPtr->numDataCol != 1)) ||
365 (asmap->parityInfo->next != NULL) ||
366 rf_CheckStripeForFailures(raidPtr, asmap))) {
367 *createFunc = (RF_VoidFuncPtr)
368 rf_CreateSmallWriteDAG;
369 } else
370 *createFunc = (RF_VoidFuncPtr)
371 rf_CreateLargeWriteDAG;
372 } else {
373 if (asmap->numParityFailed == 1)
374 *createFunc = (RF_VoidFuncPtr)
375 rf_CreateNonRedundantWriteDAG;
376 else
377 if (asmap->numStripeUnitsAccessed != 1 &&
378 failedPDA->numSector !=
379 layoutPtr->sectorsPerStripeUnit)
380 *createFunc = NULL;
381 else
382 *createFunc = (RF_VoidFuncPtr)
383 rf_CreateDegradedWriteDAG;
384 }
385 }
386 }