1 /* $OpenBSD: rf_pqdegdags.c,v 1.5 2002/12/16 07:01:04 tdeval Exp $ */
2 /* $NetBSD: rf_pqdegdags.c,v 1.5 1999/08/15 02:36:40 oster Exp $ */
3
4 /*
5 * Copyright (c) 1995 Carnegie-Mellon University.
6 * All rights reserved.
7 *
8 * Author: Daniel Stodolsky
9 *
10 * Permission to use, copy, modify and distribute this software and
11 * its documentation is hereby granted, provided that both the copyright
12 * notice and this permission notice appear in all copies of the
13 * software, derivative works or modified versions, and any portions
14 * thereof, and that both notices appear in supporting documentation.
15 *
16 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
17 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
18 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
19 *
20 * Carnegie Mellon requests users of this software to return to
21 *
22 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
23 * School of Computer Science
24 * Carnegie Mellon University
25 * Pittsburgh PA 15213-3890
26 *
27 * any improvements or extensions that they make and grant Carnegie the
28 * rights to redistribute these changes.
29 */
30
31 /*
32 * rf_pqdegdags.c
33 * Degraded mode dags for double fault cases.
34 */
35
36
37 #include "rf_archs.h"
38
39 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
40
41 #include "rf_types.h"
42 #include "rf_raid.h"
43 #include "rf_dag.h"
44 #include "rf_dagdegrd.h"
45 #include "rf_dagdegwr.h"
46 #include "rf_dagfuncs.h"
47 #include "rf_dagutils.h"
48 #include "rf_etimer.h"
49 #include "rf_acctrace.h"
50 #include "rf_general.h"
51 #include "rf_pqdegdags.h"
52 #include "rf_pq.h"
53
54 void rf_applyPDA(RF_Raid_t *, RF_PhysDiskAddr_t *, RF_PhysDiskAddr_t *,
55 RF_PhysDiskAddr_t *, void *);
56
57 /*
58 * Two data drives have failed, and we are doing a read that covers one of them.
59 * We may also be reading some of the surviving drives.
60 */
61
62
63 /*****************************************************************************
64 *
65 * Creates a DAG to perform a degraded-mode read of data within one stripe.
66 * This DAG is as follows:
67 *
68 * Hdr
69 * |
70 * Block
71 * / / \ \ \ \
72 * Rud ... Rud Rrd ... Rrd Rp Rq
73 * | \ | \ | \ | \ | \ | \
74 *
75 * | |
76 * Unblock X
77 * \ /
78 * ------ T ------
79 *
80 * Each R node is a successor of the L node.
81 * One successor arc from each R node goes to U, and the other to X.
82 * There is one Rud for each chunk of surviving user data requested by the
83 * user, and one Rrd for each chunk of surviving user data _not_ being read
84 * by the user.
85 * R = read, ud = user data, rd = recovery (surviving) data, p = P data,
86 * q = Qdata, X = pq recovery node, T = terminate
87 *
88 * The block & unblock nodes are leftovers from a previous version. They
89 * do nothing, but I haven't deleted them because it would be a tremendous
90 * effort to put them back in.
91 *
92 * Note: The target buffer for the XOR node is set to the actual user buffer
93 * where the failed data is supposed to end up. This buffer is zero'd by the
94 * code here. Thus, if you create a degraded read dag, use it, and then
95 * re-use. You have to be sure to zero the target buffer prior to the re-use.
96 *
97 * Every buffer read is passed to the pq recovery node, whose job it is to
98 * sort out what's needed and what's not.
99 *****************************************************************************/
100
101 /* Init a disk node with 2 successors and one predecessor. */
102 #define INIT_DISK_NODE(node,name) \
103 do { \
104 rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, \
105 rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2, 1, 4, 0, \
106 dag_h, name, allocList); \
107 (node)->succedents[0] = unblockNode; \
108 (node)->succedents[1] = recoveryNode; \
109 (node)->antecedents[0] = blockNode; \
110 (node)->antType[0] = rf_control; \
111 } while (0)
112
113 #define DISK_NODE_PARAMS(_node_,_p_) \
114 do { \
115 (_node_).params[0].p = _p_ ; \
116 (_node_).params[1].p = (_p_)->bufPtr; \
117 (_node_).params[2].v = parityStripeID; \
118 (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, \
119 0, 0, which_ru); \
120 } while (0)
121
122 #define DISK_NODE_PDA(node) ((node)->params[0].p)
123
124 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead)
125 {
126 rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList,
127 "Rq", "PQ Recovery", rf_PQDoubleRecoveryFunc);
128 }
129
130 void
131 rf_applyPDA(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda,
132 RF_PhysDiskAddr_t *ppda, RF_PhysDiskAddr_t *qpda, void *bp)
133 {
134 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
135 RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
136 RF_SectorCount_t s0len = ppda->numSector, len;
137 RF_SectorNum_t suoffset;
138 unsigned coeff;
139 char *pbuf = ppda->bufPtr;
140 char *qbuf = qpda->bufPtr;
141 char *buf;
142 int delta;
143
144 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
145 len = pda->numSector;
146 /* See if pda intersects a recovery pda. */
147 if ((suoffset < s0off + s0len) && (suoffset + len > s0off)) {
148 buf = pda->bufPtr;
149 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
150 pda->raidAddress);
151 coeff = (coeff % raidPtr->Layout.numDataCol);
152
153 if (suoffset < s0off) {
154 delta = s0off - suoffset;
155 buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
156 delta);
157 suoffset = s0off;
158 len -= delta;
159 }
160 if (suoffset > s0off) {
161 delta = suoffset - s0off;
162 pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
163 delta);
164 qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
165 delta);
166 }
167 if ((suoffset + len) > (s0len + s0off))
168 len = s0len + s0off - suoffset;
169
170 /* Src, dest, len. */
171 rf_bxor(buf, pbuf, rf_RaidAddressToByte(raidPtr, len), bp);
172
173 /* Dest, src, len, coeff. */
174 rf_IncQ((unsigned long *) qbuf, (unsigned long *) buf,
175 rf_RaidAddressToByte(raidPtr, len), coeff);
176 }
177 }
178
179
180 /*
181 * Recover data in the case of a double failure. There can be two
182 * result buffers, one for each chunk of data trying to be recovered.
183 * The params are pda's that have not been range restricted or otherwise
184 * politely massaged - this should be done here. The last params are the
185 * pdas of P and Q, followed by the raidPtr. The list can look like
186 *
187 * pda, pda, ..., p pda, q pda, raidptr, asm
188 *
189 * or
190 *
191 * pda, pda, ..., p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm
192 *
193 * depending on whether two chunks of recovery data were required.
194 *
195 * The second condition only arises if there are two failed buffers
196 * whose lengths do not add up a stripe unit.
197 */
198
199 int
200 rf_PQDoubleRecoveryFunc(RF_DagNode_t *node)
201 {
202 int np = node->numParams;
203 RF_AccessStripeMap_t *asmap =
204 (RF_AccessStripeMap_t *) node->params[np - 1].p;
205 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
206 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
207 int d, i;
208 unsigned coeff;
209 RF_RaidAddr_t sosAddr, suoffset;
210 RF_SectorCount_t len, secPerSU = layoutPtr->sectorsPerStripeUnit;
211 int two = 0;
212 RF_PhysDiskAddr_t *ppda, *ppda2, *qpda, *qpda2, *pda, npda;
213 char *buf;
214 int numDataCol = layoutPtr->numDataCol;
215 RF_Etimer_t timer;
216 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
217
218 RF_ETIMER_START(timer);
219
220 if (asmap->failedPDAs[1] &&
221 (asmap->failedPDAs[1]->numSector +
222 asmap->failedPDAs[0]->numSector < secPerSU)) {
223 RF_ASSERT(0);
224 ppda = node->params[np - 6].p;
225 ppda2 = node->params[np - 5].p;
226 qpda = node->params[np - 4].p;
227 qpda2 = node->params[np - 3].p;
228 d = (np - 6);
229 two = 1;
230 } else {
231 ppda = node->params[np - 4].p;
232 qpda = node->params[np - 3].p;
233 d = (np - 4);
234 }
235
236 for (i = 0; i < d; i++) {
237 pda = node->params[i].p;
238 buf = pda->bufPtr;
239 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
240 len = pda->numSector;
241 coeff = rf_RaidAddressToStripeUnitID(layoutPtr,
242 pda->raidAddress);
243 /* Compute the data unit offset within the column. */
244 coeff = (coeff % raidPtr->Layout.numDataCol);
245 /* See if pda intersects a recovery pda. */
246 rf_applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
247 if (two)
248 rf_applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
249 }
250
251 /*
252 * Ok, we got the parity back to the point where we can recover. We
253 * now need to determine the coeff of the columns that need to be
254 * recovered. We can also only need to recover a single stripe unit.
255 */
256
257 if (asmap->failedPDAs[1] == NULL) { /*
258 * Only a single stripe unit
259 * to recover.
260 */
261 pda = asmap->failedPDAs[0];
262 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
263 asmap->raidAddress);
264 /* Need to determine the column of the other failed disk. */
265 coeff = rf_RaidAddressToStripeUnitID(layoutPtr,
266 pda->raidAddress);
267 /* Compute the data unit offset within the column. */
268 coeff = (coeff % raidPtr->Layout.numDataCol);
269 for (i = 0; i < numDataCol; i++) {
270 npda.raidAddress = sosAddr + (i * secPerSU);
271 (raidPtr->Layout.map->MapSector) (raidPtr,
272 npda.raidAddress, &(npda.row), &(npda.col),
273 &(npda.startSector), 0);
274 /* Skip over dead disks. */
275 if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col]
276 .status))
277 if (i != coeff)
278 break;
279 }
280 RF_ASSERT(i < numDataCol);
281 RF_ASSERT(two == 0);
282 /*
283 * Recover the data. Since we need only to recover one
284 * column, we overwrite the parity with the other one.
285 */
286 if (coeff < i) /* Recovering 'a'. */
287 rf_PQ_recover((unsigned long *) ppda->bufPtr,
288 (unsigned long *) qpda->bufPtr,
289 (unsigned long *) pda->bufPtr,
290 (unsigned long *) ppda->bufPtr,
291 rf_RaidAddressToByte(raidPtr, pda->numSector),
292 coeff, i);
293 else /* Recovering 'b'. */
294 rf_PQ_recover((unsigned long *) ppda->bufPtr,
295 (unsigned long *) qpda->bufPtr,
296 (unsigned long *) ppda->bufPtr,
297 (unsigned long *) pda->bufPtr,
298 rf_RaidAddressToByte(raidPtr, pda->numSector),
299 i, coeff);
300 } else
301 RF_PANIC();
302
303 RF_ETIMER_STOP(timer);
304 RF_ETIMER_EVAL(timer);
305 if (tracerec)
306 tracerec->q_us += RF_ETIMER_VAL_US(timer);
307 rf_GenericWakeupFunc(node, 0);
308 return (0);
309 }
310
311 int
312 rf_PQWriteDoubleRecoveryFunc(RF_DagNode_t *node)
313 {
314 /*
315 * The situation:
316 *
317 * We are doing a write that hits only one failed data unit. The other
318 * failed data unit is not being overwritten, so we need to generate
319 * it.
320 *
321 * For the moment, we assume all the nonfailed data being written is in
322 * the shadow of the failed data unit. (i.e., either a single data
323 * unit write or the entire failed stripe unit is being overwritten.)
324 *
325 * Recovery strategy: apply the recovery data to the parity and Q.
326 * Use P & Q to recover the second failed data unit in P. Zero fill
327 * Q, then apply the recovered data to P. Then apply the data being
328 * written to the failed drive. Then walk through the surviving drives,
329 * applying new data when it exists, othewise the recovery data.
330 * Quite a mess.
331 *
332 *
333 * The params:
334 *
335 * read pda0, read pda1, ..., read pda (numDataCol-3),
336 * write pda0, ..., write pda (numStripeUnitAccess - numDataFailed),
337 * failed pda, raidPtr, asmap
338 */
339
340 int np = node->numParams;
341 RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *)
342 node->params[np - 1].p;
343 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
344 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
345 int i;
346 RF_RaidAddr_t sosAddr;
347 unsigned coeff;
348 RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
349 RF_PhysDiskAddr_t *ppda, *qpda, *pda, npda;
350 int numDataCol = layoutPtr->numDataCol;
351 RF_Etimer_t timer;
352 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
353
354 RF_ASSERT(node->numResults == 2);
355 RF_ASSERT(asmap->failedPDAs[1] == NULL);
356 RF_ETIMER_START(timer);
357 ppda = node->results[0];
358 qpda = node->results[1];
359 /* apply the recovery data */
360 for (i = 0; i < numDataCol - 2; i++)
361 rf_applyPDA(raidPtr, node->params[i].p, ppda, qpda,
362 node->dagHdr->bp);
363
364 /* Determine the other failed data unit. */
365 pda = asmap->failedPDAs[0];
366 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
367 asmap->raidAddress);
368 /* Need to determine the column of the other failed disk. */
369 coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
370 /* Compute the data unit offset within the column. */
371 coeff = (coeff % raidPtr->Layout.numDataCol);
372 for (i = 0; i < numDataCol; i++) {
373 npda.raidAddress = sosAddr + (i * secPerSU);
374 (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress,
375 &(npda.row), &(npda.col), &(npda.startSector), 0);
376 /* Skip over dead disks. */
377 if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
378 if (i != coeff)
379 break;
380 }
381 RF_ASSERT(i < numDataCol);
382 /*
383 * Recover the data. The column we want to recover, we write over the
384 * parity. The column we don't care about, we dump in q.
385 */
386 if (coeff < i) /* Recovering 'a'. */
387 rf_PQ_recover((unsigned long *) ppda->bufPtr,
388 (unsigned long *) qpda->bufPtr,
389 (unsigned long *) ppda->bufPtr,
390 (unsigned long *) qpda->bufPtr,
391 rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
392 else /* Recovering 'b'. */
393 rf_PQ_recover((unsigned long *) ppda->bufPtr,
394 (unsigned long *) qpda->bufPtr,
395 (unsigned long *) qpda->bufPtr,
396 (unsigned long *) ppda->bufPtr,
397 rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
398
399 /* OK. The valid data is in P. Zero fill Q, then inc it into it. */
400 bzero(qpda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector));
401 rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr,
402 rf_RaidAddressToByte(raidPtr, qpda->numSector), i);
403
404 /* Now apply all the write data to the buffer. */
405 /*
406 * Single stripe unit write case: The failed data is the only thing
407 * we are writing.
408 */
409 RF_ASSERT(asmap->numStripeUnitsAccessed == 1);
410 /* Dest, src, len, coeff. */
411 rf_IncQ((unsigned long *) qpda->bufPtr,
412 (unsigned long *) asmap->failedPDAs[0]->bufPtr,
413 rf_RaidAddressToByte(raidPtr, qpda->numSector), coeff);
414 rf_bxor(asmap->failedPDAs[0]->bufPtr, ppda->bufPtr,
415 rf_RaidAddressToByte(raidPtr, ppda->numSector), node->dagHdr->bp);
416
417 /* Now apply all the recovery data. */
418 for (i = 0; i < numDataCol - 2; i++)
419 rf_applyPDA(raidPtr, node->params[i].p, ppda, qpda,
420 node->dagHdr->bp);
421
422 RF_ETIMER_STOP(timer);
423 RF_ETIMER_EVAL(timer);
424 if (tracerec)
425 tracerec->q_us += RF_ETIMER_VAL_US(timer);
426
427 rf_GenericWakeupFunc(node, 0);
428 return (0);
429 }
430
431 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite)
432 {
433 RF_PANIC();
434 }
435
436
437 /*
438 * Two lost data unit write case.
439 *
440 * There are really two cases here:
441 *
442 * (1) The write completely covers the two lost data units.
443 * In that case, a reconstruct write that doesn't write the
444 * failed data units will do the correct thing. So in this case,
445 * the dag looks like
446 *
447 * Full stripe read of surviving data units (not being overwritten)
448 * Write new data (ignoring failed units)
449 * Compute P&Q
450 * Write P&Q
451 *
452 *
453 * (2) The write does not completely cover both failed data units
454 * (but touches at least one of them). Then we need to do the
455 * equivalent of a reconstruct read to recover the missing data
456 * unit from the other stripe.
457 *
458 * For any data we are writing that is not in the "shadow"
459 * of the failed units, we need to do a four cycle update.
460 * PANIC on this case. For now.
461 *
462 */
463
464 RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG)
465 {
466 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
467 RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
468 int sum;
469 int nf = asmap->numDataFailed;
470
471 sum = asmap->failedPDAs[0]->numSector;
472 if (nf == 2)
473 sum += asmap->failedPDAs[1]->numSector;
474
475 if ((nf == 2) && (sum == (2 * sectorsPerSU))) {
476 /* Large write case. */
477 rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
478 return;
479 }
480 if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU)) {
481 /* Small write case, no user data not in shadow. */
482 rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags,
483 allocList);
484 return;
485 }
486 RF_PANIC();
487 }
488
489 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite)
490 {
491 rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList,
492 "Rq", "Wq", "PQ Recovery", rf_PQWriteDoubleRecoveryFunc);
493 }
494
495 #endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) */