1 /* $OpenBSD: rf_reconstruct.c,v 1.16 2007/06/05 00:38:22 deraadt Exp $ */
2 /* $NetBSD: rf_reconstruct.c,v 1.26 2000/06/04 02:05:13 oster Exp $ */
3
4 /*
5 * Copyright (c) 1995 Carnegie-Mellon University.
6 * All rights reserved.
7 *
8 * Author: Mark Holland
9 *
10 * Permission to use, copy, modify and distribute this software and
11 * its documentation is hereby granted, provided that both the copyright
12 * notice and this permission notice appear in all copies of the
13 * software, derivative works or modified versions, and any portions
14 * thereof, and that both notices appear in supporting documentation.
15 *
16 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
17 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
18 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
19 *
20 * Carnegie Mellon requests users of this software to return to
21 *
22 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
23 * School of Computer Science
24 * Carnegie Mellon University
25 * Pittsburgh PA 15213-3890
26 *
27 * any improvements or extensions that they make and grant Carnegie the
28 * rights to redistribute these changes.
29 */
30
31 /**************************************************************
32 *
33 * rf_reconstruct.c -- Code to perform on-line reconstruction.
34 *
35 **************************************************************/
36
37 #include "rf_types.h"
38 #include <sys/time.h>
39 #include <sys/buf.h>
40 #include <sys/errno.h>
41
42 #include <sys/types.h>
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/proc.h>
46 #include <sys/ioctl.h>
47 #include <sys/fcntl.h>
48 #if __NETBSD__
49 #include <sys/vnode.h>
50 #endif
51
52 #include "rf_raid.h"
53 #include "rf_reconutil.h"
54 #include "rf_revent.h"
55 #include "rf_reconbuffer.h"
56 #include "rf_acctrace.h"
57 #include "rf_etimer.h"
58 #include "rf_dag.h"
59 #include "rf_desc.h"
60 #include "rf_general.h"
61 #include "rf_freelist.h"
62 #include "rf_debugprint.h"
63 #include "rf_driver.h"
64 #include "rf_utils.h"
65 #include "rf_shutdown.h"
66
67 #include "rf_kintf.h"
68
69 /*
70 * Setting these to -1 causes them to be set to their default values if not set
71 * by debug options.
72 */
73
74 #define Dprintf(s) \
75 do { \
76 if (rf_reconDebug) \
77 rf_debug_printf(s, \
78 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL); \
79 } while (0)
80 #define Dprintf1(s,a) \
81 do { \
82 if (rf_reconDebug) \
83 rf_debug_printf(s, \
84 (void *)((unsigned long)a), \
85 NULL, NULL, NULL, NULL, NULL, NULL, NULL); \
86 } while (0)
87 #define Dprintf2(s,a,b) \
88 do { \
89 if (rf_reconDebug) \
90 rf_debug_printf(s, \
91 (void *)((unsigned long)a), \
92 (void *)((unsigned long)b), \
93 NULL, NULL, NULL, NULL, NULL, NULL); \
94 } while (0)
95 #define Dprintf3(s,a,b,c) \
96 do { \
97 if (rf_reconDebug) \
98 rf_debug_printf(s, \
99 (void *)((unsigned long)a), \
100 (void *)((unsigned long)b), \
101 (void *)((unsigned long)c), \
102 NULL, NULL, NULL, NULL, NULL); \
103 } while (0)
104 #define Dprintf4(s,a,b,c,d) \
105 do { \
106 if (rf_reconDebug) \
107 rf_debug_printf(s, \
108 (void *)((unsigned long)a), \
109 (void *)((unsigned long)b), \
110 (void *)((unsigned long)c), \
111 (void *)((unsigned long)d), \
112 NULL, NULL, NULL, NULL); \
113 } while (0)
114 #define Dprintf5(s,a,b,c,d,e) \
115 do { \
116 if (rf_reconDebug) \
117 rf_debug_printf(s, \
118 (void *)((unsigned long)a), \
119 (void *)((unsigned long)b), \
120 (void *)((unsigned long)c), \
121 (void *)((unsigned long)d), \
122 (void *)((unsigned long)e), \
123 NULL, NULL, NULL); \
124 } while (0)
125 #define Dprintf6(s,a,b,c,d,e,f) \
126 do { \
127 if (rf_reconDebug) \
128 rf_debug_printf(s, \
129 (void *)((unsigned long)a), \
130 (void *)((unsigned long)b), \
131 (void *)((unsigned long)c), \
132 (void *)((unsigned long)d), \
133 (void *)((unsigned long)e), \
134 (void *)((unsigned long)f), \
135 NULL, NULL); \
136 } while (0)
137 #define Dprintf7(s,a,b,c,d,e,f,g) \
138 do { \
139 if (rf_reconDebug) \
140 rf_debug_printf(s, \
141 (void *)((unsigned long)a), \
142 (void *)((unsigned long)b), \
143 (void *)((unsigned long)c), \
144 (void *)((unsigned long)d), \
145 (void *)((unsigned long)e), \
146 (void *)((unsigned long)f), \
147 (void *)((unsigned long)g), \
148 NULL); \
149 } while (0)
150
151 #define DDprintf1(s,a) \
152 do { \
153 if (rf_reconDebug) \
154 rf_debug_printf(s, \
155 (void *)((unsigned long)a), \
156 NULL, NULL, NULL, NULL, NULL, NULL, NULL); \
157 } while (0)
158 #define DDprintf2(s,a,b) \
159 do { \
160 if (rf_reconDebug) \
161 rf_debug_printf(s, \
162 (void *)((unsigned long)a), \
163 (void *)((unsigned long)b), \
164 NULL, NULL, NULL, NULL, NULL, NULL); \
165 } while (0)
166
167 static RF_FreeList_t *rf_recond_freelist;
168 #define RF_MAX_FREE_RECOND 4
169 #define RF_RECOND_INC 1
170
171 RF_RaidReconDesc_t *rf_AllocRaidReconDesc(RF_Raid_t *,
172 RF_RowCol_t, RF_RowCol_t, RF_RaidDisk_t *, int,
173 RF_RowCol_t, RF_RowCol_t);
174 int rf_ProcessReconEvent(RF_Raid_t *, RF_RowCol_t, RF_ReconEvent_t *);
175 int rf_IssueNextReadRequest(RF_Raid_t *, RF_RowCol_t, RF_RowCol_t);
176 int rf_TryToRead(RF_Raid_t *, RF_RowCol_t, RF_RowCol_t);
177 int rf_ComputePSDiskOffsets(RF_Raid_t *, RF_StripeNum_t,
178 RF_RowCol_t, RF_RowCol_t, RF_SectorNum_t *, RF_SectorNum_t *,
179 RF_RowCol_t *, RF_RowCol_t *, RF_SectorNum_t *);
180 int rf_ReconReadDoneProc(void *, int);
181 int rf_ReconWriteDoneProc(void *, int);
182 void rf_CheckForNewMinHeadSep(RF_Raid_t *, RF_RowCol_t, RF_HeadSepLimit_t);
183 int rf_CheckHeadSeparation(RF_Raid_t *, RF_PerDiskReconCtrl_t *,
184 RF_RowCol_t, RF_RowCol_t, RF_HeadSepLimit_t, RF_ReconUnitNum_t);
185 void rf_ForceReconReadDoneProc(void *, int);
186 void rf_ShutdownReconstruction(void *);
187
188 /*
189 * These functions are inlined on gcc. If they are used more than
190 * once, it is strongly advised to un-line them.
191 */
192 void rf_FreeReconDesc(RF_RaidReconDesc_t *);
193 int rf_IssueNextWriteRequest(RF_Raid_t *, RF_RowCol_t);
194 int rf_CheckForcedOrBlockedReconstruction(RF_Raid_t *,
195 RF_ReconParityStripeStatus_t *, RF_PerDiskReconCtrl_t *,
196 RF_RowCol_t, RF_RowCol_t, RF_StripeNum_t, RF_ReconUnitNum_t);
197 void rf_SignalReconDone(RF_Raid_t *);
198
199 struct RF_ReconDoneProc_s {
200 void (*proc) (RF_Raid_t *, void *);
201 void *arg;
202 RF_ReconDoneProc_t *next;
203 };
204
205 static RF_FreeList_t *rf_rdp_freelist;
206 #define RF_MAX_FREE_RDP 4
207 #define RF_RDP_INC 1
208
209 void
210 rf_SignalReconDone(RF_Raid_t *raidPtr)
211 {
212 RF_ReconDoneProc_t *p;
213
214 RF_LOCK_MUTEX(raidPtr->recon_done_proc_mutex);
215 for (p = raidPtr->recon_done_procs; p; p = p->next) {
216 p->proc(raidPtr, p->arg);
217 }
218 RF_UNLOCK_MUTEX(raidPtr->recon_done_proc_mutex);
219 }
220
221 int
222 rf_RegisterReconDoneProc(RF_Raid_t *raidPtr, void (*proc) (RF_Raid_t *, void *),
223 void *arg, RF_ReconDoneProc_t **handlep)
224 {
225 RF_ReconDoneProc_t *p;
226
227 RF_FREELIST_GET(rf_rdp_freelist, p, next, (RF_ReconDoneProc_t *));
228 if (p == NULL)
229 return (ENOMEM);
230 p->proc = proc;
231 p->arg = arg;
232 RF_LOCK_MUTEX(raidPtr->recon_done_proc_mutex);
233 p->next = raidPtr->recon_done_procs;
234 raidPtr->recon_done_procs = p;
235 RF_UNLOCK_MUTEX(raidPtr->recon_done_proc_mutex);
236 if (handlep)
237 *handlep = p;
238 return (0);
239 }
240
241 /*****************************************************************************
242 *
243 * Sets up the parameters that will be used by the reconstruction process.
244 * Currently there are none, except for those that the layout-specific
245 * configuration (e.g. rf_ConfigureDeclustered) routine sets up.
246 *
247 * In the kernel, we fire off the recon thread.
248 *
249 *****************************************************************************/
250 void
251 rf_ShutdownReconstruction(void *ignored)
252 {
253 RF_FREELIST_DESTROY(rf_recond_freelist, next, (RF_RaidReconDesc_t *));
254 RF_FREELIST_DESTROY(rf_rdp_freelist, next, (RF_ReconDoneProc_t *));
255 }
256
257 int
258 rf_ConfigureReconstruction(RF_ShutdownList_t **listp)
259 {
260 int rc;
261
262 RF_FREELIST_CREATE(rf_recond_freelist, RF_MAX_FREE_RECOND,
263 RF_RECOND_INC, sizeof(RF_RaidReconDesc_t));
264 if (rf_recond_freelist == NULL)
265 return (ENOMEM);
266 RF_FREELIST_CREATE(rf_rdp_freelist, RF_MAX_FREE_RDP,
267 RF_RDP_INC, sizeof(RF_ReconDoneProc_t));
268 if (rf_rdp_freelist == NULL) {
269 RF_FREELIST_DESTROY(rf_recond_freelist, next,
270 (RF_RaidReconDesc_t *));
271 return (ENOMEM);
272 }
273 rc = rf_ShutdownCreate(listp, rf_ShutdownReconstruction, NULL);
274 if (rc) {
275 RF_ERRORMSG3("Unable to add to shutdown list file %s line %d"
276 " rc=%d.\n", __FILE__, __LINE__, rc);
277 rf_ShutdownReconstruction(NULL);
278 return (rc);
279 }
280 return (0);
281 }
282
283 RF_RaidReconDesc_t *
284 rf_AllocRaidReconDesc(RF_Raid_t *raidPtr, RF_RowCol_t row, RF_RowCol_t col,
285 RF_RaidDisk_t *spareDiskPtr, int numDisksDone, RF_RowCol_t srow,
286 RF_RowCol_t scol)
287 {
288
289 RF_RaidReconDesc_t *reconDesc;
290
291 RF_FREELIST_GET(rf_recond_freelist, reconDesc, next,
292 (RF_RaidReconDesc_t *));
293
294 reconDesc->raidPtr = raidPtr;
295 reconDesc->row = row;
296 reconDesc->col = col;
297 reconDesc->spareDiskPtr = spareDiskPtr;
298 reconDesc->numDisksDone = numDisksDone;
299 reconDesc->srow = srow;
300 reconDesc->scol = scol;
301 reconDesc->state = 0;
302 reconDesc->next = NULL;
303
304 return (reconDesc);
305 }
306
307 void
308 rf_FreeReconDesc(RF_RaidReconDesc_t *reconDesc)
309 {
310 #if RF_RECON_STATS > 0
311 printf("RAIDframe: %qu recon event waits, %qu recon delays.\n",
312 reconDesc->numReconEventWaits, reconDesc->numReconExecDelays);
313 #endif /* RF_RECON_STATS > 0 */
314
315 printf("RAIDframe: %qu max exec ticks.\n",
316 reconDesc->maxReconExecTicks);
317
318 #if (RF_RECON_STATS > 0) || defined(_KERNEL)
319 printf("\n");
320 #endif /* (RF_RECON_STATS > 0) || _KERNEL */
321 RF_FREELIST_FREE(rf_recond_freelist, reconDesc, next);
322 }
323
324
325 /*****************************************************************************
326 *
327 * Primary routine to reconstruct a failed disk. This should be called from
328 * within its own thread. It won't return until reconstruction completes,
329 * fails, or is aborted.
330 *
331 *****************************************************************************/
332 int
333 rf_ReconstructFailedDisk(RF_Raid_t *raidPtr, RF_RowCol_t row, RF_RowCol_t col)
334 {
335 RF_LayoutSW_t *lp;
336 int rc;
337
338 lp = raidPtr->Layout.map;
339 if (lp->SubmitReconBuffer) {
340 /*
341 * The current infrastructure only supports reconstructing one
342 * disk at a time for each array.
343 */
344 RF_LOCK_MUTEX(raidPtr->mutex);
345 while (raidPtr->reconInProgress) {
346 RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex);
347 }
348 raidPtr->reconInProgress++;
349 RF_UNLOCK_MUTEX(raidPtr->mutex);
350 rc = rf_ReconstructFailedDiskBasic(raidPtr, row, col);
351 RF_LOCK_MUTEX(raidPtr->mutex);
352 raidPtr->reconInProgress--;
353 RF_UNLOCK_MUTEX(raidPtr->mutex);
354 } else {
355 RF_ERRORMSG1("RECON: no way to reconstruct failed disk for"
356 " arch %c.\n", lp->parityConfig);
357 rc = EIO;
358 }
359 RF_SIGNAL_COND(raidPtr->waitForReconCond);
360 wakeup(&raidPtr->waitForReconCond); /*
361 * XXX Methinks this will be
362 * needed at some point... GO
363 */
364 return (rc);
365 }
366
367 int
368 rf_ReconstructFailedDiskBasic(RF_Raid_t *raidPtr, RF_RowCol_t row,
369 RF_RowCol_t col)
370 {
371 RF_ComponentLabel_t c_label;
372 RF_RaidDisk_t *spareDiskPtr = NULL;
373 RF_RaidReconDesc_t *reconDesc;
374 RF_RowCol_t srow, scol;
375 int numDisksDone = 0, rc;
376
377 /* First look for a spare drive onto which to reconstruct the data. */
378 /*
379 * Spare disk descriptors are stored in row 0. This may have to
380 * change eventually.
381 */
382
383 RF_LOCK_MUTEX(raidPtr->mutex);
384 RF_ASSERT(raidPtr->Disks[row][col].status == rf_ds_failed);
385
386 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
387 if (raidPtr->status[row] != rf_rs_degraded) {
388 RF_ERRORMSG2("Unable to reconstruct disk at row %d"
389 " col %d because status not degraded.\n", row, col);
390 RF_UNLOCK_MUTEX(raidPtr->mutex);
391 return (EINVAL);
392 }
393 srow = row;
394 scol = (-1);
395 } else {
396 srow = 0;
397 for (scol = raidPtr->numCol;
398 scol < raidPtr->numCol + raidPtr->numSpare; scol++) {
399 if (raidPtr->Disks[srow][scol].status == rf_ds_spare) {
400 spareDiskPtr = &raidPtr->Disks[srow][scol];
401 spareDiskPtr->status = rf_ds_used_spare;
402 break;
403 }
404 }
405 if (!spareDiskPtr) {
406 RF_ERRORMSG2("Unable to reconstruct disk at row %d"
407 " col %d because no spares are available.\n",
408 row, col);
409 RF_UNLOCK_MUTEX(raidPtr->mutex);
410 return (ENOSPC);
411 }
412 printf("RECON: initiating reconstruction on row %d col %d"
413 " -> spare at row %d col %d.\n", row, col, srow, scol);
414 }
415 RF_UNLOCK_MUTEX(raidPtr->mutex);
416
417 reconDesc = rf_AllocRaidReconDesc((void *) raidPtr, row, col,
418 spareDiskPtr, numDisksDone, srow, scol);
419 raidPtr->reconDesc = (void *) reconDesc;
420 #if RF_RECON_STATS > 0
421 reconDesc->hsStallCount = 0;
422 reconDesc->numReconExecDelays = 0;
423 reconDesc->numReconEventWaits = 0;
424 #endif /* RF_RECON_STATS > 0 */
425 reconDesc->reconExecTimerRunning = 0;
426 reconDesc->reconExecTicks = 0;
427 reconDesc->maxReconExecTicks = 0;
428 rc = rf_ContinueReconstructFailedDisk(reconDesc);
429
430 if (!rc) {
431 /* Fix up the component label. */
432 /* Don't actually need the read here... */
433 raidread_component_label(
434 raidPtr->raid_cinfo[srow][scol].ci_dev,
435 raidPtr->raid_cinfo[srow][scol].ci_vp,
436 &c_label);
437
438 raid_init_component_label(raidPtr, &c_label);
439 c_label.row = row;
440 c_label.column = col;
441 c_label.clean = RF_RAID_DIRTY;
442 c_label.status = rf_ds_optimal;
443
444 /* XXXX MORE NEEDED HERE. */
445
446 raidwrite_component_label(
447 raidPtr->raid_cinfo[srow][scol].ci_dev,
448 raidPtr->raid_cinfo[srow][scol].ci_vp,
449 &c_label);
450
451 }
452 return (rc);
453 }
454
455 /*
456 *
457 * Allow reconstructing a disk in-place -- i.e. component /dev/sd2e goes AWOL,
458 * and you don't get a spare until the next Monday. With this function
459 * (and hot-swappable drives) you can now put your new disk containing
460 * /dev/sd2e on the bus, scsictl it alive, and then use raidctl(8) to
461 * rebuild the data "on the spot".
462 *
463 */
464
465 int
466 rf_ReconstructInPlace(RF_Raid_t *raidPtr, RF_RowCol_t row, RF_RowCol_t col)
467 {
468 RF_RaidDisk_t *spareDiskPtr = NULL;
469 RF_RaidReconDesc_t *reconDesc;
470 RF_LayoutSW_t *lp;
471 RF_RaidDisk_t *badDisk;
472 RF_ComponentLabel_t c_label;
473 int numDisksDone = 0, rc;
474 struct partinfo dpart;
475 struct vnode *vp;
476 struct vattr va;
477 struct proc *proc;
478 int retcode;
479 int ac;
480
481 lp = raidPtr->Layout.map;
482 if (lp->SubmitReconBuffer) {
483 /*
484 * The current infrastructure only supports reconstructing one
485 * disk at a time for each array.
486 */
487 RF_LOCK_MUTEX(raidPtr->mutex);
488 if ((raidPtr->Disks[row][col].status == rf_ds_optimal) &&
489 (raidPtr->numFailures > 0)) {
490 /* XXX 0 above shouldn't be constant !!! */
491 /*
492 * Some component other than this has failed.
493 * Let's not make things worse than they already
494 * are...
495 */
496 #ifdef RAIDDEBUG
497 printf("RAIDFRAME: Unable to reconstruct to disk at:\n"
498 " Row: %d Col: %d Too many failures.\n",
499 row, col);
500 #endif /* RAIDDEBUG */
501 RF_UNLOCK_MUTEX(raidPtr->mutex);
502 return (EINVAL);
503 }
504 if (raidPtr->Disks[row][col].status == rf_ds_reconstructing) {
505 #ifdef RAIDDEBUG
506 printf("RAIDFRAME: Unable to reconstruct to disk at:\n"
507 " Row: %d Col: %d Reconstruction already"
508 " occurring !\n", row, col);
509 #endif /* RAIDDEBUG */
510
511 RF_UNLOCK_MUTEX(raidPtr->mutex);
512 return (EINVAL);
513 }
514
515
516 if (raidPtr->Disks[row][col].status != rf_ds_failed) {
517 /* "It's gone..." */
518 raidPtr->numFailures++;
519 raidPtr->Disks[row][col].status = rf_ds_failed;
520 raidPtr->status[row] = rf_rs_degraded;
521 rf_update_component_labels(raidPtr,
522 RF_NORMAL_COMPONENT_UPDATE);
523 }
524
525 while (raidPtr->reconInProgress) {
526 RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex);
527 }
528
529 raidPtr->reconInProgress++;
530
531 /*
532 * First look for a spare drive onto which to reconstruct
533 * the data. Spare disk descriptors are stored in row 0.
534 * This may have to change eventually.
535 */
536
537 /*
538 * Actually, we don't care if it's failed or not...
539 * On a RAID set with correct parity, this function
540 * should be callable on any component without ill effects.
541 */
542 /*
543 * RF_ASSERT(raidPtr->Disks[row][col].status == rf_ds_failed);
544 */
545
546 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
547 RF_ERRORMSG2("Unable to reconstruct to disk at row %d"
548 " col %d: operation not supported for"
549 " RF_DISTRIBUTE_SPARE.\n", row, col);
550
551 raidPtr->reconInProgress--;
552 RF_UNLOCK_MUTEX(raidPtr->mutex);
553 return (EINVAL);
554 }
555
556 /*
557 * XXX Need goop here to see if the disk is alive,
558 * and, if not, make it so...
559 */
560
561 badDisk = &raidPtr->Disks[row][col];
562
563 proc = raidPtr->recon_thread;
564
565 /*
566 * This device may have been opened successfully the
567 * first time. Close it before trying to open it again...
568 */
569
570 if (raidPtr->raid_cinfo[row][col].ci_vp != NULL) {
571 printf("Closing the opened device: %s\n",
572 raidPtr->Disks[row][col].devname);
573 vp = raidPtr->raid_cinfo[row][col].ci_vp;
574 ac = raidPtr->Disks[row][col].auto_configured;
575 rf_close_component(raidPtr, vp, ac);
576 raidPtr->raid_cinfo[row][col].ci_vp = NULL;
577 }
578 /*
579 * Note that this disk was *not* auto_configured (any longer).
580 */
581 raidPtr->Disks[row][col].auto_configured = 0;
582
583 printf("About to (re-)open the device for rebuilding: %s\n",
584 raidPtr->Disks[row][col].devname);
585
586 retcode = raidlookup(raidPtr->Disks[row][col].devname,
587 proc, &vp);
588
589 if (retcode) {
590 printf("raid%d: rebuilding: raidlookup on device: %s"
591 " failed: %d !\n", raidPtr->raidid,
592 raidPtr->Disks[row][col].devname, retcode);
593
594 /*
595 * XXX the component isn't responding properly...
596 * Must still be dead :-(
597 */
598 raidPtr->reconInProgress--;
599 RF_UNLOCK_MUTEX(raidPtr->mutex);
600 return(retcode);
601
602 } else {
603
604 /*
605 * Ok, so we can at least do a lookup...
606 * How about actually getting a vp for it ?
607 */
608
609 if ((retcode =
610 VOP_GETATTR(vp, &va, proc->p_ucred, proc)) != 0) {
611 raidPtr->reconInProgress--;
612 RF_UNLOCK_MUTEX(raidPtr->mutex);
613 return(retcode);
614 }
615 retcode = VOP_IOCTL(vp, DIOCGPART, (caddr_t) & dpart,
616 FREAD, proc->p_ucred, proc);
617 if (retcode) {
618 raidPtr->reconInProgress--;
619 RF_UNLOCK_MUTEX(raidPtr->mutex);
620 return(retcode);
621 }
622 raidPtr->Disks[row][col].blockSize =
623 dpart.disklab->d_secsize;
624
625 raidPtr->Disks[row][col].numBlocks =
626 DL_GETPSIZE(dpart.part) - rf_protectedSectors;
627
628 raidPtr->raid_cinfo[row][col].ci_vp = vp;
629 raidPtr->raid_cinfo[row][col].ci_dev = va.va_rdev;
630
631 raidPtr->Disks[row][col].dev = va.va_rdev;
632
633 /*
634 * We allow the user to specify that only a
635 * fraction of the disks should be used this is
636 * just for debug: it speeds up the parity scan.
637 */
638 raidPtr->Disks[row][col].numBlocks =
639 raidPtr->Disks[row][col].numBlocks *
640 rf_sizePercentage / 100;
641 }
642
643 spareDiskPtr = &raidPtr->Disks[row][col];
644 spareDiskPtr->status = rf_ds_used_spare;
645
646 printf("RECON: Initiating in-place reconstruction on\n");
647 printf(" row %d col %d -> spare at row %d col %d.\n",
648 row, col, row, col);
649
650 RF_UNLOCK_MUTEX(raidPtr->mutex);
651
652 reconDesc = rf_AllocRaidReconDesc((void *) raidPtr, row, col,
653 spareDiskPtr, numDisksDone, row, col);
654 raidPtr->reconDesc = (void *) reconDesc;
655 #if RF_RECON_STATS > 0
656 reconDesc->hsStallCount = 0;
657 reconDesc->numReconExecDelays = 0;
658 reconDesc->numReconEventWaits = 0;
659 #endif /* RF_RECON_STATS > 0 */
660 reconDesc->reconExecTimerRunning = 0;
661 reconDesc->reconExecTicks = 0;
662 reconDesc->maxReconExecTicks = 0;
663 rc = rf_ContinueReconstructFailedDisk(reconDesc);
664
665 RF_LOCK_MUTEX(raidPtr->mutex);
666 raidPtr->reconInProgress--;
667 RF_UNLOCK_MUTEX(raidPtr->mutex);
668
669 } else {
670 RF_ERRORMSG1("RECON: no way to reconstruct failed disk for"
671 " arch %c.\n", lp->parityConfig);
672 rc = EIO;
673 }
674 RF_LOCK_MUTEX(raidPtr->mutex);
675
676 if (!rc) {
677 /*
678 * Need to set these here, as at this point it'll be claiming
679 * that the disk is in rf_ds_spared ! But we know better :-)
680 */
681
682 raidPtr->Disks[row][col].status = rf_ds_optimal;
683 raidPtr->status[row] = rf_rs_optimal;
684
685 /* Fix up the component label. */
686 /* Don't actually need the read here... */
687 raidread_component_label(
688 raidPtr->raid_cinfo[row][col].ci_dev,
689 raidPtr->raid_cinfo[row][col].ci_vp,
690 &c_label);
691
692 raid_init_component_label(raidPtr, &c_label);
693
694 c_label.row = row;
695 c_label.column = col;
696
697 raidwrite_component_label(raidPtr->raid_cinfo[row][col].ci_dev,
698 raidPtr->raid_cinfo[row][col].ci_vp, &c_label);
699
700 }
701 RF_UNLOCK_MUTEX(raidPtr->mutex);
702 RF_SIGNAL_COND(raidPtr->waitForReconCond);
703 wakeup(&raidPtr->waitForReconCond);
704 return (rc);
705 }
706
707
708 int
709 rf_ContinueReconstructFailedDisk(RF_RaidReconDesc_t *reconDesc)
710 {
711 RF_Raid_t *raidPtr = reconDesc->raidPtr;
712 RF_RowCol_t row = reconDesc->row;
713 RF_RowCol_t col = reconDesc->col;
714 RF_RowCol_t srow = reconDesc->srow;
715 RF_RowCol_t scol = reconDesc->scol;
716 RF_ReconMap_t *mapPtr;
717
718 RF_ReconEvent_t *event;
719 struct timeval etime, elpsd;
720 unsigned long xor_s, xor_resid_us;
721 int retcode, i, ds;
722
723 switch (reconDesc->state) {
724 case 0:
725 raidPtr->accumXorTimeUs = 0;
726
727 /* Create one trace record per physical disk. */
728 RF_Malloc(raidPtr->recon_tracerecs, raidPtr->numCol *
729 sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *));
730
731 /*
732 * Quiesce the array prior to starting recon. This is needed
733 * to assure no nasty interactions with pending user writes.
734 * We need to do this before we change the disk or row status.
735 */
736 reconDesc->state = 1;
737
738 Dprintf("RECON: begin request suspend.\n");
739 retcode = rf_SuspendNewRequestsAndWait(raidPtr);
740 Dprintf("RECON: end request suspend.\n");
741 rf_StartUserStats(raidPtr); /*
742 * Zero out the stats kept on
743 * user accs.
744 */
745 /* Fall through to state 1. */
746 case 1:
747 RF_LOCK_MUTEX(raidPtr->mutex);
748
749 /*
750 * Create the reconstruction control pointer and install it in
751 * the right slot.
752 */
753 raidPtr->reconControl[row] =
754 rf_MakeReconControl(reconDesc, row, col, srow, scol);
755 mapPtr = raidPtr->reconControl[row]->reconMap;
756 raidPtr->status[row] = rf_rs_reconstructing;
757 raidPtr->Disks[row][col].status = rf_ds_reconstructing;
758 raidPtr->Disks[row][col].spareRow = srow;
759 raidPtr->Disks[row][col].spareCol = scol;
760
761 RF_UNLOCK_MUTEX(raidPtr->mutex);
762
763 RF_GETTIME(raidPtr->reconControl[row]->starttime);
764
765 /*
766 * Now start up the actual reconstruction: issue a read for
767 * each surviving disk.
768 */
769
770 reconDesc->numDisksDone = 0;
771 for (i = 0; i < raidPtr->numCol; i++) {
772 if (i != col) {
773 /*
774 * Find and issue the next I/O on the
775 * indicated disk.
776 */
777 if (rf_IssueNextReadRequest(raidPtr, row, i)) {
778 Dprintf2("RECON: done issuing for r%d"
779 " c%d.\n", row, i);
780 reconDesc->numDisksDone++;
781 }
782 }
783 }
784
785 reconDesc->state = 2;
786
787 case 2:
788 Dprintf("RECON: resume requests.\n");
789 rf_ResumeNewRequests(raidPtr);
790
791 reconDesc->state = 3;
792
793 case 3:
794
795 /*
796 * Process reconstruction events until all disks report that
797 * they've completed all work.
798 */
799 mapPtr = raidPtr->reconControl[row]->reconMap;
800
801 while (reconDesc->numDisksDone < raidPtr->numCol - 1) {
802
803 event = rf_GetNextReconEvent(reconDesc, row,
804 (void (*) (void *)) rf_ContinueReconstructFailedDisk,
805 reconDesc);
806 RF_ASSERT(event);
807
808 if (rf_ProcessReconEvent(raidPtr, row, event))
809 reconDesc->numDisksDone++;
810 raidPtr->reconControl[row]->numRUsTotal =
811 mapPtr->totalRUs;
812 raidPtr->reconControl[row]->numRUsComplete =
813 mapPtr->totalRUs -
814 rf_UnitsLeftToReconstruct(mapPtr);
815
816 raidPtr->reconControl[row]->percentComplete =
817 (raidPtr->reconControl[row]->numRUsComplete * 100 /
818 raidPtr->reconControl[row]->numRUsTotal);
819 if (rf_prReconSched) {
820 rf_PrintReconSchedule(
821 raidPtr->reconControl[row]->reconMap,
822 &(raidPtr->reconControl[row]->starttime));
823 }
824 }
825
826 reconDesc->state = 4;
827
828 case 4:
829 mapPtr = raidPtr->reconControl[row]->reconMap;
830 if (rf_reconDebug) {
831 printf("RECON: all reads completed.\n");
832 }
833 /*
834 * At this point all the reads have completed. We now wait
835 * for any pending writes to complete, and then we're done.
836 */
837
838 while (rf_UnitsLeftToReconstruct(
839 raidPtr->reconControl[row]->reconMap) > 0) {
840
841 event = rf_GetNextReconEvent(reconDesc, row,
842 (void (*) (void *)) rf_ContinueReconstructFailedDisk,
843 reconDesc);
844 RF_ASSERT(event);
845
846 /* Ignore return code. */
847 (void) rf_ProcessReconEvent(raidPtr, row, event);
848 raidPtr->reconControl[row]->percentComplete =
849 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 /
850 mapPtr->totalRUs);
851 if (rf_prReconSched) {
852 rf_PrintReconSchedule(
853 raidPtr->reconControl[row]->reconMap,
854 &(raidPtr->reconControl[row]->starttime));
855 }
856 }
857 reconDesc->state = 5;
858
859 case 5:
860 /*
861 * Success: mark the dead disk as reconstructed. We quiesce
862 * the array here to assure no nasty interactions with pending
863 * user accesses, when we free up the psstatus structure as
864 * part of FreeReconControl().
865 */
866
867 reconDesc->state = 6;
868
869 retcode = rf_SuspendNewRequestsAndWait(raidPtr);
870 rf_StopUserStats(raidPtr);
871 rf_PrintUserStats(raidPtr); /*
872 * Print out the stats on user
873 * accs accumulated during
874 * recon.
875 */
876
877 /* Fall through to state 6. */
878 case 6:
879 RF_LOCK_MUTEX(raidPtr->mutex);
880 raidPtr->numFailures--;
881 ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE);
882 raidPtr->Disks[row][col].status = (ds) ? rf_ds_dist_spared :
883 rf_ds_spared;
884 raidPtr->status[row] = (ds) ? rf_rs_reconfigured :
885 rf_rs_optimal;
886 RF_UNLOCK_MUTEX(raidPtr->mutex);
887 RF_GETTIME(etime);
888 RF_TIMEVAL_DIFF(&(raidPtr->reconControl[row]->starttime),
889 &etime, &elpsd);
890
891 /*
892 * XXX -- Why is state 7 different from state 6 if there is no
893 * return() here ? -- XXX Note that I set elpsd above & use it
894 * below, so if you put a return here you'll have to fix this.
895 * (also, FreeReconControl is called below).
896 */
897
898 case 7:
899
900 rf_ResumeNewRequests(raidPtr);
901
902 printf("Reconstruction of disk at row %d col %d completed.\n",
903 row, col);
904 xor_s = raidPtr->accumXorTimeUs / 1000000;
905 xor_resid_us = raidPtr->accumXorTimeUs % 1000000;
906 printf("Recon time was %d.%06d seconds, accumulated XOR time"
907 " was %ld us (%ld.%06ld).\n", (int) elpsd.tv_sec,
908 (int) elpsd.tv_usec, raidPtr->accumXorTimeUs, xor_s,
909 xor_resid_us);
910 printf(" (start time %d sec %d usec, end time %d sec %d"
911 " usec)\n",
912 (int) raidPtr->reconControl[row]->starttime.tv_sec,
913 (int) raidPtr->reconControl[row]->starttime.tv_usec,
914 (int) etime.tv_sec, (int) etime.tv_usec);
915
916 #if RF_RECON_STATS > 0
917 printf("Total head-sep stall count was %d.\n",
918 (int) reconDesc->hsStallCount);
919 #endif /* RF_RECON_STATS > 0 */
920 rf_FreeReconControl(raidPtr, row);
921 RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol *
922 sizeof(RF_AccTraceEntry_t));
923 rf_FreeReconDesc(reconDesc);
924
925 }
926
927 rf_SignalReconDone(raidPtr);
928 return (0);
929 }
930
931
932 /*****************************************************************************
933 * Do the right thing upon each reconstruction event.
934 * Returns nonzero if and only if there is nothing left unread on the
935 * indicated disk.
936 *****************************************************************************/
937 int
938 rf_ProcessReconEvent(RF_Raid_t *raidPtr, RF_RowCol_t frow,
939 RF_ReconEvent_t *event)
940 {
941 int retcode = 0, submitblocked;
942 RF_ReconBuffer_t *rbuf;
943 RF_SectorCount_t sectorsPerRU;
944
945 Dprintf1("RECON: rf_ProcessReconEvent type %d.\n", event->type);
946
947 switch (event->type) {
948
949 /* A read I/O has completed. */
950 case RF_REVENT_READDONE:
951 rbuf = raidPtr->reconControl[frow]
952 ->perDiskInfo[event->col].rbuf;
953 Dprintf3("RECON: READDONE EVENT: row %d col %d psid %ld.\n",
954 frow, event->col, rbuf->parityStripeID);
955 Dprintf7("RECON: done read psid %ld buf %lx %02x %02x %02x"
956 " %02x %02x.\n", rbuf->parityStripeID, rbuf->buffer,
957 rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff,
958 rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff,
959 rbuf->buffer[4] & 0xff);
960 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
961 submitblocked = rf_SubmitReconBuffer(rbuf, 0, 0);
962 Dprintf1("RECON: submitblocked=%d.\n", submitblocked);
963 if (!submitblocked)
964 retcode = rf_IssueNextReadRequest(raidPtr, frow,
965 event->col);
966 break;
967
968 /* A write I/O has completed. */
969 case RF_REVENT_WRITEDONE:
970 if (rf_floatingRbufDebug) {
971 rf_CheckFloatingRbufCount(raidPtr, 1);
972 }
973 sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit *
974 raidPtr->Layout.SUsPerRU;
975 rbuf = (RF_ReconBuffer_t *) event->arg;
976 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
977 Dprintf3("RECON: WRITEDONE EVENT: psid %d ru %d"
978 " (%d %% complete).\n",
979 rbuf->parityStripeID, rbuf->which_ru,
980 raidPtr->reconControl[frow]->percentComplete);
981 rf_ReconMapUpdate(raidPtr, raidPtr->reconControl[frow]
982 ->reconMap, rbuf->failedDiskSectorOffset,
983 rbuf->failedDiskSectorOffset + sectorsPerRU - 1);
984 rf_RemoveFromActiveReconTable(raidPtr, frow,
985 rbuf->parityStripeID, rbuf->which_ru);
986
987 if (rbuf->type == RF_RBUF_TYPE_FLOATING) {
988 RF_LOCK_MUTEX(raidPtr->reconControl[frow]->rb_mutex);
989 raidPtr->numFullReconBuffers--;
990 rf_ReleaseFloatingReconBuffer(raidPtr, frow, rbuf);
991 RF_UNLOCK_MUTEX(raidPtr->reconControl[frow]->rb_mutex);
992 } else
993 if (rbuf->type == RF_RBUF_TYPE_FORCED)
994 rf_FreeReconBuffer(rbuf);
995 else
996 RF_ASSERT(0);
997 break;
998
999 /* A buffer-stall condition has been cleared. */
1000 case RF_REVENT_BUFCLEAR:
1001 Dprintf2("RECON: BUFCLEAR EVENT: row %d col %d.\n", frow,
1002 event->col);
1003 submitblocked = rf_SubmitReconBuffer(raidPtr
1004 ->reconControl[frow]->perDiskInfo[event->col].rbuf, 0,
1005 (int) (long) event->arg);
1006 RF_ASSERT(!submitblocked); /*
1007 * We wouldn't have gotten the
1008 * BUFCLEAR event if we
1009 * couldn't submit.
1010 */
1011 retcode = rf_IssueNextReadRequest(raidPtr, frow, event->col);
1012 break;
1013
1014 /* A user-write reconstruction blockage has been cleared. */
1015 case RF_REVENT_BLOCKCLEAR:
1016 DDprintf2("RECON: BLOCKCLEAR EVENT: row %d col %d.\n",
1017 frow, event->col);
1018 retcode = rf_TryToRead(raidPtr, frow, event->col);
1019 break;
1020
1021 /*
1022 * A max-head-separation reconstruction blockage has been
1023 * cleared.
1024 */
1025 case RF_REVENT_HEADSEPCLEAR:
1026 Dprintf2("RECON: HEADSEPCLEAR EVENT: row %d col %d.\n",
1027 frow, event->col);
1028 retcode = rf_TryToRead(raidPtr, frow, event->col);
1029 break;
1030
1031 /* A buffer has become ready to write. */
1032 case RF_REVENT_BUFREADY:
1033 Dprintf2("RECON: BUFREADY EVENT: row %d col %d.\n",
1034 frow, event->col);
1035 retcode = rf_IssueNextWriteRequest(raidPtr, frow);
1036 if (rf_floatingRbufDebug) {
1037 rf_CheckFloatingRbufCount(raidPtr, 1);
1038 }
1039 break;
1040
1041 /*
1042 * We need to skip the current RU entirely because it got
1043 * recon'd while we were waiting for something else to happen.
1044 */
1045 case RF_REVENT_SKIP:
1046 DDprintf2("RECON: SKIP EVENT: row %d col %d.\n",
1047 frow, event->col);
1048 retcode = rf_IssueNextReadRequest(raidPtr, frow, event->col);
1049 break;
1050
1051 /*
1052 * A forced-reconstruction read access has completed. Just
1053 * submit the buffer.
1054 */
1055 case RF_REVENT_FORCEDREADDONE:
1056 rbuf = (RF_ReconBuffer_t *) event->arg;
1057 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
1058 DDprintf2("RECON: FORCEDREADDONE EVENT: row %d col %d.\n",
1059 frow, event->col);
1060 submitblocked = rf_SubmitReconBuffer(rbuf, 1, 0);
1061 RF_ASSERT(!submitblocked);
1062 break;
1063
1064 default:
1065 RF_PANIC();
1066 }
1067 rf_FreeReconEventDesc(event);
1068 return (retcode);
1069 }
1070
1071 /*****************************************************************************
1072 *
1073 * Find the next thing that's needed on the indicated disk, and issue
1074 * a read request for it. We assume that the reconstruction buffer
1075 * associated with this process is free to receive the data. If
1076 * reconstruction is blocked on the indicated RU, we issue a
1077 * blockage-release request instead of a physical disk read request.
1078 * If the current disk gets too far ahead of the others, we issue a
1079 * head-separation wait request and return.
1080 *
1081 * ctrl->{ru_count, curPSID, diskOffset} and
1082 * rbuf->failedDiskSectorOffset are maintained to point to the unit
1083 * we're currently accessing. Note that this deviates from the
1084 * standard C idiom of having counters point to the next thing to be
1085 * accessed. This allows us to easily retry when we're blocked by
1086 * head separation or reconstruction-blockage events.
1087 *
1088 * Returns nonzero if and only if there is nothing left unread on the
1089 * indicated disk.
1090 *
1091 *****************************************************************************/
1092 int
1093 rf_IssueNextReadRequest(RF_Raid_t *raidPtr, RF_RowCol_t row, RF_RowCol_t col)
1094 {
1095 RF_PerDiskReconCtrl_t *ctrl =
1096 &raidPtr->reconControl[row]->perDiskInfo[col];
1097 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1098 RF_ReconBuffer_t *rbuf = ctrl->rbuf;
1099 RF_ReconUnitCount_t RUsPerPU =
1100 layoutPtr->SUsPerPU / layoutPtr->SUsPerRU;
1101 RF_SectorCount_t sectorsPerRU =
1102 layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU;
1103 int do_new_check = 0, retcode = 0, status;
1104
1105 /*
1106 * If we are currently the slowest disk, mark that we have to do a new
1107 * check.
1108 */
1109 if (ctrl->headSepCounter <=
1110 raidPtr->reconControl[row]->minHeadSepCounter)
1111 do_new_check = 1;
1112
1113 while (1) {
1114
1115 ctrl->ru_count++;
1116 if (ctrl->ru_count < RUsPerPU) {
1117 ctrl->diskOffset += sectorsPerRU;
1118 rbuf->failedDiskSectorOffset += sectorsPerRU;
1119 } else {
1120 ctrl->curPSID++;
1121 ctrl->ru_count = 0;
1122 /* code left over from when head-sep was based on
1123 * parity stripe id */
1124 if (ctrl->curPSID >=
1125 raidPtr->reconControl[row]->lastPSID) {
1126 rf_CheckForNewMinHeadSep(raidPtr, row,
1127 ++(ctrl->headSepCounter));
1128 return (1); /* Finito ! */
1129 }
1130 /*
1131 * Find the disk offsets of the start of the parity
1132 * stripe on both the current disk and the failed
1133 * disk. Skip this entire parity stripe if either disk
1134 * does not appear in the indicated PS.
1135 */
1136 status = rf_ComputePSDiskOffsets(raidPtr,
1137 ctrl->curPSID, row, col, &ctrl->diskOffset,
1138 &rbuf->failedDiskSectorOffset, &rbuf->spRow,
1139 &rbuf->spCol, &rbuf->spOffset);
1140 if (status) {
1141 ctrl->ru_count = RUsPerPU - 1;
1142 continue;
1143 }
1144 }
1145 rbuf->which_ru = ctrl->ru_count;
1146
1147 /* Skip this RU if it's already been reconstructed. */
1148 if (rf_CheckRUReconstructed(raidPtr->reconControl[row]
1149 ->reconMap, rbuf->failedDiskSectorOffset)) {
1150 Dprintf2("Skipping psid %ld ru %d: already"
1151 " reconstructed.\n", ctrl->curPSID, ctrl->ru_count);
1152 continue;
1153 }
1154 break;
1155 }
1156 ctrl->headSepCounter++;
1157 if (do_new_check) /* Update min if needed. */
1158 rf_CheckForNewMinHeadSep(raidPtr, row, ctrl->headSepCounter);
1159
1160
1161 /*
1162 * At this point, we have definitely decided what to do, and we have
1163 * only to see if we can actually do it now.
1164 */
1165 rbuf->parityStripeID = ctrl->curPSID;
1166 rbuf->which_ru = ctrl->ru_count;
1167 bzero((char *) &raidPtr->recon_tracerecs[col],
1168 sizeof(raidPtr->recon_tracerecs[col]));
1169 raidPtr->recon_tracerecs[col].reconacc = 1;
1170 RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer);
1171 retcode = rf_TryToRead(raidPtr, row, col);
1172 return (retcode);
1173 }
1174
1175 /*
1176 * Tries to issue the next read on the indicated disk. We may be
1177 * blocked by (a) the heads being too far apart, or (b) recon on the
1178 * indicated RU being blocked due to a write by a user thread. In
1179 * this case, we issue a head-sep or blockage wait request, which will
1180 * cause this same routine to be invoked again later when the blockage
1181 * has cleared.
1182 */
1183
1184 int
1185 rf_TryToRead(RF_Raid_t *raidPtr, RF_RowCol_t row, RF_RowCol_t col)
1186 {
1187 RF_PerDiskReconCtrl_t *ctrl =
1188 &raidPtr->reconControl[row]->perDiskInfo[col];
1189 RF_SectorCount_t sectorsPerRU =
1190 raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU;
1191 RF_StripeNum_t psid = ctrl->curPSID;
1192 RF_ReconUnitNum_t which_ru = ctrl->ru_count;
1193 RF_DiskQueueData_t *req;
1194 int status, created = 0;
1195 RF_ReconParityStripeStatus_t *pssPtr;
1196
1197 /*
1198 * If the current disk is too far ahead of the others, issue a
1199 * head-separation wait and return.
1200 */
1201 if (rf_CheckHeadSeparation(raidPtr, ctrl, row, col,
1202 ctrl->headSepCounter, which_ru))
1203 return (0);
1204 RF_LOCK_PSS_MUTEX(raidPtr, row, psid);
1205 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl[row]
1206 ->pssTable, psid, which_ru, RF_PSS_CREATE, &created);
1207
1208 /*
1209 * If recon is blocked on the indicated parity stripe, issue a
1210 * block-wait request and return. This also must mark the indicated RU
1211 * in the stripe as under reconstruction if not blocked.
1212 */
1213 status = rf_CheckForcedOrBlockedReconstruction(raidPtr, pssPtr, ctrl,
1214 row, col, psid, which_ru);
1215 if (status == RF_PSS_RECON_BLOCKED) {
1216 Dprintf2("RECON: Stalling psid %ld ru %d: recon blocked.\n",
1217 psid, which_ru);
1218 goto out;
1219 } else
1220 if (status == RF_PSS_FORCED_ON_WRITE) {
1221 rf_CauseReconEvent(raidPtr, row, col, NULL,
1222 RF_REVENT_SKIP);
1223 goto out;
1224 }
1225 /*
1226 * Make one last check to be sure that the indicated RU didn't get
1227 * reconstructed while we were waiting for something else to happen.
1228 * This is unfortunate in that it causes us to make this check twice
1229 * in the normal case. Might want to make some attempt to re-work
1230 * this so that we only do this check if we've definitely blocked on
1231 * one of the above checks. When this condition is detected, we may
1232 * have just created a bogus status entry, which we need to delete.
1233 */
1234 if (rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap,
1235 ctrl->rbuf->failedDiskSectorOffset)) {
1236 Dprintf2("RECON: Skipping psid %ld ru %d: prior recon after"
1237 " stall.\n", psid, which_ru);
1238 if (created)
1239 rf_PSStatusDelete(raidPtr,
1240 raidPtr->reconControl[row]->pssTable, pssPtr);
1241 rf_CauseReconEvent(raidPtr, row, col, NULL, RF_REVENT_SKIP);
1242 goto out;
1243 }
1244 /* Found something to read. Issue the I/O. */
1245 Dprintf5("RECON: Read for psid %ld on row %d col %d offset %ld"
1246 " buf %lx.\n", psid, row, col, ctrl->diskOffset,
1247 ctrl->rbuf->buffer);
1248 RF_ETIMER_STOP(raidPtr->recon_tracerecs[col].recon_timer);
1249 RF_ETIMER_EVAL(raidPtr->recon_tracerecs[col].recon_timer);
1250 raidPtr->recon_tracerecs[col].specific.recon.recon_start_to_fetch_us =
1251 RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[col].recon_timer);
1252 RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer);
1253
1254 /*
1255 * Should be ok to use a NULL proc pointer here, all the bufs we use
1256 * should be in kernel space.
1257 */
1258 req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, ctrl->diskOffset,
1259 sectorsPerRU, ctrl->rbuf->buffer, psid, which_ru,
1260 rf_ReconReadDoneProc, (void *) ctrl, NULL,
1261 &raidPtr->recon_tracerecs[col], (void *) raidPtr, 0, NULL);
1262
1263 RF_ASSERT(req); /* XXX -- Fix this. -- XXX */
1264
1265 ctrl->rbuf->arg = (void *) req;
1266 rf_DiskIOEnqueue(&raidPtr->Queues[row][col], req, RF_IO_RECON_PRIORITY);
1267 pssPtr->issued[col] = 1;
1268
1269 out:
1270 RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid);
1271 return (0);
1272 }
1273
1274
1275 /*
1276 * Given a parity stripe ID, we want to find out whether both the
1277 * current disk and the failed disk exist in that parity stripe. If
1278 * not, we want to skip this whole PS. If so, we want to find the
1279 * disk offset of the start of the PS on both the current disk and the
1280 * failed disk.
1281 *
1282 * This works by getting a list of disks comprising the indicated
1283 * parity stripe, and searching the list for the current and failed
1284 * disks. Once we've decided they both exist in the parity stripe, we
1285 * need to decide whether each is data or parity, so that we'll know
1286 * which mapping function to call to get the corresponding disk
1287 * offsets.
1288 *
1289 * This is kind of unpleasant, but doing it this way allows the
1290 * reconstruction code to use parity stripe IDs rather than physical
1291 * disks address to march through the failed disk, which greatly
1292 * simplifies a lot of code, as well as eliminating the need for a
1293 * reverse-mapping function. I also think it will execute faster,
1294 * since the calls to the mapping module are kept to a minimum.
1295 *
1296 * ASSUMES THAT THE STRIPE IDENTIFIER IDENTIFIES THE DISKS COMPRISING
1297 * THE STRIPE IN THE CORRECT ORDER.
1298 */
1299
1300 int
1301 rf_ComputePSDiskOffsets(
1302 RF_Raid_t *raidPtr, /* RAID descriptor. */
1303 RF_StripeNum_t psid, /* Parity stripe identifier. */
1304 RF_RowCol_t row, /*
1305 * Row and column of disk to find
1306 * the offsets for.
1307 */
1308 RF_RowCol_t col,
1309 RF_SectorNum_t *outDiskOffset,
1310 RF_SectorNum_t *outFailedDiskSectorOffset,
1311 RF_RowCol_t *spRow, /*
1312 * OUT: Row,col of spare unit for
1313 * failed unit.
1314 */
1315 RF_RowCol_t *spCol,
1316 RF_SectorNum_t *spOffset /*
1317 * OUT: Offset into disk containing
1318 * spare unit.
1319 */
1320 )
1321 {
1322 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1323 RF_RowCol_t fcol = raidPtr->reconControl[row]->fcol;
1324 RF_RaidAddr_t sosRaidAddress; /* start-of-stripe */
1325 RF_RowCol_t *diskids;
1326 u_int i, j, k, i_offset, j_offset;
1327 RF_RowCol_t prow, pcol;
1328 int testcol, testrow;
1329 RF_RowCol_t stripe;
1330 RF_SectorNum_t poffset;
1331 char i_is_parity = 0, j_is_parity = 0;
1332 RF_RowCol_t stripeWidth =
1333 layoutPtr->numDataCol + layoutPtr->numParityCol;
1334
1335 /* Get a listing of the disks comprising that stripe. */
1336 sosRaidAddress = rf_ParityStripeIDToRaidAddress(layoutPtr, psid);
1337 (layoutPtr->map->IdentifyStripe) (raidPtr, sosRaidAddress, &diskids,
1338 &stripe);
1339 RF_ASSERT(diskids);
1340
1341 /*
1342 * Reject this entire parity stripe if it does not contain the
1343 * indicated disk or it does not contain the failed disk.
1344 */
1345 if (row != stripe)
1346 goto skipit;
1347 for (i = 0; i < stripeWidth; i++) {
1348 if (col == diskids[i])
1349 break;
1350 }
1351 if (i == stripeWidth)
1352 goto skipit;
1353 for (j = 0; j < stripeWidth; j++) {
1354 if (fcol == diskids[j])
1355 break;
1356 }
1357 if (j == stripeWidth) {
1358 goto skipit;
1359 }
1360 /* Find out which disk the parity is on. */
1361 (layoutPtr->map->MapParity) (raidPtr, sosRaidAddress, &prow, &pcol,
1362 &poffset, RF_DONT_REMAP);
1363
1364 /* Find out if either the current RU or the failed RU is parity. */
1365 /*
1366 * Also, if the parity occurs in this stripe prior to the data and/or
1367 * failed col, we need to decrement i and/or j.
1368 */
1369 for (k = 0; k < stripeWidth; k++)
1370 if (diskids[k] == pcol)
1371 break;
1372 RF_ASSERT(k < stripeWidth);
1373 i_offset = i;
1374 j_offset = j;
1375 if (k < i)
1376 i_offset--;
1377 else
1378 if (k == i) {
1379 i_is_parity = 1;
1380 i_offset = 0;
1381 } /*
1382 * Set offsets to zero to disable multiply
1383 * below.
1384 */
1385 if (k < j)
1386 j_offset--;
1387 else
1388 if (k == j) {
1389 j_is_parity = 1;
1390 j_offset = 0;
1391 }
1392 /*
1393 * At this point, [ij]_is_parity tells us whether the [current,failed]
1394 * disk is parity at the start of this RU, and, if data, "[ij]_offset"
1395 * tells us how far into the stripe the [current,failed] disk is.
1396 */
1397
1398 /*
1399 * Call the mapping routine to get the offset into the current disk,
1400 * repeat for failed disk.
1401 */
1402 if (i_is_parity)
1403 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + i_offset *
1404 layoutPtr->sectorsPerStripeUnit, &testrow, &testcol,
1405 outDiskOffset, RF_DONT_REMAP);
1406 else
1407 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + i_offset *
1408 layoutPtr->sectorsPerStripeUnit, &testrow, &testcol,
1409 outDiskOffset, RF_DONT_REMAP);
1410
1411 RF_ASSERT(row == testrow && col == testcol);
1412
1413 if (j_is_parity)
1414 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset *
1415 layoutPtr->sectorsPerStripeUnit, &testrow, &testcol,
1416 outFailedDiskSectorOffset, RF_DONT_REMAP);
1417 else
1418 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset *
1419 layoutPtr->sectorsPerStripeUnit, &testrow, &testcol,
1420 outFailedDiskSectorOffset, RF_DONT_REMAP);
1421 RF_ASSERT(row == testrow && fcol == testcol);
1422
1423 /* Now locate the spare unit for the failed unit. */
1424 if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) {
1425 if (j_is_parity)
1426 layoutPtr->map->MapParity(raidPtr, sosRaidAddress +
1427 j_offset * layoutPtr->sectorsPerStripeUnit, spRow,
1428 spCol, spOffset, RF_REMAP);
1429 else
1430 layoutPtr->map->MapSector(raidPtr, sosRaidAddress +
1431 j_offset * layoutPtr->sectorsPerStripeUnit, spRow,
1432 spCol, spOffset, RF_REMAP);
1433 } else {
1434 *spRow = raidPtr->reconControl[row]->spareRow;
1435 *spCol = raidPtr->reconControl[row]->spareCol;
1436 *spOffset = *outFailedDiskSectorOffset;
1437 }
1438
1439 return (0);
1440
1441 skipit:
1442 Dprintf3("RECON: Skipping psid %ld: nothing needed from r%d c%d.\n",
1443 psid, row, col);
1444 return (1);
1445 }
1446
1447
1448 /*
1449 * This is called when a buffer has become ready to write to the replacement
1450 * disk.
1451 */
1452 int
1453 rf_IssueNextWriteRequest(RF_Raid_t *raidPtr, RF_RowCol_t row)
1454 {
1455 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1456 RF_SectorCount_t sectorsPerRU =
1457 layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU;
1458 RF_RowCol_t fcol = raidPtr->reconControl[row]->fcol;
1459 RF_ReconBuffer_t *rbuf;
1460 RF_DiskQueueData_t *req;
1461
1462 rbuf = rf_GetFullReconBuffer(raidPtr->reconControl[row]);
1463 RF_ASSERT(rbuf); /*
1464 * There must be one available, or we wouldn't
1465 * have gotten the event that sent us here.
1466 */
1467 RF_ASSERT(rbuf->pssPtr);
1468
1469 rbuf->pssPtr->writeRbuf = rbuf;
1470 rbuf->pssPtr = NULL;
1471
1472 Dprintf7("RECON: New write (r %d c %d offs %d) for psid %ld ru %d"
1473 " (failed disk offset %ld) buf %lx.\n",
1474 rbuf->spRow, rbuf->spCol, rbuf->spOffset, rbuf->parityStripeID,
1475 rbuf->which_ru, rbuf->failedDiskSectorOffset, rbuf->buffer);
1476 Dprintf6("RECON: new write psid %ld %02x %02x %02x %02x %02x.\n",
1477 rbuf->parityStripeID, rbuf->buffer[0] & 0xff,
1478 rbuf->buffer[1] & 0xff, rbuf->buffer[2] & 0xff,
1479 rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff);
1480
1481 /*
1482 * Should be ok to use a NULL b_proc here b/c all addrs should be in
1483 * kernel space.
1484 */
1485 req = rf_CreateDiskQueueData(RF_IO_TYPE_WRITE, rbuf->spOffset,
1486 sectorsPerRU, rbuf->buffer, rbuf->parityStripeID, rbuf->which_ru,
1487 rf_ReconWriteDoneProc, (void *) rbuf, NULL,
1488 &raidPtr->recon_tracerecs[fcol], (void *) raidPtr, 0, NULL);
1489
1490 RF_ASSERT(req); /* XXX -- Fix this. -- XXX */
1491
1492 rbuf->arg = (void *) req;
1493 rf_DiskIOEnqueue(&raidPtr->Queues[rbuf->spRow][rbuf->spCol], req,
1494 RF_IO_RECON_PRIORITY);
1495
1496 return (0);
1497 }
1498
1499 /*
1500 * This gets called upon the completion of a reconstruction read
1501 * operation. The arg is a pointer to the per-disk reconstruction
1502 * control structure for the process that just finished a read.
1503 *
1504 * Called at interrupt context in the kernel, so don't do anything
1505 * illegal here.
1506 */
1507 int
1508 rf_ReconReadDoneProc(void *arg, int status)
1509 {
1510 RF_PerDiskReconCtrl_t *ctrl = (RF_PerDiskReconCtrl_t *) arg;
1511 RF_Raid_t *raidPtr = ctrl->reconCtrl->reconDesc->raidPtr;
1512
1513 if (status) {
1514 /*
1515 * XXX
1516 */
1517 printf("Recon read failed !\n");
1518 RF_PANIC();
1519 }
1520 RF_ETIMER_STOP(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1521 RF_ETIMER_EVAL(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1522 raidPtr->recon_tracerecs[ctrl->col].specific.recon.
1523 recon_fetch_to_return_us =
1524 RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1525 RF_ETIMER_START(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1526
1527 rf_CauseReconEvent(raidPtr, ctrl->row, ctrl->col, NULL,
1528 RF_REVENT_READDONE);
1529 return (0);
1530 }
1531
1532
1533 /*
1534 * This gets called upon the completion of a reconstruction write operation.
1535 * The arg is a pointer to the rbuf that was just written.
1536 *
1537 * Called at interrupt context in the kernel, so don't do anything illegal here.
1538 */
1539 int
1540 rf_ReconWriteDoneProc(void *arg, int status)
1541 {
1542 RF_ReconBuffer_t *rbuf = (RF_ReconBuffer_t *) arg;
1543
1544 Dprintf2("Reconstruction completed on psid %ld ru %d.\n",
1545 rbuf->parityStripeID, rbuf->which_ru);
1546 if (status) {
1547 /* fprintf(stderr, "Recon write failed !\n"); */
1548 printf("Recon write failed !\n");
1549 RF_PANIC();
1550 }
1551 rf_CauseReconEvent((RF_Raid_t *) rbuf->raidPtr, rbuf->row, rbuf->col,
1552 arg, RF_REVENT_WRITEDONE);
1553 return (0);
1554 }
1555
1556
1557 /*
1558 * Computes a new minimum head sep, and wakes up anyone who needs to
1559 * be woken as a result.
1560 */
1561 void
1562 rf_CheckForNewMinHeadSep(RF_Raid_t *raidPtr, RF_RowCol_t row,
1563 RF_HeadSepLimit_t hsCtr)
1564 {
1565 RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl[row];
1566 RF_HeadSepLimit_t new_min;
1567 RF_RowCol_t i;
1568 RF_CallbackDesc_t *p;
1569 /* From the definition of a minimum. */
1570 RF_ASSERT(hsCtr >= reconCtrlPtr->minHeadSepCounter);
1571
1572
1573 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1574
1575 new_min = ~(1L << (8 * sizeof(long) - 1)); /* 0x7FFF....FFF */
1576 for (i = 0; i < raidPtr->numCol; i++)
1577 if (i != reconCtrlPtr->fcol) {
1578 if (reconCtrlPtr->perDiskInfo[i].headSepCounter <
1579 new_min)
1580 new_min =
1581 reconCtrlPtr->perDiskInfo[i].headSepCounter;
1582 }
1583 /* Set the new minimum and wake up anyone who can now run again. */
1584 if (new_min != reconCtrlPtr->minHeadSepCounter) {
1585 reconCtrlPtr->minHeadSepCounter = new_min;
1586 Dprintf1("RECON: new min head pos counter val is %ld.\n",
1587 new_min);
1588 while (reconCtrlPtr->headSepCBList) {
1589 if (reconCtrlPtr->headSepCBList->callbackArg.v >
1590 new_min)
1591 break;
1592 p = reconCtrlPtr->headSepCBList;
1593 reconCtrlPtr->headSepCBList = p->next;
1594 p->next = NULL;
1595 rf_CauseReconEvent(raidPtr, p->row, p->col, NULL,
1596 RF_REVENT_HEADSEPCLEAR);
1597 rf_FreeCallbackDesc(p);
1598 }
1599
1600 }
1601 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1602 }
1603
1604 /*
1605 * Checks to see that the maximum head separation will not be violated
1606 * if we initiate a reconstruction I/O on the indicated disk.
1607 * Limiting the maximum head separation between two disks eliminates
1608 * the nasty buffer-stall conditions that occur when one disk races
1609 * ahead of the others and consumes all of the floating recon buffers.
1610 * This code is complex and unpleasant but it's necessary to avoid
1611 * some very nasty, albeit fairly rare, reconstruction behavior.
1612 *
1613 * Returns non-zero if and only if we have to stop working on the
1614 * indicated disk due to a head-separation delay.
1615 */
1616 int
1617 rf_CheckHeadSeparation(
1618 RF_Raid_t *raidPtr,
1619 RF_PerDiskReconCtrl_t *ctrl,
1620 RF_RowCol_t row,
1621 RF_RowCol_t col,
1622 RF_HeadSepLimit_t hsCtr,
1623 RF_ReconUnitNum_t which_ru
1624 )
1625 {
1626 RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl[row];
1627 RF_CallbackDesc_t *cb, *p, *pt;
1628 int retval = 0;
1629
1630 /*
1631 * If we're too far ahead of the slowest disk, stop working on this
1632 * disk until the slower ones catch up. We do this by scheduling a
1633 * wakeup callback for the time when the slowest disk has caught up.
1634 * We define "caught up" with 20% hysteresis, i.e. the head separation
1635 * must have fallen to at most 80% of the max allowable head
1636 * separation before we'll wake up.
1637 */
1638 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1639 if ((raidPtr->headSepLimit >= 0) &&
1640 ((ctrl->headSepCounter - reconCtrlPtr->minHeadSepCounter) >
1641 raidPtr->headSepLimit)) {
1642 Dprintf6("raid%d: RECON: head sep stall: row %d col %d hsCtr"
1643 " %ld minHSCtr %ld limit %ld.\n",
1644 raidPtr->raidid, row, col, ctrl->headSepCounter,
1645 reconCtrlPtr->minHeadSepCounter, raidPtr->headSepLimit);
1646 cb = rf_AllocCallbackDesc();
1647 /*
1648 * The minHeadSepCounter value we have to get to before we'll
1649 * wake up. Build in 20% hysteresis.
1650 */
1651 cb->callbackArg.v = (ctrl->headSepCounter -
1652 raidPtr->headSepLimit + raidPtr->headSepLimit / 5);
1653 cb->row = row;
1654 cb->col = col;
1655 cb->next = NULL;
1656
1657 /*
1658 * Insert this callback descriptor into the sorted list of
1659 * pending head-sep callbacks.
1660 */
1661 p = reconCtrlPtr->headSepCBList;
1662 if (!p)
1663 reconCtrlPtr->headSepCBList = cb;
1664 else
1665 if (cb->callbackArg.v < p->callbackArg.v) {
1666 cb->next = reconCtrlPtr->headSepCBList;
1667 reconCtrlPtr->headSepCBList = cb;
1668 } else {
1669 for (pt = p, p = p->next;
1670 p && (p->callbackArg.v < cb->callbackArg.v);
1671 pt = p, p = p->next);
1672 cb->next = p;
1673 pt->next = cb;
1674 }
1675 retval = 1;
1676 #if RF_RECON_STATS > 0
1677 ctrl->reconCtrl->reconDesc->hsStallCount++;
1678 #endif /* RF_RECON_STATS > 0 */
1679 }
1680 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1681
1682 return (retval);
1683 }
1684
1685
1686
1687 /*
1688 * Checks to see if reconstruction has been either forced or blocked
1689 * by a user operation. If forced, we skip this RU entirely. Else if
1690 * blocked, put ourselves on the wait list. Else return 0.
1691 *
1692 * ASSUMES THE PSS MUTEX IS LOCKED UPON ENTRY.
1693 */
1694 int
1695 rf_CheckForcedOrBlockedReconstruction(
1696 RF_Raid_t *raidPtr,
1697 RF_ReconParityStripeStatus_t *pssPtr,
1698 RF_PerDiskReconCtrl_t *ctrl,
1699 RF_RowCol_t row,
1700 RF_RowCol_t col,
1701 RF_StripeNum_t psid,
1702 RF_ReconUnitNum_t which_ru
1703 )
1704 {
1705 RF_CallbackDesc_t *cb;
1706 int retcode = 0;
1707
1708 if ((pssPtr->flags & RF_PSS_FORCED_ON_READ) ||
1709 (pssPtr->flags & RF_PSS_FORCED_ON_WRITE))
1710 retcode = RF_PSS_FORCED_ON_WRITE;
1711 else
1712 if (pssPtr->flags & RF_PSS_RECON_BLOCKED) {
1713 Dprintf4("RECON: row %d col %d blocked at psid %ld"
1714 " ru %d.\n", row, col, psid, which_ru);
1715 cb = rf_AllocCallbackDesc(); /*
1716 * Append ourselves to
1717 * the blockage-wait
1718 * list.
1719 */
1720 cb->row = row;
1721 cb->col = col;
1722 cb->next = pssPtr->blockWaitList;
1723 pssPtr->blockWaitList = cb;
1724 retcode = RF_PSS_RECON_BLOCKED;
1725 }
1726 if (!retcode)
1727 pssPtr->flags |= RF_PSS_UNDER_RECON; /*
1728 * Mark this RU as under
1729 * reconstruction.
1730 */
1731
1732 return (retcode);
1733 }
1734
1735
1736 /*
1737 * If reconstruction is currently ongoing for the indicated stripeID,
1738 * reconstruction is forced to completion and we return non-zero to
1739 * indicate that the caller must wait. If not, then reconstruction is
1740 * blocked on the indicated stripe and the routine returns zero. If
1741 * and only if we return non-zero, we'll cause the cbFunc to get
1742 * invoked with the cbArg when the reconstruction has completed.
1743 */
1744 int
1745 rf_ForceOrBlockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
1746 void (*cbFunc) (RF_Raid_t *, void *), void *cbArg)
1747 {
1748 RF_RowCol_t row = asmap->physInfo->row; /*
1749 * Which row of the array
1750 * we're working on.
1751 */
1752 RF_StripeNum_t stripeID = asmap->stripeID; /*
1753 * The stripe ID we're
1754 * forcing recon on.
1755 */
1756 RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit *
1757 raidPtr->Layout.SUsPerRU; /* Num sects in one RU. */
1758 RF_ReconParityStripeStatus_t *pssPtr; /*
1759 * A pointer to the parity
1760 * stripe status structure.
1761 */
1762 RF_StripeNum_t psid; /* Parity stripe id. */
1763 RF_SectorNum_t offset, fd_offset; /*
1764 * Disk offset, failed-disk
1765 * offset.
1766 */
1767 RF_RowCol_t *diskids;
1768 RF_RowCol_t stripe;
1769 RF_ReconUnitNum_t which_ru; /* RU within parity stripe. */
1770 RF_RowCol_t fcol, diskno, i;
1771 RF_ReconBuffer_t *new_rbuf; /* Ptr to newly allocated rbufs. */
1772 RF_DiskQueueData_t *req; /* Disk I/O req to be enqueued. */
1773 RF_CallbackDesc_t *cb;
1774 int created = 0, nPromoted;
1775
1776 psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID,
1777 &which_ru);
1778
1779 RF_LOCK_PSS_MUTEX(raidPtr, row, psid);
1780
1781 pssPtr = rf_LookupRUStatus(raidPtr,
1782 raidPtr->reconControl[row]->pssTable, psid, which_ru,
1783 RF_PSS_CREATE | RF_PSS_RECON_BLOCKED, &created);
1784
1785 /* If recon is not ongoing on this PS, just return. */
1786 if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) {
1787 RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid);
1788 return (0);
1789 }
1790 /*
1791 * Otherwise, we have to wait for reconstruction to complete on this
1792 * RU.
1793 */
1794 /*
1795 * In order to avoid waiting for a potentially large number of
1796 * low-priority accesses to complete, we force a normal-priority (i.e.
1797 * not low-priority) reconstruction on this RU.
1798 */
1799 if (!(pssPtr->flags & RF_PSS_FORCED_ON_WRITE) &&
1800 !(pssPtr->flags & RF_PSS_FORCED_ON_READ)) {
1801 DDprintf1("Forcing recon on psid %ld.\n", psid);
1802 /* Mark this RU as under forced recon. */
1803 pssPtr->flags |= RF_PSS_FORCED_ON_WRITE;
1804 /* Clear the blockage that we just set. */
1805 pssPtr->flags &= ~RF_PSS_RECON_BLOCKED;
1806 fcol = raidPtr->reconControl[row]->fcol;
1807
1808 /*
1809 * Get a listing of the disks comprising the indicated stripe.
1810 */
1811 (raidPtr->Layout.map->IdentifyStripe) (raidPtr,
1812 asmap->raidAddress, &diskids, &stripe);
1813 RF_ASSERT(row == stripe);
1814
1815 /*
1816 * For previously issued reads, elevate them to normal
1817 * priority. If the I/O has already completed, it won't be
1818 * found in the queue, and hence this will be a no-op. For
1819 * unissued reads, allocate buffers and issue new reads. The
1820 * fact that we've set the FORCED bit means that the regular
1821 * recon procs will not re-issue these reqs.
1822 */
1823 for (i = 0; i < raidPtr->Layout.numDataCol +
1824 raidPtr->Layout.numParityCol; i++)
1825 if ((diskno = diskids[i]) != fcol) {
1826 if (pssPtr->issued[diskno]) {
1827 nPromoted = rf_DiskIOPromote(&raidPtr
1828 ->Queues[row][diskno], psid,
1829 which_ru);
1830 if (rf_reconDebug && nPromoted)
1831 printf("raid%d: promoted read"
1832 " from row %d col %d.\n",
1833 raidPtr->raidid, row,
1834 diskno);
1835 } else {
1836 /* Create new buf. */
1837 new_rbuf = rf_MakeReconBuffer(raidPtr,
1838 row, diskno, RF_RBUF_TYPE_FORCED);
1839 /* Find offsets & spare locationp */
1840 rf_ComputePSDiskOffsets(raidPtr, psid,
1841 row, diskno, &offset, &fd_offset,
1842 &new_rbuf->spRow, &new_rbuf->spCol,
1843 &new_rbuf->spOffset);
1844 new_rbuf->parityStripeID = psid;
1845 /* Fill in the buffer. */
1846 new_rbuf->which_ru = which_ru;
1847 new_rbuf->failedDiskSectorOffset =
1848 fd_offset;
1849 new_rbuf->priority =
1850 RF_IO_NORMAL_PRIORITY;
1851
1852 /*
1853 * Use NULL b_proc b/c all addrs
1854 * should be in kernel space.
1855 */
1856 req = rf_CreateDiskQueueData(
1857 RF_IO_TYPE_READ, offset +
1858 which_ru * sectorsPerRU,
1859 sectorsPerRU, new_rbuf->buffer,
1860 psid, which_ru, (int (*)
1861 (void *, int))
1862 rf_ForceReconReadDoneProc,
1863 (void *) new_rbuf, NULL,
1864 NULL, (void *) raidPtr, 0, NULL);
1865
1866 RF_ASSERT(req); /*
1867 * XXX -- Fix this. --
1868 * XXX
1869 */
1870
1871 new_rbuf->arg = req;
1872 /* Enqueue the I/O. */
1873 rf_DiskIOEnqueue(&raidPtr
1874 ->Queues[row][diskno], req,
1875 RF_IO_NORMAL_PRIORITY);
1876 Dprintf3("raid%d: Issued new read req"
1877 " on row %d col %d.\n",
1878 raidPtr->raidid, row, diskno);
1879 }
1880 }
1881 /*
1882 * If the write is sitting in the disk queue, elevate its
1883 * priority.
1884 */
1885 if (rf_DiskIOPromote(&raidPtr->Queues[row][fcol],
1886 psid, which_ru))
1887 printf("raid%d: promoted write to row %d col %d.\n",
1888 raidPtr->raidid, row, fcol);
1889 }
1890 /*
1891 * Install a callback descriptor to be invoked when recon completes on
1892 * this parity stripe.
1893 */
1894 cb = rf_AllocCallbackDesc();
1895 /*
1896 * XXX The following is bogus... These functions don't really match !!!
1897 * GO
1898 */
1899 cb->callbackFunc = (void (*) (RF_CBParam_t)) cbFunc;
1900 cb->callbackArg.p = (void *) cbArg;
1901 cb->next = pssPtr->procWaitList;
1902 pssPtr->procWaitList = cb;
1903 DDprintf2("raid%d: Waiting for forced recon on psid %ld.\n",
1904 raidPtr->raidid, psid);
1905
1906 RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid);
1907 return (1);
1908 }
1909
1910
1911 /*
1912 * Called upon the completion of a forced reconstruction read.
1913 * All we do is schedule the FORCEDREADONE event.
1914 * Called at interrupt context in the kernel, so don't do anything illegal here.
1915 */
1916 void
1917 rf_ForceReconReadDoneProc(void *arg, int status)
1918 {
1919 RF_ReconBuffer_t *rbuf = arg;
1920
1921 if (status) {
1922 /* fprintf(stderr, "Forced recon read failed !\n"); */
1923 printf("Forced recon read failed !\n");
1924 RF_PANIC();
1925 }
1926 rf_CauseReconEvent((RF_Raid_t *) rbuf->raidPtr, rbuf->row, rbuf->col,
1927 (void *) rbuf, RF_REVENT_FORCEDREADDONE);
1928 }
1929
1930
1931 /* Releases a block on the reconstruction of the indicated stripe. */
1932 int
1933 rf_UnblockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap)
1934 {
1935 RF_RowCol_t row = asmap->origRow;
1936 RF_StripeNum_t stripeID = asmap->stripeID;
1937 RF_ReconParityStripeStatus_t *pssPtr;
1938 RF_ReconUnitNum_t which_ru;
1939 RF_StripeNum_t psid;
1940 int created = 0;
1941 RF_CallbackDesc_t *cb;
1942
1943 psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID,
1944 &which_ru);
1945 RF_LOCK_PSS_MUTEX(raidPtr, row, psid);
1946 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl[row]
1947 ->pssTable, psid, which_ru, RF_PSS_NONE, &created);
1948
1949 /*
1950 * When recon is forced, the pss desc can get deleted before we get
1951 * back to unblock recon. But, this can _only_ happen when recon is
1952 * forced. It would be good to put some kind of sanity check here, but
1953 * how to decide if recon was just forced or not ?
1954 */
1955 if (!pssPtr) {
1956 /*
1957 * printf("Warning: no pss descriptor upon unblock on psid %ld"
1958 * " RU %d.\n", psid, which_ru);
1959 */
1960 if (rf_reconDebug || rf_pssDebug)
1961 printf("Warning: no pss descriptor upon unblock on"
1962 " psid %ld RU %d.\n", (long) psid, which_ru);
1963 goto out;
1964 }
1965 pssPtr->blockCount--;
1966 Dprintf3("raid%d: unblocking recon on psid %ld: blockcount is %d.\n",
1967 raidPtr->raidid, psid, pssPtr->blockCount);
1968 if (pssPtr->blockCount == 0) {
1969 /* If recon blockage has been released. */
1970
1971 /*
1972 * Unblock recon before calling CauseReconEvent in case
1973 * CauseReconEvent causes us to try to issue a new read before
1974 * returning here.
1975 */
1976 pssPtr->flags &= ~RF_PSS_RECON_BLOCKED;
1977
1978
1979 while (pssPtr->blockWaitList) {
1980 /*
1981 * Spin through the block-wait list and
1982 * release all the waiters.
1983 */
1984 cb = pssPtr->blockWaitList;
1985 pssPtr->blockWaitList = cb->next;
1986 cb->next = NULL;
1987 rf_CauseReconEvent(raidPtr, cb->row, cb->col, NULL,
1988 RF_REVENT_BLOCKCLEAR);
1989 rf_FreeCallbackDesc(cb);
1990 }
1991 if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) {
1992 /* If no recon was requested while recon was blocked. */
1993 rf_PSStatusDelete(raidPtr, raidPtr->reconControl[row]
1994 ->pssTable, pssPtr);
1995 }
1996 }
1997 out:
1998 RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid);
1999 return (0);
2000 }