root/dev/raidframe/rf_reconstruct.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. rf_SignalReconDone
  2. rf_RegisterReconDoneProc
  3. rf_ShutdownReconstruction
  4. rf_ConfigureReconstruction
  5. rf_AllocRaidReconDesc
  6. rf_FreeReconDesc
  7. rf_ReconstructFailedDisk
  8. rf_ReconstructFailedDiskBasic
  9. rf_ReconstructInPlace
  10. rf_ContinueReconstructFailedDisk
  11. rf_ProcessReconEvent
  12. rf_IssueNextReadRequest
  13. rf_TryToRead
  14. rf_ComputePSDiskOffsets
  15. rf_IssueNextWriteRequest
  16. rf_ReconReadDoneProc
  17. rf_ReconWriteDoneProc
  18. rf_CheckForNewMinHeadSep
  19. rf_CheckHeadSeparation
  20. rf_CheckForcedOrBlockedReconstruction
  21. rf_ForceOrBlockRecon
  22. rf_ForceReconReadDoneProc
  23. rf_UnblockRecon

    1 /*      $OpenBSD: rf_reconstruct.c,v 1.16 2007/06/05 00:38:22 deraadt Exp $     */
    2 /*      $NetBSD: rf_reconstruct.c,v 1.26 2000/06/04 02:05:13 oster Exp $        */
    3 
    4 /*
    5  * Copyright (c) 1995 Carnegie-Mellon University.
    6  * All rights reserved.
    7  *
    8  * Author: Mark Holland
    9  *
   10  * Permission to use, copy, modify and distribute this software and
   11  * its documentation is hereby granted, provided that both the copyright
   12  * notice and this permission notice appear in all copies of the
   13  * software, derivative works or modified versions, and any portions
   14  * thereof, and that both notices appear in supporting documentation.
   15  *
   16  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
   17  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
   18  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
   19  *
   20  * Carnegie Mellon requests users of this software to return to
   21  *
   22  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
   23  *  School of Computer Science
   24  *  Carnegie Mellon University
   25  *  Pittsburgh PA 15213-3890
   26  *
   27  * any improvements or extensions that they make and grant Carnegie the
   28  * rights to redistribute these changes.
   29  */
   30 
   31 /**************************************************************
   32  *
   33  * rf_reconstruct.c -- Code to perform on-line reconstruction.
   34  *
   35  **************************************************************/
   36 
   37 #include "rf_types.h"
   38 #include <sys/time.h>
   39 #include <sys/buf.h>
   40 #include <sys/errno.h>
   41 
   42 #include <sys/types.h>
   43 #include <sys/param.h>
   44 #include <sys/systm.h>
   45 #include <sys/proc.h>
   46 #include <sys/ioctl.h>
   47 #include <sys/fcntl.h>
   48 #if     __NETBSD__
   49 #include <sys/vnode.h>
   50 #endif
   51 
   52 #include "rf_raid.h"
   53 #include "rf_reconutil.h"
   54 #include "rf_revent.h"
   55 #include "rf_reconbuffer.h"
   56 #include "rf_acctrace.h"
   57 #include "rf_etimer.h"
   58 #include "rf_dag.h"
   59 #include "rf_desc.h"
   60 #include "rf_general.h"
   61 #include "rf_freelist.h"
   62 #include "rf_debugprint.h"
   63 #include "rf_driver.h"
   64 #include "rf_utils.h"
   65 #include "rf_shutdown.h"
   66 
   67 #include "rf_kintf.h"
   68 
   69 /*
   70  * Setting these to -1 causes them to be set to their default values if not set
   71  * by debug options.
   72  */
   73 
   74 #define Dprintf(s)                                                      \
   75 do {                                                                    \
   76         if (rf_reconDebug)                                              \
   77                 rf_debug_printf(s,                                      \
   78                     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL);    \
   79 } while (0)
   80 #define Dprintf1(s,a)                                                   \
   81 do {                                                                    \
   82         if (rf_reconDebug)                                              \
   83                 rf_debug_printf(s,                                      \
   84                     (void *)((unsigned long)a),                         \
   85                     NULL, NULL, NULL, NULL, NULL, NULL, NULL);          \
   86 } while (0)
   87 #define Dprintf2(s,a,b)                                                 \
   88 do {                                                                    \
   89         if (rf_reconDebug)                                              \
   90                 rf_debug_printf(s,                                      \
   91                     (void *)((unsigned long)a),                         \
   92                     (void *)((unsigned long)b),                         \
   93                     NULL, NULL, NULL, NULL, NULL, NULL);                \
   94 } while (0)
   95 #define Dprintf3(s,a,b,c)                                               \
   96 do {                                                                    \
   97         if (rf_reconDebug)                                              \
   98                 rf_debug_printf(s,                                      \
   99                     (void *)((unsigned long)a),                         \
  100                     (void *)((unsigned long)b),                         \
  101                     (void *)((unsigned long)c),                         \
  102                     NULL, NULL, NULL, NULL, NULL);                      \
  103 } while (0)
  104 #define Dprintf4(s,a,b,c,d)                                             \
  105 do {                                                                    \
  106         if (rf_reconDebug)                                              \
  107                 rf_debug_printf(s,                                      \
  108                     (void *)((unsigned long)a),                         \
  109                     (void *)((unsigned long)b),                         \
  110                     (void *)((unsigned long)c),                         \
  111                     (void *)((unsigned long)d),                         \
  112                     NULL, NULL, NULL, NULL);                            \
  113 } while (0)
  114 #define Dprintf5(s,a,b,c,d,e)                                           \
  115 do {                                                                    \
  116         if (rf_reconDebug)                                              \
  117                 rf_debug_printf(s,                                      \
  118                     (void *)((unsigned long)a),                         \
  119                     (void *)((unsigned long)b),                         \
  120                     (void *)((unsigned long)c),                         \
  121                     (void *)((unsigned long)d),                         \
  122                     (void *)((unsigned long)e),                         \
  123                     NULL, NULL, NULL);                                  \
  124 } while (0)
  125 #define Dprintf6(s,a,b,c,d,e,f)                                         \
  126 do {                                                                    \
  127         if (rf_reconDebug)                                              \
  128                 rf_debug_printf(s,                                      \
  129                     (void *)((unsigned long)a),                         \
  130                     (void *)((unsigned long)b),                         \
  131                     (void *)((unsigned long)c),                         \
  132                     (void *)((unsigned long)d),                         \
  133                     (void *)((unsigned long)e),                         \
  134                     (void *)((unsigned long)f),                         \
  135                     NULL, NULL);                                        \
  136 } while (0)
  137 #define Dprintf7(s,a,b,c,d,e,f,g)                                       \
  138 do {                                                                    \
  139         if (rf_reconDebug)                                              \
  140                 rf_debug_printf(s,                                      \
  141                     (void *)((unsigned long)a),                         \
  142                     (void *)((unsigned long)b),                         \
  143                     (void *)((unsigned long)c),                         \
  144                     (void *)((unsigned long)d),                         \
  145                     (void *)((unsigned long)e),                         \
  146                     (void *)((unsigned long)f),                         \
  147                     (void *)((unsigned long)g),                         \
  148                     NULL);                                              \
  149 } while (0)
  150 
  151 #define DDprintf1(s,a)                                                  \
  152 do {                                                                    \
  153         if (rf_reconDebug)                                              \
  154                 rf_debug_printf(s,                                      \
  155                     (void *)((unsigned long)a),                         \
  156                     NULL, NULL, NULL, NULL, NULL, NULL, NULL);          \
  157 } while (0)
  158 #define DDprintf2(s,a,b)                                                \
  159 do {                                                                    \
  160         if (rf_reconDebug)                                              \
  161                 rf_debug_printf(s,                                      \
  162                     (void *)((unsigned long)a),                         \
  163                     (void *)((unsigned long)b),                         \
  164                     NULL, NULL, NULL, NULL, NULL, NULL);                \
  165 } while (0)
  166 
  167 static RF_FreeList_t *rf_recond_freelist;
  168 #define RF_MAX_FREE_RECOND      4
  169 #define RF_RECOND_INC           1
  170 
  171 RF_RaidReconDesc_t *rf_AllocRaidReconDesc(RF_Raid_t *,
  172         RF_RowCol_t, RF_RowCol_t, RF_RaidDisk_t *, int,
  173         RF_RowCol_t, RF_RowCol_t);
  174 int  rf_ProcessReconEvent(RF_Raid_t *, RF_RowCol_t, RF_ReconEvent_t *);
  175 int  rf_IssueNextReadRequest(RF_Raid_t *, RF_RowCol_t, RF_RowCol_t);
  176 int  rf_TryToRead(RF_Raid_t *, RF_RowCol_t, RF_RowCol_t);
  177 int  rf_ComputePSDiskOffsets(RF_Raid_t *, RF_StripeNum_t,
  178         RF_RowCol_t, RF_RowCol_t, RF_SectorNum_t *, RF_SectorNum_t *,
  179         RF_RowCol_t *, RF_RowCol_t *, RF_SectorNum_t *);
  180 int  rf_ReconReadDoneProc(void *, int);
  181 int  rf_ReconWriteDoneProc(void *, int);
  182 void rf_CheckForNewMinHeadSep(RF_Raid_t *, RF_RowCol_t, RF_HeadSepLimit_t);
  183 int  rf_CheckHeadSeparation(RF_Raid_t *, RF_PerDiskReconCtrl_t *,
  184         RF_RowCol_t, RF_RowCol_t, RF_HeadSepLimit_t, RF_ReconUnitNum_t);
  185 void rf_ForceReconReadDoneProc(void *, int);
  186 void rf_ShutdownReconstruction(void *);
  187 
  188 /*
  189  * These functions are inlined on gcc. If they are used more than
  190  * once, it is strongly advised to un-line them.
  191  */
  192 void rf_FreeReconDesc(RF_RaidReconDesc_t *);
  193 int  rf_IssueNextWriteRequest(RF_Raid_t *, RF_RowCol_t);
  194 int  rf_CheckForcedOrBlockedReconstruction(RF_Raid_t *,
  195         RF_ReconParityStripeStatus_t *, RF_PerDiskReconCtrl_t *,
  196         RF_RowCol_t, RF_RowCol_t, RF_StripeNum_t, RF_ReconUnitNum_t);
  197 void rf_SignalReconDone(RF_Raid_t *);
  198 
  199 struct RF_ReconDoneProc_s {
  200         void                    (*proc) (RF_Raid_t *, void *);
  201         void                     *arg;
  202         RF_ReconDoneProc_t       *next;
  203 };
  204 
  205 static RF_FreeList_t *rf_rdp_freelist;
  206 #define RF_MAX_FREE_RDP         4
  207 #define RF_RDP_INC              1
  208 
  209 void
  210 rf_SignalReconDone(RF_Raid_t *raidPtr)
  211 {
  212         RF_ReconDoneProc_t *p;
  213 
  214         RF_LOCK_MUTEX(raidPtr->recon_done_proc_mutex);
  215         for (p = raidPtr->recon_done_procs; p; p = p->next) {
  216                 p->proc(raidPtr, p->arg);
  217         }
  218         RF_UNLOCK_MUTEX(raidPtr->recon_done_proc_mutex);
  219 }
  220 
  221 int
  222 rf_RegisterReconDoneProc(RF_Raid_t *raidPtr, void (*proc) (RF_Raid_t *, void *),
  223     void *arg, RF_ReconDoneProc_t **handlep)
  224 {
  225         RF_ReconDoneProc_t *p;
  226 
  227         RF_FREELIST_GET(rf_rdp_freelist, p, next, (RF_ReconDoneProc_t *));
  228         if (p == NULL)
  229                 return (ENOMEM);
  230         p->proc = proc;
  231         p->arg = arg;
  232         RF_LOCK_MUTEX(raidPtr->recon_done_proc_mutex);
  233         p->next = raidPtr->recon_done_procs;
  234         raidPtr->recon_done_procs = p;
  235         RF_UNLOCK_MUTEX(raidPtr->recon_done_proc_mutex);
  236         if (handlep)
  237                 *handlep = p;
  238         return (0);
  239 }
  240 
  241 /*****************************************************************************
  242  *
  243  * Sets up the parameters that will be used by the reconstruction process.
  244  * Currently there are none, except for those that the layout-specific
  245  * configuration (e.g. rf_ConfigureDeclustered) routine sets up.
  246  *
  247  * In the kernel, we fire off the recon thread.
  248  *
  249  *****************************************************************************/
  250 void
  251 rf_ShutdownReconstruction(void *ignored)
  252 {
  253         RF_FREELIST_DESTROY(rf_recond_freelist, next, (RF_RaidReconDesc_t *));
  254         RF_FREELIST_DESTROY(rf_rdp_freelist, next, (RF_ReconDoneProc_t *));
  255 }
  256 
  257 int
  258 rf_ConfigureReconstruction(RF_ShutdownList_t **listp)
  259 {
  260         int rc;
  261 
  262         RF_FREELIST_CREATE(rf_recond_freelist, RF_MAX_FREE_RECOND,
  263             RF_RECOND_INC, sizeof(RF_RaidReconDesc_t));
  264         if (rf_recond_freelist == NULL)
  265                 return (ENOMEM);
  266         RF_FREELIST_CREATE(rf_rdp_freelist, RF_MAX_FREE_RDP,
  267             RF_RDP_INC, sizeof(RF_ReconDoneProc_t));
  268         if (rf_rdp_freelist == NULL) {
  269                 RF_FREELIST_DESTROY(rf_recond_freelist, next,
  270                     (RF_RaidReconDesc_t *));
  271                 return (ENOMEM);
  272         }
  273         rc = rf_ShutdownCreate(listp, rf_ShutdownReconstruction, NULL);
  274         if (rc) {
  275                 RF_ERRORMSG3("Unable to add to shutdown list file %s line %d"
  276                     " rc=%d.\n", __FILE__, __LINE__, rc);
  277                 rf_ShutdownReconstruction(NULL);
  278                 return (rc);
  279         }
  280         return (0);
  281 }
  282 
  283 RF_RaidReconDesc_t *
  284 rf_AllocRaidReconDesc(RF_Raid_t *raidPtr, RF_RowCol_t row, RF_RowCol_t col,
  285     RF_RaidDisk_t *spareDiskPtr, int numDisksDone, RF_RowCol_t srow,
  286     RF_RowCol_t scol)
  287 {
  288 
  289         RF_RaidReconDesc_t *reconDesc;
  290 
  291         RF_FREELIST_GET(rf_recond_freelist, reconDesc, next,
  292             (RF_RaidReconDesc_t *));
  293 
  294         reconDesc->raidPtr = raidPtr;
  295         reconDesc->row = row;
  296         reconDesc->col = col;
  297         reconDesc->spareDiskPtr = spareDiskPtr;
  298         reconDesc->numDisksDone = numDisksDone;
  299         reconDesc->srow = srow;
  300         reconDesc->scol = scol;
  301         reconDesc->state = 0;
  302         reconDesc->next = NULL;
  303 
  304         return (reconDesc);
  305 }
  306 
  307 void
  308 rf_FreeReconDesc(RF_RaidReconDesc_t *reconDesc)
  309 {
  310 #if     RF_RECON_STATS > 0
  311         printf("RAIDframe: %qu recon event waits, %qu recon delays.\n",
  312             reconDesc->numReconEventWaits, reconDesc->numReconExecDelays);
  313 #endif  /* RF_RECON_STATS > 0 */
  314 
  315         printf("RAIDframe: %qu max exec ticks.\n",
  316             reconDesc->maxReconExecTicks);
  317 
  318 #if     (RF_RECON_STATS > 0) || defined(_KERNEL)
  319         printf("\n");
  320 #endif  /* (RF_RECON_STATS > 0) || _KERNEL */
  321         RF_FREELIST_FREE(rf_recond_freelist, reconDesc, next);
  322 }
  323 
  324 
  325 /*****************************************************************************
  326  *
  327  * Primary routine to reconstruct a failed disk. This should be called from
  328  * within its own thread. It won't return until reconstruction completes,
  329  * fails, or is aborted.
  330  *
  331  *****************************************************************************/
  332 int
  333 rf_ReconstructFailedDisk(RF_Raid_t *raidPtr, RF_RowCol_t row, RF_RowCol_t col)
  334 {
  335         RF_LayoutSW_t *lp;
  336         int rc;
  337 
  338         lp = raidPtr->Layout.map;
  339         if (lp->SubmitReconBuffer) {
  340                 /*
  341                  * The current infrastructure only supports reconstructing one
  342                  * disk at a time for each array.
  343                  */
  344                 RF_LOCK_MUTEX(raidPtr->mutex);
  345                 while (raidPtr->reconInProgress) {
  346                         RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex);
  347                 }
  348                 raidPtr->reconInProgress++;
  349                 RF_UNLOCK_MUTEX(raidPtr->mutex);
  350                 rc = rf_ReconstructFailedDiskBasic(raidPtr, row, col);
  351                 RF_LOCK_MUTEX(raidPtr->mutex);
  352                 raidPtr->reconInProgress--;
  353                 RF_UNLOCK_MUTEX(raidPtr->mutex);
  354         } else {
  355                 RF_ERRORMSG1("RECON: no way to reconstruct failed disk for"
  356                     " arch %c.\n", lp->parityConfig);
  357                 rc = EIO;
  358         }
  359         RF_SIGNAL_COND(raidPtr->waitForReconCond);
  360         wakeup(&raidPtr->waitForReconCond);     /*
  361                                                  * XXX Methinks this will be
  362                                                  * needed at some point... GO
  363                                                  */
  364         return (rc);
  365 }
  366 
  367 int
  368 rf_ReconstructFailedDiskBasic(RF_Raid_t *raidPtr, RF_RowCol_t row,
  369     RF_RowCol_t col)
  370 {
  371         RF_ComponentLabel_t c_label;
  372         RF_RaidDisk_t *spareDiskPtr = NULL;
  373         RF_RaidReconDesc_t *reconDesc;
  374         RF_RowCol_t srow, scol;
  375         int numDisksDone = 0, rc;
  376 
  377         /* First look for a spare drive onto which to reconstruct the data. */
  378         /*
  379          * Spare disk descriptors are stored in row 0. This may have to
  380          * change eventually.
  381          */
  382 
  383         RF_LOCK_MUTEX(raidPtr->mutex);
  384         RF_ASSERT(raidPtr->Disks[row][col].status == rf_ds_failed);
  385 
  386         if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
  387                 if (raidPtr->status[row] != rf_rs_degraded) {
  388                         RF_ERRORMSG2("Unable to reconstruct disk at row %d"
  389                             " col %d because status not degraded.\n", row, col);
  390                         RF_UNLOCK_MUTEX(raidPtr->mutex);
  391                         return (EINVAL);
  392                 }
  393                 srow = row;
  394                 scol = (-1);
  395         } else {
  396                 srow = 0;
  397                 for (scol = raidPtr->numCol;
  398                      scol < raidPtr->numCol + raidPtr->numSpare; scol++) {
  399                         if (raidPtr->Disks[srow][scol].status == rf_ds_spare) {
  400                                 spareDiskPtr = &raidPtr->Disks[srow][scol];
  401                                 spareDiskPtr->status = rf_ds_used_spare;
  402                                 break;
  403                         }
  404                 }
  405                 if (!spareDiskPtr) {
  406                         RF_ERRORMSG2("Unable to reconstruct disk at row %d"
  407                             " col %d because no spares are available.\n",
  408                             row, col);
  409                         RF_UNLOCK_MUTEX(raidPtr->mutex);
  410                         return (ENOSPC);
  411                 }
  412                 printf("RECON: initiating reconstruction on row %d col %d"
  413                     " -> spare at row %d col %d.\n", row, col, srow, scol);
  414         }
  415         RF_UNLOCK_MUTEX(raidPtr->mutex);
  416 
  417         reconDesc = rf_AllocRaidReconDesc((void *) raidPtr, row, col,
  418             spareDiskPtr, numDisksDone, srow, scol);
  419         raidPtr->reconDesc = (void *) reconDesc;
  420 #if     RF_RECON_STATS > 0
  421         reconDesc->hsStallCount = 0;
  422         reconDesc->numReconExecDelays = 0;
  423         reconDesc->numReconEventWaits = 0;
  424 #endif  /* RF_RECON_STATS > 0 */
  425         reconDesc->reconExecTimerRunning = 0;
  426         reconDesc->reconExecTicks = 0;
  427         reconDesc->maxReconExecTicks = 0;
  428         rc = rf_ContinueReconstructFailedDisk(reconDesc);
  429 
  430         if (!rc) {
  431                 /* Fix up the component label. */
  432                 /* Don't actually need the read here... */
  433                 raidread_component_label(
  434                     raidPtr->raid_cinfo[srow][scol].ci_dev,
  435                     raidPtr->raid_cinfo[srow][scol].ci_vp,
  436                     &c_label);
  437 
  438                 raid_init_component_label(raidPtr, &c_label);
  439                 c_label.row = row;
  440                 c_label.column = col;
  441                 c_label.clean = RF_RAID_DIRTY;
  442                 c_label.status = rf_ds_optimal;
  443 
  444                 /* XXXX MORE NEEDED HERE. */
  445 
  446                 raidwrite_component_label(
  447                     raidPtr->raid_cinfo[srow][scol].ci_dev,
  448                     raidPtr->raid_cinfo[srow][scol].ci_vp,
  449                     &c_label);
  450 
  451         }
  452         return (rc);
  453 }
  454 
  455 /*
  456  *
  457  * Allow reconstructing a disk in-place -- i.e. component /dev/sd2e goes AWOL,
  458  * and you don't get a spare until the next Monday. With this function
  459  * (and hot-swappable drives) you can now put your new disk containing
  460  * /dev/sd2e on the bus, scsictl it alive, and then use raidctl(8) to
  461  * rebuild the data "on the spot".
  462  *
  463  */
  464 
  465 int
  466 rf_ReconstructInPlace(RF_Raid_t *raidPtr, RF_RowCol_t row, RF_RowCol_t col)
  467 {
  468         RF_RaidDisk_t *spareDiskPtr = NULL;
  469         RF_RaidReconDesc_t *reconDesc;
  470         RF_LayoutSW_t *lp;
  471         RF_RaidDisk_t *badDisk;
  472         RF_ComponentLabel_t c_label;
  473         int numDisksDone = 0, rc;
  474         struct partinfo dpart;
  475         struct vnode *vp;
  476         struct vattr va;
  477         struct proc *proc;
  478         int retcode;
  479         int ac;
  480 
  481         lp = raidPtr->Layout.map;
  482         if (lp->SubmitReconBuffer) {
  483                 /*
  484                  * The current infrastructure only supports reconstructing one
  485                  * disk at a time for each array.
  486                  */
  487                 RF_LOCK_MUTEX(raidPtr->mutex);
  488                 if ((raidPtr->Disks[row][col].status == rf_ds_optimal) &&
  489                     (raidPtr->numFailures > 0)) {
  490                         /* XXX 0 above shouldn't be constant !!! */
  491                         /*
  492                          * Some component other than this has failed.
  493                          * Let's not make things worse than they already
  494                          * are...
  495                          */
  496 #ifdef  RAIDDEBUG
  497                         printf("RAIDFRAME: Unable to reconstruct to disk at:\n"
  498                             "      Row: %d Col: %d   Too many failures.\n",
  499                             row, col);
  500 #endif  /* RAIDDEBUG */
  501                         RF_UNLOCK_MUTEX(raidPtr->mutex);
  502                         return (EINVAL);
  503                 }
  504                 if (raidPtr->Disks[row][col].status == rf_ds_reconstructing) {
  505 #ifdef  RAIDDEBUG
  506                         printf("RAIDFRAME: Unable to reconstruct to disk at:\n"
  507                             "      Row: %d Col: %d   Reconstruction already"
  508                             " occurring !\n", row, col);
  509 #endif  /* RAIDDEBUG */
  510 
  511                         RF_UNLOCK_MUTEX(raidPtr->mutex);
  512                         return (EINVAL);
  513                 }
  514 
  515 
  516                 if (raidPtr->Disks[row][col].status != rf_ds_failed) {
  517                         /* "It's gone..." */
  518                         raidPtr->numFailures++;
  519                         raidPtr->Disks[row][col].status = rf_ds_failed;
  520                         raidPtr->status[row] = rf_rs_degraded;
  521                         rf_update_component_labels(raidPtr,
  522                             RF_NORMAL_COMPONENT_UPDATE);
  523                 }
  524 
  525                 while (raidPtr->reconInProgress) {
  526                         RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex);
  527                 }
  528 
  529                 raidPtr->reconInProgress++;
  530 
  531                 /*
  532                  * First look for a spare drive onto which to reconstruct
  533                  * the data. Spare disk descriptors are stored in row 0.
  534                  * This may have to change eventually.
  535                  */
  536 
  537                 /*
  538                  * Actually, we don't care if it's failed or not...
  539                  * On a RAID set with correct parity, this function
  540                  * should be callable on any component without ill effects.
  541                  */
  542                 /*
  543                  * RF_ASSERT(raidPtr->Disks[row][col].status == rf_ds_failed);
  544                  */
  545 
  546                 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
  547                         RF_ERRORMSG2("Unable to reconstruct to disk at row %d"
  548                             " col %d: operation not supported for"
  549                             " RF_DISTRIBUTE_SPARE.\n", row, col);
  550 
  551                         raidPtr->reconInProgress--;
  552                         RF_UNLOCK_MUTEX(raidPtr->mutex);
  553                         return (EINVAL);
  554                 }
  555 
  556                 /*
  557                  * XXX Need goop here to see if the disk is alive,
  558                  * and, if not, make it so...
  559                  */
  560 
  561                 badDisk = &raidPtr->Disks[row][col];
  562 
  563                 proc = raidPtr->recon_thread;
  564 
  565                 /*
  566                  * This device may have been opened successfully the
  567                  * first time. Close it before trying to open it again...
  568                  */
  569 
  570                 if (raidPtr->raid_cinfo[row][col].ci_vp != NULL) {
  571                         printf("Closing the opened device: %s\n",
  572                             raidPtr->Disks[row][col].devname);
  573                         vp = raidPtr->raid_cinfo[row][col].ci_vp;
  574                         ac = raidPtr->Disks[row][col].auto_configured;
  575                         rf_close_component(raidPtr, vp, ac);
  576                         raidPtr->raid_cinfo[row][col].ci_vp = NULL;
  577                 }
  578                 /*
  579                  * Note that this disk was *not* auto_configured (any longer).
  580                  */
  581                 raidPtr->Disks[row][col].auto_configured = 0;
  582 
  583                 printf("About to (re-)open the device for rebuilding: %s\n",
  584                     raidPtr->Disks[row][col].devname);
  585 
  586                 retcode = raidlookup(raidPtr->Disks[row][col].devname,
  587                     proc, &vp);
  588 
  589                 if (retcode) {
  590                         printf("raid%d: rebuilding: raidlookup on device: %s"
  591                             " failed: %d !\n", raidPtr->raidid,
  592                             raidPtr->Disks[row][col].devname, retcode);
  593 
  594                         /*
  595                          * XXX the component isn't responding properly...
  596                          * Must still be dead :-(
  597                          */
  598                         raidPtr->reconInProgress--;
  599                         RF_UNLOCK_MUTEX(raidPtr->mutex);
  600                         return(retcode);
  601 
  602                 } else {
  603 
  604                         /*
  605                          * Ok, so we can at least do a lookup...
  606                          * How about actually getting a vp for it ?
  607                          */
  608 
  609                         if ((retcode =
  610                              VOP_GETATTR(vp, &va, proc->p_ucred, proc)) != 0) {
  611                                 raidPtr->reconInProgress--;
  612                                 RF_UNLOCK_MUTEX(raidPtr->mutex);
  613                                 return(retcode);
  614                         }
  615                         retcode = VOP_IOCTL(vp, DIOCGPART, (caddr_t) & dpart,
  616                             FREAD, proc->p_ucred, proc);
  617                         if (retcode) {
  618                                 raidPtr->reconInProgress--;
  619                                 RF_UNLOCK_MUTEX(raidPtr->mutex);
  620                                 return(retcode);
  621                         }
  622                         raidPtr->Disks[row][col].blockSize =
  623                             dpart.disklab->d_secsize;
  624 
  625                         raidPtr->Disks[row][col].numBlocks =
  626                             DL_GETPSIZE(dpart.part) - rf_protectedSectors;
  627 
  628                         raidPtr->raid_cinfo[row][col].ci_vp = vp;
  629                         raidPtr->raid_cinfo[row][col].ci_dev = va.va_rdev;
  630 
  631                         raidPtr->Disks[row][col].dev = va.va_rdev;
  632 
  633                         /*
  634                          * We allow the user to specify that only a
  635                          * fraction of the disks should be used this is
  636                          * just for debug:  it speeds up the parity scan.
  637                          */
  638                         raidPtr->Disks[row][col].numBlocks =
  639                             raidPtr->Disks[row][col].numBlocks *
  640                             rf_sizePercentage / 100;
  641                 }
  642 
  643                 spareDiskPtr = &raidPtr->Disks[row][col];
  644                 spareDiskPtr->status = rf_ds_used_spare;
  645 
  646                 printf("RECON: Initiating in-place reconstruction on\n");
  647                 printf("       row %d col %d -> spare at row %d col %d.\n",
  648                     row, col, row, col);
  649 
  650                 RF_UNLOCK_MUTEX(raidPtr->mutex);
  651 
  652                 reconDesc = rf_AllocRaidReconDesc((void *) raidPtr, row, col,
  653                     spareDiskPtr, numDisksDone, row, col);
  654                 raidPtr->reconDesc = (void *) reconDesc;
  655 #if     RF_RECON_STATS > 0
  656                 reconDesc->hsStallCount = 0;
  657                 reconDesc->numReconExecDelays = 0;
  658                 reconDesc->numReconEventWaits = 0;
  659 #endif  /* RF_RECON_STATS > 0 */
  660                 reconDesc->reconExecTimerRunning = 0;
  661                 reconDesc->reconExecTicks = 0;
  662                 reconDesc->maxReconExecTicks = 0;
  663                 rc = rf_ContinueReconstructFailedDisk(reconDesc);
  664 
  665                 RF_LOCK_MUTEX(raidPtr->mutex);
  666                 raidPtr->reconInProgress--;
  667                 RF_UNLOCK_MUTEX(raidPtr->mutex);
  668 
  669         } else {
  670                 RF_ERRORMSG1("RECON: no way to reconstruct failed disk for"
  671                     " arch %c.\n", lp->parityConfig);
  672                 rc = EIO;
  673         }
  674         RF_LOCK_MUTEX(raidPtr->mutex);
  675 
  676         if (!rc) {
  677                 /*
  678                  * Need to set these here, as at this point it'll be claiming
  679                  * that the disk is in rf_ds_spared !  But we know better :-)
  680                  */
  681 
  682                 raidPtr->Disks[row][col].status = rf_ds_optimal;
  683                 raidPtr->status[row] = rf_rs_optimal;
  684 
  685                 /* Fix up the component label. */
  686                 /* Don't actually need the read here... */
  687                 raidread_component_label(
  688                     raidPtr->raid_cinfo[row][col].ci_dev,
  689                     raidPtr->raid_cinfo[row][col].ci_vp,
  690                     &c_label);
  691 
  692                 raid_init_component_label(raidPtr, &c_label);
  693 
  694                 c_label.row = row;
  695                 c_label.column = col;
  696 
  697                 raidwrite_component_label(raidPtr->raid_cinfo[row][col].ci_dev,
  698                     raidPtr->raid_cinfo[row][col].ci_vp, &c_label);
  699 
  700         }
  701         RF_UNLOCK_MUTEX(raidPtr->mutex);
  702         RF_SIGNAL_COND(raidPtr->waitForReconCond);
  703         wakeup(&raidPtr->waitForReconCond);
  704         return (rc);
  705 }
  706 
  707 
  708 int
  709 rf_ContinueReconstructFailedDisk(RF_RaidReconDesc_t *reconDesc)
  710 {
  711         RF_Raid_t *raidPtr = reconDesc->raidPtr;
  712         RF_RowCol_t row = reconDesc->row;
  713         RF_RowCol_t col = reconDesc->col;
  714         RF_RowCol_t srow = reconDesc->srow;
  715         RF_RowCol_t scol = reconDesc->scol;
  716         RF_ReconMap_t *mapPtr;
  717 
  718         RF_ReconEvent_t *event;
  719         struct timeval etime, elpsd;
  720         unsigned long xor_s, xor_resid_us;
  721         int retcode, i, ds;
  722 
  723         switch (reconDesc->state) {
  724         case 0:
  725                 raidPtr->accumXorTimeUs = 0;
  726 
  727                 /* Create one trace record per physical disk. */
  728                 RF_Malloc(raidPtr->recon_tracerecs, raidPtr->numCol *
  729                     sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *));
  730 
  731                 /*
  732                  * Quiesce the array prior to starting recon. This is needed
  733                  * to assure no nasty interactions with pending user writes.
  734                  * We need to do this before we change the disk or row status.
  735                  */
  736                 reconDesc->state = 1;
  737 
  738                 Dprintf("RECON: begin request suspend.\n");
  739                 retcode = rf_SuspendNewRequestsAndWait(raidPtr);
  740                 Dprintf("RECON: end request suspend.\n");
  741                 rf_StartUserStats(raidPtr);     /*
  742                                                  * Zero out the stats kept on
  743                                                  * user accs.
  744                                                  */
  745                 /* Fall through to state 1. */
  746         case 1:
  747                 RF_LOCK_MUTEX(raidPtr->mutex);
  748 
  749                 /*
  750                  * Create the reconstruction control pointer and install it in
  751                  * the right slot.
  752                  */
  753                 raidPtr->reconControl[row] =
  754                     rf_MakeReconControl(reconDesc, row, col, srow, scol);
  755                 mapPtr = raidPtr->reconControl[row]->reconMap;
  756                 raidPtr->status[row] = rf_rs_reconstructing;
  757                 raidPtr->Disks[row][col].status = rf_ds_reconstructing;
  758                 raidPtr->Disks[row][col].spareRow = srow;
  759                 raidPtr->Disks[row][col].spareCol = scol;
  760 
  761                 RF_UNLOCK_MUTEX(raidPtr->mutex);
  762 
  763                 RF_GETTIME(raidPtr->reconControl[row]->starttime);
  764 
  765                 /*
  766                  * Now start up the actual reconstruction: issue a read for
  767                  * each surviving disk.
  768                  */
  769 
  770                 reconDesc->numDisksDone = 0;
  771                 for (i = 0; i < raidPtr->numCol; i++) {
  772                         if (i != col) {
  773                                 /*
  774                                  * Find and issue the next I/O on the
  775                                  * indicated disk.
  776                                  */
  777                                 if (rf_IssueNextReadRequest(raidPtr, row, i)) {
  778                                         Dprintf2("RECON: done issuing for r%d"
  779                                             " c%d.\n", row, i);
  780                                         reconDesc->numDisksDone++;
  781                                 }
  782                         }
  783                 }
  784 
  785                 reconDesc->state = 2;
  786 
  787         case 2:
  788                 Dprintf("RECON: resume requests.\n");
  789                 rf_ResumeNewRequests(raidPtr);
  790 
  791                 reconDesc->state = 3;
  792 
  793         case 3:
  794 
  795                 /*
  796                  * Process reconstruction events until all disks report that
  797                  * they've completed all work.
  798                  */
  799                 mapPtr = raidPtr->reconControl[row]->reconMap;
  800 
  801                 while (reconDesc->numDisksDone < raidPtr->numCol - 1) {
  802 
  803                         event = rf_GetNextReconEvent(reconDesc, row,
  804                            (void (*) (void *)) rf_ContinueReconstructFailedDisk,
  805                             reconDesc);
  806                         RF_ASSERT(event);
  807 
  808                         if (rf_ProcessReconEvent(raidPtr, row, event))
  809                                 reconDesc->numDisksDone++;
  810                         raidPtr->reconControl[row]->numRUsTotal =
  811                                 mapPtr->totalRUs;
  812                         raidPtr->reconControl[row]->numRUsComplete =
  813                                 mapPtr->totalRUs -
  814                                 rf_UnitsLeftToReconstruct(mapPtr);
  815 
  816                         raidPtr->reconControl[row]->percentComplete =
  817                             (raidPtr->reconControl[row]->numRUsComplete * 100 /
  818                              raidPtr->reconControl[row]->numRUsTotal);
  819                         if (rf_prReconSched) {
  820                                 rf_PrintReconSchedule(
  821                                     raidPtr->reconControl[row]->reconMap,
  822                                     &(raidPtr->reconControl[row]->starttime));
  823                         }
  824                 }
  825 
  826                 reconDesc->state = 4;
  827 
  828         case 4:
  829                 mapPtr = raidPtr->reconControl[row]->reconMap;
  830                 if (rf_reconDebug) {
  831                         printf("RECON: all reads completed.\n");
  832                 }
  833                 /*
  834                  * At this point all the reads have completed. We now wait
  835                  * for any pending writes to complete, and then we're done.
  836                  */
  837 
  838                 while (rf_UnitsLeftToReconstruct(
  839                     raidPtr->reconControl[row]->reconMap) > 0) {
  840 
  841                         event = rf_GetNextReconEvent(reconDesc, row,
  842                            (void (*) (void *)) rf_ContinueReconstructFailedDisk,
  843                             reconDesc);
  844                         RF_ASSERT(event);
  845 
  846                         /* Ignore return code. */
  847                         (void) rf_ProcessReconEvent(raidPtr, row, event);
  848                         raidPtr->reconControl[row]->percentComplete =
  849                             100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 /
  850                             mapPtr->totalRUs);
  851                         if (rf_prReconSched) {
  852                                 rf_PrintReconSchedule(
  853                                     raidPtr->reconControl[row]->reconMap,
  854                                     &(raidPtr->reconControl[row]->starttime));
  855                         }
  856                 }
  857                 reconDesc->state = 5;
  858 
  859         case 5:
  860                 /*
  861                  * Success:  mark the dead disk as reconstructed. We quiesce
  862                  * the array here to assure no nasty interactions with pending
  863                  * user accesses, when we free up the psstatus structure as
  864                  * part of FreeReconControl().
  865                  */
  866 
  867                 reconDesc->state = 6;
  868 
  869                 retcode = rf_SuspendNewRequestsAndWait(raidPtr);
  870                 rf_StopUserStats(raidPtr);
  871                 rf_PrintUserStats(raidPtr);     /*
  872                                                  * Print out the stats on user
  873                                                  * accs accumulated during
  874                                                  * recon.
  875                                                  */
  876 
  877                 /* Fall through to state 6. */
  878         case 6:
  879                 RF_LOCK_MUTEX(raidPtr->mutex);
  880                 raidPtr->numFailures--;
  881                 ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE);
  882                 raidPtr->Disks[row][col].status = (ds) ? rf_ds_dist_spared :
  883                                                          rf_ds_spared;
  884                 raidPtr->status[row] = (ds) ? rf_rs_reconfigured :
  885                                               rf_rs_optimal;
  886                 RF_UNLOCK_MUTEX(raidPtr->mutex);
  887                 RF_GETTIME(etime);
  888                 RF_TIMEVAL_DIFF(&(raidPtr->reconControl[row]->starttime),
  889                     &etime, &elpsd);
  890 
  891                 /*
  892                  * XXX -- Why is state 7 different from state 6 if there is no
  893                  * return() here ? -- XXX Note that I set elpsd above & use it
  894                  * below, so if you put a return here you'll have to fix this.
  895                  * (also, FreeReconControl is called below).
  896                  */
  897 
  898         case 7:
  899 
  900                 rf_ResumeNewRequests(raidPtr);
  901 
  902                 printf("Reconstruction of disk at row %d col %d completed.\n",
  903                     row, col);
  904                 xor_s = raidPtr->accumXorTimeUs / 1000000;
  905                 xor_resid_us = raidPtr->accumXorTimeUs % 1000000;
  906                 printf("Recon time was %d.%06d seconds, accumulated XOR time"
  907                     " was %ld us (%ld.%06ld).\n", (int) elpsd.tv_sec,
  908                     (int) elpsd.tv_usec, raidPtr->accumXorTimeUs, xor_s,
  909                     xor_resid_us);
  910                 printf("  (start time %d sec %d usec, end time %d sec %d"
  911                     " usec)\n",
  912                     (int) raidPtr->reconControl[row]->starttime.tv_sec,
  913                     (int) raidPtr->reconControl[row]->starttime.tv_usec,
  914                     (int) etime.tv_sec, (int) etime.tv_usec);
  915 
  916 #if     RF_RECON_STATS > 0
  917                 printf("Total head-sep stall count was %d.\n",
  918                     (int) reconDesc->hsStallCount);
  919 #endif  /* RF_RECON_STATS > 0 */
  920                 rf_FreeReconControl(raidPtr, row);
  921                 RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol *
  922                     sizeof(RF_AccTraceEntry_t));
  923                 rf_FreeReconDesc(reconDesc);
  924 
  925         }
  926 
  927         rf_SignalReconDone(raidPtr);
  928         return (0);
  929 }
  930 
  931 
  932 /*****************************************************************************
  933  * Do the right thing upon each reconstruction event.
  934  * Returns nonzero if and only if there is nothing left unread on the
  935  * indicated disk.
  936  *****************************************************************************/
  937 int
  938 rf_ProcessReconEvent(RF_Raid_t *raidPtr, RF_RowCol_t frow,
  939     RF_ReconEvent_t *event)
  940 {
  941         int retcode = 0, submitblocked;
  942         RF_ReconBuffer_t *rbuf;
  943         RF_SectorCount_t sectorsPerRU;
  944 
  945         Dprintf1("RECON: rf_ProcessReconEvent type %d.\n", event->type);
  946 
  947         switch (event->type) {
  948 
  949                 /* A read I/O has completed. */
  950         case RF_REVENT_READDONE:
  951                 rbuf = raidPtr->reconControl[frow]
  952                     ->perDiskInfo[event->col].rbuf;
  953                 Dprintf3("RECON: READDONE EVENT: row %d col %d psid %ld.\n",
  954                     frow, event->col, rbuf->parityStripeID);
  955                 Dprintf7("RECON: done read  psid %ld buf %lx  %02x %02x %02x"
  956                     " %02x %02x.\n", rbuf->parityStripeID, rbuf->buffer,
  957                     rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff,
  958                     rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff,
  959                     rbuf->buffer[4] & 0xff);
  960                 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
  961                 submitblocked = rf_SubmitReconBuffer(rbuf, 0, 0);
  962                 Dprintf1("RECON: submitblocked=%d.\n", submitblocked);
  963                 if (!submitblocked)
  964                         retcode = rf_IssueNextReadRequest(raidPtr, frow,
  965                             event->col);
  966                 break;
  967 
  968                 /* A write I/O has completed. */
  969         case RF_REVENT_WRITEDONE:
  970                 if (rf_floatingRbufDebug) {
  971                         rf_CheckFloatingRbufCount(raidPtr, 1);
  972                 }
  973                 sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit *
  974                     raidPtr->Layout.SUsPerRU;
  975                 rbuf = (RF_ReconBuffer_t *) event->arg;
  976                 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
  977                 Dprintf3("RECON: WRITEDONE EVENT: psid %d ru %d"
  978                     " (%d %% complete).\n",
  979                     rbuf->parityStripeID, rbuf->which_ru,
  980                     raidPtr->reconControl[frow]->percentComplete);
  981                 rf_ReconMapUpdate(raidPtr, raidPtr->reconControl[frow]
  982                     ->reconMap, rbuf->failedDiskSectorOffset,
  983                     rbuf->failedDiskSectorOffset + sectorsPerRU - 1);
  984                 rf_RemoveFromActiveReconTable(raidPtr, frow,
  985                     rbuf->parityStripeID, rbuf->which_ru);
  986 
  987                 if (rbuf->type == RF_RBUF_TYPE_FLOATING) {
  988                         RF_LOCK_MUTEX(raidPtr->reconControl[frow]->rb_mutex);
  989                         raidPtr->numFullReconBuffers--;
  990                         rf_ReleaseFloatingReconBuffer(raidPtr, frow, rbuf);
  991                         RF_UNLOCK_MUTEX(raidPtr->reconControl[frow]->rb_mutex);
  992                 } else
  993                         if (rbuf->type == RF_RBUF_TYPE_FORCED)
  994                                 rf_FreeReconBuffer(rbuf);
  995                         else
  996                                 RF_ASSERT(0);
  997                 break;
  998 
  999                 /* A buffer-stall condition has been cleared. */
 1000         case RF_REVENT_BUFCLEAR:
 1001                 Dprintf2("RECON: BUFCLEAR EVENT: row %d col %d.\n", frow,
 1002                     event->col);
 1003                 submitblocked = rf_SubmitReconBuffer(raidPtr
 1004                     ->reconControl[frow]->perDiskInfo[event->col].rbuf, 0,
 1005                     (int) (long) event->arg);
 1006                 RF_ASSERT(!submitblocked);      /*
 1007                                                  * We wouldn't have gotten the
 1008                                                  * BUFCLEAR event if we
 1009                                                  * couldn't submit.
 1010                                                  */
 1011                 retcode = rf_IssueNextReadRequest(raidPtr, frow, event->col);
 1012                 break;
 1013 
 1014                 /* A user-write reconstruction blockage has been cleared. */
 1015         case RF_REVENT_BLOCKCLEAR:
 1016                 DDprintf2("RECON: BLOCKCLEAR EVENT: row %d col %d.\n",
 1017                     frow, event->col);
 1018                 retcode = rf_TryToRead(raidPtr, frow, event->col);
 1019                 break;
 1020 
 1021                 /*
 1022                  * A max-head-separation reconstruction blockage has been
 1023                  * cleared.
 1024                  */
 1025         case RF_REVENT_HEADSEPCLEAR:
 1026                 Dprintf2("RECON: HEADSEPCLEAR EVENT: row %d col %d.\n",
 1027                     frow, event->col);
 1028                 retcode = rf_TryToRead(raidPtr, frow, event->col);
 1029                 break;
 1030 
 1031                 /* A buffer has become ready to write. */
 1032         case RF_REVENT_BUFREADY:
 1033                 Dprintf2("RECON: BUFREADY EVENT: row %d col %d.\n",
 1034                     frow, event->col);
 1035                 retcode = rf_IssueNextWriteRequest(raidPtr, frow);
 1036                 if (rf_floatingRbufDebug) {
 1037                         rf_CheckFloatingRbufCount(raidPtr, 1);
 1038                 }
 1039                 break;
 1040 
 1041                 /*
 1042                  * We need to skip the current RU entirely because it got
 1043                  * recon'd while we were waiting for something else to happen.
 1044                  */
 1045         case RF_REVENT_SKIP:
 1046                 DDprintf2("RECON: SKIP EVENT: row %d col %d.\n",
 1047                     frow, event->col);
 1048                 retcode = rf_IssueNextReadRequest(raidPtr, frow, event->col);
 1049                 break;
 1050 
 1051                 /*
 1052                  * A forced-reconstruction read access has completed. Just
 1053                  * submit the buffer.
 1054                  */
 1055         case RF_REVENT_FORCEDREADDONE:
 1056                 rbuf = (RF_ReconBuffer_t *) event->arg;
 1057                 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
 1058                 DDprintf2("RECON: FORCEDREADDONE EVENT: row %d col %d.\n",
 1059                     frow, event->col);
 1060                 submitblocked = rf_SubmitReconBuffer(rbuf, 1, 0);
 1061                 RF_ASSERT(!submitblocked);
 1062                 break;
 1063 
 1064         default:
 1065                 RF_PANIC();
 1066         }
 1067         rf_FreeReconEventDesc(event);
 1068         return (retcode);
 1069 }
 1070 
 1071 /*****************************************************************************
 1072  *
 1073  * Find the next thing that's needed on the indicated disk, and issue
 1074  * a read request for it. We assume that the reconstruction buffer
 1075  * associated with this process is free to receive the data. If
 1076  * reconstruction is blocked on the indicated RU, we issue a
 1077  * blockage-release request instead of a physical disk read request.
 1078  * If the current disk gets too far ahead of the others, we issue a
 1079  * head-separation wait request and return.
 1080  *
 1081  * ctrl->{ru_count, curPSID, diskOffset} and
 1082  * rbuf->failedDiskSectorOffset are maintained to point to the unit
 1083  * we're currently accessing. Note that this deviates from the
 1084  * standard C idiom of having counters point to the next thing to be
 1085  * accessed. This allows us to easily retry when we're blocked by
 1086  * head separation or reconstruction-blockage events.
 1087  *
 1088  * Returns nonzero if and only if there is nothing left unread on the
 1089  * indicated disk.
 1090  *
 1091  *****************************************************************************/
 1092 int
 1093 rf_IssueNextReadRequest(RF_Raid_t *raidPtr, RF_RowCol_t row, RF_RowCol_t col)
 1094 {
 1095         RF_PerDiskReconCtrl_t *ctrl =
 1096             &raidPtr->reconControl[row]->perDiskInfo[col];
 1097         RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
 1098         RF_ReconBuffer_t *rbuf = ctrl->rbuf;
 1099         RF_ReconUnitCount_t RUsPerPU =
 1100             layoutPtr->SUsPerPU / layoutPtr->SUsPerRU;
 1101         RF_SectorCount_t sectorsPerRU =
 1102             layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU;
 1103         int do_new_check = 0, retcode = 0, status;
 1104 
 1105         /*
 1106          * If we are currently the slowest disk, mark that we have to do a new
 1107          * check.
 1108          */
 1109         if (ctrl->headSepCounter <=
 1110             raidPtr->reconControl[row]->minHeadSepCounter)
 1111                 do_new_check = 1;
 1112 
 1113         while (1) {
 1114 
 1115                 ctrl->ru_count++;
 1116                 if (ctrl->ru_count < RUsPerPU) {
 1117                         ctrl->diskOffset += sectorsPerRU;
 1118                         rbuf->failedDiskSectorOffset += sectorsPerRU;
 1119                 } else {
 1120                         ctrl->curPSID++;
 1121                         ctrl->ru_count = 0;
 1122                         /* code left over from when head-sep was based on
 1123                          * parity stripe id */
 1124                         if (ctrl->curPSID >=
 1125                             raidPtr->reconControl[row]->lastPSID) {
 1126                                 rf_CheckForNewMinHeadSep(raidPtr, row,
 1127                                     ++(ctrl->headSepCounter));
 1128                                 return (1);     /* Finito ! */
 1129                         }
 1130                         /*
 1131                          * Find the disk offsets of the start of the parity
 1132                          * stripe on both the current disk and the failed
 1133                          * disk. Skip this entire parity stripe if either disk
 1134                          * does not appear in the indicated PS.
 1135                          */
 1136                         status = rf_ComputePSDiskOffsets(raidPtr,
 1137                             ctrl->curPSID, row, col, &ctrl->diskOffset,
 1138                             &rbuf->failedDiskSectorOffset, &rbuf->spRow,
 1139                             &rbuf->spCol, &rbuf->spOffset);
 1140                         if (status) {
 1141                                 ctrl->ru_count = RUsPerPU - 1;
 1142                                 continue;
 1143                         }
 1144                 }
 1145                 rbuf->which_ru = ctrl->ru_count;
 1146 
 1147                 /* Skip this RU if it's already been reconstructed. */
 1148                 if (rf_CheckRUReconstructed(raidPtr->reconControl[row]
 1149                     ->reconMap, rbuf->failedDiskSectorOffset)) {
 1150                         Dprintf2("Skipping psid %ld ru %d: already"
 1151                             " reconstructed.\n", ctrl->curPSID, ctrl->ru_count);
 1152                         continue;
 1153                 }
 1154                 break;
 1155         }
 1156         ctrl->headSepCounter++;
 1157         if (do_new_check)       /* Update min if needed. */
 1158                 rf_CheckForNewMinHeadSep(raidPtr, row, ctrl->headSepCounter);
 1159 
 1160 
 1161         /*
 1162          * At this point, we have definitely decided what to do, and we have
 1163          * only to see if we can actually do it now.
 1164          */
 1165         rbuf->parityStripeID = ctrl->curPSID;
 1166         rbuf->which_ru = ctrl->ru_count;
 1167         bzero((char *) &raidPtr->recon_tracerecs[col],
 1168             sizeof(raidPtr->recon_tracerecs[col]));
 1169         raidPtr->recon_tracerecs[col].reconacc = 1;
 1170         RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer);
 1171         retcode = rf_TryToRead(raidPtr, row, col);
 1172         return (retcode);
 1173 }
 1174 
 1175 /*
 1176  * Tries to issue the next read on the indicated disk. We may be
 1177  * blocked by (a) the heads being too far apart, or (b) recon on the
 1178  * indicated RU being blocked due to a write by a user thread. In
 1179  * this case, we issue a head-sep or blockage wait request, which will
 1180  * cause this same routine to be invoked again later when the blockage
 1181  * has cleared.
 1182  */
 1183 
 1184 int
 1185 rf_TryToRead(RF_Raid_t *raidPtr, RF_RowCol_t row, RF_RowCol_t col)
 1186 {
 1187         RF_PerDiskReconCtrl_t *ctrl =
 1188             &raidPtr->reconControl[row]->perDiskInfo[col];
 1189         RF_SectorCount_t sectorsPerRU =
 1190             raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU;
 1191         RF_StripeNum_t psid = ctrl->curPSID;
 1192         RF_ReconUnitNum_t which_ru = ctrl->ru_count;
 1193         RF_DiskQueueData_t *req;
 1194         int status, created = 0;
 1195         RF_ReconParityStripeStatus_t *pssPtr;
 1196 
 1197         /*
 1198          * If the current disk is too far ahead of the others, issue a
 1199          * head-separation wait and return.
 1200          */
 1201         if (rf_CheckHeadSeparation(raidPtr, ctrl, row, col,
 1202             ctrl->headSepCounter, which_ru))
 1203                 return (0);
 1204         RF_LOCK_PSS_MUTEX(raidPtr, row, psid);
 1205         pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl[row]
 1206             ->pssTable, psid, which_ru, RF_PSS_CREATE, &created);
 1207 
 1208         /*
 1209          * If recon is blocked on the indicated parity stripe, issue a
 1210          * block-wait request and return. This also must mark the indicated RU
 1211          * in the stripe as under reconstruction if not blocked.
 1212          */
 1213         status = rf_CheckForcedOrBlockedReconstruction(raidPtr, pssPtr, ctrl,
 1214             row, col, psid, which_ru);
 1215         if (status == RF_PSS_RECON_BLOCKED) {
 1216                 Dprintf2("RECON: Stalling psid %ld ru %d: recon blocked.\n",
 1217                     psid, which_ru);
 1218                 goto out;
 1219         } else
 1220                 if (status == RF_PSS_FORCED_ON_WRITE) {
 1221                         rf_CauseReconEvent(raidPtr, row, col, NULL,
 1222                             RF_REVENT_SKIP);
 1223                         goto out;
 1224                 }
 1225         /*
 1226          * Make one last check to be sure that the indicated RU didn't get
 1227          * reconstructed while we were waiting for something else to happen.
 1228          * This is unfortunate in that it causes us to make this check twice
 1229          * in the normal case. Might want to make some attempt to re-work
 1230          * this so that we only do this check if we've definitely blocked on
 1231          * one of the above checks. When this condition is detected, we may
 1232          * have just created a bogus status entry, which we need to delete.
 1233          */
 1234         if (rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap,
 1235             ctrl->rbuf->failedDiskSectorOffset)) {
 1236                 Dprintf2("RECON: Skipping psid %ld ru %d: prior recon after"
 1237                     " stall.\n", psid, which_ru);
 1238                 if (created)
 1239                         rf_PSStatusDelete(raidPtr,
 1240                             raidPtr->reconControl[row]->pssTable, pssPtr);
 1241                 rf_CauseReconEvent(raidPtr, row, col, NULL, RF_REVENT_SKIP);
 1242                 goto out;
 1243         }
 1244         /* Found something to read. Issue the I/O. */
 1245         Dprintf5("RECON: Read for psid %ld on row %d col %d offset %ld"
 1246             " buf %lx.\n", psid, row, col, ctrl->diskOffset,
 1247             ctrl->rbuf->buffer);
 1248         RF_ETIMER_STOP(raidPtr->recon_tracerecs[col].recon_timer);
 1249         RF_ETIMER_EVAL(raidPtr->recon_tracerecs[col].recon_timer);
 1250         raidPtr->recon_tracerecs[col].specific.recon.recon_start_to_fetch_us =
 1251             RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[col].recon_timer);
 1252         RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer);
 1253 
 1254         /*
 1255          * Should be ok to use a NULL proc pointer here, all the bufs we use
 1256          * should be in kernel space.
 1257          */
 1258         req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, ctrl->diskOffset,
 1259             sectorsPerRU, ctrl->rbuf->buffer, psid, which_ru,
 1260             rf_ReconReadDoneProc, (void *) ctrl, NULL,
 1261             &raidPtr->recon_tracerecs[col], (void *) raidPtr, 0, NULL);
 1262 
 1263         RF_ASSERT(req);         /* XXX -- Fix this. -- XXX */
 1264 
 1265         ctrl->rbuf->arg = (void *) req;
 1266         rf_DiskIOEnqueue(&raidPtr->Queues[row][col], req, RF_IO_RECON_PRIORITY);
 1267         pssPtr->issued[col] = 1;
 1268 
 1269 out:
 1270         RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid);
 1271         return (0);
 1272 }
 1273 
 1274 
 1275 /*
 1276  * Given a parity stripe ID, we want to find out whether both the
 1277  * current disk and the failed disk exist in that parity stripe. If
 1278  * not, we want to skip this whole PS. If so, we want to find the
 1279  * disk offset of the start of the PS on both the current disk and the
 1280  * failed disk.
 1281  *
 1282  * This works by getting a list of disks comprising the indicated
 1283  * parity stripe, and searching the list for the current and failed
 1284  * disks. Once we've decided they both exist in the parity stripe, we
 1285  * need to decide whether each is data or parity, so that we'll know
 1286  * which mapping function to call to get the corresponding disk
 1287  * offsets.
 1288  *
 1289  * This is kind of unpleasant, but doing it this way allows the
 1290  * reconstruction code to use parity stripe IDs rather than physical
 1291  * disks address to march through the failed disk, which greatly
 1292  * simplifies a lot of code, as well as eliminating the need for a
 1293  * reverse-mapping function. I also think it will execute faster,
 1294  * since the calls to the mapping module are kept to a minimum.
 1295  *
 1296  * ASSUMES THAT THE STRIPE IDENTIFIER IDENTIFIES THE DISKS COMPRISING
 1297  * THE STRIPE IN THE CORRECT ORDER.
 1298  */
 1299 
 1300 int
 1301 rf_ComputePSDiskOffsets(
 1302     RF_Raid_t           *raidPtr,       /* RAID descriptor. */
 1303     RF_StripeNum_t       psid,          /* Parity stripe identifier. */
 1304     RF_RowCol_t          row,           /*
 1305                                          * Row and column of disk to find
 1306                                          * the offsets for.
 1307                                          */
 1308     RF_RowCol_t          col,
 1309     RF_SectorNum_t      *outDiskOffset,
 1310     RF_SectorNum_t      *outFailedDiskSectorOffset,
 1311     RF_RowCol_t         *spRow,         /*
 1312                                          * OUT: Row,col of spare unit for
 1313                                          * failed unit.
 1314                                          */
 1315     RF_RowCol_t         *spCol,
 1316     RF_SectorNum_t      *spOffset       /*
 1317                                          * OUT: Offset into disk containing
 1318                                          * spare unit.
 1319                                          */
 1320 )
 1321 {
 1322         RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
 1323         RF_RowCol_t fcol = raidPtr->reconControl[row]->fcol;
 1324         RF_RaidAddr_t sosRaidAddress;   /* start-of-stripe */
 1325         RF_RowCol_t *diskids;
 1326         u_int i, j, k, i_offset, j_offset;
 1327         RF_RowCol_t prow, pcol;
 1328         int testcol, testrow;
 1329         RF_RowCol_t stripe;
 1330         RF_SectorNum_t poffset;
 1331         char i_is_parity = 0, j_is_parity = 0;
 1332         RF_RowCol_t stripeWidth =
 1333             layoutPtr->numDataCol + layoutPtr->numParityCol;
 1334 
 1335         /* Get a listing of the disks comprising that stripe. */
 1336         sosRaidAddress = rf_ParityStripeIDToRaidAddress(layoutPtr, psid);
 1337         (layoutPtr->map->IdentifyStripe) (raidPtr, sosRaidAddress, &diskids,
 1338             &stripe);
 1339         RF_ASSERT(diskids);
 1340 
 1341         /*
 1342          * Reject this entire parity stripe if it does not contain the
 1343          * indicated disk or it does not contain the failed disk.
 1344          */
 1345         if (row != stripe)
 1346                 goto skipit;
 1347         for (i = 0; i < stripeWidth; i++) {
 1348                 if (col == diskids[i])
 1349                         break;
 1350         }
 1351         if (i == stripeWidth)
 1352                 goto skipit;
 1353         for (j = 0; j < stripeWidth; j++) {
 1354                 if (fcol == diskids[j])
 1355                         break;
 1356         }
 1357         if (j == stripeWidth) {
 1358                 goto skipit;
 1359         }
 1360         /* Find out which disk the parity is on. */
 1361         (layoutPtr->map->MapParity) (raidPtr, sosRaidAddress, &prow, &pcol,
 1362             &poffset, RF_DONT_REMAP);
 1363 
 1364         /* Find out if either the current RU or the failed RU is parity. */
 1365         /*
 1366          * Also, if the parity occurs in this stripe prior to the data and/or
 1367          * failed col, we need to decrement i and/or j.
 1368          */
 1369         for (k = 0; k < stripeWidth; k++)
 1370                 if (diskids[k] == pcol)
 1371                         break;
 1372         RF_ASSERT(k < stripeWidth);
 1373         i_offset = i;
 1374         j_offset = j;
 1375         if (k < i)
 1376                 i_offset--;
 1377         else
 1378                 if (k == i) {
 1379                         i_is_parity = 1;
 1380                         i_offset = 0;
 1381                 }               /*
 1382                                  * Set offsets to zero to disable multiply
 1383                                  * below.
 1384                                  */
 1385         if (k < j)
 1386                 j_offset--;
 1387         else
 1388                 if (k == j) {
 1389                         j_is_parity = 1;
 1390                         j_offset = 0;
 1391                 }
 1392         /*
 1393          * At this point, [ij]_is_parity tells us whether the [current,failed]
 1394          * disk is parity at the start of this RU, and, if data, "[ij]_offset"
 1395          * tells us how far into the stripe the [current,failed] disk is.
 1396          */
 1397 
 1398         /*
 1399          * Call the mapping routine to get the offset into the current disk,
 1400          * repeat for failed disk.
 1401          */
 1402         if (i_is_parity)
 1403                 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + i_offset *
 1404                     layoutPtr->sectorsPerStripeUnit, &testrow, &testcol,
 1405                     outDiskOffset, RF_DONT_REMAP);
 1406         else
 1407                 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + i_offset *
 1408                     layoutPtr->sectorsPerStripeUnit, &testrow, &testcol,
 1409                     outDiskOffset, RF_DONT_REMAP);
 1410 
 1411         RF_ASSERT(row == testrow && col == testcol);
 1412 
 1413         if (j_is_parity)
 1414                 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset *
 1415                     layoutPtr->sectorsPerStripeUnit, &testrow, &testcol,
 1416                     outFailedDiskSectorOffset, RF_DONT_REMAP);
 1417         else
 1418                 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset *
 1419                     layoutPtr->sectorsPerStripeUnit, &testrow, &testcol,
 1420                     outFailedDiskSectorOffset, RF_DONT_REMAP);
 1421         RF_ASSERT(row == testrow && fcol == testcol);
 1422 
 1423         /* Now locate the spare unit for the failed unit. */
 1424         if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) {
 1425                 if (j_is_parity)
 1426                         layoutPtr->map->MapParity(raidPtr, sosRaidAddress +
 1427                             j_offset * layoutPtr->sectorsPerStripeUnit, spRow,
 1428                             spCol, spOffset, RF_REMAP);
 1429                 else
 1430                         layoutPtr->map->MapSector(raidPtr, sosRaidAddress +
 1431                             j_offset * layoutPtr->sectorsPerStripeUnit, spRow,
 1432                             spCol, spOffset, RF_REMAP);
 1433         } else {
 1434                 *spRow = raidPtr->reconControl[row]->spareRow;
 1435                 *spCol = raidPtr->reconControl[row]->spareCol;
 1436                 *spOffset = *outFailedDiskSectorOffset;
 1437         }
 1438 
 1439         return (0);
 1440 
 1441 skipit:
 1442         Dprintf3("RECON: Skipping psid %ld: nothing needed from r%d c%d.\n",
 1443             psid, row, col);
 1444         return (1);
 1445 }
 1446 
 1447 
 1448 /*
 1449  * This is called when a buffer has become ready to write to the replacement
 1450  * disk.
 1451  */
 1452 int
 1453 rf_IssueNextWriteRequest(RF_Raid_t *raidPtr, RF_RowCol_t row)
 1454 {
 1455         RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
 1456         RF_SectorCount_t sectorsPerRU =
 1457             layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU;
 1458         RF_RowCol_t fcol = raidPtr->reconControl[row]->fcol;
 1459         RF_ReconBuffer_t *rbuf;
 1460         RF_DiskQueueData_t *req;
 1461 
 1462         rbuf = rf_GetFullReconBuffer(raidPtr->reconControl[row]);
 1463         RF_ASSERT(rbuf);        /*
 1464                                  * There must be one available, or we wouldn't
 1465                                  * have gotten the event that sent us here.
 1466                                  */
 1467         RF_ASSERT(rbuf->pssPtr);
 1468 
 1469         rbuf->pssPtr->writeRbuf = rbuf;
 1470         rbuf->pssPtr = NULL;
 1471 
 1472         Dprintf7("RECON: New write (r %d c %d offs %d) for psid %ld ru %d"
 1473             " (failed disk offset %ld) buf %lx.\n",
 1474             rbuf->spRow, rbuf->spCol, rbuf->spOffset, rbuf->parityStripeID,
 1475             rbuf->which_ru, rbuf->failedDiskSectorOffset, rbuf->buffer);
 1476         Dprintf6("RECON: new write psid %ld   %02x %02x %02x %02x %02x.\n",
 1477             rbuf->parityStripeID, rbuf->buffer[0] & 0xff,
 1478             rbuf->buffer[1] & 0xff, rbuf->buffer[2] & 0xff,
 1479             rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff);
 1480 
 1481         /*
 1482          * Should be ok to use a NULL b_proc here b/c all addrs should be in
 1483          * kernel space.
 1484          */
 1485         req = rf_CreateDiskQueueData(RF_IO_TYPE_WRITE, rbuf->spOffset,
 1486             sectorsPerRU, rbuf->buffer, rbuf->parityStripeID, rbuf->which_ru,
 1487             rf_ReconWriteDoneProc, (void *) rbuf, NULL,
 1488             &raidPtr->recon_tracerecs[fcol], (void *) raidPtr, 0, NULL);
 1489 
 1490         RF_ASSERT(req);         /* XXX -- Fix this. -- XXX */
 1491 
 1492         rbuf->arg = (void *) req;
 1493         rf_DiskIOEnqueue(&raidPtr->Queues[rbuf->spRow][rbuf->spCol], req,
 1494             RF_IO_RECON_PRIORITY);
 1495 
 1496         return (0);
 1497 }
 1498 
 1499 /*
 1500  * This gets called upon the completion of a reconstruction read
 1501  * operation. The arg is a pointer to the per-disk reconstruction
 1502  * control structure for the process that just finished a read.
 1503  *
 1504  * Called at interrupt context in the kernel, so don't do anything
 1505  * illegal here.
 1506  */
 1507 int
 1508 rf_ReconReadDoneProc(void *arg, int status)
 1509 {
 1510         RF_PerDiskReconCtrl_t *ctrl = (RF_PerDiskReconCtrl_t *) arg;
 1511         RF_Raid_t *raidPtr = ctrl->reconCtrl->reconDesc->raidPtr;
 1512 
 1513         if (status) {
 1514                 /*
 1515                  * XXX
 1516                  */
 1517                 printf("Recon read failed !\n");
 1518                 RF_PANIC();
 1519         }
 1520         RF_ETIMER_STOP(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
 1521         RF_ETIMER_EVAL(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
 1522         raidPtr->recon_tracerecs[ctrl->col].specific.recon.
 1523            recon_fetch_to_return_us =
 1524              RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
 1525         RF_ETIMER_START(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
 1526 
 1527         rf_CauseReconEvent(raidPtr, ctrl->row, ctrl->col, NULL,
 1528             RF_REVENT_READDONE);
 1529         return (0);
 1530 }
 1531 
 1532 
 1533 /*
 1534  * This gets called upon the completion of a reconstruction write operation.
 1535  * The arg is a pointer to the rbuf that was just written.
 1536  *
 1537  * Called at interrupt context in the kernel, so don't do anything illegal here.
 1538  */
 1539 int
 1540 rf_ReconWriteDoneProc(void *arg, int status)
 1541 {
 1542         RF_ReconBuffer_t *rbuf = (RF_ReconBuffer_t *) arg;
 1543 
 1544         Dprintf2("Reconstruction completed on psid %ld ru %d.\n",
 1545             rbuf->parityStripeID, rbuf->which_ru);
 1546         if (status) {
 1547                 /* fprintf(stderr, "Recon write failed !\n"); */
 1548                 printf("Recon write failed !\n");
 1549                 RF_PANIC();
 1550         }
 1551         rf_CauseReconEvent((RF_Raid_t *) rbuf->raidPtr, rbuf->row, rbuf->col,
 1552             arg, RF_REVENT_WRITEDONE);
 1553         return (0);
 1554 }
 1555 
 1556 
 1557 /*
 1558  * Computes a new minimum head sep, and wakes up anyone who needs to
 1559  * be woken as a result.
 1560  */
 1561 void
 1562 rf_CheckForNewMinHeadSep(RF_Raid_t *raidPtr, RF_RowCol_t row,
 1563     RF_HeadSepLimit_t hsCtr)
 1564 {
 1565         RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl[row];
 1566         RF_HeadSepLimit_t new_min;
 1567         RF_RowCol_t i;
 1568         RF_CallbackDesc_t *p;
 1569         /* From the definition of a minimum. */
 1570         RF_ASSERT(hsCtr >= reconCtrlPtr->minHeadSepCounter);
 1571 
 1572 
 1573         RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
 1574 
 1575         new_min = ~(1L << (8 * sizeof(long) - 1));      /* 0x7FFF....FFF */
 1576         for (i = 0; i < raidPtr->numCol; i++)
 1577                 if (i != reconCtrlPtr->fcol) {
 1578                         if (reconCtrlPtr->perDiskInfo[i].headSepCounter <
 1579                             new_min)
 1580                                 new_min =
 1581                                     reconCtrlPtr->perDiskInfo[i].headSepCounter;
 1582                 }
 1583         /* Set the new minimum and wake up anyone who can now run again. */
 1584         if (new_min != reconCtrlPtr->minHeadSepCounter) {
 1585                 reconCtrlPtr->minHeadSepCounter = new_min;
 1586                 Dprintf1("RECON:  new min head pos counter val is %ld.\n",
 1587                     new_min);
 1588                 while (reconCtrlPtr->headSepCBList) {
 1589                         if (reconCtrlPtr->headSepCBList->callbackArg.v >
 1590                             new_min)
 1591                                 break;
 1592                         p = reconCtrlPtr->headSepCBList;
 1593                         reconCtrlPtr->headSepCBList = p->next;
 1594                         p->next = NULL;
 1595                         rf_CauseReconEvent(raidPtr, p->row, p->col, NULL,
 1596                             RF_REVENT_HEADSEPCLEAR);
 1597                         rf_FreeCallbackDesc(p);
 1598                 }
 1599 
 1600         }
 1601         RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
 1602 }
 1603 
 1604 /*
 1605  * Checks to see that the maximum head separation will not be violated
 1606  * if we initiate a reconstruction I/O on the indicated disk.
 1607  * Limiting the maximum head separation between two disks eliminates
 1608  * the nasty buffer-stall conditions that occur when one disk races
 1609  * ahead of the others and consumes all of the floating recon buffers.
 1610  * This code is complex and unpleasant but it's necessary to avoid
 1611  * some very nasty, albeit fairly rare, reconstruction behavior.
 1612  *
 1613  * Returns non-zero if and only if we have to stop working on the
 1614  * indicated disk due to a head-separation delay.
 1615  */
 1616 int
 1617 rf_CheckHeadSeparation(
 1618     RF_Raid_t                   *raidPtr,
 1619     RF_PerDiskReconCtrl_t       *ctrl,
 1620     RF_RowCol_t                  row,
 1621     RF_RowCol_t                  col,
 1622     RF_HeadSepLimit_t            hsCtr,
 1623     RF_ReconUnitNum_t            which_ru
 1624 )
 1625 {
 1626         RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl[row];
 1627         RF_CallbackDesc_t *cb, *p, *pt;
 1628         int retval = 0;
 1629 
 1630         /*
 1631          * If we're too far ahead of the slowest disk, stop working on this
 1632          * disk until the slower ones catch up. We do this by scheduling a
 1633          * wakeup callback for the time when the slowest disk has caught up.
 1634          * We define "caught up" with 20% hysteresis, i.e. the head separation
 1635          * must have fallen to at most 80% of the max allowable head
 1636          * separation before we'll wake up.
 1637          */
 1638         RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
 1639         if ((raidPtr->headSepLimit >= 0) &&
 1640             ((ctrl->headSepCounter - reconCtrlPtr->minHeadSepCounter) >
 1641              raidPtr->headSepLimit)) {
 1642                 Dprintf6("raid%d: RECON: head sep stall: row %d col %d hsCtr"
 1643                     " %ld minHSCtr %ld limit %ld.\n",
 1644                     raidPtr->raidid, row, col, ctrl->headSepCounter,
 1645                     reconCtrlPtr->minHeadSepCounter, raidPtr->headSepLimit);
 1646                 cb = rf_AllocCallbackDesc();
 1647                 /*
 1648                  * The minHeadSepCounter value we have to get to before we'll
 1649                  * wake up. Build in 20% hysteresis.
 1650                  */
 1651                 cb->callbackArg.v = (ctrl->headSepCounter -
 1652                     raidPtr->headSepLimit + raidPtr->headSepLimit / 5);
 1653                 cb->row = row;
 1654                 cb->col = col;
 1655                 cb->next = NULL;
 1656 
 1657                 /*
 1658                  * Insert this callback descriptor into the sorted list of
 1659                  * pending head-sep callbacks.
 1660                  */
 1661                 p = reconCtrlPtr->headSepCBList;
 1662                 if (!p)
 1663                         reconCtrlPtr->headSepCBList = cb;
 1664                 else
 1665                         if (cb->callbackArg.v < p->callbackArg.v) {
 1666                                 cb->next = reconCtrlPtr->headSepCBList;
 1667                                 reconCtrlPtr->headSepCBList = cb;
 1668                         } else {
 1669                                 for (pt = p, p = p->next;
 1670                                     p && (p->callbackArg.v < cb->callbackArg.v);
 1671                                     pt = p, p = p->next);
 1672                                 cb->next = p;
 1673                                 pt->next = cb;
 1674                         }
 1675                 retval = 1;
 1676 #if     RF_RECON_STATS > 0
 1677                 ctrl->reconCtrl->reconDesc->hsStallCount++;
 1678 #endif  /* RF_RECON_STATS > 0 */
 1679         }
 1680         RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
 1681 
 1682         return (retval);
 1683 }
 1684 
 1685 
 1686 
 1687 /*
 1688  * Checks to see if reconstruction has been either forced or blocked
 1689  * by a user operation. If forced, we skip this RU entirely. Else if
 1690  * blocked, put ourselves on the wait list. Else return 0.
 1691  *
 1692  * ASSUMES THE PSS MUTEX IS LOCKED UPON ENTRY.
 1693  */
 1694 int
 1695 rf_CheckForcedOrBlockedReconstruction(
 1696     RF_Raid_t                    *raidPtr,
 1697     RF_ReconParityStripeStatus_t *pssPtr,
 1698     RF_PerDiskReconCtrl_t        *ctrl,
 1699     RF_RowCol_t                   row,
 1700     RF_RowCol_t                   col,
 1701     RF_StripeNum_t                psid,
 1702     RF_ReconUnitNum_t             which_ru
 1703 )
 1704 {
 1705         RF_CallbackDesc_t *cb;
 1706         int retcode = 0;
 1707 
 1708         if ((pssPtr->flags & RF_PSS_FORCED_ON_READ) ||
 1709             (pssPtr->flags & RF_PSS_FORCED_ON_WRITE))
 1710                 retcode = RF_PSS_FORCED_ON_WRITE;
 1711         else
 1712                 if (pssPtr->flags & RF_PSS_RECON_BLOCKED) {
 1713                         Dprintf4("RECON: row %d col %d blocked at psid %ld"
 1714                             " ru %d.\n", row, col, psid, which_ru);
 1715                         cb = rf_AllocCallbackDesc();    /*
 1716                                                          * Append ourselves to
 1717                                                          * the blockage-wait
 1718                                                          * list.
 1719                                                          */
 1720                         cb->row = row;
 1721                         cb->col = col;
 1722                         cb->next = pssPtr->blockWaitList;
 1723                         pssPtr->blockWaitList = cb;
 1724                         retcode = RF_PSS_RECON_BLOCKED;
 1725                 }
 1726         if (!retcode)
 1727                 pssPtr->flags |= RF_PSS_UNDER_RECON;    /*
 1728                                                          * Mark this RU as under
 1729                                                          * reconstruction.
 1730                                                          */
 1731 
 1732         return (retcode);
 1733 }
 1734 
 1735 
 1736 /*
 1737  * If reconstruction is currently ongoing for the indicated stripeID,
 1738  * reconstruction is forced to completion and we return non-zero to
 1739  * indicate that the caller must wait. If not, then reconstruction is
 1740  * blocked on the indicated stripe and the routine returns zero. If
 1741  * and only if we return non-zero, we'll cause the cbFunc to get
 1742  * invoked with the cbArg when the reconstruction has completed.
 1743  */
 1744 int
 1745 rf_ForceOrBlockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
 1746         void (*cbFunc) (RF_Raid_t *, void *), void *cbArg)
 1747 {
 1748         RF_RowCol_t row = asmap->physInfo->row; /*
 1749                                                  * Which row of the array
 1750                                                  * we're working on.
 1751                                                  */
 1752         RF_StripeNum_t stripeID = asmap->stripeID;      /*
 1753                                                          * The stripe ID we're
 1754                                                          * forcing recon on.
 1755                                                          */
 1756         RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit *
 1757             raidPtr->Layout.SUsPerRU;           /* Num sects in one RU. */
 1758         RF_ReconParityStripeStatus_t *pssPtr;   /*
 1759                                                  * A pointer to the parity
 1760                                                  * stripe status structure.
 1761                                                  */
 1762         RF_StripeNum_t psid;                    /* Parity stripe id. */
 1763         RF_SectorNum_t offset, fd_offset;       /*
 1764                                                  * Disk offset, failed-disk
 1765                                                  * offset.
 1766                                                  */
 1767         RF_RowCol_t *diskids;
 1768         RF_RowCol_t stripe;
 1769         RF_ReconUnitNum_t which_ru;     /* RU within parity stripe. */
 1770         RF_RowCol_t fcol, diskno, i;
 1771         RF_ReconBuffer_t *new_rbuf;     /* Ptr to newly allocated rbufs. */
 1772         RF_DiskQueueData_t *req;        /* Disk I/O req to be enqueued. */
 1773         RF_CallbackDesc_t *cb;
 1774         int created = 0, nPromoted;
 1775 
 1776         psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID,
 1777             &which_ru);
 1778 
 1779         RF_LOCK_PSS_MUTEX(raidPtr, row, psid);
 1780 
 1781         pssPtr = rf_LookupRUStatus(raidPtr,
 1782             raidPtr->reconControl[row]->pssTable, psid, which_ru,
 1783             RF_PSS_CREATE | RF_PSS_RECON_BLOCKED, &created);
 1784 
 1785         /* If recon is not ongoing on this PS, just return. */
 1786         if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) {
 1787                 RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid);
 1788                 return (0);
 1789         }
 1790         /*
 1791          * Otherwise, we have to wait for reconstruction to complete on this
 1792          * RU.
 1793          */
 1794         /*
 1795          * In order to avoid waiting for a potentially large number of
 1796          * low-priority accesses to complete, we force a normal-priority (i.e.
 1797          * not low-priority) reconstruction on this RU.
 1798          */
 1799         if (!(pssPtr->flags & RF_PSS_FORCED_ON_WRITE) &&
 1800             !(pssPtr->flags & RF_PSS_FORCED_ON_READ)) {
 1801                 DDprintf1("Forcing recon on psid %ld.\n", psid);
 1802                 /* Mark this RU as under forced recon. */
 1803                 pssPtr->flags |= RF_PSS_FORCED_ON_WRITE;
 1804                 /* Clear the blockage that we just set. */
 1805                 pssPtr->flags &= ~RF_PSS_RECON_BLOCKED;
 1806                 fcol = raidPtr->reconControl[row]->fcol;
 1807 
 1808                 /*
 1809                  * Get a listing of the disks comprising the indicated stripe.
 1810                  */
 1811                 (raidPtr->Layout.map->IdentifyStripe) (raidPtr,
 1812                     asmap->raidAddress, &diskids, &stripe);
 1813                 RF_ASSERT(row == stripe);
 1814 
 1815                 /*
 1816                  * For previously issued reads, elevate them to normal
 1817                  * priority. If the I/O has already completed, it won't be
 1818                  * found in the queue, and hence this will be a no-op. For
 1819                  * unissued reads, allocate buffers and issue new reads. The
 1820                  * fact that we've set the FORCED bit means that the regular
 1821                  * recon procs will not re-issue these reqs.
 1822                  */
 1823                 for (i = 0; i < raidPtr->Layout.numDataCol +
 1824                     raidPtr->Layout.numParityCol; i++)
 1825                         if ((diskno = diskids[i]) != fcol) {
 1826                                 if (pssPtr->issued[diskno]) {
 1827                                         nPromoted = rf_DiskIOPromote(&raidPtr
 1828                                             ->Queues[row][diskno], psid,
 1829                                             which_ru);
 1830                                         if (rf_reconDebug && nPromoted)
 1831                                                 printf("raid%d: promoted read"
 1832                                                     " from row %d col %d.\n",
 1833                                                     raidPtr->raidid, row,
 1834                                                     diskno);
 1835                                 } else {
 1836                                         /* Create new buf. */
 1837                                         new_rbuf = rf_MakeReconBuffer(raidPtr,
 1838                                             row, diskno, RF_RBUF_TYPE_FORCED);
 1839                                         /* Find offsets & spare locationp */
 1840                                         rf_ComputePSDiskOffsets(raidPtr, psid,
 1841                                             row, diskno, &offset, &fd_offset,
 1842                                             &new_rbuf->spRow, &new_rbuf->spCol,
 1843                                             &new_rbuf->spOffset);
 1844                                         new_rbuf->parityStripeID = psid;
 1845                                         /* Fill in the buffer. */
 1846                                         new_rbuf->which_ru = which_ru;
 1847                                         new_rbuf->failedDiskSectorOffset =
 1848                                             fd_offset;
 1849                                         new_rbuf->priority =
 1850                                             RF_IO_NORMAL_PRIORITY;
 1851 
 1852                                         /*
 1853                                          * Use NULL b_proc b/c all addrs
 1854                                          * should be in kernel space.
 1855                                          */
 1856                                         req = rf_CreateDiskQueueData(
 1857                                             RF_IO_TYPE_READ, offset +
 1858                                             which_ru * sectorsPerRU,
 1859                                             sectorsPerRU, new_rbuf->buffer,
 1860                                             psid, which_ru, (int (*)
 1861                                             (void *, int))
 1862                                               rf_ForceReconReadDoneProc,
 1863                                             (void *) new_rbuf, NULL,
 1864                                             NULL, (void *) raidPtr, 0, NULL);
 1865 
 1866                                         RF_ASSERT(req); /*
 1867                                                          * XXX -- Fix this. --
 1868                                                          * XXX
 1869                                                          */
 1870 
 1871                                         new_rbuf->arg = req;
 1872                                         /* Enqueue the I/O. */
 1873                                         rf_DiskIOEnqueue(&raidPtr
 1874                                             ->Queues[row][diskno], req,
 1875                                             RF_IO_NORMAL_PRIORITY);
 1876                                         Dprintf3("raid%d: Issued new read req"
 1877                                             " on row %d col %d.\n",
 1878                                             raidPtr->raidid, row, diskno);
 1879                                 }
 1880                         }
 1881                 /*
 1882                  * If the write is sitting in the disk queue, elevate its
 1883                  * priority.
 1884                  */
 1885                 if (rf_DiskIOPromote(&raidPtr->Queues[row][fcol],
 1886                     psid, which_ru))
 1887                         printf("raid%d: promoted write to row %d col %d.\n",
 1888                             raidPtr->raidid, row, fcol);
 1889         }
 1890         /*
 1891          * Install a callback descriptor to be invoked when recon completes on
 1892          * this parity stripe.
 1893          */
 1894         cb = rf_AllocCallbackDesc();
 1895         /*
 1896          * XXX The following is bogus... These functions don't really match !!!
 1897          * GO
 1898          */
 1899         cb->callbackFunc = (void (*) (RF_CBParam_t)) cbFunc;
 1900         cb->callbackArg.p = (void *) cbArg;
 1901         cb->next = pssPtr->procWaitList;
 1902         pssPtr->procWaitList = cb;
 1903         DDprintf2("raid%d: Waiting for forced recon on psid %ld.\n",
 1904             raidPtr->raidid, psid);
 1905 
 1906         RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid);
 1907         return (1);
 1908 }
 1909 
 1910 
 1911 /*
 1912  * Called upon the completion of a forced reconstruction read.
 1913  * All we do is schedule the FORCEDREADONE event.
 1914  * Called at interrupt context in the kernel, so don't do anything illegal here.
 1915  */
 1916 void
 1917 rf_ForceReconReadDoneProc(void *arg, int status)
 1918 {
 1919         RF_ReconBuffer_t *rbuf = arg;
 1920 
 1921         if (status) {
 1922                 /* fprintf(stderr, "Forced recon read failed !\n"); */
 1923                 printf("Forced recon read failed !\n");
 1924                 RF_PANIC();
 1925         }
 1926         rf_CauseReconEvent((RF_Raid_t *) rbuf->raidPtr, rbuf->row, rbuf->col,
 1927             (void *) rbuf, RF_REVENT_FORCEDREADDONE);
 1928 }
 1929 
 1930 
 1931 /* Releases a block on the reconstruction of the indicated stripe. */
 1932 int
 1933 rf_UnblockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap)
 1934 {
 1935         RF_RowCol_t row = asmap->origRow;
 1936         RF_StripeNum_t stripeID = asmap->stripeID;
 1937         RF_ReconParityStripeStatus_t *pssPtr;
 1938         RF_ReconUnitNum_t which_ru;
 1939         RF_StripeNum_t psid;
 1940         int created = 0;
 1941         RF_CallbackDesc_t *cb;
 1942 
 1943         psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID,
 1944             &which_ru);
 1945         RF_LOCK_PSS_MUTEX(raidPtr, row, psid);
 1946         pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl[row]
 1947             ->pssTable, psid, which_ru, RF_PSS_NONE, &created);
 1948 
 1949         /*
 1950          * When recon is forced, the pss desc can get deleted before we get
 1951          * back to unblock recon. But, this can _only_ happen when recon is
 1952          * forced. It would be good to put some kind of sanity check here, but
 1953          * how to decide if recon was just forced or not ?
 1954          */
 1955         if (!pssPtr) {
 1956                 /*
 1957                  * printf("Warning: no pss descriptor upon unblock on psid %ld"
 1958                  *     " RU %d.\n", psid, which_ru);
 1959                  */
 1960                 if (rf_reconDebug || rf_pssDebug)
 1961                         printf("Warning: no pss descriptor upon unblock on"
 1962                             " psid %ld RU %d.\n", (long) psid, which_ru);
 1963                 goto out;
 1964         }
 1965         pssPtr->blockCount--;
 1966         Dprintf3("raid%d: unblocking recon on psid %ld: blockcount is %d.\n",
 1967             raidPtr->raidid, psid, pssPtr->blockCount);
 1968         if (pssPtr->blockCount == 0) {
 1969                 /* If recon blockage has been released. */
 1970 
 1971                 /*
 1972                  * Unblock recon before calling CauseReconEvent in case
 1973                  * CauseReconEvent causes us to try to issue a new read before
 1974                  * returning here.
 1975                  */
 1976                 pssPtr->flags &= ~RF_PSS_RECON_BLOCKED;
 1977 
 1978 
 1979                 while (pssPtr->blockWaitList) {
 1980                         /*
 1981                          * Spin through the block-wait list and
 1982                          * release all the waiters.
 1983                          */
 1984                         cb = pssPtr->blockWaitList;
 1985                         pssPtr->blockWaitList = cb->next;
 1986                         cb->next = NULL;
 1987                         rf_CauseReconEvent(raidPtr, cb->row, cb->col, NULL,
 1988                             RF_REVENT_BLOCKCLEAR);
 1989                         rf_FreeCallbackDesc(cb);
 1990                 }
 1991                 if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) {
 1992                         /* If no recon was requested while recon was blocked. */
 1993                         rf_PSStatusDelete(raidPtr, raidPtr->reconControl[row]
 1994                             ->pssTable, pssPtr);
 1995                 }
 1996         }
 1997 out:
 1998         RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid);
 1999         return (0);
 2000 }

/* [<][>][^][v][top][bottom][index][help] */