root/dev/raidframe/rf_dagutils.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. rf_InitNode
  2. rf_FreeDAG
  3. rf_MakePropListEntry
  4. rf_ShutdownDAGs
  5. rf_ConfigureDAGs
  6. rf_AllocDAGHeader
  7. rf_FreeDAGHeader
  8. rf_AllocBuffer
  9. rf_NodeStatusString
  10. rf_PrintNodeInfoString
  11. rf_RecurPrintDAG
  12. rf_PrintDAG
  13. rf_AssignNodeNums
  14. rf_RecurAssignNodeNums
  15. rf_ResetDAGHeaderPointers
  16. rf_RecurResetDAGHeaderPointers
  17. rf_PrintDAGList
  18. rf_ValidateBranch
  19. rf_ValidateBranchVisitedBits
  20. rf_ValidateVisitedBits
  21. rf_ValidateDAG
  22. rf_redirect_asm
  23. rf_MapUnaccessedPortionOfStripe
  24. rf_PDAOverlap
  25. rf_GenerateFailedAccessASMs
  26. rf_RangeRestrictPDA
  27. rf_compute_workload_shift
  28. rf_SelectMirrorDiskIdle
  29. rf_SelectMirrorDiskPartition

    1 /*      $OpenBSD: rf_dagutils.c,v 1.4 2002/12/16 07:01:03 tdeval Exp $  */
    2 /*      $NetBSD: rf_dagutils.c,v 1.6 1999/12/09 02:26:09 oster Exp $    */
    3 
    4 /*
    5  * Copyright (c) 1995 Carnegie-Mellon University.
    6  * All rights reserved.
    7  *
    8  * Authors: Mark Holland, William V. Courtright II, Jim Zelenka
    9  *
   10  * Permission to use, copy, modify and distribute this software and
   11  * its documentation is hereby granted, provided that both the copyright
   12  * notice and this permission notice appear in all copies of the
   13  * software, derivative works or modified versions, and any portions
   14  * thereof, and that both notices appear in supporting documentation.
   15  *
   16  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
   17  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
   18  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
   19  *
   20  * Carnegie Mellon requests users of this software to return to
   21  *
   22  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
   23  *  School of Computer Science
   24  *  Carnegie Mellon University
   25  *  Pittsburgh PA 15213-3890
   26  *
   27  * any improvements or extensions that they make and grant Carnegie the
   28  * rights to redistribute these changes.
   29  */
   30 
   31 /*****************************************************************************
   32  *
   33  * rf_dagutils.c -- Utility routines for manipulating dags.
   34  *
   35  *****************************************************************************/
   36 
   37 #include "rf_archs.h"
   38 #include "rf_types.h"
   39 #include "rf_threadstuff.h"
   40 #include "rf_raid.h"
   41 #include "rf_dag.h"
   42 #include "rf_dagutils.h"
   43 #include "rf_dagfuncs.h"
   44 #include "rf_general.h"
   45 #include "rf_freelist.h"
   46 #include "rf_map.h"
   47 #include "rf_shutdown.h"
   48 
   49 #define SNUM_DIFF(_a_,_b_)      (((_a_)>(_b_))?((_a_)-(_b_)):((_b_)-(_a_)))
   50 
   51 RF_RedFuncs_t rf_xorFuncs = {
   52         rf_RegularXorFunc, "Reg Xr", rf_SimpleXorFunc, "Simple Xr"
   53 };
   54 
   55 RF_RedFuncs_t rf_xorRecoveryFuncs = {
   56         rf_RecoveryXorFunc, "Recovery Xr", rf_RecoveryXorFunc, "Recovery Xr"
   57 };
   58 
   59 void rf_RecurPrintDAG(RF_DagNode_t *, int, int);
   60 void rf_PrintDAG(RF_DagHeader_t *);
   61 int  rf_ValidateBranch(RF_DagNode_t *, int *, int *, RF_DagNode_t **, int);
   62 void rf_ValidateBranchVisitedBits(RF_DagNode_t *, int, int);
   63 void rf_ValidateVisitedBits(RF_DagHeader_t *);
   64 
   65 /*****************************************************************************
   66  *
   67  * InitNode - Initialize a dag node.
   68  *
   69  * The size of the propList array is always the same as that of the
   70  * successors array.
   71  *
   72  *****************************************************************************/
   73 void
   74 rf_InitNode(
   75         RF_DagNode_t     *node,
   76         RF_NodeStatus_t   initstatus,
   77         int               commit,
   78         int             (*doFunc) (RF_DagNode_t *),
   79         int             (*undoFunc) (RF_DagNode_t *node),
   80         int             (*wakeFunc) (RF_DagNode_t *node, int),
   81         int               nSucc,
   82         int               nAnte,
   83         int               nParam,
   84         int               nResult,
   85         RF_DagHeader_t   *hdr,
   86         char             *name,
   87         RF_AllocListElem_t *alist
   88 )
   89 {
   90         void **ptrs;
   91         int nptrs;
   92 
   93         if (nAnte > RF_MAX_ANTECEDENTS)
   94                 RF_PANIC();
   95         node->status = initstatus;
   96         node->commitNode = commit;
   97         node->doFunc = doFunc;
   98         node->undoFunc = undoFunc;
   99         node->wakeFunc = wakeFunc;
  100         node->numParams = nParam;
  101         node->numResults = nResult;
  102         node->numAntecedents = nAnte;
  103         node->numAntDone = 0;
  104         node->next = NULL;
  105         node->numSuccedents = nSucc;
  106         node->name = name;
  107         node->dagHdr = hdr;
  108         node->visited = 0;
  109 
  110         /* Allocate all the pointers with one call to malloc. */
  111         nptrs = nSucc + nAnte + nResult + nSucc;
  112 
  113         if (nptrs <= RF_DAG_PTRCACHESIZE) {
  114                 /*
  115                  * The dag_ptrs field of the node is basically some scribble
  116                  * space to be used here. We could get rid of it, and always
  117                  * allocate the range of pointers, but that's expensive. So,
  118                  * we pick a "common case" size for the pointer cache.
  119                  * Hopefully, we'll find that:
  120                  * (1) Generally, nptrs doesn't exceed RF_DAG_PTRCACHESIZE by
  121                  *     only a little bit (least efficient case).
  122                  * (2) Generally, ntprs isn't a lot less than
  123                  *     RF_DAG_PTRCACHESIZE (wasted memory).
  124                  */
  125                 ptrs = (void **) node->dag_ptrs;
  126         } else {
  127                 RF_CallocAndAdd(ptrs, nptrs, sizeof(void *), (void **), alist);
  128         }
  129         node->succedents = (nSucc) ? (RF_DagNode_t **) ptrs : NULL;
  130         node->antecedents = (nAnte) ? (RF_DagNode_t **) (ptrs + nSucc) : NULL;
  131         node->results = (nResult) ? (void **) (ptrs + nSucc + nAnte) : NULL;
  132         node->propList = (nSucc) ? (RF_PropHeader_t **)
  133             (ptrs + nSucc + nAnte + nResult) : NULL;
  134 
  135         if (nParam) {
  136                 if (nParam <= RF_DAG_PARAMCACHESIZE) {
  137                         node->params = (RF_DagParam_t *) node->dag_params;
  138                 } else {
  139                         RF_CallocAndAdd(node->params, nParam,
  140                             sizeof(RF_DagParam_t), (RF_DagParam_t *), alist);
  141                 }
  142         } else {
  143                 node->params = NULL;
  144         }
  145 }
  146 
  147 
  148 
  149 /*****************************************************************************
  150  *
  151  * Allocation and deallocation routines.
  152  *
  153  *****************************************************************************/
  154 
  155 void
  156 rf_FreeDAG(RF_DagHeader_t *dag_h)
  157 {
  158         RF_AccessStripeMapHeader_t *asmap, *t_asmap;
  159         RF_DagHeader_t *nextDag;
  160         int i;
  161 
  162         while (dag_h) {
  163                 nextDag = dag_h->next;
  164                 for (i = 0; dag_h->memChunk[i] && i < RF_MAXCHUNKS; i++) {
  165                         /* Release mem chunks. */
  166                         rf_ReleaseMemChunk(dag_h->memChunk[i]);
  167                         dag_h->memChunk[i] = NULL;
  168                 }
  169 
  170                 RF_ASSERT(i == dag_h->chunkIndex);
  171                 if (dag_h->xtraChunkCnt > 0) {
  172                         /* Free xtraMemChunks. */
  173                         for (i = 0; dag_h->xtraMemChunk[i] &&
  174                              i < dag_h->xtraChunkIndex; i++) {
  175                                 rf_ReleaseMemChunk(dag_h->xtraMemChunk[i]);
  176                                 dag_h->xtraMemChunk[i] = NULL;
  177                         }
  178                         RF_ASSERT(i == dag_h->xtraChunkIndex);
  179                         /* Free ptrs to xtraMemChunks. */
  180                         RF_Free(dag_h->xtraMemChunk, dag_h->xtraChunkCnt *
  181                             sizeof(RF_ChunkDesc_t *));
  182                 }
  183                 rf_FreeAllocList(dag_h->allocList);
  184                 for (asmap = dag_h->asmList; asmap;) {
  185                         t_asmap = asmap;
  186                         asmap = asmap->next;
  187                         rf_FreeAccessStripeMap(t_asmap);
  188                 }
  189                 rf_FreeDAGHeader(dag_h);
  190                 dag_h = nextDag;
  191         }
  192 }
  193 
  194 RF_PropHeader_t *
  195 rf_MakePropListEntry(RF_DagHeader_t *dag_h, int resultNum, int paramNum,
  196     RF_PropHeader_t *next, RF_AllocListElem_t *allocList)
  197 {
  198         RF_PropHeader_t *p;
  199 
  200         RF_CallocAndAdd(p, 1, sizeof(RF_PropHeader_t), (RF_PropHeader_t *),
  201             allocList);
  202         p->resultNum = resultNum;
  203         p->paramNum = paramNum;
  204         p->next = next;
  205         return (p);
  206 }
  207 
  208 static RF_FreeList_t *rf_dagh_freelist;
  209 
  210 #define RF_MAX_FREE_DAGH        128
  211 #define RF_DAGH_INC              16
  212 #define RF_DAGH_INITIAL          32
  213 
  214 void rf_ShutdownDAGs(void *);
  215 void
  216 rf_ShutdownDAGs(void *ignored)
  217 {
  218         RF_FREELIST_DESTROY(rf_dagh_freelist, next, (RF_DagHeader_t *));
  219 }
  220 
  221 int
  222 rf_ConfigureDAGs(RF_ShutdownList_t **listp)
  223 {
  224         int rc;
  225 
  226         RF_FREELIST_CREATE(rf_dagh_freelist, RF_MAX_FREE_DAGH, RF_DAGH_INC,
  227             sizeof(RF_DagHeader_t));
  228         if (rf_dagh_freelist == NULL)
  229                 return (ENOMEM);
  230         rc = rf_ShutdownCreate(listp, rf_ShutdownDAGs, NULL);
  231         if (rc) {
  232                 RF_ERRORMSG3("Unable to add to shutdown list file %s line"
  233                     " %d rc=%d\n", __FILE__, __LINE__, rc);
  234                 rf_ShutdownDAGs(NULL);
  235                 return (rc);
  236         }
  237         RF_FREELIST_PRIME(rf_dagh_freelist, RF_DAGH_INITIAL, next,
  238             (RF_DagHeader_t *));
  239         return (0);
  240 }
  241 
  242 RF_DagHeader_t *
  243 rf_AllocDAGHeader(void)
  244 {
  245         RF_DagHeader_t *dh;
  246 
  247         RF_FREELIST_GET(rf_dagh_freelist, dh, next, (RF_DagHeader_t *));
  248         if (dh) {
  249                 bzero((char *) dh, sizeof(RF_DagHeader_t));
  250         }
  251         return (dh);
  252 }
  253 
  254 void
  255 rf_FreeDAGHeader(RF_DagHeader_t *dh)
  256 {
  257         RF_FREELIST_FREE(rf_dagh_freelist, dh, next);
  258 }
  259 
  260 /* Allocate a buffer big enough to hold the data described by pda. */
  261 void *
  262 rf_AllocBuffer(RF_Raid_t *raidPtr, RF_DagHeader_t *dag_h,
  263     RF_PhysDiskAddr_t *pda, RF_AllocListElem_t *allocList)
  264 {
  265         char *p;
  266 
  267         RF_MallocAndAdd(p, pda->numSector << raidPtr->logBytesPerSector,
  268             (char *), allocList);
  269         return ((void *) p);
  270 }
  271 
  272 
  273 /*****************************************************************************
  274  *
  275  * Debug routines.
  276  *
  277  *****************************************************************************/
  278 
  279 char *
  280 rf_NodeStatusString(RF_DagNode_t *node)
  281 {
  282         switch (node->status) {
  283         case rf_wait:
  284                 return ("wait");
  285         case rf_fired:
  286                 return ("fired");
  287         case rf_good:
  288                 return ("good");
  289         case rf_bad:
  290                 return ("bad");
  291         default:
  292                 return ("?");
  293         }
  294 }
  295 
  296 void
  297 rf_PrintNodeInfoString(RF_DagNode_t *node)
  298 {
  299         RF_PhysDiskAddr_t *pda;
  300         int (*df) (RF_DagNode_t *) = node->doFunc;
  301         int i, lk, unlk;
  302         void *bufPtr;
  303 
  304         if ((df == rf_DiskReadFunc) || (df == rf_DiskWriteFunc) ||
  305             (df == rf_DiskReadMirrorIdleFunc) ||
  306             (df == rf_DiskReadMirrorPartitionFunc)) {
  307                 pda = (RF_PhysDiskAddr_t *) node->params[0].p;
  308                 bufPtr = (void *) node->params[1].p;
  309                 lk = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
  310                 unlk = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
  311                 RF_ASSERT(!(lk && unlk));
  312                 printf("r %d c %d offs %ld nsect %d buf 0x%lx %s\n", pda->row,
  313                     pda->col, (long) pda->startSector, (int) pda->numSector,
  314                     (long) bufPtr, (lk) ? "LOCK" : ((unlk) ? "UNLK" : " "));
  315                 return;
  316         }
  317         if (df == rf_DiskUnlockFunc) {
  318                 pda = (RF_PhysDiskAddr_t *) node->params[0].p;
  319                 lk = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
  320                 unlk = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
  321                 RF_ASSERT(!(lk && unlk));
  322                 printf("r %d c %d %s\n", pda->row, pda->col,
  323                     (lk) ? "LOCK" : ((unlk) ? "UNLK" : "nop"));
  324                 return;
  325         }
  326         if ((df == rf_SimpleXorFunc) || (df == rf_RegularXorFunc)
  327             || (df == rf_RecoveryXorFunc)) {
  328                 printf("result buf 0x%lx\n", (long) node->results[0]);
  329                 for (i = 0; i < node->numParams - 1; i += 2) {
  330                         pda = (RF_PhysDiskAddr_t *) node->params[i].p;
  331                         bufPtr = (RF_PhysDiskAddr_t *) node->params[i + 1].p;
  332                         printf("    buf 0x%lx r%d c%d offs %ld nsect %d\n",
  333                             (long) bufPtr, pda->row, pda->col,
  334                             (long) pda->startSector, (int) pda->numSector);
  335                 }
  336                 return;
  337         }
  338 #if     RF_INCLUDE_PARITYLOGGING > 0
  339         if (df == rf_ParityLogOverwriteFunc || df == rf_ParityLogUpdateFunc) {
  340                 for (i = 0; i < node->numParams - 1; i += 2) {
  341                         pda = (RF_PhysDiskAddr_t *) node->params[i].p;
  342                         bufPtr = (RF_PhysDiskAddr_t *) node->params[i + 1].p;
  343                         printf(" r%d c%d offs %ld nsect %d buf 0x%lx\n",
  344                             pda->row, pda->col, (long) pda->startSector,
  345                             (int) pda->numSector, (long) bufPtr);
  346                 }
  347                 return;
  348         }
  349 #endif  /* RF_INCLUDE_PARITYLOGGING > 0 */
  350 
  351         if ((df == rf_TerminateFunc) || (df == rf_NullNodeFunc)) {
  352                 printf("\n");
  353                 return;
  354         }
  355         printf("?\n");
  356 }
  357 
  358 void
  359 rf_RecurPrintDAG(RF_DagNode_t *node, int depth, int unvisited)
  360 {
  361         char *anttype;
  362         int i;
  363 
  364         node->visited = (unvisited) ? 0 : 1;
  365         printf("(%d) %d C%d %s: %s,s%d %d/%d,a%d/%d,p%d,r%d S{", depth,
  366             node->nodeNum, node->commitNode, node->name,
  367             rf_NodeStatusString(node), node->numSuccedents,
  368             node->numSuccFired, node->numSuccDone,
  369             node->numAntecedents, node->numAntDone,
  370             node->numParams, node->numResults);
  371         for (i = 0; i < node->numSuccedents; i++) {
  372                 printf("%d%s", node->succedents[i]->nodeNum,
  373                     ((i == node->numSuccedents - 1) ? "\0" : " "));
  374         }
  375         printf("} A{");
  376         for (i = 0; i < node->numAntecedents; i++) {
  377                 switch (node->antType[i]) {
  378                 case rf_trueData:
  379                         anttype = "T";
  380                         break;
  381                 case rf_antiData:
  382                         anttype = "A";
  383                         break;
  384                 case rf_outputData:
  385                         anttype = "O";
  386                         break;
  387                 case rf_control:
  388                         anttype = "C";
  389                         break;
  390                 default:
  391                         anttype = "?";
  392                         break;
  393                 }
  394                 printf("%d(%s)%s", node->antecedents[i]->nodeNum, anttype,
  395                     (i == node->numAntecedents - 1) ? "\0" : " ");
  396         }
  397         printf("}; ");
  398         rf_PrintNodeInfoString(node);
  399         for (i = 0; i < node->numSuccedents; i++) {
  400                 if (node->succedents[i]->visited == unvisited)
  401                         rf_RecurPrintDAG(node->succedents[i], depth + 1,
  402                             unvisited);
  403         }
  404 }
  405 
  406 void
  407 rf_PrintDAG(RF_DagHeader_t *dag_h)
  408 {
  409         int unvisited, i;
  410         char *status;
  411 
  412         /* Set dag status. */
  413         switch (dag_h->status) {
  414         case rf_enable:
  415                 status = "enable";
  416                 break;
  417         case rf_rollForward:
  418                 status = "rollForward";
  419                 break;
  420         case rf_rollBackward:
  421                 status = "rollBackward";
  422                 break;
  423         default:
  424                 status = "illegal !";
  425                 break;
  426         }
  427         /* Find out if visited bits are currently set or cleared. */
  428         unvisited = dag_h->succedents[0]->visited;
  429 
  430         printf("DAG type:  %s\n", dag_h->creator);
  431         printf("format is (depth) num commit type: status,nSucc nSuccFired/n"
  432             "SuccDone,nAnte/nAnteDone,nParam,nResult S{x} A{x(type)};  info\n");
  433         printf("(0) %d Hdr: %s, s%d, (commit %d/%d) S{", dag_h->nodeNum,
  434             status, dag_h->numSuccedents, dag_h->numCommitNodes,
  435             dag_h->numCommits);
  436         for (i = 0; i < dag_h->numSuccedents; i++) {
  437                 printf("%d%s", dag_h->succedents[i]->nodeNum,
  438                     ((i == dag_h->numSuccedents - 1) ? "\0" : " "));
  439         }
  440         printf("};\n");
  441         for (i = 0; i < dag_h->numSuccedents; i++) {
  442                 if (dag_h->succedents[i]->visited == unvisited)
  443                         rf_RecurPrintDAG(dag_h->succedents[i], 1, unvisited);
  444         }
  445 }
  446 
  447 /* Assign node numbers. */
  448 int
  449 rf_AssignNodeNums(RF_DagHeader_t *dag_h)
  450 {
  451         int unvisited, i, nnum;
  452         RF_DagNode_t *node;
  453 
  454         nnum = 0;
  455         unvisited = dag_h->succedents[0]->visited;
  456 
  457         dag_h->nodeNum = nnum++;
  458         for (i = 0; i < dag_h->numSuccedents; i++) {
  459                 node = dag_h->succedents[i];
  460                 if (node->visited == unvisited) {
  461                         nnum = rf_RecurAssignNodeNums(dag_h->succedents[i],
  462                             nnum, unvisited);
  463                 }
  464         }
  465         return (nnum);
  466 }
  467 
  468 int
  469 rf_RecurAssignNodeNums(RF_DagNode_t *node, int num, int unvisited)
  470 {
  471         int i;
  472 
  473         node->visited = (unvisited) ? 0 : 1;
  474 
  475         node->nodeNum = num++;
  476         for (i = 0; i < node->numSuccedents; i++) {
  477                 if (node->succedents[i]->visited == unvisited) {
  478                         num = rf_RecurAssignNodeNums(node->succedents[i],
  479                             num, unvisited);
  480                 }
  481         }
  482         return (num);
  483 }
  484 
  485 /* Set the header pointers in each node to "newptr". */
  486 void
  487 rf_ResetDAGHeaderPointers(RF_DagHeader_t *dag_h, RF_DagHeader_t *newptr)
  488 {
  489         int i;
  490 
  491         for (i = 0; i < dag_h->numSuccedents; i++)
  492                 if (dag_h->succedents[i]->dagHdr != newptr)
  493                         rf_RecurResetDAGHeaderPointers(dag_h->succedents[i],
  494                             newptr);
  495 }
  496 
  497 void
  498 rf_RecurResetDAGHeaderPointers(RF_DagNode_t *node, RF_DagHeader_t *newptr)
  499 {
  500         int i;
  501 
  502         node->dagHdr = newptr;
  503         for (i = 0; i < node->numSuccedents; i++)
  504                 if (node->succedents[i]->dagHdr != newptr)
  505                         rf_RecurResetDAGHeaderPointers(node->succedents[i],
  506                             newptr);
  507 }
  508 
  509 void
  510 rf_PrintDAGList(RF_DagHeader_t *dag_h)
  511 {
  512         int i = 0;
  513 
  514         for (; dag_h; dag_h = dag_h->next) {
  515                 rf_AssignNodeNums(dag_h);
  516                 printf("\n\nDAG %d IN LIST:\n", i++);
  517                 rf_PrintDAG(dag_h);
  518         }
  519 }
  520 
  521 int
  522 rf_ValidateBranch(RF_DagNode_t *node, int *scount, int *acount,
  523     RF_DagNode_t **nodes, int unvisited)
  524 {
  525         int i, retcode = 0;
  526 
  527         /* Construct an array of node pointers indexed by node num. */
  528         node->visited = (unvisited) ? 0 : 1;
  529         nodes[node->nodeNum] = node;
  530 
  531         if (node->next != NULL) {
  532                 printf("INVALID DAG: next pointer in node is not NULL.\n");
  533                 retcode = 1;
  534         }
  535         if (node->status != rf_wait) {
  536                 printf("INVALID DAG: Node status is not wait.\n");
  537                 retcode = 1;
  538         }
  539         if (node->numAntDone != 0) {
  540                 printf("INVALID DAG: numAntDone is not zero.\n");
  541                 retcode = 1;
  542         }
  543         if (node->doFunc == rf_TerminateFunc) {
  544                 if (node->numSuccedents != 0) {
  545                         printf("INVALID DAG: Terminator node has"
  546                             " succedents.\n");
  547                         retcode = 1;
  548                 }
  549         } else {
  550                 if (node->numSuccedents == 0) {
  551                         printf("INVALID DAG: Non-terminator node has no"
  552                             " succedents\n");
  553                         retcode = 1;
  554                 }
  555         }
  556         for (i = 0; i < node->numSuccedents; i++) {
  557                 if (!node->succedents[i]) {
  558                         printf("INVALID DAG: succedent %d of node %s"
  559                             " is NULL.\n", i, node->name);
  560                         retcode = 1;
  561                 }
  562                 scount[node->succedents[i]->nodeNum]++;
  563         }
  564         for (i = 0; i < node->numAntecedents; i++) {
  565                 if (!node->antecedents[i]) {
  566                         printf("INVALID DAG: antecedent %d of node %s is"
  567                             " NULL.\n", i, node->name);
  568                         retcode = 1;
  569                 }
  570                 acount[node->antecedents[i]->nodeNum]++;
  571         }
  572         for (i = 0; i < node->numSuccedents; i++) {
  573                 if (node->succedents[i]->visited == unvisited) {
  574                         if (rf_ValidateBranch(node->succedents[i], scount,
  575                                 acount, nodes, unvisited)) {
  576                                 retcode = 1;
  577                         }
  578                 }
  579         }
  580         return (retcode);
  581 }
  582 
  583 void
  584 rf_ValidateBranchVisitedBits(RF_DagNode_t *node, int unvisited, int rl)
  585 {
  586         int i;
  587 
  588         RF_ASSERT(node->visited == unvisited);
  589         for (i = 0; i < node->numSuccedents; i++) {
  590                 if (node->succedents[i] == NULL) {
  591                         printf("node=%lx node->succedents[%d] is NULL.\n",
  592                             (long) node, i);
  593                         RF_ASSERT(0);
  594                 }
  595                 rf_ValidateBranchVisitedBits(node->succedents[i],
  596                     unvisited, rl + 1);
  597         }
  598 }
  599 
  600 /*
  601  * NOTE:  Never call this on a big dag, because it is exponential
  602  * in execution time.
  603  */
  604 void
  605 rf_ValidateVisitedBits(RF_DagHeader_t *dag)
  606 {
  607         int i, unvisited;
  608 
  609         unvisited = dag->succedents[0]->visited;
  610 
  611         for (i = 0; i < dag->numSuccedents; i++) {
  612                 if (dag->succedents[i] == NULL) {
  613                         printf("dag=%lx dag->succedents[%d] is NULL.\n",
  614                             (long) dag, i);
  615                         RF_ASSERT(0);
  616                 }
  617                 rf_ValidateBranchVisitedBits(dag->succedents[i], unvisited, 0);
  618         }
  619 }
  620 
  621 /*
  622  * Validate a DAG. _at entry_ verify that:
  623  *   -- numNodesCompleted is zero
  624  *   -- node queue is null
  625  *   -- dag status is rf_enable
  626  *   -- next pointer is null on every node
  627  *   -- all nodes have status wait
  628  *   -- numAntDone is zero in all nodes
  629  *   -- terminator node has zero successors
  630  *   -- no other node besides terminator has zero successors
  631  *   -- no successor or antecedent pointer in a node is NULL
  632  *   -- number of times that each node appears as a successor of another node
  633  *      is equal to the antecedent count on that node
  634  *   -- number of times that each node appears as an antecedent of another node
  635  *      is equal to the succedent count on that node
  636  *   -- what else ?
  637  */
  638 int
  639 rf_ValidateDAG(RF_DagHeader_t *dag_h)
  640 {
  641         int i, nodecount;
  642         int *scount, *acount;   /* Per-node successor and antecedent counts. */
  643         RF_DagNode_t **nodes;   /* Array of ptrs to nodes in dag. */
  644         int retcode = 0;
  645         int unvisited;
  646         int commitNodeCount = 0;
  647 
  648         if (rf_validateVisitedDebug)
  649                 rf_ValidateVisitedBits(dag_h);
  650 
  651         if (dag_h->numNodesCompleted != 0) {
  652                 printf("INVALID DAG: num nodes completed is %d, should be 0.\n",
  653                     dag_h->numNodesCompleted);
  654                 retcode = 1;
  655                 goto validate_dag_bad;
  656         }
  657         if (dag_h->status != rf_enable) {
  658                 printf("INVALID DAG: not enabled.\n");
  659                 retcode = 1;
  660                 goto validate_dag_bad;
  661         }
  662         if (dag_h->numCommits != 0) {
  663                 printf("INVALID DAG: numCommits != 0 (%d)\n",
  664                     dag_h->numCommits);
  665                 retcode = 1;
  666                 goto validate_dag_bad;
  667         }
  668         if (dag_h->numSuccedents != 1) {
  669                 /* Currently, all dags must have only one succedent. */
  670                 printf("INVALID DAG: numSuccedents != 1 (%d).\n",
  671                     dag_h->numSuccedents);
  672                 retcode = 1;
  673                 goto validate_dag_bad;
  674         }
  675         nodecount = rf_AssignNodeNums(dag_h);
  676 
  677         unvisited = dag_h->succedents[0]->visited;
  678 
  679         RF_Calloc(scount, nodecount, sizeof(int), (int *));
  680         RF_Calloc(acount, nodecount, sizeof(int), (int *));
  681         RF_Calloc(nodes, nodecount, sizeof(RF_DagNode_t *), (RF_DagNode_t **));
  682         for (i = 0; i < dag_h->numSuccedents; i++) {
  683                 if ((dag_h->succedents[i]->visited == unvisited)
  684                     && rf_ValidateBranch(dag_h->succedents[i], scount,
  685                         acount, nodes, unvisited)) {
  686                         retcode = 1;
  687                 }
  688         }
  689         /* Start at 1 to skip the header node. */
  690         for (i = 1; i < nodecount; i++) {
  691                 if (nodes[i]->commitNode)
  692                         commitNodeCount++;
  693                 if (nodes[i]->doFunc == NULL) {
  694                         printf("INVALID DAG: node %s has an undefined"
  695                             " doFunc.\n", nodes[i]->name);
  696                         retcode = 1;
  697                         goto validate_dag_out;
  698                 }
  699                 if (nodes[i]->undoFunc == NULL) {
  700                         printf("INVALID DAG: node %s has an undefined"
  701                             " doFunc.\n", nodes[i]->name);
  702                         retcode = 1;
  703                         goto validate_dag_out;
  704                 }
  705                 if (nodes[i]->numAntecedents != scount[nodes[i]->nodeNum]) {
  706                         printf("INVALID DAG: node %s has %d antecedents but"
  707                             " appears as a succedent %d times.\n",
  708                             nodes[i]->name, nodes[i]->numAntecedents,
  709                             scount[nodes[i]->nodeNum]);
  710                         retcode = 1;
  711                         goto validate_dag_out;
  712                 }
  713                 if (nodes[i]->numSuccedents != acount[nodes[i]->nodeNum]) {
  714                         printf("INVALID DAG: node %s has %d succedents but"
  715                             " appears as an antecedent %d times.\n",
  716                             nodes[i]->name, nodes[i]->numSuccedents,
  717                             acount[nodes[i]->nodeNum]);
  718                         retcode = 1;
  719                         goto validate_dag_out;
  720                 }
  721         }
  722 
  723         if (dag_h->numCommitNodes != commitNodeCount) {
  724                 printf("INVALID DAG: incorrect commit node count. "
  725                     "hdr->numCommitNodes (%d) found (%d) commit nodes"
  726                     " in graph.\n",
  727                     dag_h->numCommitNodes, commitNodeCount);
  728                 retcode = 1;
  729                 goto validate_dag_out;
  730         }
  731 
  732 validate_dag_out:
  733         RF_Free(scount, nodecount * sizeof(int));
  734         RF_Free(acount, nodecount * sizeof(int));
  735         RF_Free(nodes, nodecount * sizeof(RF_DagNode_t *));
  736         if (retcode)
  737                 rf_PrintDAGList(dag_h);
  738 
  739         if (rf_validateVisitedDebug)
  740                 rf_ValidateVisitedBits(dag_h);
  741 
  742         return (retcode);
  743 
  744 validate_dag_bad:
  745         rf_PrintDAGList(dag_h);
  746         return (retcode);
  747 }
  748 
  749 
  750 /*****************************************************************************
  751  *
  752  * Misc construction routines.
  753  *
  754  *****************************************************************************/
  755 
  756 void
  757 rf_redirect_asm(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap)
  758 {
  759         int ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) ? 1 : 0;
  760         int row = asmap->physInfo->row;
  761         int fcol = raidPtr->reconControl[row]->fcol;
  762         int srow = raidPtr->reconControl[row]->spareRow;
  763         int scol = raidPtr->reconControl[row]->spareCol;
  764         RF_PhysDiskAddr_t *pda;
  765 
  766         RF_ASSERT(raidPtr->status[row] == rf_rs_reconstructing);
  767         for (pda = asmap->physInfo; pda; pda = pda->next) {
  768                 if (pda->col == fcol) {
  769                         if (rf_dagDebug) {
  770                                 if (!rf_CheckRUReconstructed(
  771                                     raidPtr->reconControl[row]->reconMap,
  772                                     pda->startSector)) {
  773                                         RF_PANIC();
  774                                 }
  775                         }
  776                         /*printf("Remapped data for large write\n");*/
  777                         if (ds) {
  778                                 raidPtr->Layout.map->MapSector(raidPtr,
  779                                     pda->raidAddress, &pda->row, &pda->col,
  780                                     &pda->startSector, RF_REMAP);
  781                         } else {
  782                                 pda->row = srow;
  783                                 pda->col = scol;
  784                         }
  785                 }
  786         }
  787         for (pda = asmap->parityInfo; pda; pda = pda->next) {
  788                 if (pda->col == fcol) {
  789                         if (rf_dagDebug) {
  790                                 if (!rf_CheckRUReconstructed(
  791                                     raidPtr->reconControl[row]->reconMap,
  792                                     pda->startSector)) {
  793                                         RF_PANIC();
  794                                 }
  795                         }
  796                 }
  797                 if (ds) {
  798                         (raidPtr->Layout.map->MapParity) (raidPtr,
  799                             pda->raidAddress, &pda->row, &pda->col,
  800                             &pda->startSector, RF_REMAP);
  801                 } else {
  802                         pda->row = srow;
  803                         pda->col = scol;
  804                 }
  805         }
  806 }
  807 
  808 
  809 /*
  810  * This routine allocates read buffers and generates stripe maps for the
  811  * regions of the array from the start of the stripe to the start of the
  812  * access, and from the end of the access to the end of the stripe. It also
  813  * computes and returns the number of DAG nodes needed to read all this data.
  814  * Note that this routine does the wrong thing if the access is fully
  815  * contained within one stripe unit, so we RF_ASSERT against this case at the
  816  * start.
  817  */
  818 void
  819 rf_MapUnaccessedPortionOfStripe(
  820         RF_Raid_t                *raidPtr,
  821         RF_RaidLayout_t          *layoutPtr,    /* in: layout information */
  822         RF_AccessStripeMap_t     *asmap,        /* in: access stripe map */
  823         RF_DagHeader_t           *dag_h,        /* in: header of the dag */
  824                                                 /*     to create */
  825         RF_AccessStripeMapHeader_t **new_asm_h, /* in: ptr to array of 2 */
  826                                                 /*     headers, to be */
  827                                                 /*     filled in */
  828         int                      *nRodNodes,    /* out: num nodes to be */
  829                                                 /*      generated to read */
  830                                                 /*      unaccessed data */
  831         char                    **sosBuffer,    /* out: pointers to newly */
  832                                                 /*      allocated buffer */
  833         char                    **eosBuffer,
  834         RF_AllocListElem_t       *allocList
  835 )
  836 {
  837         RF_RaidAddr_t sosRaidAddress, eosRaidAddress;
  838         RF_SectorNum_t sosNumSector, eosNumSector;
  839 
  840         RF_ASSERT(asmap->numStripeUnitsAccessed > (layoutPtr->numDataCol / 2));
  841         /*
  842          * Generate an access map for the region of the array from start of
  843          * stripe to start of access.
  844          */
  845         new_asm_h[0] = new_asm_h[1] = NULL;
  846         *nRodNodes = 0;
  847         if (!rf_RaidAddressStripeAligned(layoutPtr, asmap->raidAddress)) {
  848                 sosRaidAddress = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
  849                     asmap->raidAddress);
  850                 sosNumSector = asmap->raidAddress - sosRaidAddress;
  851                 RF_MallocAndAdd(*sosBuffer, rf_RaidAddressToByte(raidPtr,
  852                     sosNumSector), (char *), allocList);
  853                 new_asm_h[0] = rf_MapAccess(raidPtr, sosRaidAddress,
  854                     sosNumSector, *sosBuffer, RF_DONT_REMAP);
  855                 new_asm_h[0]->next = dag_h->asmList;
  856                 dag_h->asmList = new_asm_h[0];
  857                 *nRodNodes += new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
  858 
  859                 RF_ASSERT(new_asm_h[0]->stripeMap->next == NULL);
  860                 /* We're totally within one stripe here. */
  861                 if (asmap->flags & RF_ASM_REDIR_LARGE_WRITE)
  862                         rf_redirect_asm(raidPtr, new_asm_h[0]->stripeMap);
  863         }
  864         /*
  865          * Generate an access map for the region of the array from end of
  866          * access to end of stripe.
  867          */
  868         if (!rf_RaidAddressStripeAligned(layoutPtr, asmap->endRaidAddress)) {
  869                 eosRaidAddress = asmap->endRaidAddress;
  870                 eosNumSector = rf_RaidAddressOfNextStripeBoundary(layoutPtr,
  871                     eosRaidAddress) - eosRaidAddress;
  872                 RF_MallocAndAdd(*eosBuffer, rf_RaidAddressToByte(raidPtr,
  873                     eosNumSector), (char *), allocList);
  874                 new_asm_h[1] = rf_MapAccess(raidPtr, eosRaidAddress,
  875                     eosNumSector, *eosBuffer, RF_DONT_REMAP);
  876                 new_asm_h[1]->next = dag_h->asmList;
  877                 dag_h->asmList = new_asm_h[1];
  878                 *nRodNodes += new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
  879 
  880                 RF_ASSERT(new_asm_h[1]->stripeMap->next == NULL);
  881                 /* We're totally within one stripe here. */
  882                 if (asmap->flags & RF_ASM_REDIR_LARGE_WRITE)
  883                         rf_redirect_asm(raidPtr, new_asm_h[1]->stripeMap);
  884         }
  885 }
  886 
  887 
  888 /* Returns non-zero if the indicated ranges of stripe unit offsets overlap. */
  889 int
  890 rf_PDAOverlap(RF_RaidLayout_t *layoutPtr, RF_PhysDiskAddr_t *src,
  891     RF_PhysDiskAddr_t *dest)
  892 {
  893         RF_SectorNum_t soffs =
  894             rf_StripeUnitOffset(layoutPtr, src->startSector);
  895         RF_SectorNum_t doffs =
  896             rf_StripeUnitOffset(layoutPtr, dest->startSector);
  897         /* Use -1 to be sure we stay within SU. */
  898         RF_SectorNum_t send =
  899             rf_StripeUnitOffset(layoutPtr, src->startSector +
  900             src->numSector - 1);
  901         RF_SectorNum_t dend =
  902             rf_StripeUnitOffset(layoutPtr, dest->startSector +
  903             dest->numSector - 1);
  904 
  905         return ((RF_MAX(soffs, doffs) <= RF_MIN(send, dend)) ? 1 : 0);
  906 }
  907 
  908 
  909 /*
  910  * GenerateFailedAccessASMs
  911  *
  912  * This routine figures out what portion of the stripe needs to be read
  913  * to effect the degraded read or write operation. It's primary function
  914  * is to identify everything required to recover the data, and then
  915  * eliminate anything that is already being accessed by the user.
  916  *
  917  * The main result is two new ASMs, one for the region from the start of the
  918  * stripe to the start of the access, and one for the region from the end of
  919  * the access to the end of the stripe. These ASMs describe everything that
  920  * needs to be read to effect the degraded access. Other results are:
  921  *    nXorBufs -- The total number of buffers that need to be XORed together
  922  *                to recover the lost data,
  923  *    rpBufPtr -- Ptr to a newly-allocated buffer to hold the parity. If NULL
  924  *                at entry, not allocated.
  925  *    overlappingPDAs --
  926  *                Describes which of the non-failed PDAs, in the user access,
  927  *                overlap data that needs to be read to effect recovery.
  928  *                overlappingPDAs[i]==1 if and only if, neglecting the failed
  929  *                PDA, the i'th pda in the input asm overlaps data that needs
  930  *                to be read for recovery.
  931  */
  932  /* in: asmap - ASM for the actual access, one stripe only. */
  933  /* in: faildPDA - Which component of the access has failed. */
  934  /* in: dag_h - Header of the DAG we're going to create. */
  935  /* out: new_asm_h - The two new ASMs. */
  936  /* out: nXorBufs - The total number of xor bufs required. */
  937  /* out: rpBufPtr - A buffer for the parity read. */
  938 void
  939 rf_GenerateFailedAccessASMs(
  940         RF_Raid_t                *raidPtr,
  941         RF_AccessStripeMap_t     *asmap,
  942         RF_PhysDiskAddr_t        *failedPDA,
  943         RF_DagHeader_t           *dag_h,
  944         RF_AccessStripeMapHeader_t **new_asm_h,
  945         int                      *nXorBufs,
  946         char                    **rpBufPtr,
  947         char                     *overlappingPDAs,
  948         RF_AllocListElem_t       *allocList
  949 )
  950 {
  951         RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
  952 
  953         /* s=start, e=end, s=stripe, a=access, f=failed, su=stripe unit */
  954         RF_RaidAddr_t sosAddr, sosEndAddr, eosStartAddr, eosAddr;
  955 
  956         RF_SectorCount_t numSect[2], numParitySect;
  957         RF_PhysDiskAddr_t *pda;
  958         char *rdBuf, *bufP;
  959         int foundit, i;
  960 
  961         bufP = NULL;
  962         foundit = 0;
  963         /*
  964          * First compute the following raid addresses:
  965          * - Start of stripe
  966          * - (sosAddr) MIN(start of access, start of failed SU)
  967          * - (sosEndAddr) MAX(end of access, end of failed SU)
  968          * - (eosStartAddr) end of stripe (i.e. start of next stripe)
  969          *   (eosAddr)
  970          */
  971         sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
  972             asmap->raidAddress);
  973         sosEndAddr = RF_MIN(asmap->raidAddress,
  974             rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,
  975             failedPDA->raidAddress));
  976         eosStartAddr = RF_MAX(asmap->endRaidAddress,
  977             rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr,
  978             failedPDA->raidAddress));
  979         eosAddr = rf_RaidAddressOfNextStripeBoundary(layoutPtr,
  980             asmap->raidAddress);
  981 
  982         /*
  983          * Now generate access stripe maps for each of the above regions of
  984          * the stripe. Use a dummy (NULL) buf ptr for now.
  985          */
  986 
  987         new_asm_h[0] = (sosAddr != sosEndAddr) ?
  988             rf_MapAccess(raidPtr, sosAddr, sosEndAddr - sosAddr, NULL,
  989             RF_DONT_REMAP) : NULL;
  990         new_asm_h[1] = (eosStartAddr != eosAddr) ?
  991             rf_MapAccess(raidPtr, eosStartAddr, eosAddr - eosStartAddr, NULL,
  992             RF_DONT_REMAP) : NULL;
  993 
  994         /*
  995          * Walk through the PDAs and range-restrict each SU to the region of
  996          * the SU touched on the failed PDA. Also compute total data buffer
  997          * space requirements in this step. Ignore the parity for now.
  998          */
  999 
 1000         numSect[0] = numSect[1] = 0;
 1001         if (new_asm_h[0]) {
 1002                 new_asm_h[0]->next = dag_h->asmList;
 1003                 dag_h->asmList = new_asm_h[0];
 1004                 for (pda = new_asm_h[0]->stripeMap->physInfo; pda;
 1005                      pda = pda->next) {
 1006                         rf_RangeRestrictPDA(raidPtr, failedPDA, pda,
 1007                             RF_RESTRICT_NOBUFFER, 0);
 1008                         numSect[0] += pda->numSector;
 1009                 }
 1010         }
 1011         if (new_asm_h[1]) {
 1012                 new_asm_h[1]->next = dag_h->asmList;
 1013                 dag_h->asmList = new_asm_h[1];
 1014                 for (pda = new_asm_h[1]->stripeMap->physInfo;
 1015                      pda; pda = pda->next) {
 1016                         rf_RangeRestrictPDA(raidPtr, failedPDA, pda,
 1017                             RF_RESTRICT_NOBUFFER, 0);
 1018                         numSect[1] += pda->numSector;
 1019                 }
 1020         }
 1021         numParitySect = failedPDA->numSector;
 1022 
 1023         /*
 1024          * Allocate buffer space for the data & parity we have to read to
 1025          * recover from the failure.
 1026          */
 1027 
 1028         if (numSect[0] + numSect[1] + ((rpBufPtr) ? numParitySect : 0)) {
 1029                 /* Don't allocate parity buf if not needed. */
 1030                 RF_MallocAndAdd(rdBuf, rf_RaidAddressToByte(raidPtr,
 1031                     numSect[0] + numSect[1] + numParitySect), (char *),
 1032                     allocList);
 1033                 bufP = rdBuf;
 1034                 if (rf_degDagDebug)
 1035                         printf("Newly allocated buffer (%d bytes) is 0x%lx\n",
 1036                             (int) rf_RaidAddressToByte(raidPtr,
 1037                             numSect[0] + numSect[1] + numParitySect),
 1038                             (unsigned long) bufP);
 1039         }
 1040         /*
 1041          * Now walk through the pdas one last time and assign buffer pointers
 1042          * (ugh!). Again, ignore the parity. Also, count nodes to find out
 1043          * how many bufs need to be xored together.
 1044          */
 1045         (*nXorBufs) = 1;        /* In read case, 1 is for parity. */
 1046                                 /* In write case, 1 is for failed data. */
 1047         if (new_asm_h[0]) {
 1048                 for (pda = new_asm_h[0]->stripeMap->physInfo; pda;
 1049                      pda = pda->next) {
 1050                         pda->bufPtr = bufP;
 1051                         bufP += rf_RaidAddressToByte(raidPtr, pda->numSector);
 1052                 }
 1053                 *nXorBufs += new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
 1054         }
 1055         if (new_asm_h[1]) {
 1056                 for (pda = new_asm_h[1]->stripeMap->physInfo; pda;
 1057                      pda = pda->next) {
 1058                         pda->bufPtr = bufP;
 1059                         bufP += rf_RaidAddressToByte(raidPtr, pda->numSector);
 1060                 }
 1061                 (*nXorBufs) += new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
 1062         }
 1063         if (rpBufPtr)
 1064                 /* The rest of the buffer is for parity. */
 1065                 *rpBufPtr = bufP;
 1066 
 1067         /*
 1068          * The last step is to figure out how many more distinct buffers need
 1069          * to get xor'd to produce the missing unit. there's one for each
 1070          * user-data read node that overlaps the portion of the failed unit
 1071          * being accessed.
 1072          */
 1073 
 1074         for (foundit = i = 0, pda = asmap->physInfo;
 1075              pda; i++, pda = pda->next) {
 1076                 if (pda == failedPDA) {
 1077                         i--;
 1078                         foundit = 1;
 1079                         continue;
 1080                 }
 1081                 if (rf_PDAOverlap(layoutPtr, pda, failedPDA)) {
 1082                         overlappingPDAs[i] = 1;
 1083                         (*nXorBufs)++;
 1084                 }
 1085         }
 1086         if (!foundit) {
 1087                 RF_ERRORMSG("GenerateFailedAccessASMs: did not find failedPDA"
 1088                     " in asm list.\n");
 1089                 RF_ASSERT(0);
 1090         }
 1091         if (rf_degDagDebug) {
 1092                 if (new_asm_h[0]) {
 1093                         printf("First asm:\n");
 1094                         rf_PrintFullAccessStripeMap(new_asm_h[0], 1);
 1095                 }
 1096                 if (new_asm_h[1]) {
 1097                         printf("Second asm:\n");
 1098                         rf_PrintFullAccessStripeMap(new_asm_h[1], 1);
 1099                 }
 1100         }
 1101 }
 1102 
 1103 
 1104 /*
 1105  * Adjust the offset and number of sectors in the destination pda so that
 1106  * it covers at most the region of the SU covered by the source PDA. This
 1107  * is exclusively a restriction:  the number of sectors indicated by the
 1108  * target PDA can only shrink.
 1109  *
 1110  * For example:  s = sectors within SU indicated by source PDA
 1111  *               d = sectors within SU indicated by dest PDA
 1112  *               r = results, stored in dest PDA
 1113  *
 1114  * |--------------- one stripe unit ---------------------|
 1115  * |           sssssssssssssssssssssssssssssssss         |
 1116  * |    ddddddddddddddddddddddddddddddddddddddddddddd    |
 1117  * |           rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr         |
 1118  *
 1119  * Another example:
 1120  *
 1121  * |--------------- one stripe unit ---------------------|
 1122  * |           sssssssssssssssssssssssssssssssss         |
 1123  * |    ddddddddddddddddddddddd                          |
 1124  * |           rrrrrrrrrrrrrrrr                          |
 1125  *
 1126  */
 1127 void
 1128 rf_RangeRestrictPDA(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *src,
 1129     RF_PhysDiskAddr_t *dest, int dobuffer, int doraidaddr)
 1130 {
 1131         RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
 1132         RF_SectorNum_t soffs =
 1133             rf_StripeUnitOffset(layoutPtr, src->startSector);
 1134         RF_SectorNum_t doffs =
 1135             rf_StripeUnitOffset(layoutPtr, dest->startSector);
 1136         RF_SectorNum_t send =
 1137             rf_StripeUnitOffset(layoutPtr, src->startSector +
 1138             src->numSector - 1); /* Use -1 to be sure we stay within SU. */
 1139         RF_SectorNum_t dend =
 1140             rf_StripeUnitOffset(layoutPtr, dest->startSector +
 1141             dest->numSector - 1);
 1142         RF_SectorNum_t subAddr =
 1143             rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,
 1144             dest->startSector); /* Stripe unit boundary. */
 1145 
 1146         dest->startSector = subAddr + RF_MAX(soffs, doffs);
 1147         dest->numSector = subAddr + RF_MIN(send, dend) + 1 - dest->startSector;
 1148 
 1149         if (dobuffer)
 1150                 dest->bufPtr += (soffs > doffs) ?
 1151                     rf_RaidAddressToByte(raidPtr, soffs - doffs) : 0;
 1152         if (doraidaddr) {
 1153                 dest->raidAddress =
 1154                     rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,
 1155                     dest->raidAddress) +
 1156                     rf_StripeUnitOffset(layoutPtr, dest->startSector);
 1157         }
 1158 }
 1159 
 1160 /*
 1161  * Want the highest of these primes to be the largest one
 1162  * less than the max expected number of columns (won't hurt
 1163  * to be too small or too large, but won't be optimal, either)
 1164  * --jimz
 1165  */
 1166 #define NLOWPRIMES      8
 1167 static int lowprimes[NLOWPRIMES] = {2, 3, 5, 7, 11, 13, 17, 19};
 1168 
 1169 /*****************************************************************************
 1170  * Compute the workload shift factor. (chained declustering)
 1171  *
 1172  * Return nonzero if access should shift to secondary, otherwise,
 1173  * access is to primary.
 1174  *****************************************************************************/
 1175 int
 1176 rf_compute_workload_shift(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda)
 1177 {
 1178         /*
 1179          * Variables:
 1180          *  d   = Column of disk containing primary.
 1181          *  f   = Column of failed disk.
 1182          *  n   = Number of disks in array.
 1183          *  sd  = "shift distance"
 1184          *        (number of columns that d is to the right of f).
 1185          *  row = Row of array the access is in.
 1186          *  v   = Numerator of redirection ratio.
 1187          *  k   = Denominator of redirection ratio.
 1188          */
 1189         RF_RowCol_t d, f, sd, row, n;
 1190         int k, v, ret, i;
 1191 
 1192         row = pda->row;
 1193         n = raidPtr->numCol;
 1194 
 1195         /* Assign column of primary copy to d. */
 1196         d = pda->col;
 1197 
 1198         /* Assign column of dead disk to f. */
 1199         for (f = 0; ((!RF_DEAD_DISK(raidPtr->Disks[row][f].status)) &&
 1200              (f < n)); f++);
 1201 
 1202         RF_ASSERT(f < n);
 1203         RF_ASSERT(f != d);
 1204 
 1205         sd = (f > d) ? (n + d - f) : (d - f);
 1206         RF_ASSERT(sd < n);
 1207 
 1208         /*
 1209          * v of every k accesses should be redirected.
 1210          *
 1211          * v/k := (n-1-sd)/(n-1)
 1212          */
 1213         v = (n - 1 - sd);
 1214         k = (n - 1);
 1215 
 1216 #if 1
 1217         /*
 1218          * XXX
 1219          * Is this worth it ?
 1220          *
 1221          * Now reduce the fraction, by repeatedly factoring
 1222          * out primes (just like they teach in elementary school !).
 1223          */
 1224         for (i = 0; i < NLOWPRIMES; i++) {
 1225                 if (lowprimes[i] > v)
 1226                         break;
 1227                 while (((v % lowprimes[i]) == 0) && ((k % lowprimes[i]) == 0)) {
 1228                         v /= lowprimes[i];
 1229                         k /= lowprimes[i];
 1230                 }
 1231         }
 1232 #endif
 1233 
 1234         raidPtr->hist_diskreq[row][d]++;
 1235         if (raidPtr->hist_diskreq[row][d] > v) {
 1236                 ret = 0;        /* Do not redirect. */
 1237         } else {
 1238                 ret = 1;        /* Redirect. */
 1239         }
 1240 
 1241 #if 0
 1242         printf("d=%d f=%d sd=%d v=%d k=%d ret=%d h=%d\n", d, f, sd, v, k, ret,
 1243             raidPtr->hist_diskreq[row][d]);
 1244 #endif
 1245 
 1246         if (raidPtr->hist_diskreq[row][d] >= k) {
 1247                 /* Reset counter. */
 1248                 raidPtr->hist_diskreq[row][d] = 0;
 1249         }
 1250         return (ret);
 1251 }
 1252 
 1253 /*
 1254  * Disk selection routines.
 1255  */
 1256 
 1257 /*
 1258  * Select the disk with the shortest queue from a mirror pair.
 1259  * Both the disk I/Os queued in RAIDframe as well as those at the physical
 1260  * disk are counted as members of the "queue".
 1261  */
 1262 void
 1263 rf_SelectMirrorDiskIdle(RF_DagNode_t *node)
 1264 {
 1265         RF_Raid_t *raidPtr = (RF_Raid_t *) node->dagHdr->raidPtr;
 1266         RF_RowCol_t rowData, colData, rowMirror, colMirror;
 1267         int dataQueueLength, mirrorQueueLength, usemirror;
 1268         RF_PhysDiskAddr_t *data_pda = (RF_PhysDiskAddr_t *) node->params[0].p;
 1269         RF_PhysDiskAddr_t *mirror_pda = (RF_PhysDiskAddr_t *) node->params[4].p;
 1270         RF_PhysDiskAddr_t *tmp_pda;
 1271         RF_RaidDisk_t **disks = raidPtr->Disks;
 1272         RF_DiskQueue_t **dqs = raidPtr->Queues, *dataQueue, *mirrorQueue;
 1273 
 1274         /* Return the [row col] of the disk with the shortest queue. */
 1275         rowData = data_pda->row;
 1276         colData = data_pda->col;
 1277         rowMirror = mirror_pda->row;
 1278         colMirror = mirror_pda->col;
 1279         dataQueue = &(dqs[rowData][colData]);
 1280         mirrorQueue = &(dqs[rowMirror][colMirror]);
 1281 
 1282 #ifdef  RF_LOCK_QUEUES_TO_READ_LEN
 1283         RF_LOCK_QUEUE_MUTEX(dataQueue, "SelectMirrorDiskIdle");
 1284 #endif  /* RF_LOCK_QUEUES_TO_READ_LEN */
 1285         dataQueueLength = dataQueue->queueLength + dataQueue->numOutstanding;
 1286 #ifdef  RF_LOCK_QUEUES_TO_READ_LEN
 1287         RF_UNLOCK_QUEUE_MUTEX(dataQueue, "SelectMirrorDiskIdle");
 1288         RF_LOCK_QUEUE_MUTEX(mirrorQueue, "SelectMirrorDiskIdle");
 1289 #endif  /* RF_LOCK_QUEUES_TO_READ_LEN */
 1290         mirrorQueueLength = mirrorQueue->queueLength +
 1291             mirrorQueue->numOutstanding;
 1292 #ifdef  RF_LOCK_QUEUES_TO_READ_LEN
 1293         RF_UNLOCK_QUEUE_MUTEX(mirrorQueue, "SelectMirrorDiskIdle");
 1294 #endif  /* RF_LOCK_QUEUES_TO_READ_LEN */
 1295 
 1296         usemirror = 0;
 1297         if (RF_DEAD_DISK(disks[rowMirror][colMirror].status)) {
 1298                 usemirror = 0;
 1299         } else
 1300                 if (RF_DEAD_DISK(disks[rowData][colData].status)) {
 1301                         usemirror = 1;
 1302                 } else
 1303                         if (raidPtr->parity_good == RF_RAID_DIRTY) {
 1304                                 /* Trust only the main disk. */
 1305                                 usemirror = 0;
 1306                         } else
 1307                         if (dataQueueLength < mirrorQueueLength) {
 1308                                 usemirror = 0;
 1309                         } else
 1310                                 if (mirrorQueueLength < dataQueueLength) {
 1311                                         usemirror = 1;
 1312                                 } else {
 1313                                         /* Queues are equal length. */
 1314                                         /* Attempt cleverness. */
 1315                                         if (SNUM_DIFF(dataQueue
 1316                                             ->last_deq_sector, data_pda
 1317                                             ->startSector) <=
 1318                                             SNUM_DIFF(mirrorQueue
 1319                                             ->last_deq_sector, mirror_pda
 1320                                             ->startSector)) {
 1321                                                 usemirror = 0;
 1322                                         } else {
 1323                                                 usemirror = 1;
 1324                                         }
 1325                                 }
 1326 
 1327         if (usemirror) {
 1328                 /* Use mirror (parity) disk, swap params 0 & 4. */
 1329                 tmp_pda = data_pda;
 1330                 node->params[0].p = mirror_pda;
 1331                 node->params[4].p = tmp_pda;
 1332         } else {
 1333                 /* Use data disk, leave param 0 unchanged. */
 1334         }
 1335         /*printf("dataQueueLength %d, mirrorQueueLength %d\n", dataQueueLength,
 1336             mirrorQueueLength);*/
 1337 }
 1338 
 1339 /*
 1340  * Do simple partitioning. This assumes that
 1341  * the data and parity disks are laid out identically.
 1342  */
 1343 void
 1344 rf_SelectMirrorDiskPartition(RF_DagNode_t *node)
 1345 {
 1346         RF_Raid_t *raidPtr = (RF_Raid_t *) node->dagHdr->raidPtr;
 1347         RF_RowCol_t rowData, colData, rowMirror, colMirror;
 1348         RF_PhysDiskAddr_t *data_pda = (RF_PhysDiskAddr_t *) node->params[0].p;
 1349         RF_PhysDiskAddr_t *mirror_pda = (RF_PhysDiskAddr_t *) node->params[4].p;
 1350         RF_PhysDiskAddr_t *tmp_pda;
 1351         RF_RaidDisk_t **disks = raidPtr->Disks;
 1352         RF_DiskQueue_t **dqs = raidPtr->Queues, *dataQueue, *mirrorQueue;
 1353         int usemirror;
 1354 
 1355         /* Return the [row col] of the disk with the shortest queue. */
 1356         rowData = data_pda->row;
 1357         colData = data_pda->col;
 1358         rowMirror = mirror_pda->row;
 1359         colMirror = mirror_pda->col;
 1360         dataQueue = &(dqs[rowData][colData]);
 1361         mirrorQueue = &(dqs[rowMirror][colMirror]);
 1362 
 1363         usemirror = 0;
 1364         if (RF_DEAD_DISK(disks[rowMirror][colMirror].status)) {
 1365                 usemirror = 0;
 1366         } else
 1367                 if (RF_DEAD_DISK(disks[rowData][colData].status)) {
 1368                         usemirror = 1;
 1369                 } else
 1370                         if (raidPtr->parity_good == RF_RAID_DIRTY) {
 1371                                 /* Trust only the main disk. */
 1372                                 usemirror = 0;
 1373                 } else
 1374                                 if (data_pda->startSector <
 1375                                     (disks[rowData][colData].numBlocks / 2)) {
 1376                                 usemirror = 0;
 1377                         } else {
 1378                                 usemirror = 1;
 1379                         }
 1380 
 1381         if (usemirror) {
 1382                 /* Use mirror (parity) disk, swap params 0 & 4. */
 1383                 tmp_pda = data_pda;
 1384                 node->params[0].p = mirror_pda;
 1385                 node->params[4].p = tmp_pda;
 1386         } else {
 1387                 /* Use data disk, leave param 0 unchanged. */
 1388         }
 1389 }

/* [<][>][^][v][top][bottom][index][help] */