root/dev/raidframe/rf_pqdegdags.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. RF_CREATE_DAG_FUNC_DECL
  2. rf_applyPDA
  3. rf_PQDoubleRecoveryFunc
  4. rf_PQWriteDoubleRecoveryFunc
  5. RF_CREATE_DAG_FUNC_DECL
  6. RF_CREATE_DAG_FUNC_DECL
  7. RF_CREATE_DAG_FUNC_DECL

    1 /*      $OpenBSD: rf_pqdegdags.c,v 1.5 2002/12/16 07:01:04 tdeval Exp $ */
    2 /*      $NetBSD: rf_pqdegdags.c,v 1.5 1999/08/15 02:36:40 oster Exp $   */
    3 
    4 /*
    5  * Copyright (c) 1995 Carnegie-Mellon University.
    6  * All rights reserved.
    7  *
    8  * Author: Daniel Stodolsky
    9  *
   10  * Permission to use, copy, modify and distribute this software and
   11  * its documentation is hereby granted, provided that both the copyright
   12  * notice and this permission notice appear in all copies of the
   13  * software, derivative works or modified versions, and any portions
   14  * thereof, and that both notices appear in supporting documentation.
   15  *
   16  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
   17  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
   18  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
   19  *
   20  * Carnegie Mellon requests users of this software to return to
   21  *
   22  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
   23  *  School of Computer Science
   24  *  Carnegie Mellon University
   25  *  Pittsburgh PA 15213-3890
   26  *
   27  * any improvements or extensions that they make and grant Carnegie the
   28  * rights to redistribute these changes.
   29  */
   30 
   31 /*
   32  * rf_pqdegdags.c
   33  * Degraded mode dags for double fault cases.
   34  */
   35 
   36 
   37 #include "rf_archs.h"
   38 
   39 #if     (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
   40 
   41 #include "rf_types.h"
   42 #include "rf_raid.h"
   43 #include "rf_dag.h"
   44 #include "rf_dagdegrd.h"
   45 #include "rf_dagdegwr.h"
   46 #include "rf_dagfuncs.h"
   47 #include "rf_dagutils.h"
   48 #include "rf_etimer.h"
   49 #include "rf_acctrace.h"
   50 #include "rf_general.h"
   51 #include "rf_pqdegdags.h"
   52 #include "rf_pq.h"
   53 
   54 void rf_applyPDA(RF_Raid_t *, RF_PhysDiskAddr_t *, RF_PhysDiskAddr_t *,
   55         RF_PhysDiskAddr_t *, void *);
   56 
   57 /*
   58  * Two data drives have failed, and we are doing a read that covers one of them.
   59  * We may also be reading some of the surviving drives.
   60  */
   61 
   62 
   63 /*****************************************************************************
   64  *
   65  * Creates a DAG to perform a degraded-mode read of data within one stripe.
   66  * This DAG is as follows:
   67  *
   68  *                                      Hdr
   69  *                                       |
   70  *                                     Block
   71  *                       /         /           \         \     \   \
   72  *                      Rud  ...  Rud         Rrd  ...  Rrd    Rp  Rq
   73  *                      | \       | \         | \       | \    | \ | \
   74  *
   75  *                                 |                 |
   76  *                              Unblock              X
   77  *                                  \               /
   78  *                                   ------ T ------
   79  *
   80  * Each R node is a successor of the L node.
   81  * One successor arc from each R node goes to U, and the other to X.
   82  * There is one Rud for each chunk of surviving user data requested by the
   83  * user, and one Rrd for each chunk of surviving user data _not_ being read
   84  * by the user.
   85  * R = read, ud = user data, rd = recovery (surviving) data, p = P data,
   86  * q = Qdata, X = pq recovery node, T = terminate
   87  *
   88  * The block & unblock nodes are leftovers from a previous version. They
   89  * do nothing, but I haven't deleted them because it would be a tremendous
   90  * effort to put them back in.
   91  *
   92  * Note:  The target buffer for the XOR node is set to the actual user buffer
   93  * where the failed data is supposed to end up. This buffer is zero'd by the
   94  * code here. Thus, if you create a degraded read dag, use it, and then
   95  * re-use. You have to be sure to zero the target buffer prior to the re-use.
   96  *
   97  * Every buffer read is passed to the pq recovery node, whose job it is to
   98  * sort out what's needed and what's not.
   99  *****************************************************************************/
  100 
  101 /* Init a disk node with 2 successors and one predecessor. */
  102 #define INIT_DISK_NODE(node,name)                                       \
  103 do {                                                                    \
  104         rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc,           \
  105             rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2, 1, 4, 0,      \
  106             dag_h, name, allocList);                                    \
  107         (node)->succedents[0] = unblockNode;                            \
  108         (node)->succedents[1] = recoveryNode;                           \
  109         (node)->antecedents[0] = blockNode;                             \
  110         (node)->antType[0] = rf_control;                                \
  111 } while (0)
  112 
  113 #define DISK_NODE_PARAMS(_node_,_p_)                                    \
  114 do {                                                                    \
  115         (_node_).params[0].p = _p_ ;                                    \
  116         (_node_).params[1].p = (_p_)->bufPtr;                           \
  117         (_node_).params[2].v = parityStripeID;                          \
  118         (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,  \
  119             0, 0, which_ru);                                            \
  120 } while (0)
  121 
  122 #define DISK_NODE_PDA(node)     ((node)->params[0].p)
  123 
  124 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead)
  125 {
  126         rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList,
  127             "Rq", "PQ Recovery", rf_PQDoubleRecoveryFunc);
  128 }
  129 
  130 void
  131 rf_applyPDA(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda,
  132     RF_PhysDiskAddr_t *ppda, RF_PhysDiskAddr_t *qpda, void *bp)
  133 {
  134         RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
  135         RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
  136         RF_SectorCount_t s0len = ppda->numSector, len;
  137         RF_SectorNum_t suoffset;
  138         unsigned coeff;
  139         char *pbuf = ppda->bufPtr;
  140         char *qbuf = qpda->bufPtr;
  141         char *buf;
  142         int delta;
  143 
  144         suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
  145         len = pda->numSector;
  146         /* See if pda intersects a recovery pda. */
  147         if ((suoffset < s0off + s0len) && (suoffset + len > s0off)) {
  148                 buf = pda->bufPtr;
  149                 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
  150                     pda->raidAddress);
  151                 coeff = (coeff % raidPtr->Layout.numDataCol);
  152 
  153                 if (suoffset < s0off) {
  154                         delta = s0off - suoffset;
  155                         buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
  156                             delta);
  157                         suoffset = s0off;
  158                         len -= delta;
  159                 }
  160                 if (suoffset > s0off) {
  161                         delta = suoffset - s0off;
  162                         pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
  163                             delta);
  164                         qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
  165                             delta);
  166                 }
  167                 if ((suoffset + len) > (s0len + s0off))
  168                         len = s0len + s0off - suoffset;
  169 
  170                 /* Src, dest, len. */
  171                 rf_bxor(buf, pbuf, rf_RaidAddressToByte(raidPtr, len), bp);
  172 
  173                 /* Dest, src, len, coeff. */
  174                 rf_IncQ((unsigned long *) qbuf, (unsigned long *) buf,
  175                     rf_RaidAddressToByte(raidPtr, len), coeff);
  176         }
  177 }
  178 
  179 
  180 /*
  181  * Recover data in the case of a double failure. There can be two
  182  * result buffers, one for each chunk of data trying to be recovered.
  183  * The params are pda's that have not been range restricted or otherwise
  184  * politely massaged - this should be done here. The last params are the
  185  * pdas of P and Q, followed by the raidPtr. The list can look like
  186  *
  187  *   pda, pda, ..., p pda, q pda, raidptr, asm
  188  *
  189  * or
  190  *
  191  *   pda, pda, ..., p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm
  192  *
  193  * depending on whether two chunks of recovery data were required.
  194  *
  195  * The second condition only arises if there are two failed buffers
  196  * whose lengths do not add up a stripe unit.
  197  */
  198 
  199 int
  200 rf_PQDoubleRecoveryFunc(RF_DagNode_t *node)
  201 {
  202         int np = node->numParams;
  203         RF_AccessStripeMap_t *asmap =
  204             (RF_AccessStripeMap_t *) node->params[np - 1].p;
  205         RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
  206         RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
  207         int d, i;
  208         unsigned coeff;
  209         RF_RaidAddr_t sosAddr, suoffset;
  210         RF_SectorCount_t len, secPerSU = layoutPtr->sectorsPerStripeUnit;
  211         int two = 0;
  212         RF_PhysDiskAddr_t *ppda, *ppda2, *qpda, *qpda2, *pda, npda;
  213         char *buf;
  214         int numDataCol = layoutPtr->numDataCol;
  215         RF_Etimer_t timer;
  216         RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
  217 
  218         RF_ETIMER_START(timer);
  219 
  220         if (asmap->failedPDAs[1] &&
  221             (asmap->failedPDAs[1]->numSector +
  222              asmap->failedPDAs[0]->numSector < secPerSU)) {
  223                 RF_ASSERT(0);
  224                 ppda = node->params[np - 6].p;
  225                 ppda2 = node->params[np - 5].p;
  226                 qpda = node->params[np - 4].p;
  227                 qpda2 = node->params[np - 3].p;
  228                 d = (np - 6);
  229                 two = 1;
  230         } else {
  231                 ppda = node->params[np - 4].p;
  232                 qpda = node->params[np - 3].p;
  233                 d = (np - 4);
  234         }
  235 
  236         for (i = 0; i < d; i++) {
  237                 pda = node->params[i].p;
  238                 buf = pda->bufPtr;
  239                 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
  240                 len = pda->numSector;
  241                 coeff = rf_RaidAddressToStripeUnitID(layoutPtr,
  242                     pda->raidAddress);
  243                 /* Compute the data unit offset within the column. */
  244                 coeff = (coeff % raidPtr->Layout.numDataCol);
  245                 /* See if pda intersects a recovery pda. */
  246                 rf_applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
  247                 if (two)
  248                         rf_applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
  249         }
  250 
  251         /*
  252          * Ok, we got the parity back to the point where we can recover. We
  253          * now need to determine the coeff of the columns that need to be
  254          * recovered. We can also only need to recover a single stripe unit.
  255          */
  256 
  257         if (asmap->failedPDAs[1] == NULL) {     /*
  258                                                  * Only a single stripe unit
  259                                                  * to recover.
  260                                                  */
  261                 pda = asmap->failedPDAs[0];
  262                 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
  263                     asmap->raidAddress);
  264                 /* Need to determine the column of the other failed disk. */
  265                 coeff = rf_RaidAddressToStripeUnitID(layoutPtr,
  266                     pda->raidAddress);
  267                 /* Compute the data unit offset within the column. */
  268                 coeff = (coeff % raidPtr->Layout.numDataCol);
  269                 for (i = 0; i < numDataCol; i++) {
  270                         npda.raidAddress = sosAddr + (i * secPerSU);
  271                         (raidPtr->Layout.map->MapSector) (raidPtr,
  272                             npda.raidAddress, &(npda.row), &(npda.col),
  273                             &(npda.startSector), 0);
  274                         /* Skip over dead disks. */
  275                         if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col]
  276                             .status))
  277                                 if (i != coeff)
  278                                         break;
  279                 }
  280                 RF_ASSERT(i < numDataCol);
  281                 RF_ASSERT(two == 0);
  282                 /*
  283                  * Recover the data. Since we need only to recover one
  284                  * column, we overwrite the parity with the other one.
  285                  */
  286                 if (coeff < i)  /* Recovering 'a'. */
  287                         rf_PQ_recover((unsigned long *) ppda->bufPtr,
  288                             (unsigned long *) qpda->bufPtr,
  289                             (unsigned long *) pda->bufPtr,
  290                             (unsigned long *) ppda->bufPtr,
  291                             rf_RaidAddressToByte(raidPtr, pda->numSector),
  292                             coeff, i);
  293                 else            /* Recovering 'b'. */
  294                         rf_PQ_recover((unsigned long *) ppda->bufPtr,
  295                             (unsigned long *) qpda->bufPtr,
  296                             (unsigned long *) ppda->bufPtr,
  297                             (unsigned long *) pda->bufPtr,
  298                             rf_RaidAddressToByte(raidPtr, pda->numSector),
  299                             i, coeff);
  300         } else
  301                 RF_PANIC();
  302 
  303         RF_ETIMER_STOP(timer);
  304         RF_ETIMER_EVAL(timer);
  305         if (tracerec)
  306                 tracerec->q_us += RF_ETIMER_VAL_US(timer);
  307         rf_GenericWakeupFunc(node, 0);
  308         return (0);
  309 }
  310 
  311 int
  312 rf_PQWriteDoubleRecoveryFunc(RF_DagNode_t *node)
  313 {
  314         /*
  315          * The situation:
  316          *
  317          * We are doing a write that hits only one failed data unit. The other
  318          * failed data unit is not being overwritten, so we need to generate
  319          * it.
  320          *
  321          * For the moment, we assume all the nonfailed data being written is in
  322          * the shadow of the failed data unit. (i.e., either a single data
  323          * unit write or the entire failed stripe unit is being overwritten.)
  324          *
  325          * Recovery strategy: apply the recovery data to the parity and Q.
  326          * Use P & Q to recover the second failed data unit in P. Zero fill
  327          * Q, then apply the recovered data to P. Then apply the data being
  328          * written to the failed drive. Then walk through the surviving drives,
  329          * applying new data when it exists, othewise the recovery data.
  330          * Quite a mess.
  331          *
  332          *
  333          * The params:
  334          *
  335          *   read pda0, read pda1, ..., read pda (numDataCol-3),
  336          *   write pda0, ..., write pda (numStripeUnitAccess - numDataFailed),
  337          *   failed pda, raidPtr, asmap
  338          */
  339 
  340         int np = node->numParams;
  341         RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *)
  342             node->params[np - 1].p;
  343         RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
  344         RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
  345         int i;
  346         RF_RaidAddr_t sosAddr;
  347         unsigned coeff;
  348         RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
  349         RF_PhysDiskAddr_t *ppda, *qpda, *pda, npda;
  350         int numDataCol = layoutPtr->numDataCol;
  351         RF_Etimer_t timer;
  352         RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
  353 
  354         RF_ASSERT(node->numResults == 2);
  355         RF_ASSERT(asmap->failedPDAs[1] == NULL);
  356         RF_ETIMER_START(timer);
  357         ppda = node->results[0];
  358         qpda = node->results[1];
  359         /* apply the recovery data */
  360         for (i = 0; i < numDataCol - 2; i++)
  361                 rf_applyPDA(raidPtr, node->params[i].p, ppda, qpda,
  362                     node->dagHdr->bp);
  363 
  364         /* Determine the other failed data unit. */
  365         pda = asmap->failedPDAs[0];
  366         sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
  367             asmap->raidAddress);
  368         /* Need to determine the column of the other failed disk. */
  369         coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
  370         /* Compute the data unit offset within the column. */
  371         coeff = (coeff % raidPtr->Layout.numDataCol);
  372         for (i = 0; i < numDataCol; i++) {
  373                 npda.raidAddress = sosAddr + (i * secPerSU);
  374                 (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress,
  375                     &(npda.row), &(npda.col), &(npda.startSector), 0);
  376                 /* Skip over dead disks. */
  377                 if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
  378                         if (i != coeff)
  379                                 break;
  380         }
  381         RF_ASSERT(i < numDataCol);
  382         /*
  383          * Recover the data. The column we want to recover, we write over the
  384          * parity. The column we don't care about, we dump in q.
  385          */
  386         if (coeff < i)          /* Recovering 'a'. */
  387                 rf_PQ_recover((unsigned long *) ppda->bufPtr,
  388                     (unsigned long *) qpda->bufPtr,
  389                     (unsigned long *) ppda->bufPtr,
  390                     (unsigned long *) qpda->bufPtr,
  391                     rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
  392         else                    /* Recovering 'b'. */
  393                 rf_PQ_recover((unsigned long *) ppda->bufPtr,
  394                     (unsigned long *) qpda->bufPtr,
  395                     (unsigned long *) qpda->bufPtr,
  396                     (unsigned long *) ppda->bufPtr,
  397                     rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
  398 
  399         /* OK. The valid data is in P. Zero fill Q, then inc it into it. */
  400         bzero(qpda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector));
  401         rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr,
  402             rf_RaidAddressToByte(raidPtr, qpda->numSector), i);
  403 
  404         /* Now apply all the write data to the buffer. */
  405         /*
  406          * Single stripe unit write case: The failed data is the only thing
  407          * we are writing.
  408          */
  409         RF_ASSERT(asmap->numStripeUnitsAccessed == 1);
  410         /* Dest, src, len, coeff. */
  411         rf_IncQ((unsigned long *) qpda->bufPtr,
  412             (unsigned long *) asmap->failedPDAs[0]->bufPtr,
  413             rf_RaidAddressToByte(raidPtr, qpda->numSector), coeff);
  414         rf_bxor(asmap->failedPDAs[0]->bufPtr, ppda->bufPtr,
  415             rf_RaidAddressToByte(raidPtr, ppda->numSector), node->dagHdr->bp);
  416 
  417         /* Now apply all the recovery data. */
  418         for (i = 0; i < numDataCol - 2; i++)
  419                 rf_applyPDA(raidPtr, node->params[i].p, ppda, qpda,
  420                     node->dagHdr->bp);
  421 
  422         RF_ETIMER_STOP(timer);
  423         RF_ETIMER_EVAL(timer);
  424         if (tracerec)
  425                 tracerec->q_us += RF_ETIMER_VAL_US(timer);
  426 
  427         rf_GenericWakeupFunc(node, 0);
  428         return (0);
  429 }
  430 
  431 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite)
  432 {
  433         RF_PANIC();
  434 }
  435 
  436 
  437 /*
  438  * Two lost data unit write case.
  439  *
  440  * There are really two cases here:
  441  *
  442  * (1) The write completely covers the two lost data units.
  443  *     In that case, a reconstruct write that doesn't write the
  444  *     failed data units will do the correct thing. So in this case,
  445  *     the dag looks like
  446  *
  447  *         Full stripe read of surviving data units (not being overwritten)
  448  *         Write new data (ignoring failed units)
  449  *         Compute P&Q
  450  *         Write P&Q
  451  *
  452  *
  453  * (2) The write does not completely cover both failed data units
  454  *     (but touches at least one of them). Then we need to do the
  455  *     equivalent of a reconstruct read to recover the missing data
  456  *     unit from the other stripe.
  457  *
  458  *     For any data we are writing that is not in the "shadow"
  459  *     of the failed units, we need to do a four cycle update.
  460  *     PANIC on this case. For now.
  461  *
  462  */
  463 
  464 RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG)
  465 {
  466         RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
  467         RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
  468         int sum;
  469         int nf = asmap->numDataFailed;
  470 
  471         sum = asmap->failedPDAs[0]->numSector;
  472         if (nf == 2)
  473                 sum += asmap->failedPDAs[1]->numSector;
  474 
  475         if ((nf == 2) && (sum == (2 * sectorsPerSU))) {
  476                 /* Large write case. */
  477                 rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
  478                 return;
  479         }
  480         if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU)) {
  481                 /* Small write case, no user data not in shadow. */
  482                 rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags,
  483                     allocList);
  484                 return;
  485         }
  486         RF_PANIC();
  487 }
  488 
  489 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite)
  490 {
  491         rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList,
  492             "Rq", "Wq", "PQ Recovery", rf_PQWriteDoubleRecoveryFunc);
  493 }
  494 
  495 #endif  /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) */

/* [<][>][^][v][top][bottom][index][help] */