root/dev/raidframe/rf_dagfuncs.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. rf_ConfigureDAGFuncs
  2. rf_TerminateFunc
  3. rf_TerminateUndoFunc
  4. rf_DiskReadMirrorIdleFunc
  5. rf_DiskReadMirrorPartitionFunc
  6. rf_DiskReadMirrorUndoFunc
  7. rf_ParityLogUpdateFunc
  8. rf_ParityLogOverwriteFunc
  9. rf_ParityLogUpdateFunc
  10. rf_ParityLogOverwriteFunc
  11. rf_ParityLogUpdateUndoFunc
  12. rf_ParityLogOverwriteUndoFunc
  13. rf_NullNodeFunc
  14. rf_NullNodeUndoFunc
  15. rf_DiskReadFuncForThreads
  16. rf_DiskWriteFuncForThreads
  17. rf_DiskUndoFunc
  18. rf_DiskUnlockFuncForThreads
  19. rf_GenericWakeupFunc
  20. rf_RegularXorFunc
  21. rf_SimpleXorFunc
  22. rf_RecoveryXorFunc
  23. rf_XorIntoBuffer
  24. rf_bxor
  25. rf_longword_bxor
  26. rf_longword_bxor3
  27. rf_bxor3

    1 /*      $OpenBSD: rf_dagfuncs.c,v 1.7 2004/09/20 17:51:07 miod Exp $    */
    2 /*      $NetBSD: rf_dagfuncs.c,v 1.6 2000/03/30 12:45:40 augustss Exp $ */
    3 
    4 /*
    5  * Copyright (c) 1995 Carnegie-Mellon University.
    6  * All rights reserved.
    7  *
    8  * Author: Mark Holland, William V. Courtright II
    9  *
   10  * Permission to use, copy, modify and distribute this software and
   11  * its documentation is hereby granted, provided that both the copyright
   12  * notice and this permission notice appear in all copies of the
   13  * software, derivative works or modified versions, and any portions
   14  * thereof, and that both notices appear in supporting documentation.
   15  *
   16  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
   17  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
   18  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
   19  *
   20  * Carnegie Mellon requests users of this software to return to
   21  *
   22  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
   23  *  School of Computer Science
   24  *  Carnegie Mellon University
   25  *  Pittsburgh PA 15213-3890
   26  *
   27  * any improvements or extensions that they make and grant Carnegie the
   28  * rights to redistribute these changes.
   29  */
   30 
   31 /*
   32  * dagfuncs.c -- DAG node execution routines.
   33  *
   34  * Rules:
   35  * 1. Every DAG execution function must eventually cause node->status to
   36  *    get set to "good" or "bad", and "FinishNode" to be called. In the
   37  *    case of nodes that complete immediately (xor, NullNodeFunc, etc),
   38  *    the node execution function can do these two things directly. In
   39  *    the case of nodes that have to wait for some event (a disk read to
   40  *    complete, a lock to be released, etc) to occur before they can
   41  *    complete, this is typically achieved by having whatever module
   42  *    is doing the operation call GenericWakeupFunc upon completion.
   43  * 2. DAG execution functions should check the status in the DAG header
   44  *    and NOP out their operations if the status is not "enable". However,
   45  *    execution functions that release resources must be sure to release
   46  *    them even when they NOP out the function that would use them.
   47  *    Functions that acquire resources should go ahead and acquire them
   48  *    even when they NOP, so that a downstream release node will not have
   49  *    to check to find out whether or not the acquire was suppressed.
   50  */
   51 
   52 #include <sys/ioctl.h>
   53 #include <sys/param.h>
   54 
   55 #include "rf_archs.h"
   56 #include "rf_raid.h"
   57 #include "rf_dag.h"
   58 #include "rf_layout.h"
   59 #include "rf_etimer.h"
   60 #include "rf_acctrace.h"
   61 #include "rf_diskqueue.h"
   62 #include "rf_dagfuncs.h"
   63 #include "rf_general.h"
   64 #include "rf_engine.h"
   65 #include "rf_dagutils.h"
   66 
   67 #include "rf_kintf.h"
   68 
   69 #if     RF_INCLUDE_PARITYLOGGING > 0
   70 #include "rf_paritylog.h"
   71 #endif  /* RF_INCLUDE_PARITYLOGGING > 0 */
   72 
   73 int     (*rf_DiskReadFunc) (RF_DagNode_t *);
   74 int     (*rf_DiskWriteFunc) (RF_DagNode_t *);
   75 int     (*rf_DiskReadUndoFunc) (RF_DagNode_t *);
   76 int     (*rf_DiskWriteUndoFunc) (RF_DagNode_t *);
   77 int     (*rf_DiskUnlockFunc) (RF_DagNode_t *);
   78 int     (*rf_DiskUnlockUndoFunc) (RF_DagNode_t *);
   79 int     (*rf_RegularXorUndoFunc) (RF_DagNode_t *);
   80 int     (*rf_SimpleXorUndoFunc) (RF_DagNode_t *);
   81 int     (*rf_RecoveryXorUndoFunc) (RF_DagNode_t *);
   82 
   83 /*****************************************************************************
   84  * Main (only) configuration routine for this module.
   85  *****************************************************************************/
   86 int
   87 rf_ConfigureDAGFuncs(RF_ShutdownList_t **listp)
   88 {
   89         RF_ASSERT(((sizeof(long) == 8) && RF_LONGSHIFT == 3) ||
   90             ((sizeof(long) == 4) && RF_LONGSHIFT == 2));
   91         rf_DiskReadFunc = rf_DiskReadFuncForThreads;
   92         rf_DiskReadUndoFunc = rf_DiskUndoFunc;
   93         rf_DiskWriteFunc = rf_DiskWriteFuncForThreads;
   94         rf_DiskWriteUndoFunc = rf_DiskUndoFunc;
   95         rf_DiskUnlockFunc = rf_DiskUnlockFuncForThreads;
   96         rf_DiskUnlockUndoFunc = rf_NullNodeUndoFunc;
   97         rf_RegularXorUndoFunc = rf_NullNodeUndoFunc;
   98         rf_SimpleXorUndoFunc = rf_NullNodeUndoFunc;
   99         rf_RecoveryXorUndoFunc = rf_NullNodeUndoFunc;
  100         return (0);
  101 }
  102 
  103 
  104 /*****************************************************************************
  105  * The execution function associated with a terminate node.
  106  *****************************************************************************/
  107 int
  108 rf_TerminateFunc(RF_DagNode_t *node)
  109 {
  110         RF_ASSERT(node->dagHdr->numCommits == node->dagHdr->numCommitNodes);
  111         node->status = rf_good;
  112         return (rf_FinishNode(node, RF_THREAD_CONTEXT));
  113 }
  114 
  115 int
  116 rf_TerminateUndoFunc(RF_DagNode_t *node)
  117 {
  118         return (0);
  119 }
  120 
  121 
  122 /*****************************************************************************
  123  * Execution functions associated with a mirror node.
  124  *
  125  * parameters:
  126  *
  127  * 0 - Physical disk address of data.
  128  * 1 - Buffer for holding read data.
  129  * 2 - Parity stripe ID.
  130  * 3 - Flags.
  131  * 4 - Physical disk address of mirror (parity).
  132  *
  133  *****************************************************************************/
  134 
  135 int
  136 rf_DiskReadMirrorIdleFunc(RF_DagNode_t *node)
  137 {
  138         /*
  139          * Select the mirror copy with the shortest queue and fill in node
  140          * parameters with physical disk address.
  141          */
  142 
  143         rf_SelectMirrorDiskIdle(node);
  144         return (rf_DiskReadFunc(node));
  145 }
  146 
  147 int
  148 rf_DiskReadMirrorPartitionFunc(RF_DagNode_t *node)
  149 {
  150         /*
  151          * Select the mirror copy with the shortest queue and fill in node
  152          * parameters with physical disk address.
  153          */
  154 
  155         rf_SelectMirrorDiskPartition(node);
  156         return (rf_DiskReadFunc(node));
  157 }
  158 
  159 int
  160 rf_DiskReadMirrorUndoFunc(RF_DagNode_t *node)
  161 {
  162         return (0);
  163 }
  164 
  165 
  166 
  167 #if     RF_INCLUDE_PARITYLOGGING > 0
  168 /*****************************************************************************
  169  * The execution function associated with a parity log update node.
  170  *****************************************************************************/
  171 int
  172 rf_ParityLogUpdateFunc(RF_DagNode_t *node)
  173 {
  174         RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
  175         caddr_t buf = (caddr_t) node->params[1].p;
  176         RF_ParityLogData_t *logData;
  177         RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
  178         RF_Etimer_t timer;
  179 
  180         if (node->dagHdr->status == rf_enable) {
  181                 RF_ETIMER_START(timer);
  182                 logData = rf_CreateParityLogData(RF_UPDATE, pda, buf,
  183                     (RF_Raid_t *) (node->dagHdr->raidPtr),
  184                     node->wakeFunc, (void *) node,
  185                     node->dagHdr->tracerec, timer);
  186                 if (logData)
  187                         rf_ParityLogAppend(logData, RF_FALSE, NULL, RF_FALSE);
  188                 else {
  189                         RF_ETIMER_STOP(timer);
  190                         RF_ETIMER_EVAL(timer);
  191                         tracerec->plog_us += RF_ETIMER_VAL_US(timer);
  192                         (node->wakeFunc) (node, ENOMEM);
  193                 }
  194         }
  195         return (0);
  196 }
  197 
  198 
  199 /*****************************************************************************
  200  * The execution function associated with a parity log overwrite node.
  201  *****************************************************************************/
  202 int
  203 rf_ParityLogOverwriteFunc(RF_DagNode_t *node)
  204 {
  205         RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
  206         caddr_t buf = (caddr_t) node->params[1].p;
  207         RF_ParityLogData_t *logData;
  208         RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
  209         RF_Etimer_t timer;
  210 
  211         if (node->dagHdr->status == rf_enable) {
  212                 RF_ETIMER_START(timer);
  213                 logData = rf_CreateParityLogData(RF_OVERWRITE, pda, buf,
  214                     (RF_Raid_t *) (node->dagHdr->raidPtr), node->wakeFunc,
  215                     (void *) node, node->dagHdr->tracerec, timer);
  216                 if (logData)
  217                         rf_ParityLogAppend(logData, RF_FALSE, NULL, RF_FALSE);
  218                 else {
  219                         RF_ETIMER_STOP(timer);
  220                         RF_ETIMER_EVAL(timer);
  221                         tracerec->plog_us += RF_ETIMER_VAL_US(timer);
  222                         (node->wakeFunc) (node, ENOMEM);
  223                 }
  224         }
  225         return (0);
  226 }
  227 #else   /* RF_INCLUDE_PARITYLOGGING > 0 */
  228 
  229 int
  230 rf_ParityLogUpdateFunc(RF_DagNode_t *node)
  231 {
  232         return (0);
  233 }
  234 
  235 int
  236 rf_ParityLogOverwriteFunc(RF_DagNode_t *node)
  237 {
  238         return (0);
  239 }
  240 #endif  /* RF_INCLUDE_PARITYLOGGING > 0 */
  241 
  242 int
  243 rf_ParityLogUpdateUndoFunc(RF_DagNode_t *node)
  244 {
  245         return (0);
  246 }
  247 
  248 int
  249 rf_ParityLogOverwriteUndoFunc(RF_DagNode_t *node)
  250 {
  251         return (0);
  252 }
  253 
  254 /*****************************************************************************
  255  * The execution function associated with a NOP node.
  256  *****************************************************************************/
  257 int
  258 rf_NullNodeFunc(RF_DagNode_t *node)
  259 {
  260         node->status = rf_good;
  261         return (rf_FinishNode(node, RF_THREAD_CONTEXT));
  262 }
  263 
  264 int
  265 rf_NullNodeUndoFunc(RF_DagNode_t *node)
  266 {
  267         node->status = rf_undone;
  268         return (rf_FinishNode(node, RF_THREAD_CONTEXT));
  269 }
  270 
  271 
  272 /*****************************************************************************
  273  * The execution function associated with a disk-read node.
  274  *****************************************************************************/
  275 int
  276 rf_DiskReadFuncForThreads(RF_DagNode_t *node)
  277 {
  278         RF_DiskQueueData_t *req;
  279         RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
  280         caddr_t buf = (caddr_t) node->params[1].p;
  281         RF_StripeNum_t parityStripeID = (RF_StripeNum_t) node->params[2].v;
  282         unsigned priority = RF_EXTRACT_PRIORITY(node->params[3].v);
  283         unsigned lock = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
  284         unsigned unlock = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
  285         unsigned which_ru = RF_EXTRACT_RU(node->params[3].v);
  286         RF_DiskQueueDataFlags_t flags = 0;
  287         RF_IoType_t iotype = (node->dagHdr->status == rf_enable) ?
  288             RF_IO_TYPE_READ : RF_IO_TYPE_NOP;
  289         RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
  290         void *b_proc = NULL;
  291 
  292         if (node->dagHdr->bp)
  293                 b_proc = (void *) ((struct buf *) node->dagHdr->bp)->b_proc;
  294 
  295         RF_ASSERT(!(lock && unlock));
  296         flags |= (lock) ? RF_LOCK_DISK_QUEUE : 0;
  297         flags |= (unlock) ? RF_UNLOCK_DISK_QUEUE : 0;
  298 
  299         req = rf_CreateDiskQueueData(iotype, pda->startSector, pda->numSector,
  300             buf, parityStripeID, which_ru,
  301             (int (*) (void *, int)) node->wakeFunc,
  302             node, NULL, node->dagHdr->tracerec,
  303             (void *) (node->dagHdr->raidPtr), flags, b_proc);
  304         if (!req) {
  305                 (node->wakeFunc) (node, ENOMEM);
  306         } else {
  307                 node->dagFuncData = (void *) req;
  308                 rf_DiskIOEnqueue(&(dqs[pda->row][pda->col]), req, priority);
  309         }
  310         return (0);
  311 }
  312 
  313 
  314 /*****************************************************************************
  315  * the execution function associated with a disk-write node
  316  *****************************************************************************/
  317 int
  318 rf_DiskWriteFuncForThreads(RF_DagNode_t *node)
  319 {
  320         RF_DiskQueueData_t *req;
  321         RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
  322         caddr_t buf = (caddr_t) node->params[1].p;
  323         RF_StripeNum_t parityStripeID = (RF_StripeNum_t) node->params[2].v;
  324         unsigned priority = RF_EXTRACT_PRIORITY(node->params[3].v);
  325         unsigned lock = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
  326         unsigned unlock = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
  327         unsigned which_ru = RF_EXTRACT_RU(node->params[3].v);
  328         RF_DiskQueueDataFlags_t flags = 0;
  329         RF_IoType_t iotype = (node->dagHdr->status == rf_enable) ?
  330             RF_IO_TYPE_WRITE : RF_IO_TYPE_NOP;
  331         RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
  332         void *b_proc = NULL;
  333 
  334         if (node->dagHdr->bp)
  335                 b_proc = (void *) ((struct buf *) node->dagHdr->bp)->b_proc;
  336 
  337         /* Normal processing (rollaway or forward recovery) begins here. */
  338         RF_ASSERT(!(lock && unlock));
  339         flags |= (lock) ? RF_LOCK_DISK_QUEUE : 0;
  340         flags |= (unlock) ? RF_UNLOCK_DISK_QUEUE : 0;
  341         req = rf_CreateDiskQueueData(iotype, pda->startSector, pda->numSector,
  342             buf, parityStripeID, which_ru,
  343             (int (*) (void *, int)) node->wakeFunc, (void *) node, NULL,
  344             node->dagHdr->tracerec, (void *) (node->dagHdr->raidPtr),
  345             flags, b_proc);
  346 
  347         if (!req) {
  348                 (node->wakeFunc) (node, ENOMEM);
  349         } else {
  350                 node->dagFuncData = (void *) req;
  351                 rf_DiskIOEnqueue(&(dqs[pda->row][pda->col]), req, priority);
  352         }
  353 
  354         return (0);
  355 }
  356 /*****************************************************************************
  357  * The undo function for disk nodes.
  358  * Note:  This is not a proper undo of a write node, only locks are released.
  359  *        old data is not restored to disk !
  360  *****************************************************************************/
  361 int
  362 rf_DiskUndoFunc(RF_DagNode_t *node)
  363 {
  364         RF_DiskQueueData_t *req;
  365         RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
  366         RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
  367 
  368         req = rf_CreateDiskQueueData(RF_IO_TYPE_NOP, 0L, 0, NULL, 0L, 0,
  369             (int (*) (void *, int)) node->wakeFunc, (void *) node,
  370             NULL, node->dagHdr->tracerec, (void *) (node->dagHdr->raidPtr),
  371             RF_UNLOCK_DISK_QUEUE, NULL);
  372         if (!req)
  373                 (node->wakeFunc) (node, ENOMEM);
  374         else {
  375                 node->dagFuncData = (void *) req;
  376                 rf_DiskIOEnqueue(&(dqs[pda->row][pda->col]), req,
  377                     RF_IO_NORMAL_PRIORITY);
  378         }
  379 
  380         return (0);
  381 }
  382 
  383 /*****************************************************************************
  384  * The execution function associated with an "unlock disk queue" node.
  385  *****************************************************************************/
  386 int
  387 rf_DiskUnlockFuncForThreads(RF_DagNode_t *node)
  388 {
  389         RF_DiskQueueData_t *req;
  390         RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
  391         RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
  392 
  393         req = rf_CreateDiskQueueData(RF_IO_TYPE_NOP, 0L, 0, NULL, 0L, 0,
  394             (int (*) (void *, int)) node->wakeFunc, (void *) node,
  395             NULL, node->dagHdr->tracerec, (void *) (node->dagHdr->raidPtr),
  396             RF_UNLOCK_DISK_QUEUE, NULL);
  397         if (!req)
  398                 (node->wakeFunc) (node, ENOMEM);
  399         else {
  400                 node->dagFuncData = (void *) req;
  401                 rf_DiskIOEnqueue(&(dqs[pda->row][pda->col]), req,
  402                     RF_IO_NORMAL_PRIORITY);
  403         }
  404 
  405         return (0);
  406 }
  407 
  408 /*****************************************************************************
  409  * Callback routine for DiskRead and DiskWrite nodes. When the disk op
  410  * completes, the routine is called to set the node status and inform
  411  * the execution engine that the node has fired.
  412  *****************************************************************************/
  413 int
  414 rf_GenericWakeupFunc(RF_DagNode_t *node, int status)
  415 {
  416         switch (node->status) {
  417         case rf_bwd1:
  418                 node->status = rf_bwd2;
  419                 if (node->dagFuncData)
  420                         rf_FreeDiskQueueData((RF_DiskQueueData_t *)
  421                             node->dagFuncData);
  422                 return (rf_DiskWriteFuncForThreads(node));
  423                 break;
  424         case rf_fired:
  425                 if (status)
  426                         node->status = rf_bad;
  427                 else
  428                         node->status = rf_good;
  429                 break;
  430         case rf_recover:
  431                 /* Probably should never reach this case. */
  432                 if (status)
  433                         node->status = rf_panic;
  434                 else
  435                         node->status = rf_undone;
  436                 break;
  437         default:
  438                 printf("rf_GenericWakeupFunc:");
  439                 printf("node->status is %d,", node->status);
  440                 printf("status is %d \n", status);
  441                 RF_PANIC();
  442                 break;
  443         }
  444         if (node->dagFuncData)
  445                 rf_FreeDiskQueueData((RF_DiskQueueData_t *) node->dagFuncData);
  446         return (rf_FinishNode(node, RF_INTR_CONTEXT));
  447 }
  448 
  449 
  450 /*****************************************************************************
  451  * There are three distinct types of xor nodes.
  452  *
  453  * A "regular xor" is used in the fault-free case where the access spans
  454  * a complete stripe unit. It assumes that the result buffer is one full
  455  * stripe unit in size, and uses the stripe-unit-offset values that it
  456  * computes from the PDAs to determine where within the stripe unit to
  457  * XOR each argument buffer.
  458  *
  459  * A "simple xor" is used in the fault-free case where the access touches
  460  * only a portion of one (or two, in some cases) stripe unit(s). It assumes
  461  * that all the argument buffers are of the same size and have the same
  462  * stripe unit offset.
  463  *
  464  * A "recovery xor" is used in the degraded-mode case. It's similar to
  465  * the regular xor function except that it takes the failed PDA as an
  466  * additional parameter, and uses it to determine what portions of the
  467  * argument buffers need to be xor'd into the result buffer, and where
  468  * in the result buffer they should go.
  469  *****************************************************************************/
  470 
  471 /*
  472  * Xor the params together and store the result in the result field.
  473  * Assume the result field points to a buffer that is the size of one SU,
  474  * and use the pda params to determine where within the buffer to XOR
  475  * the input buffers.
  476  */
  477 int
  478 rf_RegularXorFunc(RF_DagNode_t *node)
  479 {
  480         RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
  481         RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
  482         RF_Etimer_t timer;
  483         int i, retcode;
  484 
  485         retcode = 0;
  486         if (node->dagHdr->status == rf_enable) {
  487                 /* Don't do the XOR if the input is the same as the output. */
  488                 RF_ETIMER_START(timer);
  489                 for (i = 0; i < node->numParams - 1; i += 2)
  490                         if (node->params[i + 1].p != node->results[0]) {
  491                                 retcode = rf_XorIntoBuffer(raidPtr,
  492                                     (RF_PhysDiskAddr_t *) node->params[i].p,
  493                                     (char *) node->params[i + 1].p,
  494                                     (char *) node->results[0],
  495                                     node->dagHdr->bp);
  496                         }
  497                 RF_ETIMER_STOP(timer);
  498                 RF_ETIMER_EVAL(timer);
  499                 tracerec->xor_us += RF_ETIMER_VAL_US(timer);
  500         }
  501         /* Call wake func explicitly since no I/O in this node. */
  502         return (rf_GenericWakeupFunc(node, retcode));
  503 }
  504 
  505 /* Xor the inputs into the result buffer, ignoring placement issues. */
  506 int
  507 rf_SimpleXorFunc(RF_DagNode_t *node)
  508 {
  509         RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
  510         int i, retcode = 0;
  511         RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
  512         RF_Etimer_t timer;
  513 
  514         if (node->dagHdr->status == rf_enable) {
  515                 RF_ETIMER_START(timer);
  516                 /* Don't do the XOR if the input is the same as the output. */
  517                 for (i = 0; i < node->numParams - 1; i += 2)
  518                         if (node->params[i + 1].p != node->results[0]) {
  519                                 retcode = rf_bxor((char *)
  520                                     node->params[i + 1].p,
  521                                     (char *) node->results[0],
  522                                     rf_RaidAddressToByte(raidPtr,
  523                                     ((RF_PhysDiskAddr_t *)
  524                                     node->params[i].p)->numSector),
  525                                     (struct buf *) node->dagHdr->bp);
  526                         }
  527                 RF_ETIMER_STOP(timer);
  528                 RF_ETIMER_EVAL(timer);
  529                 tracerec->xor_us += RF_ETIMER_VAL_US(timer);
  530         }
  531         /* Call wake func explicitly since no I/O in this node. */
  532         return (rf_GenericWakeupFunc(node, retcode));
  533 }
  534 
  535 /*
  536  * This xor is used by the degraded-mode dag functions to recover lost data.
  537  * The second-to-last parameter is the PDA for the failed portion of the access.
  538  * The code here looks at this PDA and assumes that the xor target buffer is
  539  * equal in size to the number of sectors in the failed PDA. It then uses
  540  * the other PDAs in the parameter list to determine where within the target
  541  * buffer the corresponding data should be xored.
  542  */
  543 int
  544 rf_RecoveryXorFunc(RF_DagNode_t *node)
  545 {
  546         RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
  547         RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
  548         RF_PhysDiskAddr_t *failedPDA =
  549             (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
  550         int i, retcode = 0;
  551         RF_PhysDiskAddr_t *pda;
  552         int suoffset, failedSUOffset =
  553             rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
  554         char *srcbuf, *destbuf;
  555         RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
  556         RF_Etimer_t timer;
  557 
  558         if (node->dagHdr->status == rf_enable) {
  559                 RF_ETIMER_START(timer);
  560                 for (i = 0; i < node->numParams - 2; i += 2)
  561                         if (node->params[i + 1].p != node->results[0]) {
  562                                 pda = (RF_PhysDiskAddr_t *) node->params[i].p;
  563                                 srcbuf = (char *) node->params[i + 1].p;
  564                                 suoffset = rf_StripeUnitOffset(layoutPtr,
  565                                     pda->startSector);
  566                                 destbuf = ((char *) node->results[0]) +
  567                                     rf_RaidAddressToByte(raidPtr,
  568                                     suoffset - failedSUOffset);
  569                                 retcode = rf_bxor(srcbuf, destbuf,
  570                                     rf_RaidAddressToByte(raidPtr,
  571                                     pda->numSector), node->dagHdr->bp);
  572                         }
  573                 RF_ETIMER_STOP(timer);
  574                 RF_ETIMER_EVAL(timer);
  575                 tracerec->xor_us += RF_ETIMER_VAL_US(timer);
  576         }
  577         return (rf_GenericWakeupFunc(node, retcode));
  578 }
  579 
  580 
  581 /*****************************************************************************
  582  * The next three functions are utilities used by the above xor-execution
  583  * functions.
  584  *****************************************************************************/
  585 
  586 /*
  587  * This is just a glorified buffer xor. Targbuf points to a buffer that is
  588  * one full stripe unit in size. srcbuf points to a buffer that may be less
  589  * than 1 SU, but never more. When the access described by pda is one SU in
  590  * size (which by implication means it's SU-aligned), all that happens is
  591  * (targbuf) <- (srcbuf ^ targbuf). When the access is less than one SU in
  592  * size the XOR occurs on only the portion of targbuf identified in the pda.
  593  */
  594 
  595 int
  596 rf_XorIntoBuffer(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda, char *srcbuf,
  597     char *targbuf, void *bp)
  598 {
  599         char *targptr;
  600         int sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
  601         int SUOffset = pda->startSector % sectPerSU;
  602         int length, retcode = 0;
  603 
  604         RF_ASSERT(pda->numSector <= sectPerSU);
  605 
  606         targptr = targbuf + rf_RaidAddressToByte(raidPtr, SUOffset);
  607         length = rf_RaidAddressToByte(raidPtr, pda->numSector);
  608         retcode = rf_bxor(srcbuf, targptr, length, bp);
  609         return (retcode);
  610 }
  611 
  612 /*
  613  * It really should be the case that the buffer pointers (returned by malloc)
  614  * are aligned to the natural word size of the machine, so this is the only
  615  * case we optimize for. The length should always be a multiple of the sector
  616  * size, so there should be no problem with leftover bytes at the end.
  617  */
  618 int
  619 rf_bxor(char *src, char *dest, int len, void *bp)
  620 {
  621         unsigned mask = sizeof(long) - 1, retcode = 0;
  622 
  623         if (!(((unsigned long) src) & mask) &&
  624             !(((unsigned long) dest) & mask) && !(len & mask)) {
  625                 retcode = rf_longword_bxor((unsigned long *) src,
  626                     (unsigned long *) dest, len >> RF_LONGSHIFT, bp);
  627         } else {
  628                 RF_ASSERT(0);
  629         }
  630         return (retcode);
  631 }
  632 
  633 /* Map a user buffer into kernel space, if necessary. */
  634 #define REMAP_VA(_bp,x,y)       (y) = (x)
  635 
  636 /*
  637  * When XORing in kernel mode, we need to map each user page to kernel
  638  * space before we can access it.
  639  * We don't want to assume anything about which input buffers are in
  640  * kernel/user space, nor about their alignment, so in each loop we
  641  * compute the maximum number of bytes that we can xor without crossing
  642  * any page boundaries, and do only this many bytes before the next remap.
  643  */
  644 int
  645 rf_longword_bxor(unsigned long *src, unsigned long *dest, int len, void *bp)
  646 {
  647         unsigned long *end = src + len; /* len in longwords. */
  648         unsigned long d0, d1, d2, d3, s0, s1, s2, s3; /* temps */
  649         unsigned long *pg_src, *pg_dest; /* Per-page source/dest pointers. */
  650         int longs_this_time; /* # longwords to xor in the current iteration. */
  651 
  652         REMAP_VA(bp, src, pg_src);
  653         REMAP_VA(bp, dest, pg_dest);
  654         if (!pg_src || !pg_dest)
  655                 return (EFAULT);
  656 
  657         while (len >= 4) {
  658                 longs_this_time = RF_MIN(len, RF_MIN(RF_BLIP(pg_src),
  659                     RF_BLIP(pg_dest)) >> RF_LONGSHIFT);
  660                 src += longs_this_time;
  661                 dest += longs_this_time;
  662                 len -= longs_this_time;
  663                 while (longs_this_time >= 4) {
  664                         d0 = pg_dest[0];
  665                         d1 = pg_dest[1];
  666                         d2 = pg_dest[2];
  667                         d3 = pg_dest[3];
  668                         s0 = pg_src[0];
  669                         s1 = pg_src[1];
  670                         s2 = pg_src[2];
  671                         s3 = pg_src[3];
  672                         pg_dest[0] = d0 ^ s0;
  673                         pg_dest[1] = d1 ^ s1;
  674                         pg_dest[2] = d2 ^ s2;
  675                         pg_dest[3] = d3 ^ s3;
  676                         pg_src += 4;
  677                         pg_dest += 4;
  678                         longs_this_time -= 4;
  679                 }
  680                 while (longs_this_time > 0) {
  681                         /* Cannot cross any page boundaries here. */
  682                         *pg_dest++ ^= *pg_src++;
  683                         longs_this_time--;
  684                 }
  685 
  686                 /*
  687                  * Either we're done, or we've reached a page boundary on one
  688                  * (or possibly both) of the pointers.
  689                  */
  690                 if (len) {
  691                         if (RF_PAGE_ALIGNED(src))
  692                                 REMAP_VA(bp, src, pg_src);
  693                         if (RF_PAGE_ALIGNED(dest))
  694                                 REMAP_VA(bp, dest, pg_dest);
  695                         if (!pg_src || !pg_dest)
  696                                 return (EFAULT);
  697                 }
  698         }
  699         while (src < end) {
  700                 *pg_dest++ ^= *pg_src++;
  701                 src++;
  702                 dest++;
  703                 len--;
  704                 if (RF_PAGE_ALIGNED(src))
  705                         REMAP_VA(bp, src, pg_src);
  706                 if (RF_PAGE_ALIGNED(dest))
  707                         REMAP_VA(bp, dest, pg_dest);
  708         }
  709         RF_ASSERT(len == 0);
  710         return (0);
  711 }
  712 
  713 
  714 /*
  715  * dst = a ^ b ^ c;
  716  * a may equal dst
  717  * see comment above longword_bxor
  718  */
  719 int
  720 rf_longword_bxor3(unsigned long *dst, unsigned long *a, unsigned long *b,
  721     unsigned long *c, int len, void *bp)
  722 {
  723         unsigned long a0, a1, a2, a3, b0, b1, b2, b3;
  724         /* Per-page source/dest pointers. */
  725         unsigned long *pg_a, *pg_b, *pg_c, *pg_dst;
  726         int longs_this_time;    /* # longs to xor in the current iteration */
  727         char dst_is_a = 0;
  728 
  729         /* Note: The length (len) is in longwords. */
  730 
  731         REMAP_VA(bp, a, pg_a);
  732         REMAP_VA(bp, b, pg_b);
  733         REMAP_VA(bp, c, pg_c);
  734         if (a == dst) {
  735                 pg_dst = pg_a;
  736                 dst_is_a = 1;
  737         } else {
  738                 REMAP_VA(bp, dst, pg_dst);
  739         }
  740 
  741         /* Align dest to cache line. Can't cross a pg boundary on dst here. */
  742         while ((((unsigned long) pg_dst) & 0x1f)) {
  743                 *pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++;
  744                 dst++;
  745                 a++;
  746                 b++;
  747                 c++;
  748                 if (RF_PAGE_ALIGNED(a)) {
  749                         REMAP_VA(bp, a, pg_a);
  750                         if (!pg_a)
  751                                 return (EFAULT);
  752                 }
  753                 if (RF_PAGE_ALIGNED(b)) {
  754                         REMAP_VA(bp, a, pg_b);
  755                         if (!pg_b)
  756                                 return (EFAULT);
  757                 }
  758                 if (RF_PAGE_ALIGNED(c)) {
  759                         REMAP_VA(bp, a, pg_c);
  760                         if (!pg_c)
  761                                 return (EFAULT);
  762                 }
  763                 len--;
  764         }
  765 
  766         while (len > 4) {
  767                 longs_this_time = RF_MIN(len, RF_MIN(RF_BLIP(a),
  768                     RF_MIN(RF_BLIP(b), RF_MIN(RF_BLIP(c), RF_BLIP(dst)))) >>
  769                     RF_LONGSHIFT);
  770                 a += longs_this_time;
  771                 b += longs_this_time;
  772                 c += longs_this_time;
  773                 dst += longs_this_time;
  774                 len -= longs_this_time;
  775                 while (longs_this_time >= 4) {
  776                         a0 = pg_a[0];
  777                         longs_this_time -= 4;
  778 
  779                         a1 = pg_a[1];
  780                         a2 = pg_a[2];
  781 
  782                         a3 = pg_a[3];
  783                         pg_a += 4;
  784 
  785                         b0 = pg_b[0];
  786                         b1 = pg_b[1];
  787 
  788                         b2 = pg_b[2];
  789                         b3 = pg_b[3];
  790                         /* Start dual issue. */
  791                         a0 ^= b0;
  792                         b0 = pg_c[0];
  793 
  794                         pg_b += 4;
  795                         a1 ^= b1;
  796 
  797                         a2 ^= b2;
  798                         a3 ^= b3;
  799 
  800                         b1 = pg_c[1];
  801                         a0 ^= b0;
  802 
  803                         b2 = pg_c[2];
  804                         a1 ^= b1;
  805 
  806                         b3 = pg_c[3];
  807                         a2 ^= b2;
  808 
  809                         pg_dst[0] = a0;
  810                         a3 ^= b3;
  811                         pg_dst[1] = a1;
  812                         pg_c += 4;
  813                         pg_dst[2] = a2;
  814                         pg_dst[3] = a3;
  815                         pg_dst += 4;
  816                 }
  817                 while (longs_this_time > 0) {
  818                         /* Cannot cross any page boundaries here. */
  819                         *pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++;
  820                         longs_this_time--;
  821                 }
  822 
  823                 if (len) {
  824                         if (RF_PAGE_ALIGNED(a)) {
  825                                 REMAP_VA(bp, a, pg_a);
  826                                 if (!pg_a)
  827                                         return (EFAULT);
  828                                 if (dst_is_a)
  829                                         pg_dst = pg_a;
  830                         }
  831                         if (RF_PAGE_ALIGNED(b)) {
  832                                 REMAP_VA(bp, b, pg_b);
  833                                 if (!pg_b)
  834                                         return (EFAULT);
  835                         }
  836                         if (RF_PAGE_ALIGNED(c)) {
  837                                 REMAP_VA(bp, c, pg_c);
  838                                 if (!pg_c)
  839                                         return (EFAULT);
  840                         }
  841                         if (!dst_is_a)
  842                                 if (RF_PAGE_ALIGNED(dst)) {
  843                                         REMAP_VA(bp, dst, pg_dst);
  844                                         if (!pg_dst)
  845                                                 return (EFAULT);
  846                                 }
  847                 }
  848         }
  849         while (len) {
  850                 *pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++;
  851                 dst++;
  852                 a++;
  853                 b++;
  854                 c++;
  855                 if (RF_PAGE_ALIGNED(a)) {
  856                         REMAP_VA(bp, a, pg_a);
  857                         if (!pg_a)
  858                                 return (EFAULT);
  859                         if (dst_is_a)
  860                                 pg_dst = pg_a;
  861                 }
  862                 if (RF_PAGE_ALIGNED(b)) {
  863                         REMAP_VA(bp, b, pg_b);
  864                         if (!pg_b)
  865                                 return (EFAULT);
  866                 }
  867                 if (RF_PAGE_ALIGNED(c)) {
  868                         REMAP_VA(bp, c, pg_c);
  869                         if (!pg_c)
  870                                 return (EFAULT);
  871                 }
  872                 if (!dst_is_a)
  873                         if (RF_PAGE_ALIGNED(dst)) {
  874                                 REMAP_VA(bp, dst, pg_dst);
  875                                 if (!pg_dst)
  876                                         return (EFAULT);
  877                         }
  878                 len--;
  879         }
  880         return (0);
  881 }
  882 
  883 int
  884 rf_bxor3(unsigned char *dst, unsigned char *a, unsigned char *b,
  885     unsigned char *c, unsigned long len, void *bp)
  886 {
  887         RF_ASSERT(((RF_UL(dst) | RF_UL(a) | RF_UL(b) | RF_UL(c) | len) & 0x7)
  888             == 0);
  889 
  890         return (rf_longword_bxor3((unsigned long *) dst, (unsigned long *) a,
  891                 (unsigned long *) b, (unsigned long *) c, len >> RF_LONGSHIFT,
  892                  bp));
  893 }

/* [<][>][^][v][top][bottom][index][help] */