/*
 * Refinement plugin
 */

/****************************************************************************
 *                                                                          *
 *  Module  :   refinesse.c                                                 *
 *                                                                          *
 *  Purpose :   SSE Surface Refinement plugin (rpRefine.c)                  *
 *                                                                          *
 ****************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "rpplugin.h"
#include "time.h"
#include <rpdbgerr.h>
#include <rwcore.h>
#include "rprefine.h"

/* Check for SSE */

#if ( ((defined(__ICL)) && (400 <= __ICL)) && defined(RWSIMD) )

#include <rtintel.h>

#include <rpworld.h>

#include "bbtpsse.h"
#include "refinevars.h"

#include "nodeRefine.h"
#include "nodeSSERefine.h"

static const char   rcsid[] __RWUNUSED__ =
    "@@(#)$Id: refinesse.c,v 1.23 2001/01/26 11:05:25 johns Exp $";

/****************************************************************************
 Local types
 */

/****************************************************************************
 Local (static) globals
 */

/****************************************************************************
 Local defines
 */

/*****************************************************************************
                           Utilities
 ****************************************************************************/

#define _SSEBARYCENTRICINTERPOLATION(_res, _t1, _t2, _t3, _vt0, _vt1, _vt2) \
    (_res).m128 =                                                       \
        _mm_add_ps(_mm_add_ps(_mm_mul_ps(((_vt0).m128), ((_t1).m128)),  \
                              _mm_mul_ps(((_vt1).m128), ((_t2).m128))), \
                   _mm_mul_ps(((_vt2).m128), ((_t3).m128)))

#define _refineSSECAMERALOAD(_cW, _cH, _xOff, _yOff, _zScale, _zShift, _cam) \
MACRO_START                                            \
{                                                      \
    /*                                                 \
     * Load the camera matrix.                         \
     */                                                \
    (_cW).m128 = _mm_set_ps1((_cam).camWidth);         \
    (_cH).m128 = _mm_set_ps1((_cam).camHeight);        \
                                                       \
    (_xOff).m128 = _mm_set_ps1((_cam).camOffsetX);     \
    (_yOff).m128 = _mm_set_ps1((_cam).camOffsetY);     \
                                                       \
    (_zScale).m128 = _mm_set_ps1((_cam).zScale);       \
    (_zShift).m128 = _mm_set_ps1((_cam).zShift);       \
                                                       \
}                                                      \
MACRO_STOP

#ifdef RW_WNI

#define _refineWNICLIPLOAD(_nC, _fC, _xLo, _xHi, _yLo, _yHi, _zLo, _zHi, _cam)\
MACRO_START                                       \
{                                                 \
    RwSplitBits _split;                           \
                                                  \
    (_nC).m128 = _mm_set_ps1((_cam).nearClip);    \
    (_fC).m128 = _mm_set_ps1((_cam).farClip);     \
                                                  \
    (_xLo).m128i = _mm_set1_epi32(rwXLOCLIP);     \
    (_xHi).m128i = _mm_set1_epi32(rwXHICLIP);     \
                                                  \
    (_yLo).m128i = _mm_set1_epi32(rwYLOCLIP);     \
    (_yHi).m128i = _mm_set1_epi32(rwYHICLIP);     \
                                                  \
    (_zLo).m128i = _mm_set1_epi32(rwZLOCLIP);     \
    (_zHi).m128i = _mm_set1_epi32(rwZHICLIP);     \
                                                  \
}                                                 \
MACRO_STOP

#endif /* RW_WNI */

#define _refineSSECLIPLOAD(_nC, _fC, _xLo, _xHi, _yLo, _yHi, _zLo, _zHi, _cam)\
MACRO_START                                       \
{                                                 \
    RwSplitBits _split;                           \
                                                  \
    (_nC).m128 = _mm_set_ps1((_cam).nearClip);    \
    (_fC).m128 = _mm_set_ps1((_cam).farClip);     \
                                                  \
    _split.nUInt = (RwUInt32) rwXLOCLIP;          \
    (_xLo).m128 = _mm_set_ps1((_split).nReal);    \
    _split.nUInt = (RwUInt32) rwXHICLIP;          \
    (_xHi).m128 = _mm_set_ps1((_split).nReal);    \
                                                  \
    _split.nUInt = (RwUInt32) rwYLOCLIP;          \
    (_yLo).m128 = _mm_set_ps1((_split).nReal);    \
    _split.nUInt = (RwUInt32) rwYHICLIP;          \
    (_yHi).m128 = _mm_set_ps1((_split).nReal);    \
                                                  \
    _split.nUInt = (RwUInt32) rwZLOCLIP;          \
    (_zLo).m128 = _mm_set_ps1((_split).nReal);    \
    _split.nUInt = (RwUInt32) rwZHICLIP;          \
    (_zHi).m128 = _mm_set_ps1((_split).nReal);    \
                                                  \
}                                                 \
MACRO_STOP

/*****************************************************************************
 * Shuffle the values around.
 */
#define _refine_transpose_v3d_out_SSE(_out, _in)                             \
MACRO_START                                                                  \
{                                                                            \
    RpSSEOverlayM128      _row[4];                                           \
                                                                             \
    _row[0].m128 = _mm_shuffle_ps(((_in)[0].m128), ((_in)[1].m128), 0x44);   \
    _row[2].m128 = _mm_shuffle_ps(((_in)[0].m128), ((_in)[1].m128), 0xEE);   \
    _row[1].m128 = _mm_shuffle_ps(((_in)[2].m128), ((_in)[3].m128), 0x44);   \
    _row[3].m128 = _mm_shuffle_ps(((_in)[2].m128), ((_in)[3].m128), 0xEE);   \
                                                                             \
    ((_out)[0].m128) = _mm_shuffle_ps(_row[0].m128, _row[1].m128, 0xDD);     \
    ((_out)[1].m128) = _mm_shuffle_ps(_row[2].m128, _row[3].m128, 0x88);     \
    ((_out)[2].m128) = _mm_shuffle_ps(_row[2].m128, _row[3].m128, 0xDD);     \
}                                                                            \
MACRO_STOP

#define _refine_transpose_v3d_in_SSE(_out, _in)                              \
MACRO_START                                                                  \
{                                                                            \
    static const RpSSEOverlayM128 _mm_zero = { {0.0f, 0.0f, 0.0f, 0.0f} };   \
    RpSSEOverlayM128      _row[4];                                           \
                                                                             \
    _row[0].m128 = _mm_shuffle_ps((_mm_zero.m128), ((_in)[0].m128), 0x44);   \
    _row[2].m128 = _mm_shuffle_ps((_mm_zero.m128), ((_in)[0].m128), 0xEE);   \
    _row[1].m128 = _mm_shuffle_ps(((_in)[1].m128), ((_in)[2].m128), 0x44);   \
    _row[3].m128 = _mm_shuffle_ps(((_in)[1].m128), ((_in)[2].m128), 0xEE);   \
                                                                             \
    (_out)[0].m128 = _mm_shuffle_ps(_row[0].m128, _row[1].m128, 0x88);       \
    (_out)[1].m128 = _mm_shuffle_ps(_row[0].m128, _row[1].m128, 0xDD);       \
    (_out)[2].m128 = _mm_shuffle_ps(_row[2].m128, _row[3].m128, 0x88);       \
    (_out)[3].m128 = _mm_shuffle_ps(_row[2].m128, _row[3].m128, 0xDD);       \
}                                                                            \
MACRO_STOP

#define _refine_transpose_v4d_SSE(_out, _in)                                 \
MACRO_START                                                                  \
{                                                                            \
    RpSSEOverlayM128 _row[4];                                                \
                                                                             \
    _row[0].m128 = _mm_shuffle_ps(((_in)[0].m128), ((_in)[1].m128), 0x44);   \
    _row[2].m128 = _mm_shuffle_ps(((_in)[0].m128), ((_in)[1].m128), 0xEE);   \
    _row[1].m128 = _mm_shuffle_ps(((_in)[2].m128), ((_in)[3].m128), 0x44);   \
    _row[3].m128 = _mm_shuffle_ps(((_in)[2].m128), ((_in)[3].m128), 0xEE);   \
                                                                             \
    (_out)[0].m128) = _mm_shuffle_ps(_row[0].m128, _row[1].m128, 0x88);      \
    (_out)[1].m128) = _mm_shuffle_ps(_row[0].m128, _row[1].m128, 0xDD);      \
    (_out)[2].m128) = _mm_shuffle_ps(_row[2].m128, _row[3].m128, 0x88);      \
    (_out)[3].m128) = _mm_shuffle_ps(_row[2].m128, _row[3].m128, 0xDD);      \
}                                                                            \
MACRO_STOP

#ifdef RW_WNI

#define _refine_transpose_v4d_WNI(_out, _in)                                     \
MACRO_START                                                                      \
{                                                                                \
    RpWNIOverlayM128i _row[4];                                                   \
                                                                                 \
    _row[0].m128i = _mm_shuffle_epi32(((_in)[0].m128i), ((_in)[1].m128i), 0x44); \
    _row[2].m128i = _mm_shuffle_epi32(((_in)[0].m128i), ((_in)[1].m128i), 0xEE); \
    _row[1].m128i = _mm_shuffle_epi32(((_in)[2].m128i), ((_in)[3].m128i), 0x44); \
    _row[3].m128i = _mm_shuffle_epi32(((_in)[2].m128i), ((_in)[3].m128i), 0xEE); \
                                                                                 \
    (_out)[0].m128i) = _mm_shuffle_epi32(_row[0].m128i, _row[1].m128i, 0x88);    \
    (_out)[1].m128i) = _mm_shuffle_epi32(_row[0].m128i, _row[1].m128i, 0xDD);    \
    (_out)[2].m128i) = _mm_shuffle_epi32(_row[2].m128i, _row[3].m128i, 0x88);    \
    (_out)[3].m128i) = _mm_shuffle_epi32(_row[2].m128i, _row[3].m128i, 0xDD);    \
}                                                                                \
MACRO_STOP

#endif /* RW_WNI */

/*****************************************************************************
 * Get and set the extra col.
 */
#define _refineExtraGetColSSE(_vert, _col)                                   \
MACRO_START                                                                  \
{                                                                            \
    (_col)[0].m128 = _mm_set_ps1(((RwRGBAReal *)(_vert))->red);              \
    (_col)[1].m128 = _mm_set_ps1(((RwRGBAReal *)(_vert))->green);            \
    (_col)[2].m128 = _mm_set_ps1(((RwRGBAReal *)(_vert))->blue);             \
    (_col)[3].m128 = _mm_set_ps1(((RwRGBAReal *)(_vert))->alpha);            \
}                                                                            \
MACRO_STOP

#define _refineExtraSetColSSE(_vert0, _vert1, _vert2, _vert3, _col)          \
MACRO_START                                                                  \
{                                                                            \
    ((RwRGBAReal *) (_vert0))->red = (_col)[0]._f[0];                        \
    ((RwRGBAReal *) (_vert1))->red = (_col)[0]._f[1];                        \
    ((RwRGBAReal *) (_vert2))->red = (_col)[0]._f[2];                        \
    ((RwRGBAReal *) (_vert3))->red = (_col)[0]._f[3];                        \
                                                                             \
    ((RwRGBAReal *) (_vert0))->green = (_col)[1]._f[0];                      \
    ((RwRGBAReal *) (_vert1))->green = (_col)[1]._f[1];                      \
    ((RwRGBAReal *) (_vert2))->green = (_col)[1]._f[2];                      \
    ((RwRGBAReal *) (_vert3))->green = (_col)[1]._f[3];                      \
                                                                             \
    ((RwRGBAReal *) (_vert0))->blue = (_col)[2]._f[0];                       \
    ((RwRGBAReal *) (_vert1))->blue = (_col)[2]._f[1];                       \
    ((RwRGBAReal *) (_vert2))->blue = (_col)[2]._f[2];                       \
    ((RwRGBAReal *) (_vert3))->blue = (_col)[2]._f[3];                       \
                                                                             \
    ((RwRGBAReal *) (_vert0))->alpha = (_col)[3]._f[0];                      \
    ((RwRGBAReal *) (_vert1))->alpha = (_col)[3]._f[1];                      \
    ((RwRGBAReal *) (_vert2))->alpha = (_col)[3]._f[2];                      \
    ((RwRGBAReal *) (_vert3))->alpha = (_col)[3]._f[3];                      \
}                                                                            \
MACRO_STOP

/*****************************************************************************
 * Get ad set extra ' UV.
 */
#define _refineExtraGetUVSSE(_vert, _uv)                                     \
MACRO_START                                                                  \
{                                                                            \
    (_uv)[0].m128 = _mm_set_ps1(((RxUV *)(_vert))->u);                     \
    (_uv)[1].m128 = _mm_set_ps1(((RxUV *)(_vert))->v);                     \
}                                                                            \
MACRO_STOP

#define _refineExtraSetUVSSE(_vert0, _vert1, _vert2, _vert3, _uv)            \
MACRO_START                                                                  \
{                                                                            \
    ((RxUV *) (_vert0))->u = (_uv)[0]._f[0];                               \
    ((RxUV *) (_vert1))->u = (_uv)[0]._f[1];                               \
    ((RxUV *) (_vert2))->u = (_uv)[0]._f[2];                               \
    ((RxUV *) (_vert3))->u = (_uv)[0]._f[3];                               \
                                                                             \
    ((RxUV *) (_vert0))->v = (_uv)[1]._f[0];                               \
    ((RxUV *) (_vert1))->v = (_uv)[1]._f[1];                               \
    ((RxUV *) (_vert2))->v = (_uv)[1]._f[2];                               \
    ((RxUV *) (_vert3))->v = (_uv)[1]._f[3];                               \
}                                                                            \
MACRO_STOP

/*****************************************************************************
 *  Get and set the the Object verts' pos.
 */
#define _refineObjVertsGetPosSSE(_vert, _pos)                                \
MACRO_START                                                                  \
{                                                                            \
    RwV3d               _v3d;                                                \
                                                                             \
    RxObjSpace3DVertexGetPos((RxObjSpace3DVertex *) (_vert), &_v3d);     \
                                                                             \
    (_pos)[0].m128 = _mm_set_ps1(_v3d.x);                                    \
    (_pos)[1].m128 = _mm_set_ps1(_v3d.y);                                    \
    (_pos)[2].m128 = _mm_set_ps1(_v3d.z);                                    \
}                                                                            \
MACRO_STOP

/*****************************************************************************
 * Get and set the the Object verts' normal.
 */
#define _refineObjVertsGetNrmSSE(_vert, _nrm)                                \
MACRO_START                                                                  \
{                                                                            \
    RwV3d               _v3d;                                                \
                                                                             \
    RxObjSpace3DVertexGetNormal((RxObjSpace3DVertex *) (_vert), &_v3d);  \
                                                                             \
    (_nrm)[0].m128 = _mm_set_ps1(_v3d.x);                                    \
    (_nrm)[1].m128 = _mm_set_ps1(_v3d.y);                                    \
    (_nrm)[2].m128 = _mm_set_ps1(_v3d.z);                                    \
}                                                                            \
MACRO_STOP

/*****************************************************************************
 * Get and set the the device verts' colour.
 */
#define _refineDevVertsGetColSSE(_vert, _col)                                \
MACRO_START                                                                  \
{                                                                            \
    (_col)[0].m128 = _mm_set_ps1(                                            \
         (RwReal) RwIm2DVertexGetRed((RxScrSpace2DVertex *) (_vert)));     \
    (_col)[1].m128 = _mm_set_ps1(                                            \
         (RwReal) RwIm2DVertexGetGreen((RxScrSpace2DVertex *) (_vert)));   \
    (_col)[2].m128 = _mm_set_ps1(                                            \
         (RwReal) RwIm2DVertexGetBlue((RxScrSpace2DVertex *) (_vert)));    \
    (_col)[3].m128 = _mm_set_ps1(                                            \
         (RwReal) RwIm2DVertexGetAlpha((RxScrSpace2DVertex *) (_vert)));   \
}                                                                            \
MACRO_STOP

#define _refineDevVertsSetColSSE(_vert0, _vert1, _vert2, _vert3, _col)       \
MACRO_START                                                                  \
{                                                                            \
    RwIm2DVertexSetRealRGBA((RxScrSpace2DVertex *) (_vert0),               \
                            (_col)[0]._f[0], (_col)[1]._f[0],                \
                            (_col)[2]._f[0], (_col)[3]._f[0]);               \
                                                                             \
    RwIm2DVertexSetRealRGBA((RxScrSpace2DVertex *) (_vert1),               \
                            (_col)[0]._f[1], (_col)[1]._f[1],                \
                            (_col)[2]._f[1], (_col)[3]._f[1]);               \
                                                                             \
    RwIm2DVertexSetRealRGBA((RxScrSpace2DVertex *) (_vert2),               \
                            (_col)[0]._f[2], (_col)[1]._f[2],                \
                            (_col)[2]._f[2], (_col)[3]._f[2]);               \
                                                                             \
    RwIm2DVertexSetRealRGBA((RxScrSpace2DVertex *) (_vert3),               \
                            (_col)[0]._f[3], (_col)[1]._f[3],                \
                            (_col)[2]._f[3], (_col)[3]._f[3]);               \
}                                                                            \
MACRO_STOP

#ifdef RW_WNI

#define _refineDevVertsGetColWNI(_vert, _col)                                \
MACRO_START                                                                  \
{                                                                            \
    (_col)[0].m128 = _mm_cvtepi32_ps(_mm_set1_epi32(                         \
         RwIm2DVertexGetRed((RxScr2DVertex *) (_vert)))));                   \
    (_col)[1].m128 = _mm_cvtepi32_ps(_mm_set1_epi32(                         \
         RwIm2DVertexGetGreen((RxScr2DVertex *) (_vert)))));                 \
    (_col)[2].m128 = _mm_cvtepi32_ps(_mm_set1_epi32(                         \
         RwIm2DVertexGetBlue((RxScr2DVertex *) (_vert)))));                  \
    (_col)[3].m128 = _mm_cvtepi32_ps(_mm_set1_epi32(                         \
         RwIm2DVertexGetAlpha((RxScr2DVertex *) (_vert)))));                 \
}                                                                            \
MACRO_STOP

#define _refineDevVertsSetColWNI(_vert0, _vert1, _vert2, _vert3, _col)       \
MACRO_START                                                                  \
{                                                                            \
    RpWNIOverlayM128i    _tmp[4];                                            \
                                                                             \
    _tmp[0].m128i = _mm_cvtps_epi32(_col[0].m128);                           \
    _tmp[1].m128i = _mm_cvtps_epi32(_col[1].m128);                           \
    _tmp[2].m128i = _mm_cvtps_epi32(_col[2].m128);                           \
    _tmp[3].m128i = _mm_cvtps_epi32(_col[3].m128);                           \
                                                                             \
    RwIm2DVertexSetIntRGBA((RxScrSpace2DVertex *) (_vert0),                  \
                        _tmp[0]._d[0], _tmp[1]._d[0],                        \
                        _tmp[2]._d[0], _tmp[3]._d[0]);                       \
                                                                             \
    RwIm2DVertexSetIntRGBA((RxScrSpace2DVertex *) (_vert1),                  \
                        _tmp[0]._d[1], _tmp[1]._d[1],                        \
                        _tmp[2]._d[1], _tmp[3]._d[1]);                       \
                                                                             \
    RwIm2DVertexSetIntRGBA((RxScrSpace2DVertex *) (_vert2),                  \
                        _tmp[0]._d[2], _tmp[1]._d[2],                        \
                        _tmp[2]._d[2], _tmp[3]._d[2]);                       \
                                                                             \
    RwIm2DVertexSetIntRGBA((RxScrSpace2DVertex *) (_vert3),                  \
                        _tmp[0]._d[3], _tmp[1]._d[3],                        \
                        _tmp[2]._d[3], _tmp[3]._d[3]);                       \
                                                                             \
}                                                                            \
MACRO_STOP

#endif /* RW_WNI */

#define _refineDevVertsSetScrPosSSE(_vert0, _vert1, _vert2, _vert3, _pos)    \
MACRO_START                                                                  \
{                                                                            \
    RwIm2DVertexSetScreenX((RxScrSpace2DVertex *) (_vert0),                \
            (_pos)[0]._f[0]);                                                \
    RwIm2DVertexSetScreenX((RxScrSpace2DVertex *) (_vert1),                \
            (_pos)[0]._f[1]);                                                \
    RwIm2DVertexSetScreenX((RxScrSpace2DVertex *) (_vert2),                \
            (_pos)[0]._f[2]);                                                \
    RwIm2DVertexSetScreenX((RxScrSpace2DVertex *) (_vert3),                \
            (_pos)[0]._f[3]);                                                \
                                                                             \
    RwIm2DVertexSetScreenY((RxScrSpace2DVertex *) (_vert0),                \
            (_pos)[1]._f[0]);                                                \
    RwIm2DVertexSetScreenY((RxScrSpace2DVertex *) (_vert1),                \
            (_pos)[1]._f[1]);                                                \
    RwIm2DVertexSetScreenY((RxScrSpace2DVertex *) (_vert2),                \
            (_pos)[1]._f[2]);                                                \
    RwIm2DVertexSetScreenY((RxScrSpace2DVertex *) (_vert3),                \
            (_pos)[1]._f[3]);                                                \
                                                                             \
    RwIm2DVertexSetScreenZ((RxScrSpace2DVertex *) (_vert0),                \
            (_pos)[2]._f[0]);                                                \
    RwIm2DVertexSetScreenZ((RxScrSpace2DVertex *) (_vert1),                \
            (_pos)[2]._f[1]);                                                \
    RwIm2DVertexSetScreenZ((RxScrSpace2DVertex *) (_vert2),                \
            (_pos)[2]._f[2]);                                                \
    RwIm2DVertexSetScreenZ((RxScrSpace2DVertex *) (_vert3),                \
            (_pos)[2]._f[3]);                                                \
}                                                                            \
MACRO_STOP

#define _refineDevVertsSetCamPosSSE(_vert0, _vert1, _vert2, _vert3, _pos, _z) \
MACRO_START                                                                  \
{                                                                            \
    RwIm2DVertexSetCameraX((RxScrSpace2DVertex *) (_vert0),                \
            (_pos)[0]._f[0]);                                                \
    RwIm2DVertexSetCameraX((RxScrSpace2DVertex *) (_vert1),                \
            (_pos)[0]._f[1]);                                                \
    RwIm2DVertexSetCameraX((RxScrSpace2DVertex *) (_vert2),                \
            (_pos)[0]._f[2]);                                                \
    RwIm2DVertexSetCameraX((RxScrSpace2DVertex *) (_vert3),                \
            (_pos)[0]._f[3]);                                                \
                                                                             \
    RwIm2DVertexSetCameraY((RxScrSpace2DVertex *) (_vert0),                \
            (_pos)[1]._f[0]);                                                \
    RwIm2DVertexSetCameraY((RxScrSpace2DVertex *) (_vert1),                \
            (_pos)[1]._f[1]);                                                \
    RwIm2DVertexSetCameraY((RxScrSpace2DVertex *) (_vert2),                \
            (_pos)[1]._f[2]);                                                \
    RwIm2DVertexSetCameraY((RxScrSpace2DVertex *) (_vert3),                \
            (_pos)[1]._f[3]);                                                \
                                                                             \
    RwIm2DVertexSetCameraZ((RxScrSpace2DVertex *) (_vert0),                \
            (_pos)[2]._f[0]);                                                \
    RwIm2DVertexSetCameraZ((RxScrSpace2DVertex *) (_vert1),                \
            (_pos)[2]._f[1]);                                                \
    RwIm2DVertexSetCameraZ((RxScrSpace2DVertex *) (_vert2),                \
            (_pos)[2]._f[2]);                                                \
    RwIm2DVertexSetCameraZ((RxScrSpace2DVertex *) (_vert3),                \
            (_pos)[2]._f[3]);                                                \
                                                                             \
    RwIm2DVertexSetRecipCameraZ((RxScrSpace2DVertex *) (_vert0),           \
            (_z)->_f[0]);                                                    \
    RwIm2DVertexSetRecipCameraZ((RxScrSpace2DVertex *) (_vert1),           \
            (_z)->_f[1]);                                                    \
    RwIm2DVertexSetRecipCameraZ((RxScrSpace2DVertex *) (_vert2),           \
            (_z)->_f[2]);                                                    \
    RwIm2DVertexSetRecipCameraZ((RxScrSpace2DVertex *) (_vert3),           \
            (_z)->_f[3]);                                                    \
}                                                                            \
MACRO_STOP

#define _refineDevVertsSetUVSSE(_vert0, _vert1, _vert2, _vert3, _uv, _z)     \
MACRO_START                                                                  \
{                                                                            \
    RwIm2DVertexSetU((RxScrSpace2DVertex *) (_vert0),                      \
            (_uv)[0]._f[0], (_z)->_f[0]);                                    \
    RwIm2DVertexSetU((RxScrSpace2DVertex *) (_vert1),                      \
            (_uv)[0]._f[1], (_z)->_f[1]);                                    \
    RwIm2DVertexSetU((RxScrSpace2DVertex *) (_vert2),                      \
            (_uv)[0]._f[2], (_z)->_f[2]);                                    \
    RwIm2DVertexSetU((RxScrSpace2DVertex *) (_vert3),                      \
            (_uv)[0]._f[3], (_z)->_f[3]);                                    \
                                                                             \
    RwIm2DVertexSetV((RxScrSpace2DVertex *) (_vert0),                      \
            (_uv)[1]._f[0], (_z)->_f[0]);                                    \
    RwIm2DVertexSetV((RxScrSpace2DVertex *) (_vert1),                      \
            (_uv)[1]._f[1], (_z)->_f[1]);                                    \
    RwIm2DVertexSetV((RxScrSpace2DVertex *) (_vert2),                      \
            (_uv)[1]._f[2], (_z)->_f[2]);                                    \
    RwIm2DVertexSetV((RxScrSpace2DVertex *) (_vert3),                      \
            (_uv)[1]._f[3], (_z)->_f[3]);                                    \
}                                                                            \
MACRO_STOP

/*****************************************************************************
 * Get and set the Camera verts' pos.
 */
#define _refineCamVertsGetPosSSE(_vert, _pos)                                \
MACRO_START                                                                  \
{                                                                            \
    (_pos)[0].m128 = _mm_set_ps1(                                            \
            ((RxCamSpace3DVertex *) (_vert))->cameraVertex.x);             \
    (_pos)[1].m128 = _mm_set_ps1(                                            \
            ((RxCamSpace3DVertex *) (_vert))->cameraVertex.y);             \
    (_pos)[2].m128 = _mm_set_ps1(                                            \
            ((RxCamSpace3DVertex *) (_vert))->cameraVertex.z);             \
}                                                                            \
MACRO_STOP

#define _refineCamVertsSetPosSSE(_vert0, _vert1, _vert2, _vert3, _pos)       \
MACRO_START                                                                  \
{                                                                            \
    RpSSEOverlayM128      _transpose[4];                                     \
                                                                             \
    _refine_transpose_v3d_in_SSE(_transpose, (_pos));                        \
                                                                             \
    ((RxCamSpace3DVertex *) (_vert0))->cameraVertex =                      \
            _transpose[0].v4d.v3d;                                           \
    ((RxCamSpace3DVertex *) (_vert1))->cameraVertex =                      \
            _transpose[1].v4d.v3d;                                           \
    ((RxCamSpace3DVertex *) (_vert2))->cameraVertex =                      \
            _transpose[2].v4d.v3d;                                           \
    ((RxCamSpace3DVertex *) (_vert3))->cameraVertex =                      \
            _transpose[3].v4d.v3d;                                           \
}                                                                            \
MACRO_STOP

/*****************************************************************************
 * Get and set the Camera verts' Nrm.
 */
#define _refineCamVertsGetNrmSSE(_vert, _nrm)                                \
MACRO_START                                                                  \
{                                                                            \
    (_nrm)[0].m128 = _mm_set_ps1(((RwV3d *) (_vert))->x);                    \
    (_nrm)[1].m128 = _mm_set_ps1(((RwV3d *) (_vert))->y);                    \
    (_nrm)[2].m128 = _mm_set_ps1(((RwV3d *) (_vert))->z);                    \
}                                                                            \
MACRO_STOP

#define _refineCamVertsSetNrmSSE(_vert0, _vert1, _vert2, _vert3, _nrm)       \
MACRO_START                                                                  \
{                                                                            \
    RpSSEOverlayM128      _transpose[4];                                     \
                                                                             \
    _refine_transpose_v3d_in_SSE(_transpose, (_nrm));                        \
                                                                             \
    *(RwV3d *) (_vert0) = _transpose[0].v4d.v3d;                             \
    *(RwV3d *) (_vert1) = _transpose[1].v4d.v3d;                             \
    *(RwV3d *) (_vert2) = _transpose[2].v4d.v3d;                             \
    *(RwV3d *) (_vert3) = _transpose[3].v4d.v3d;                             \
}                                                                            \
MACRO_STOP

/*****************************************************************************
 * Get and set the Camera verts' UV.
 */
#define _refineCamVertsGetUVSSE(_vert, _uv)                                  \
MACRO_START                                                                  \
{                                                                            \
    (_uv)[0].m128 = _mm_set_ps1(((RxCamSpace3DVertex *) (_vert))->u);      \
    (_uv)[1].m128 = _mm_set_ps1(((RxCamSpace3DVertex *) (_vert))->v);      \
}                                                                            \
MACRO_STOP

#define _refineCamVertsSetUVSSE(_vert0, _vert1, _vert2, _vert3, _uv)         \
MACRO_START                                                                  \
{                                                                            \
    ((RxCamSpace3DVertex *) (_vert0))->u = (_uv)[0]._f[0];                 \
    ((RxCamSpace3DVertex *) (_vert1))->u = (_uv)[0]._f[1];                 \
    ((RxCamSpace3DVertex *) (_vert2))->u = (_uv)[0]._f[2];                 \
    ((RxCamSpace3DVertex *) (_vert3))->u = (_uv)[0]._f[3];                 \
                                                                             \
    ((RxCamSpace3DVertex *) (_vert0))->v = (_uv)[1]._f[0];                 \
    ((RxCamSpace3DVertex *) (_vert1))->v = (_uv)[1]._f[1];                 \
    ((RxCamSpace3DVertex *) (_vert2))->v = (_uv)[1]._f[2];                 \
    ((RxCamSpace3DVertex *) (_vert3))->v = (_uv)[1]._f[3];                 \
}                                                                            \
MACRO_STOP

#define _refineCamVertsSetFlagSSE(_vert0, _vert1, _vert2, _vert3, _flag)     \
MACRO_START                                                                  \
{                                                                            \
    RwSplitBits         _split;                                              \
                                                                             \
    _split.nReal = (_flag)->_f[0];                                           \
    ((RxCamSpace3DVertex *) (_vert0))->clipFlags = _split.nUInt;             \
                                                                             \
    _split.nReal = (_flag)->_f[1];                                           \
    ((RxCamSpace3DVertex *) (_vert1))->clipFlags = _split.nUInt;             \
                                                                             \
    _split.nReal = (_flag)->_f[2];                                           \
    ((RxCamSpace3DVertex *) (_vert2))->clipFlags = _split.nUInt;             \
                                                                             \
    _split.nReal = (_flag)->_f[3];                                           \
    ((RxCamSpace3DVertex *) (_vert3))->clipFlags = _split.nUInt;             \
}                                                                            \
MACRO_STOP

#ifdef RW_WNI

#define _refineCamVertsSetFlagWNI(_vert0, _vert1, _vert2, _vert3, _flag)     \
MACRO_START                                                                  \
{                                                                            \
    ((RxCamSpace3DVertex *) (_vert0))->clipFlags = (_flag)->_d[0];           \
    ((RxCamSpace3DVertex *) (_vert1))->clipFlags = (_flag)->_d[1];           \
    ((RxCamSpace3DVertex *) (_vert2))->clipFlags = (_flag)->_d[2];           \
    ((RxCamSpace3DVertex *) (_vert3))->clipFlags = (_flag)->_d[3];           \
}                                                                            \
MACRO_STOP

#endif /* RW_ENI */

/*****************************************************************************
 * Get the triangles' obj verts.
 */
static void
_refineTriangleGetObjVertsSSE(refineSSERefineData * refineData,
                              RwInt32 idx0, RwInt32 idx1, RwInt32 idx2,
                              RpSSEOverlayM128 objPos[3][3],
                              RpSSEOverlayM128 objNrm[3][3])
{
    RxCluster          *objVerts;
    RwChar             *triVert;

    RWFUNCTION(RWSTRING("_refineTriangleGetObjVertsSSE"));

    objVerts = refineData->refineData.objVerts;

    triVert = RxClusterGetIndexedData(objVerts, RwChar, idx0);
    _refineObjVertsGetPosSSE(triVert, objPos[0]);
    _refineObjVertsGetNrmSSE(triVert, objNrm[0]);

    triVert = RxClusterGetIndexedData(objVerts, RwChar, idx1);
    _refineObjVertsGetPosSSE(triVert, objPos[1]);
    _refineObjVertsGetNrmSSE(triVert, objNrm[1]);

    triVert = RxClusterGetIndexedData(objVerts, RwChar, idx2);
    _refineObjVertsGetPosSSE(triVert, objPos[2]);
    _refineObjVertsGetNrmSSE(triVert, objNrm[2]);

    RWRETURNVOID();
}

static void
_refineTriangleGetCamVertsSSE(refineSSERefineData * refineData,
                              RwInt32 idx0, RwInt32 idx1, RwInt32 idx2,
                              RpSSEOverlayM128 camPos[3][3],
                              RpSSEOverlayM128 camNrm[3][3],
                              RpSSEOverlayM128 camUV[3][2])
{
    RxCluster          *camVerts, *camNorms;
    RwChar             *triVert, *triNorm;

    RWFUNCTION(RWSTRING("_refineTriangleGetCamVertsSSE"));

    camVerts = refineData->refineData.camVerts;
    camNorms = refineData->refineData.camNorms;

    triVert = RxClusterGetIndexedData(camVerts, RwChar, idx0);
    triNorm = RxClusterGetIndexedData(camNorms, RwChar, idx0);
    _refineCamVertsGetPosSSE(triVert, camPos[0]);
    _refineCamVertsGetNrmSSE(triNorm, camNrm[0]);
    _refineCamVertsGetUVSSE(triVert, camUV[0]);

    triVert = RxClusterGetIndexedData(camVerts, RwChar, idx1);
    triNorm = RxClusterGetIndexedData(camNorms, RwChar, idx1);
    _refineCamVertsGetPosSSE(triVert, camPos[1]);
    _refineCamVertsGetNrmSSE(triNorm, camNrm[1]);
    _refineCamVertsGetUVSSE(triVert, camUV[1]);

    triVert = RxClusterGetIndexedData(camVerts, RwChar, idx2);
    triNorm = RxClusterGetIndexedData(camNorms, RwChar, idx2);
    _refineCamVertsGetPosSSE(triVert, camPos[2]);
    _refineCamVertsGetNrmSSE(triNorm, camNrm[2]);
    _refineCamVertsGetUVSSE(triVert, camUV[2]);

    RWRETURNVOID();
}

static void
_refineTriangleGetDevVertsSSE(refineSSERefineData * refineData,
                              RwInt32 idx0, RwInt32 idx1, RwInt32 idx2,
                              RpSSEOverlayM128 devCol[3][4])
{
    RxCluster          *devVerts;
    RwChar             *triVert;

    RWFUNCTION(RWSTRING("_refineTriangleGetDevVertsSSE"));

    devVerts = refineData->refineData.devVerts;

    triVert = RxClusterGetIndexedData(devVerts, RwChar, idx0);
    _refineDevVertsGetColSSE(triVert, devCol[0]);

    triVert = RxClusterGetIndexedData(devVerts, RwChar, idx1);
    _refineDevVertsGetColSSE(triVert, devCol[1]);

    triVert = RxClusterGetIndexedData(devVerts, RwChar, idx2);
    _refineDevVertsGetColSSE(triVert, devCol[2]);

    RWRETURNVOID();
}

static void
_refineTriangleExtraGetSSE(refineSSERefineData * refineData,
                           RwInt32 idx0, RwInt32 idx1, RwInt32 idx2,
                           RpSSEOverlayM128 * extraUV,
                           RpSSEOverlayM128 * extraRGBA)
{
    RwInt32             i, j;
    RwChar             *triVert;

    RWFUNCTION(RWSTRING("_refineTriangleExtraGetSSE"));

    j = 0;
    for (i = 0; i < refineData->refineData.numPresentUVs; i++)
    {
        triVert =
            RxClusterGetIndexedData(refineData->refineData.extraUVs[i],
                                    RwChar, idx0);

        _refineExtraGetUVSSE(triVert, &extraUV[j]);
        j += 2;

        triVert =
            RxClusterGetIndexedData(refineData->refineData.extraUVs[i],
                                    RwChar, idx1);

        _refineExtraGetUVSSE(triVert, &extraUV[j]);
        j += 2;

        triVert =
            RxClusterGetIndexedData(refineData->refineData.extraUVs[i],
                                    RwChar, idx2);

        _refineExtraGetUVSSE(triVert, &extraUV[j]);
        j += 2;
    }

    j = 0;
    for (i = 0; i < refineData->refineData.numPresentRGBAs; i++)
    {
        triVert =
            RxClusterGetIndexedData(refineData->refineData.
                                    extraRGBAs[i], RwChar, idx0);

        _refineExtraGetColSSE(triVert, &extraRGBA[j]);
        j += 4;

        triVert =
            RxClusterGetIndexedData(refineData->refineData.
                                    extraRGBAs[i], RwChar, idx1);

        _refineExtraGetColSSE(triVert, &extraRGBA[j]);
        j += 4;

        triVert =
            RxClusterGetIndexedData(refineData->refineData.
                                    extraRGBAs[i], RwChar, idx2);

        _refineExtraGetColSSE(triVert, &extraRGBA[j]);
        j += 4;
    }

    RWRETURNVOID();
}

/* I thought that this would be an RxPipeline node??? */

/*****************************************************************************
 * Project the verts.
 */
static void
_refineProjectVertsSSE(refineSSERefineData * refineData,
                       refineCameraData * camData __RWUNUSED__,
                       RwChar * camVert0, RwChar * camVert1,
                       RwChar * camVert2, RwChar * camVert3,
                       RwChar * devVert0, RwChar * devVert1,
                       RwChar * devVert2, RwChar * devVert3,
                       RpSSEOverlayM128 * camPos,
                       RpSSEOverlayM128 * camUV)
{
    static const RpSSEOverlayM128 zero = { {(RwReal) 0.0,
                                            (RwReal) 0.0,
                                            (RwReal) 0.0,
                                            (RwReal) 0.0}
    };

    RpSSEOverlayM128    xClip, yClip, zClip;
    RpSSEOverlayM128    nRecipZ, v1, v2, v3;

    RWFUNCTION(RWSTRING("_refineProjectVertsSSE"));

    nRecipZ.m128 = _mm_rcp_ps(camPos[2].m128);

    _refineDevVertsSetCamPosSSE(devVert0, devVert1, devVert2, devVert3,
                                camPos, &nRecipZ);

    /* Set the X clipflag. */
    v1.m128 = _mm_cmplt_ps(camPos[0].m128, zero.m128);
    v1.m128 = _mm_and_ps(v1.m128, refineData->sse.xLoClip.m128);
    v2.m128 = _mm_cmpgt_ps(camPos[0].m128, camPos[2].m128);
    v3.m128 = _mm_and_ps(v2.m128, refineData->sse.xHiClip.m128);

    /* xClip.m128 = _mm_andnot_ps(v2.m128, v3.m128); */
    xClip.m128 = _mm_andnot_ps(v2.m128, v1.m128);
    xClip.m128 = _mm_or_ps(v3.m128, xClip.m128);

    /* Set the Y clipflag. */
    v1.m128 = _mm_cmplt_ps(camPos[1].m128, zero.m128);
    v1.m128 = _mm_and_ps(v1.m128, refineData->sse.yLoClip.m128);
    v2.m128 = _mm_cmpgt_ps(camPos[1].m128, camPos[2].m128);
    v3.m128 = _mm_and_ps(v2.m128, refineData->sse.yHiClip.m128);

    /* yClip.m128 = _mm_andnot_ps(v2.m128, v3.m128); */
    yClip.m128 = _mm_andnot_ps(v2.m128, v1.m128);
    yClip.m128 = _mm_or_ps(v3.m128, yClip.m128);

    /* Set the Z clipflag. */
    v1.m128 = _mm_cmpgt_ps(camPos[2].m128, refineData->clipFar.m128);
    v1.m128 = _mm_and_ps(v1.m128, refineData->sse.zHiClip.m128);
    v2.m128 = _mm_cmplt_ps(camPos[2].m128, refineData->clipNear.m128);
    v3.m128 = _mm_and_ps(v2.m128, refineData->sse.zLoClip.m128);

    /* zClip.m128 = _mm_andnot_ps(v2.m128, v3.m128); */
    zClip.m128 = _mm_andnot_ps(v2.m128, v1.m128);
    zClip.m128 = _mm_or_ps(v3.m128, zClip.m128);

    /* combine the xyz flags. */
    zClip.m128 =
        _mm_or_ps(_mm_or_ps(xClip.m128, yClip.m128), zClip.m128);

    /* Perspective projection. Can only be after clip flags because we
     * are changing camPos here
     */
    camPos[0].m128 = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(camPos[0].m128,
                                                      nRecipZ.m128),
                                           refineData->camWidth.m128),
                                refineData->camOffsetX.m128);

    camPos[1].m128 = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(camPos[1].m128,
                                                      nRecipZ.m128),
                                           refineData->camHeight.m128),
                                refineData->camOffsetY.m128);

    camPos[2].m128 = _mm_add_ps(_mm_mul_ps(refineData->zScale.m128,
                                           nRecipZ.m128),
                                refineData->zShift.m128);

    /* set clipflag and texture */
    _refineDevVertsSetScrPosSSE(devVert0, devVert1, devVert2, devVert3,
                                camPos);
    _refineDevVertsSetUVSSE(devVert0, devVert1, devVert2, devVert3,
                            camUV, &nRecipZ);

    _refineCamVertsSetFlagSSE(camVert0, camVert1, camVert2, camVert3,
                              &zClip);

    RWRETURNVOID();
}

#ifdef RW_WNI

static void
_refineProjectVertsWNI(refineSSERefineData * refineData,
                       refineCameraData * camData __RWUNUSED__,
                       RwChar * camVert0, RwChar * camVert1,
                       RwChar * camVert2, RwChar * camVert3,
                       RwChar * devVert0, RwChar * devVert1,
                       RwChar * devVert2, RwChar * devVert3,
                       RpSSEOverlayM128 * camPos,
                       RpSSEOverlayM128 * camUV)
{
    static const RpSSEOverlayM128 zero = { {(RwReal) 0.0,
                                            (RwReal) 0.0,
                                            (RwReal) 0.0,
                                            (RwReal) 0.0}
    };

    RpWNIOverlayM128i   xClip, yClip, zClip, v1, v2, v3;
    RpSSEOverlayM128    nRecipZ, v4;

    RWFUNCTION(RWSTRING("_refineProjectVertsWNI"));

    nRecipZ.m128 = _mm_rcp_ps(camPos[2].m128);

    _refineDevVertsSetCamPosSSE(devVert0, devVert1, devVert2, devVert3,
                                camPos, &nRecipZ);

    /* Set the X clipflag. */
    v4.m128 = _mm_cmplt_ps(camPos[0].m128, zero.m128);
    v1.m128i = _mm_setr_epi32(v4._d[0], v4._d[1], v4._d[2], v4._d[3]);
    v1.m128i = _mm_and_si128(v1.m128i, refineData->wni.xLoClip.m128i);

    v4.m128 = _mm_cmpgt_ps(camPos[0].m128, camPos[2].m128);
    v2.m128i = _mm_setr_epi32(v4._d[0], v4._d[1], v4._d[2], v4._d[3]);
    v3.m128i = _mm_and_si128(v2.m128i, refineData->wni.xHiClip.m128i);

    /* xClip.m128 = _mm_andnot_ps(v2.m128, v3.m128); */
    xClip.m128i = _mm_andnot_si128(v2.m128i, v1.m128i);
    xClip.m128i = _mm_or_si128(v3.m128i, xClip.m128i);

    /* Set the Y clipflag. */
    v4.m128 = _mm_cmplt_ps(camPos[1].m128, zero.m128);
    v1.m128i = _mm_setr_epi32(v4._d[0], v4._d[1], v4._d[2], v4._d[3]);
    v1.m128i = _mm_and_si128(v1.m128i, refineData->wni.yLoClip.m128i);

    v4.m128 = _mm_cmpgt_ps(camPos[1].m128, camPos[2].m128);
    v2.m128i = _mm_setr_epi32(v4._d[0], v4._d[1], v4._d[2], v4._d[3]);
    v3.m128i = _mm_and_si128(v2.m128i, refineData->wni.yHiClip.m128i);

    /* yClip.m128 = _mm_andnot_ps(v2.m128, v3.m128); */
    yClip.m128i = _mm_andnot_si128(v2.m128i, v1.m128i);
    yClip.m128i = _mm_or_si128(v3.m128i, yClip.m128i);

    /* Set the Z clipflag. */
    v4.m128 = _mm_cmpgt_ps(camPos[2].m128, refineData->clipFar.m128);
    v1.m128i = _mm_setr_epi32(v4._d[0], v4._d[1], v4._d[2], v4._d[3]);
    v1.m128i = _mm_and_si128(v1.m128i, refineData->wni.zHiClip.m128i);

    v4.m128 = _mm_cmplt_ps(camPos[2].m128, refineData->clipNear.m128);
    v2.m128i = _mm_setr_epi32(v4._d[0], v4._d[1], v4._d[2], v4._d[3]);
    v3.m128i = _mm_and_si128(v2.m128i, refineData->wni.zLoClip.m128i);

    /* zClip.m128 = _mm_andnot_ps(v2.m128, v3.m128); */
    zClip.m128i = _mm_andnot_si128(v2.m128i, v1.m128i);
    zClip.m128i = _mm_or_si128(v3.m128i, zClip.m128i);

    /* combine the xyz flags. */
    zClip.m128i = _mm_or_si128(_mm_or_si128(xClip.m128i, yClip.m128i),
                               zClip.m128i);

    /* Perspective projection. Can only be after clip flags because we
     * are changing camPos here
     */
    camPos[0].m128 = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(camPos[0].m128,
                                                      nRecipZ.m128),
                                           refineData->camWidth.m128),
                                refineData->camOffsetX.m128);

    camPos[1].m128 = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(camPos[1].m128,
                                                      nRecipZ.m128),
                                           refineData->camHeight.m128),
                                refineData->camOffsetY.m128);

    camPos[2].m128 = _mm_add_ps(_mm_mul_ps(refineData->zScale.m128,
                                           nRecipZ.m128),
                                refineData->zShift.m128);

    /* set clipflag and texture */
    _refineDevVertsSetScrPosSSE(devVert0, devVert1, devVert2, devVert3,
                                camPos);
    _refineDevVertsSetUVSSE(devVert0, devVert1, devVert2, devVert3,
                            camUV, &nRecipZ);

    _refineCamVertsSetFlagWNI(camVert0, camVert1, camVert2, camVert3,
                              &zClip);

    RWRETURNVOID();
}

#endif /* RW_WNI */

/*****************************************************************************
 * Generate the refinement.
 */
RwBool
_rtrefineGenerateRefinementSSE(refineSSERefineData * refineData,
                               refineCameraData * camData,
                               RxHeap * heap)
{
    static const RpSSEOverlayM128 one = { {(RwReal) 1.0,
                                           (RwReal) 1.0,
                                           (RwReal) 1.0,
                                           (RwReal) 1.0}
    };
    BBTPSSEOrdinates    ords;

    RwChar             *idx;
    RwChar             *camVert0, *camVert1, *camVert2, *camVert3;
    RwChar             *camNorm0, *camNorm1, *camNorm2, *camNorm3;
    RwChar             *devVert0, *devVert1, *devVert2, *devVert3;
    RwChar            **extraUV0, **extraUV1, **extraUV2, **extraUV3;
    RwChar            **extraRGBA0, **extraRGBA1, **extraRGBA2,
        **extraRGBA3;
    RwInt32             idxsStride, camVertsStride, camNormsStride,
        devVertsStride, extraUVStride, extraRGBAStride;
    RwInt32             i, j, k, i0, i1, i2, i3, d, newVerts,
        oldNumVerts, numElems, numTris, numPresentRGBAs, numPresentUVs;
    RwReal             *k1, *k2;

    RwInt32             i0_start, i1_start, i2_start, i3_start,
        idx0, idx1, idx2;

    RpSSEOverlayM128    tmpPos[3], tmpNrm[3], tmpCol[4], tmpUV[2];
    RpSSEOverlayM128    camPos[3][3], camNrm[3][3], camUV[3][2];
    RpSSEOverlayM128    devCol[3][4];
    RpSSEOverlayM128   *extraRGBAs, *extraUVs;
    RpSSEOverlayM128    l1, l2; /* multi-indices */
    RpSSEOverlayM128    t1, t2, t3; /* barycentric coords */
    RpSSEOverlayM128    height, depth, recip;
    RwBool              result;

    RWFUNCTION(RWSTRING("_rtrefineGenerateRefinementSSE"));

    result = TRUE;

    /* Set up the weights index. */
    d = refineData->refineData.depth;

    k1 = (RwReal *) RxHeapAlloc(heap, (d * d) * sizeof(RwReal));
    k2 = (RwReal *) RxHeapAlloc(heap, (d * d) * sizeof(RwReal));

    newVerts = 0;

    if ((k1 != NULL) && (k2 != NULL))
    {
        for (i = 0; i <= d; i++)
        {
            i1 = i;

            for (j = 0; j <= d - i; j++)
            {
                i2 = d - i - j;
                i3 = d - i1 - i2;

                if ((i1 == d) || (i2 == d) || (i3 == d))
                {
                    /* vertices, nothing to do */
                }
                else
                {
                    k1[newVerts] = (RwReal) i1;
                    k2[newVerts] = (RwReal) i2;

                    newVerts++;
                }
            }
        }
    }
    else
        result = FALSE;

    /* Extra props. */
    numPresentUVs = refineData->refineData.numPresentUVs;
    numPresentRGBAs = refineData->refineData.numPresentRGBAs;

    extraUVs = NULL;
    extraUV0 = NULL;
    extraUV1 = NULL;
    extraUV2 = NULL;
    extraUV3 = NULL;

    if (result == TRUE)
    {
        if (numPresentUVs != 0)
        {
            extraUVs =
                RxHeapAlloc(heap,
                            2 * (3 * numPresentUVs *
                                 sizeof(RpSSEOverlayM128)));

            extraUV0 =
                RxHeapAlloc(heap, numPresentUVs * sizeof(RwChar *));

            extraUV1 =
                RxHeapAlloc(heap, numPresentUVs * sizeof(RwChar *));

            extraUV2 =
                RxHeapAlloc(heap, numPresentUVs * sizeof(RwChar *));

            extraUV3 =
                RxHeapAlloc(heap, numPresentUVs * sizeof(RwChar *));

            if ((extraUVs == NULL) ||
                (extraUV0 == NULL) ||
                (extraUV1 == NULL) ||
                (extraUV2 == NULL) || (extraUV3 == NULL))
                result = FALSE;
        }
    }

    extraRGBAs = NULL;
    extraRGBA0 = NULL;
    extraRGBA1 = NULL;
    extraRGBA2 = NULL;
    extraRGBA3 = NULL;

    if (result == TRUE)
    {
        if (numPresentRGBAs != 0)
        {
            extraRGBAs =
                RxHeapAlloc(heap,
                            4 * (3 * numPresentRGBAs *
                                 sizeof(RpSSEOverlayM128)));

            extraRGBA0 =
                RxHeapAlloc(heap, numPresentRGBAs * sizeof(RwChar *));

            extraRGBA1 =
                RxHeapAlloc(heap, numPresentRGBAs * sizeof(RwChar *));

            extraRGBA2 =
                RxHeapAlloc(heap, numPresentRGBAs * sizeof(RwChar *));

            extraRGBA3 =
                RxHeapAlloc(heap, numPresentRGBAs * sizeof(RwChar *));

            if ((extraRGBAs == NULL) ||
                (extraRGBA0 == NULL) ||
                (extraRGBA1 == NULL) ||
                (extraRGBA2 == NULL) || (extraRGBA3 == NULL))
                result = FALSE;
        }
    }

    if (result == TRUE)
    {
        /* copy the globals into local */
        numTris = refineData->refineData.oldNumTris;
        oldNumVerts = refineData->refineData.oldNumVerts;

        depth.m128 = _mm_set_ps1((RwReal) d);
        recip.m128 = _mm_rcp_ps(depth.m128);

        /* setup for the main loop. */
        idxsStride = refineData->idxsStride * 3;
        camVertsStride = refineData->camVertsStride;
        camNormsStride = refineData->camNormsStride;
        devVertsStride = refineData->devVertsStride;
        extraUVStride = refineData->extraUVStride;
        extraRGBAStride = refineData->extraRGBAStride;

        idx =
            RxClusterGetCursorData(refineData->refineData.idxs, RwChar);
        camNorm0 =
            RxClusterGetIndexedData(refineData->refineData.camNorms,
                                    RwChar, oldNumVerts);
        camVert0 =
            RxClusterGetIndexedData(refineData->refineData.camVerts,
                                    RwChar, oldNumVerts);
        devVert0 =
            RxClusterGetIndexedData(refineData->refineData.devVerts,
                                    RwChar, oldNumVerts);

        for (i = 0; i < numPresentUVs; i++)
        {
            extraUV0[i] =
                RxClusterGetIndexedData(refineData->refineData.
                                        extraUVs[i], RwChar,
                                        oldNumVerts);
        }
        for (i = 0; i < numPresentRGBAs; i++)
        {
            extraRGBA0[i] =
                RxClusterGetIndexedData(refineData->refineData.
                                        extraRGBAs[i], RwChar,
                                        oldNumVerts);
        }

        i1_start = 0;
        i2_start = 0;
        i3_start = 0;

        switch (newVerts & 3)
        {
            case 0:
                i1_start = 1;

            case 3:
                i2_start = i1_start + 1;

            case 2:
                i3_start = i2_start + 1;

            default:
                ;
        }

        /* Main process loop. */
        while (--numTris >= 0)
        {
            idx0 = ((RxVertexIndex *) idx)[0];
            idx1 = ((RxVertexIndex *) idx)[1];
            idx2 = ((RxVertexIndex *) idx)[2];

            /* copy the obj vertex data into SSEOverlayM128.
             * We only need the obj verts at this point so we borrow the cam
             * vars.
             */
            _refineTriangleGetObjVertsSSE(refineData, idx0, idx1, idx2,
                                          camPos, camNrm);

            /* generate ords */
            _rtbbtpSSEGenerateOrdinates(&ords,
                                        camPos[0], camNrm[0], camPos[1],
                                        camNrm[1], camPos[2],
                                        camNrm[2]);

            /* copy the cam and dev vertex data. */
            _refineTriangleGetCamVertsSSE(refineData, idx0, idx1, idx2,
                                          camPos, camNrm, camUV);
            _refineTriangleGetDevVertsSSE(refineData, idx0, idx1, idx2,
                                          devCol);
            _refineTriangleExtraGetSSE(refineData, idx0, idx1, idx2,
                                       extraUVs, extraRGBAs);

            /* Setup the initial pointers for the triangle refinement loop. */
            i0 = 0;
            i1 = i1_start;
            i2 = i2_start;
            i3 = i3_start;

            camNorm1 = camNorm0 + (i1_start * camNormsStride);
            camVert1 = camVert0 + (i1_start * camVertsStride);
            devVert1 = devVert0 + (i1_start * devVertsStride);

            camNorm2 = camNorm0 + (i2_start * camNormsStride);
            camVert2 = camVert0 + (i2_start * camVertsStride);
            devVert2 = devVert0 + (i2_start * devVertsStride);

            camNorm3 = camNorm0 + (i3_start * camNormsStride);
            camVert3 = camVert0 + (i3_start * camVertsStride);
            devVert3 = devVert0 + (i3_start * devVertsStride);

            for (i = 0; i < numPresentRGBAs; i++)
            {
                extraRGBA1[i] =
                    extraRGBA0[i] + (i1_start * extraRGBAStride);
                extraRGBA2[i] =
                    extraRGBA0[i] + (i2_start * extraRGBAStride);
                extraRGBA3[i] =
                    extraRGBA0[i] + (i3_start * extraRGBAStride);
            }

            for (i = 0; i < numPresentUVs; i++)
            {
                extraUV1[i] = extraUV0[i] + (i1_start * extraUVStride);
                extraUV2[i] = extraUV0[i] + (i2_start * extraUVStride);
                extraUV3[i] = extraUV0[i] + (i3_start * extraUVStride);
            }

            /* Triangle refinement loop. */
            numElems = (newVerts + 3) >> 2;
            while (--numElems >= 0)
            {
                /* Need to load the constant in i3, i2, i1, i0 so they
                 * will be in i0, i1, i2, i3 in the SSE.
                 */
                l1.m128 = _mm_set_ps(k1[i3], k1[i2], k1[i1], k1[i0]);
                l2.m128 = _mm_set_ps(k2[i3], k2[i2], k2[i1], k2[i0]);

                /* compute barycentric coords */

                t1.m128 = _mm_mul_ps((recip.m128), (l1.m128));
                t2.m128 = _mm_mul_ps((recip.m128), (l2.m128));
                t3.m128 = _mm_sub_ps((one.m128),
                                     _mm_add_ps((t1.m128), (t2.m128)));

                _rtbbtpSSEPatchEvaluate(&height, &ords, &t2, &t1, &t3);

                /* process the nrms, must be done before pos */

                _SSEBARYCENTRICINTERPOLATION(tmpNrm[0],
                                             t2, t1, t3,
                                             camNrm[0][0], camNrm[1][0],
                                             camNrm[2][0]);

                _SSEBARYCENTRICINTERPOLATION(tmpNrm[1],
                                             t2, t1, t3,
                                             camNrm[0][1], camNrm[1][1],
                                             camNrm[2][1]);

                _SSEBARYCENTRICINTERPOLATION(tmpNrm[2],
                                             t2, t1, t3,
                                             camNrm[0][2], camNrm[1][2],
                                             camNrm[2][2]);

                /* process the pos, must be after nrm  */

                _SSEBARYCENTRICINTERPOLATION(tmpPos[0],
                                             t2, t1, t3,
                                             camPos[0][0], camPos[1][0],
                                             camPos[2][0]);

                _SSEBARYCENTRICINTERPOLATION(tmpPos[1],
                                             t2, t1, t3,
                                             camPos[0][1], camPos[1][1],
                                             camPos[2][1]);
                _SSEBARYCENTRICINTERPOLATION(tmpPos[2], t2, t1, t3,
                                             camPos[0][2], camPos[1][2],
                                             camPos[2][2]);

                SSEV3dMultConstant(tmpNrm, tmpNrm, height);
                SSEV3dAdd(tmpPos, tmpPos, tmpNrm);

                /* process the rgba */

                _SSEBARYCENTRICINTERPOLATION(tmpCol[0],
                                             t2, t1, t3,
                                             devCol[0][0], devCol[1][0],
                                             devCol[2][0]);

                _SSEBARYCENTRICINTERPOLATION(tmpCol[1],
                                             t2, t1, t3,
                                             devCol[0][1], devCol[1][1],
                                             devCol[2][1]);

                _SSEBARYCENTRICINTERPOLATION(tmpCol[2],
                                             t2, t1, t3,
                                             devCol[0][2], devCol[1][2],
                                             devCol[2][2]);
                _SSEBARYCENTRICINTERPOLATION(tmpCol[3], t2, t1, t3,
                                             devCol[0][3], devCol[1][3],
                                             devCol[2][3]);

                /* process the uv */

                _SSEBARYCENTRICINTERPOLATION(tmpUV[0],
                                             t2, t1, t3,
                                             camUV[0][0], camUV[1][0],
                                             camUV[2][0]);

                _SSEBARYCENTRICINTERPOLATION(tmpUV[1],
                                             t2, t1, t3,
                                             camUV[0][1], camUV[1][1],
                                             camUV[2][1]);

                _refineCamVertsSetUVSSE(camVert0, camVert1, camVert2,
                                        camVert3, tmpUV);

                _refineCamVertsSetPosSSE(camVert0, camVert1, camVert2,
                                         camVert3, tmpPos);

                _refineCamVertsSetNrmSSE(camNorm0, camNorm1, camNorm2,
                                         camNorm3, tmpNrm);

                /* Project the verts. */

#ifdef RW_WNI

                if (rpRefineGlobals.sseFlag & RPREFINE_WNI)
                {
                    _refineDevVertsSetColWNI(devVert0, devVert1,
                                             devVert2, devVert3,
                                             tmpCol);

                    _refineProjectVertsWNI(refineData, camData,
                                           camVert0, camVert1, camVert2,
                                           camVert3, devVert0, devVert1,
                                           devVert2, devVert3, tmpPos,
                                           tmpUV);
                }
                else
                {
                    _refineDevVertsSetColSSE(devVert0, devVert1,
                                             devVert2, devVert3,
                                             tmpCol);

                    _refineProjectVertsSSE(refineData, camData,
                                           camVert0, camVert1, camVert2,
                                           camVert3, devVert0, devVert1,
                                           devVert2, devVert3, tmpPos,
                                           tmpUV);
                }

#else /* RW_WNI */

                _refineDevVertsSetColSSE(devVert0, devVert1, devVert2,
                                         devVert3, tmpCol);

                _refineProjectVertsSSE(refineData, camData,
                                       camVert0, camVert1, camVert2,
                                       camVert3, devVert0, devVert1,
                                       devVert2, devVert3, tmpPos,
                                       tmpUV);

#endif /* RW_WNI */

                /* process extra Col. */
                /* the extraRGBAs is in the form of RGBARGBARGBA */
                j = 0;
                for (i = 0; i < numPresentRGBAs; i++)
                {
                    j = i * 12;

                    _SSEBARYCENTRICINTERPOLATION(tmpCol[0],
                                                 t2, t1, t3,
                                                 extraRGBAs[j],
                                                 extraRGBAs[j + 4],
                                                 extraRGBAs[j + 8]);
                    j++;

                    _SSEBARYCENTRICINTERPOLATION(tmpCol[1],
                                                 t2, t1, t3,
                                                 extraRGBAs[j],
                                                 extraRGBAs[j + 4],
                                                 extraRGBAs[j + 8]);
                    j++;

                    _SSEBARYCENTRICINTERPOLATION(tmpCol[2],
                                                 t2, t1, t3,
                                                 extraRGBAs[j],
                                                 extraRGBAs[j + 4],
                                                 extraRGBAs[j + 8]);
                    j++;

                    _SSEBARYCENTRICINTERPOLATION(tmpCol[3],
                                                 t2, t1, t3,
                                                 extraRGBAs[j],
                                                 extraRGBAs[j + 4],
                                                 extraRGBAs[j + 8]);
                    j++;

                    _refineExtraSetColSSE(extraRGBA0[i], extraRGBA1[i],
                                          extraRGBA2[i], extraRGBA3[i],
                                          tmpCol);

                    extraRGBA0[i] = extraRGBA3[i] + extraRGBAStride;
                    extraRGBA1[i] = extraRGBA0[i] + extraRGBAStride;
                    extraRGBA2[i] = extraRGBA1[i] + extraRGBAStride;
                    extraRGBA3[i] = extraRGBA2[i] + extraRGBAStride;
                }

                /* process extra UV. */
                /* the extraUVs is in the form of UVUVUV */
                j = 0;
                for (i = 0; i < numPresentUVs; i++)
                {
                    j = i * 6;
                    _SSEBARYCENTRICINTERPOLATION(tmpUV[0],
                                                 t2, t1, t3,
                                                 extraUVs[j],
                                                 extraUVs[j + 2],
                                                 extraUVs[j + 4]);
                    j++;

                    _SSEBARYCENTRICINTERPOLATION(tmpUV[1],
                                                 t2, t1, t3,
                                                 extraUVs[j],
                                                 extraUVs[j + 2],
                                                 extraUVs[j + 4]);
                    j++;

                    _refineExtraSetUVSSE(extraUV0[i], extraUV1[i],
                                         extraUV2[i], extraUV3[i],
                                         tmpUV);

                    extraUV0[i] = extraUV3[i] + extraUVStride;
                    extraUV1[i] = extraUV0[i] + extraUVStride;
                    extraUV2[i] = extraUV1[i] + extraUVStride;
                    extraUV3[i] = extraUV2[i] + extraUVStride;
                }

                /* Increment the pointers. */
                camVert0 = camVert3 + camVertsStride;
                camVert1 = camVert0 + camVertsStride;
                camVert2 = camVert1 + camVertsStride;
                camVert3 = camVert2 + camVertsStride;

                camNorm0 = camNorm3 + camNormsStride;
                camNorm1 = camNorm0 + camNormsStride;
                camNorm2 = camNorm1 + camNormsStride;
                camNorm3 = camNorm2 + camNormsStride;

                devVert0 = devVert3 + devVertsStride;
                devVert1 = devVert0 + devVertsStride;
                devVert2 = devVert1 + devVertsStride;
                devVert3 = devVert2 + devVertsStride;

                i0 = i3 + 1;
                i1 = i0 + 1;
                i2 = i1 + 1;
                i3 = i2 + 1;
            }

            idx = idx + idxsStride;
        }
    }

    if (k1 != NULL)
        RxHeapFree(heap, k1);

    if (k2 != NULL)
        RxHeapFree(heap, k2);

    if (extraUVs != NULL)
        RxHeapFree(heap, extraUVs);

    if (extraUV0 != NULL)
        RxHeapFree(heap, extraUV0);

    if (extraUV1 != NULL)
        RxHeapFree(heap, extraUV1);

    if (extraUV2 != NULL)
        RxHeapFree(heap, extraUV2);

    if (extraUV3 != NULL)
        RxHeapFree(heap, extraUV3);

    if (extraRGBAs != NULL)
        RxHeapFree(heap, extraRGBAs);

    if (extraRGBA0 != NULL)
        RxHeapFree(heap, extraRGBA0);

    if (extraRGBA1 != NULL)
        RxHeapFree(heap, extraRGBA1);

    if (extraRGBA2 != NULL)
        RxHeapFree(heap, extraRGBA2);

    if (extraRGBA3 != NULL)
        RxHeapFree(heap, extraRGBA3);

    RWRETURN(result);
}

RwBool
_rt_rtrefineSetupCameraSSE(refineSSERefineData * refineData,
                           refineCameraData * camData)
{
    RWFUNCTION(RWSTRING("_rt_rtrefineSetupCameraSSE"));

    _rtrefineSetupCamera(camData);

    _refineSSECAMERALOAD(refineData->camWidth,
                         refineData->camHeight,
                         refineData->camOffsetX,
                         refineData->camOffsetY,
                         refineData->zScale,
                         refineData->zShift, *camData);

#ifdef RW_WNI

    if (rpRefineGlobals.sseFlag & RPREFINE_WNI)
    {
        _refineWNICLIPLOAD(refineData->clipNear,
                           refineData->clipFar,
                           refineData->wni.xLoClip,
                           refineData->wni.xHiClip,
                           refineData->wni.yLoClip,
                           refineData->wni.yHiClip,
                           refineData->wni.zLoClip,
                           refineData->wni.zHiClip, *camData);
    }
    else
    {
        _refineSSECLIPLOAD(refineData->clipNear,
                           refineData->clipFar,
                           refineData->sse.xLoClip,
                           refineData->sse.xHiClip,
                           refineData->sse.yLoClip,
                           refineData->sse.yHiClip,
                           refineData->sse.zLoClip,
                           refineData->sse.zHiClip, *camData);
    }

#else /* RW_WNI */

    _refineSSECLIPLOAD(refineData->clipNear,
                       refineData->clipFar,
                       refineData->sse.xLoClip,
                       refineData->sse.xHiClip,
                       refineData->sse.yLoClip,
                       refineData->sse.yHiClip,
                       refineData->sse.zLoClip,
                       refineData->sse.zHiClip, *camData);

#endif /* RW_WNI */

    RWRETURN(TRUE);
}

#endif /* ( ((defined(__ICL)) && (400 <= __ICL)) && defined(RWSIMD) ) */
