/*
 * INTEL OVERLOAD
 *
 * Intel overloaded functions.
 *
 * Copyright (c) 1998 Criterion Software Ltd.
 *
 */

/****************************************************************************
 Includes
 */

#include <stdio.h>
#include <stdlib.h>

#include "rpplugin.h"
#include "rpdbgerr.h"
#include "rtintel.h"
#include "overload.h"

static const char __RWUNUSED__   rcsid[] =
    "@@(#)$Id: overload.c,v 1.13 2001/08/22 21:41:01 Markj Exp $";

#if ( defined(_WIN32) && defined(_MSC_VER) && (_MSC_VER>=1000) )
#if (defined(_XBOX))
#include <xtl.h>
#else /* (defined(_XBOX)) */
#include <windows.h>
#include <crtdbg.h>
#endif /* (defined(_XBOX)) */
#define   OUTPUTDEBUGSTRING(_msg)   OutputDebugString(_msg)
#endif /* ( defined(_WIN32) && defined(_MSC_VER) && (_MSC_VER>=1000) ) */

#if (!defined(OUTPUTDEBUGSTRING))
#define OUTPUTDEBUGSTRING(_msg) /* Null op */
#endif /* (!defined(OUTPUTDEBUGSTRING)) */

#if (!defined(_MM_FLUSH_ZERO_MASK))
#define _MM_FLUSH_ZERO_MASK   0x8000
#endif /* (!defined(_MM_FLUSH_ZERO_MASK)) */

#if (!defined(_MM_FLUSH_ZERO_ON))
#define _MM_FLUSH_ZERO_ON     0x8000
#endif /* (!defined(_MM_FLUSH_ZERO_ON)) */

#if (!defined(_MM_FLUSH_ZERO_OFF))
#define _MM_FLUSH_ZERO_OFF    0x0000
#endif /* (!defined(_MM_FLUSH_ZERO_OFF)) */

#if (defined(__ICL))
#if (400<=__ICL)

#if (defined(RW_SUPPRESS_PREFETCH))
#define RW_PREFETCH_SSE(_p, _i) /* No op */
#endif /* (defined(RW_SUPPRESS_PREFETCH)) */

#if (!defined(RW_PREFETCH_SSE))
#define RW_PREFETCH_SSE(_p, _i) _mm_prefetch((_p), (_i))
#endif /* (!defined(RW_PREFETCH_SSE)) */

#define _rwSSEVECTORMULTPOINT(_src, _trg, _mx, _my, _mz, _mw, _v)    \
    (_v).m128 =                                                 \
        _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_set_ps1((_src).x), \
                                         (_mx).m128),           \
                              _mm_mul_ps(_mm_set_ps1((_src).y), \
                                         (_my).m128)),          \
                   _mm_add_ps(_mm_mul_ps(_mm_set_ps1((_src).z), \
                                         (_mz).m128),           \
                              (_mw).m128));                     \
                                                                \
    (_trg) = (_v).v4d.v3d;                                      \

#define _rwSSEVECTORMULTVECTOR(_src, _trg, _mx, _my, _mz, _v)   \
    (_v).m128 =                                                 \
        _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_set_ps1((_src).x), \
                                         (_mx).m128),           \
                              _mm_mul_ps(_mm_set_ps1((_src).y), \
                                         (_my).m128)),          \
                   _mm_mul_ps(_mm_set_ps1((_src).z),            \
                              (_mz).m128));                     \
                                                                \
    (_trg) = (_v).v4d.v3d;                                      \


/****************************************************************************
 _rwMatrixMultiplySSE

 On entry   : Dest matrix pointer, two source matrix pointers
 On exit    : Matrix pointer contains result
 */
static
RWASMAPI(void)
_rwMatrixMultiplySSE(RwMatrix * dstMat,
                     const RwMatrix * matA, const RwMatrix * matB)
{
    RpSSEOverlayM128    v;
    RpSSEOverlayM128    m_x;
    RpSSEOverlayM128    m_y;
    RpSSEOverlayM128    m_z;
    RpSSEOverlayM128    m_w;

    RWFUNCTION(RWSTRING("_rwMatrixMultiplySSE"));
    RWASSERT(dstMat);
    RWASSERT(matA);
    RWASSERT(matB);

    m_x.v4d.w = 0;
    m_x.v4d.v3d = matB->right;

    m_y.v4d.w = 0;
    m_y.v4d.v3d = matB->up;

    m_z.v4d.w = 0;
    m_z.v4d.v3d = matB->at;

    m_w.v4d.w = 1;
    m_w.v4d.v3d = matB->pos;

    /* Multiply out right */
    _rwSSEVECTORMULTVECTOR(matA->right, dstMat->right, m_x, m_y, m_z,
                           v);

    /* Then up */
    _rwSSEVECTORMULTVECTOR(matA->up, dstMat->up, m_x, m_y, m_z, v);

    /* Then at */
    _rwSSEVECTORMULTVECTOR(matA->at, dstMat->at, m_x, m_y, m_z, v);

    /* Then pos - this is different because there is an extra add
     * (implicit 1 (one) in bottom right of matrix)
     */
    _rwSSEVECTORMULTPOINT(matA->pos, dstMat->pos, m_x, m_y, m_z, m_w,
                          v);

    /* And that's all folks */
    RWRETURNVOID();
}

/*
 * See
 * http://www.lysator.liu.se/c/duffs-device.html
 * for details of Duff's Device for dynamic loop unrolling optimization.
 * At time of writing, Tom Duff is at Lucas Films / Pixar.
 */

/****************************************************************************
 VectorMultPointSSE

 On entry   : out array, in array, num of points, matrix
 On exit    : out array of success
 */

static RwV3d       *
VectorMultPointSSE(RwV3d * pointsOut,
                   const RwV3d * pointsIn,
                   RwInt32 numPoints, const RwMatrix * matrix)
{

    RpSSEOverlayM128    v;
    RpSSEOverlayM128    m_x;
    RpSSEOverlayM128    m_y;
    RpSSEOverlayM128    m_z;
    RpSSEOverlayM128    m_w;

    RWFUNCTION(RWSTRING("VectorMultPointSSE"));

    RWASSERT(pointsOut);
    RWASSERT(pointsIn);
    RWASSERT(matrix);

    if (numPoints > 0)
    {
        RwUInt32            offset = (numPoints - 1) & ((RwUInt32) ~ 7);
        const RwV3d        *sourceptr = &pointsIn[offset];
        RwV3d              *targetptr = &pointsOut[offset];

        m_x.v4d.w = 0;
        m_x.v4d.v3d = matrix->right;

        m_y.v4d.w = 0;
        m_y.v4d.v3d = matrix->up;

        m_z.v4d.w = 0;
        m_z.v4d.v3d = matrix->at;

        m_w.v4d.w = 1;
        m_w.v4d.v3d = matrix->pos;

        switch (numPoints & 7)
        {
            case 0:
                do
                {
                    _rwSSEVECTORMULTPOINT(sourceptr[7], targetptr[7],
                                          m_x, m_y, m_z, m_w, v);
            case 7:
                    _rwSSEVECTORMULTPOINT(sourceptr[6], targetptr[6],
                                          m_x, m_y, m_z, m_w, v);
            case 6:
                    _rwSSEVECTORMULTPOINT(sourceptr[5], targetptr[5],
                                          m_x, m_y, m_z, m_w, v);
            case 5:
                    _rwSSEVECTORMULTPOINT(sourceptr[4], targetptr[4],
                                          m_x, m_y, m_z, m_w, v);
            case 4:
                    _rwSSEVECTORMULTPOINT(sourceptr[3], targetptr[3],
                                          m_x, m_y, m_z, m_w, v);
            case 3:
                    _rwSSEVECTORMULTPOINT(sourceptr[2], targetptr[2],
                                          m_x, m_y, m_z, m_w, v);
            case 2:
                    _rwSSEVECTORMULTPOINT(sourceptr[1], targetptr[1],
                                          m_x, m_y, m_z, m_w, v);
            case 1:
                    _rwSSEVECTORMULTPOINT(sourceptr[0], targetptr[0],
                                          m_x, m_y, m_z, m_w, v);
                    sourceptr -= 8;
                }
                while ((targetptr -= 8) >= pointsOut);
        }
    }

    RWRETURN(pointsOut);
}

/****************************************************************************
 _rwVectorMultvectorSSE

 On entry   : out array, in array, num of points, matrix
 On exit    : out array of success
 */

static RwV3d       *
VectorMultVectorSSE(RwV3d * pointsOut,
                    const RwV3d * pointsIn,
                    RwInt32 numPoints, const RwMatrix * matrix)
{

    RpSSEOverlayM128    v;
    RpSSEOverlayM128    m_x;
    RpSSEOverlayM128    m_y;
    RpSSEOverlayM128    m_z;
    RpSSEOverlayM128    m_w;

    RWFUNCTION(RWSTRING("VectorMultVectorSSE"));

    RWASSERT(pointsOut);
    RWASSERT(pointsIn);
    RWASSERT(matrix);

    if (numPoints > 0)
    {
        RwUInt32            offset = (numPoints - 1) & ((RwUInt32) ~ 7);
        const RwV3d        *sourceptr = &pointsIn[offset];
        RwV3d              *targetptr = &pointsOut[offset];

        m_x.v4d.w = 0;
        m_x.v4d.v3d = matrix->right;

        m_y.v4d.w = 0;
        m_y.v4d.v3d = matrix->up;

        m_z.v4d.w = 0;
        m_z.v4d.v3d = matrix->at;

        m_w.v4d.w = 1;
        m_w.v4d.v3d = matrix->pos;

        switch (numPoints & 7)
        {
            case 0:
                do
                {
                    _rwSSEVECTORMULTVECTOR(sourceptr[7], targetptr[7],
                                           m_x, m_y, m_z, v);
            case 7:
                    _rwSSEVECTORMULTVECTOR(sourceptr[6], targetptr[6],
                                           m_x, m_y, m_z, v);
            case 6:
                    _rwSSEVECTORMULTVECTOR(sourceptr[5], targetptr[5],
                                           m_x, m_y, m_z, v);
            case 5:
                    _rwSSEVECTORMULTVECTOR(sourceptr[4], targetptr[4],
                                           m_x, m_y, m_z, v);
            case 4:
                    _rwSSEVECTORMULTVECTOR(sourceptr[3], targetptr[3],
                                           m_x, m_y, m_z, v);
            case 3:
                    _rwSSEVECTORMULTVECTOR(sourceptr[2], targetptr[2],
                                           m_x, m_y, m_z, v);
            case 2:
                    _rwSSEVECTORMULTVECTOR(sourceptr[1], targetptr[1],
                                           m_x, m_y, m_z, v);
            case 1:
                    _rwSSEVECTORMULTVECTOR(sourceptr[0], targetptr[0],
                                           m_x, m_y, m_z, v);
                    sourceptr -= 8;
                }
                while ((targetptr -= 8) >= pointsOut);
        }
    }

    RWRETURN(pointsOut);
}

#define PipeTransformSSE NULL

RtIntelOverload    *
_rtIntelOverloadGetHandle(void)
{
    static RtIntelOverload dGIntelOverload = {
        _rwMatrixMultiplySSE,
        VectorMultPointSSE,
        VectorMultVectorSSE,
        PipeTransformSSE
    };
    RwBool              SSEEnabledCPU;
    RtIntelOverload    *result = NULL;

    RWFUNCTION(RWSTRING("_rtIntelOverloadGetHandle"));

    SSEEnabledCPU = RtIntelHaveSSE();

    if (SSEEnabledCPU)
    {
        /* _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); */
        _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON);
        OUTPUTDEBUGSTRING(__FILE__ ":" RW_STRINGIFY_EXPANDED(__LINE__)
                          ":"
                          "_mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON);\n");
        result = &dGIntelOverload;
    }
    else
    {
        RWERROR((E_RW_NOTSSEENABLEDCPU));
    }

    RWMESSAGE(("SSEEnabledCPU %08x result %p", SSEEnabledCPU, result));
    RWRETURN(result);
}

#endif /* (400<=__ICL) */
#endif /* (defined(__ICL)) */
