/* 
 * SSE emulation
 *
 * Copyright (c) 1998 Criterion Software Ltd.
 */

/*
 *      Generic software emulation of icl/SSE intrinsics.
 */

#include <float.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#include "rpplugin.h"
#include "rpdbgerr.h"
#include "rtintel.h"
#include "sse.h"

static const char  __RWUNUSED__  rcsid[] =
    "@@(#)$Id: sse.c,v 1.10 2001/06/12 08:55:19 johns Exp $";

#define _rw_max(_x, _y) (((_x) > (_y)) ? (_x) : (_y))
#define _rw_min(_x, _y) (((_x) < (_y)) ? (_x) : (_y))
#define _rw_isnan(_x) ((_x) != (_x))

#if ( defined(_WIN32) && defined(_MSC_VER) && (_MSC_VER>=1000) )
#if (defined(_XBOX))
#include <xtl.h>
#else /* (defined(_XBOX)) */
#include <windows.h>
#include <crtdbg.h>
#endif /* (defined(_XBOX)) */

#define   OUTPUTDEBUGSTRING(_msg)   OutputDebugString(_msg)

#define INTEL_SSE_UNIMPLEMENTED(__func__)               \
do                                                      \
{                                                       \
   char buffer[256];                                    \
                                                        \
   _snprintf( buffer, sizeof(buffer)/sizeof(buffer[0]), \
              "%s:%d: %s Unimplemented\n",              \
              __FILE__, __LINE__, __func__ );           \
                                                        \
   OUTPUTDEBUGSTRING(buffer);                           \
                                                        \
} while (0)

#endif /* ( defined(_WIN32) && defined(_MSC_VER) && (_MSC_VER>=1000) ) */

#if (!defined(OUTPUTDEBUGSTRING))
#define OUTPUTDEBUGSTRING(_msg) /* Null op */
#endif /* (!defined(OUTPUTDEBUGSTRING)) */

#if (!defined(INTEL_SSE_UNIMPLEMENTED))
#define INTEL_SSE_UNIMPLEMENTED(__func__) /* Null op */
#endif /* (!defined(INTEL_SSE_UNIMPLEMENTED)) */

#define recip(x)  (((float)1)/((float)(x)))
#define sign(x)   (((x)<0)?0x00000001:0x00000000)
#define hiword(x) ((short)( ((long)(x)) >> 16 ))

/*
 * Arithmetic Operations
 */

/**
 * \ingroup rtintel
 * \ref Rt_mm_add_ss adds the lower SP FP (single-precision,
 * floating-point) values of a and b the upper 3 SP FP values are passed
 * through from a.
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_add_ss(Rt_m128 a, Rt_m128 b) /* ADDSS */
{
    /*
     * Adds the lower SP FP (single-precision, floating-point)
     * values of a and b
     * the upper 3 SP FP values are passed through from a.
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_add_ss"));

    ao.m128 = a;
    bo.m128 = b;

    ro._f[0] = ao._f[0] + bo._f[0];
    ro._f[1] = ao._f[1];
    ro._f[2] = ao._f[2];
    ro._f[3] = ao._f[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_add_ps adds the four SP FP values of a and b.
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_add_ps(Rt_m128 a, Rt_m128 b) /* ADDPS */
{
    /*
     * Adds the four SP FP values of a and b.
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_add_ps"));

    ao.m128 = a;
    bo.m128 = b;

    ro._f[0] = ao._f[0] + bo._f[0];
    ro._f[1] = ao._f[1] + bo._f[1];
    ro._f[2] = ao._f[2] + bo._f[2];
    ro._f[3] = ao._f[3] + bo._f[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_sub_ss subtracts the lower SP FP values of a and b.
 * The upper 3 SP FP values are passed through from a.
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_sub_ss(Rt_m128 a, Rt_m128 b) /* SUBSS */
{
    /*
     * Subtracts the lower SP FP values of a and b.
     * The upper 3 SP FP values are passed through from a.
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_sub_ss"));

    ao.m128 = a;
    bo.m128 = b;

    ro._f[0] = ao._f[0] - bo._f[0];
    ro._f[1] = ao._f[1];
    ro._f[2] = ao._f[2];
    ro._f[3] = ao._f[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_sub_ps subtracts the four SP FP values of a and b.
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_sub_ps(Rt_m128 a, Rt_m128 b) /* SUBPS */
{
    /*
     * Subtracts the four SP FP values of a and b.
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_sub_ps"));

    ao.m128 = a;
    bo.m128 = b;

    ro._f[0] = ao._f[0] - bo._f[0];
    ro._f[1] = ao._f[1] - bo._f[1];
    ro._f[2] = ao._f[2] - bo._f[2];
    ro._f[3] = ao._f[3] - bo._f[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_mul_ss multiplies the lower SP FP values of a and b
 * the upper 3 SP FP values are passed through from a.
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_mul_ss(Rt_m128 a, Rt_m128 b) /* MULSS */
{
    /*
     * Multiplies the lower SP FP values of a and b
     * the upper 3 SP FP values are  passed through from a.
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_mul_ss"));

    ao.m128 = a;
    bo.m128 = b;

    ro._f[0] = ao._f[0] * bo._f[0];
    ro._f[1] = ao._f[1];
    ro._f[2] = ao._f[2];
    ro._f[3] = ao._f[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_mul_ps multiplies the four SP FP values of a and b.
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_mul_ps(Rt_m128 a, Rt_m128 b) /* MULPS */
{
    /*
     * Multiplies the four SP FP values of a and b.
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_mul_ps"));

    ao.m128 = a;
    bo.m128 = b;

    ro._f[0] = ao._f[0] * bo._f[0];
    ro._f[1] = ao._f[1] * bo._f[1];
    ro._f[2] = ao._f[2] * bo._f[2];
    ro._f[3] = ao._f[3] * bo._f[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_div_ss divides the lower SP FP values of a and b
 * the upper 3 SP FP values are passed through from a.
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_div_ss(Rt_m128 a, Rt_m128 b) /* DIVSS */
{
    /*
     * Divides the lower SP FP values of a and b
     * the upper 3 SP FP values are passed through from a.
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_div_ss"));

    ao.m128 = a;
    bo.m128 = b;

    ro._f[0] = ao._f[0] / bo._f[0];
    ro._f[1] = ao._f[1];
    ro._f[2] = ao._f[2];
    ro._f[3] = ao._f[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_div_ps divides the four SP FP values of a and b
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_div_ps(Rt_m128 a, Rt_m128 b) /* DIVPS */
{
    /*
     * Divides the four SP FP values of a and b
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_div_ps"));

    ao.m128 = a;
    bo.m128 = b;

    ro._f[0] = ao._f[0] / bo._f[0];
    ro._f[1] = ao._f[1] / bo._f[1];
    ro._f[2] = ao._f[2] / bo._f[2];
    ro._f[3] = ao._f[3] / bo._f[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_sqrt_ss computes the square root of the lower SP FP
 * value of a the upper 3 SP FP values are passed through
 * \param a  Rt_m128 a
 */

Rt_m128
Rt_mm_sqrt_ss(Rt_m128 a)        /* SQRTSS */
{
    /*
     *
     * Computes the square root of the lower SP FP value of a
     * the upper 3 SP FP values are passed through
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_sqrt_ss"));

    ao.m128 = a;

    ro._f[0] = (float) rwSqrt(ao._f[0]);
    ro._f[1] = ao._f[1];
    ro._f[2] = ao._f[2];
    ro._f[3] = ao._f[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_sqrt_ps computes the square roots of the four SP FP
 * values of a
 * \param a  Rt_m128 a
 */

Rt_m128
Rt_mm_sqrt_ps(Rt_m128 a)        /* SQRTPS */
{
    /*
     * Computes the square roots of the four SP FP values of a
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_sqrt_ps"));

    ao.m128 = a;

    ro._f[0] = (float) rwSqrt(ao._f[0]);
    ro._f[1] = (float) rwSqrt(ao._f[1]);
    ro._f[2] = (float) rwSqrt(ao._f[2]);
    ro._f[3] = (float) rwSqrt(ao._f[3]);

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_rcp_ss computes the approximation of the reciprocal
 * of the lower SP FP value of a
 * \param a  Rt_m128 a
 */

Rt_m128
Rt_mm_rcp_ss(Rt_m128 a)         /* RCPSS */
{
    /*
     * Computes the approximation of the reciprocal of the
     * lower SP FP value of a
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_rcp_ss"));

    ao.m128 = a;

    ro._f[0] = (float) recip(ao._f[0]);
    ro._f[1] = ao._f[1];
    ro._f[2] = ao._f[2];
    ro._f[3] = ao._f[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_rcp_ps computes the approximations of the
 * (float)reciprocals of the four SP FP values of a
 * \param a  Rt_m128 a
 */

Rt_m128
Rt_mm_rcp_ps(Rt_m128 a)         /* RCPPS */
{
    /*
     * Computes the approximations of the (float)reciprocals of
     * the four SP FP values of a
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_rcp_ps"));

    ao.m128 = a;

    ro._f[0] = (float) recip(ao._f[0]);
    ro._f[1] = (float) recip(ao._f[1]);
    ro._f[2] = (float) recip(ao._f[2]);
    ro._f[3] = (float) recip(ao._f[3]);

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_rsqrt_ss computes the approximation of the
 * (float)reciprocal of the square root of the lower SP FP value of a the
 * upper 3 SP FP values are passed through
 * \param a  Rt_m128 a
 */

Rt_m128
Rt_mm_rsqrt_ss(Rt_m128 a)       /* RSQRTSS */
{
    /*
     * Computes the approximation of the (float)reciprocal of
     * the square root of the
     * lower SP FP value of a the upper 3 SP FP values are passed through
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_rsqrt_ss"));

    ao.m128 = a;

    ro._f[0] = (float) recip(rwSqrt(ao._f[0]));
    ro._f[1] = ao._f[1];
    ro._f[2] = ao._f[2];
    ro._f[3] = ao._f[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_rsqrt_ps computes the approximations of the
 * (float)reciprocals of the square roots of the four SP FP values of a
 * \param a  Rt_m128 a
 */

Rt_m128
Rt_mm_rsqrt_ps(Rt_m128 a)       /* RSQRTPS */
{
    /*
     * Computes the approximations of the (float)reciprocals of
     * the square roots of the four SP FP values of a
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_rsqrt_ps"));

    ao.m128 = a;

    ro._f[0] = (float) recip(rwSqrt(ao._f[0]));
    ro._f[1] = (float) recip(rwSqrt(ao._f[1]));
    ro._f[2] = (float) recip(rwSqrt(ao._f[2]));
    ro._f[3] = (float) recip(rwSqrt(ao._f[3]));

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_min_ss computes the minimum of the lower SP FP
 * values of a and b the upper 3 SP FP values are passed through from a
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_min_ss(Rt_m128 a, Rt_m128 b) /* MINSS */
{
    /*
     * Computes the minimum of the lower SP FP values of a and b the upper 3
     * SP FP values are passed through from a
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_min_ss"));

    ao.m128 = a;
    bo.m128 = b;

    ro._f[0] = _rw_min(ao._f[0], bo._f[0]);
    ro._f[1] = ao._f[1];
    ro._f[2] = ao._f[2];
    ro._f[3] = ao._f[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_min_ps computes the minimums of the four SP FP
 * values of a and b
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */
Rt_m128
Rt_mm_min_ps(Rt_m128 a, Rt_m128 b) /* MINPS */
{
    /*
     * Computes the minimums of the four SP FP values of a and b
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_min_ps"));

    ao.m128 = a;
    bo.m128 = b;

    ro._f[0] = _rw_min(ao._f[0], bo._f[0]);
    ro._f[1] = _rw_min(ao._f[1], bo._f[1]);
    ro._f[2] = _rw_min(ao._f[2], bo._f[2]);
    ro._f[3] = _rw_min(ao._f[3], bo._f[3]);

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_max_ss computes the maximum of the lower SP FP
 * values of a and b the upper 3 SP FP values are passed through from a
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_max_ss(Rt_m128 a, Rt_m128 b) /* MAXSS */
{
    /*
     * Computes the maximum of the lower SP FP values of a and b
     * the upper 3 SP FP values are passed through from a
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_max_ss"));

    ao.m128 = a;
    bo.m128 = b;

    ro._f[0] = _rw_max(ao._f[0], bo._f[0]);
    ro._f[1] = ao._f[1];
    ro._f[2] = ao._f[2];
    ro._f[3] = ao._f[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_max_ps computes the maximums of the four SP FP
 * values of a and b
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_max_ps(Rt_m128 a, Rt_m128 b) /* MAXPS */
{
    /*
     * Computes the maximums of the four SP FP values of a and b
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_max_ps"));

    ao.m128 = a;
    bo.m128 = b;

    ro._f[0] = _rw_max(ao._f[0], bo._f[0]);
    ro._f[1] = _rw_max(ao._f[1], bo._f[1]);
    ro._f[2] = _rw_max(ao._f[2], bo._f[2]);
    ro._f[3] = _rw_max(ao._f[3], bo._f[3]);

    RWRETURN(ro.m128);
}

/*
 * Logical Operations
 */

/**
 * \ingroup rtintel
 * \ref Rt_mm_and_ps computes the bitwise And of the four SP FP
 * values of a and b
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_and_ps(Rt_m128 a, Rt_m128 b) /* ANDPS */
{
    /*
     * Computes the bitwise And of the four SP FP values of a and b
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_and_ps"));

    ao.m128 = a;
    bo.m128 = b;

    ro.ud[0] = ao.ud[0] & bo.ud[0];
    ro.ud[1] = ao.ud[1] & bo.ud[1];
    ro.ud[2] = ao.ud[2] & bo.ud[2];
    ro.ud[3] = ao.ud[3] & bo.ud[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_andnot_ps computes the bitwise AND-NOT of the four
 * SP FP values of a and b
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_andnot_ps(Rt_m128 a, Rt_m128 b) /* ANDNPS */
{
    /*
     * Computes the bitwise AND-NOT of the four SP FP values of
     * a and b
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_andnot_ps"));

    ao.m128 = a;
    bo.m128 = b;

    ro.ud[0] = ~ao.ud[0] & bo.ud[0];
    ro.ud[1] = ~ao.ud[1] & bo.ud[1];
    ro.ud[2] = ~ao.ud[2] & bo.ud[2];
    ro.ud[3] = ~ao.ud[3] & bo.ud[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_or_ps computes the bitwise OR of the four SP FP
 * values of a and b
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_or_ps(Rt_m128 a, Rt_m128 b) /* ORPS */
{
    /*
     * Computes the bitwise OR of the four SP FP values of a and b
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_or_ps"));

    ao.m128 = a;
    bo.m128 = b;

    ro.ud[0] = ao.ud[0] | bo.ud[0];
    ro.ud[1] = ao.ud[1] | bo.ud[1];
    ro.ud[2] = ao.ud[2] | bo.ud[2];
    ro.ud[3] = ao.ud[3] | bo.ud[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_xor_ps computes bitwise EXOR (exclusive-or) of the
 * four SP FP values of a and b
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_xor_ps(Rt_m128 a, Rt_m128 b) /* XORPS */
{
    /*
     * Computes bitwise EXOR (exclusive-or) of the four SP FP values of a and b
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_xor_ps"));

    ao.m128 = a;
    bo.m128 = b;

    ro.ud[0] = ao.ud[0] ^ bo.ud[0];
    ro.ud[1] = ao.ud[1] ^ bo.ud[1];
    ro.ud[2] = ao.ud[2] ^ bo.ud[2];
    ro.ud[3] = ao.ud[3] ^ bo.ud[3];

    RWRETURN(ro.m128);
}

/*
 * Comparisons
 */

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpeq_ss compare for equality
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_cmpeq_ss(Rt_m128 a, Rt_m128 b) /* CMPEQSS */
{
    /*
     * Compare for equality
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpeq_ss"));

    ao.m128 = a;
    bo.m128 = b;

    ro.ud[0] = (ao._f[0] == bo._f[0]) ? 0xffffffffL : 0x0;
    ro._f[1] = ao._f[1];
    ro._f[2] = ao._f[2];
    ro._f[3] = ao._f[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpeq_ps compare for equality
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_cmpeq_ps(Rt_m128 a, Rt_m128 b) /* CMPEQPS */
{
    /*
     * Compare for equality
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpeq_ps"));

    ao.m128 = a;
    bo.m128 = b;

    ro.ud[0] = (ao._f[0] == bo._f[0]) ? 0xffffffffL : 0x0;
    ro.ud[1] = (ao._f[1] == bo._f[1]) ? 0xffffffffL : 0x0;
    ro.ud[2] = (ao._f[2] == bo._f[2]) ? 0xffffffffL : 0x0;
    ro.ud[3] = (ao._f[3] == bo._f[3]) ? 0xffffffffL : 0x0;

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmplt_ss compare for less-than
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_cmplt_ss(Rt_m128 a, Rt_m128 b) /* CMPLTSS */
{
    /*
     * Compare for less-than
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmplt_ss"));

    ao.m128 = a;
    bo.m128 = b;

    ro.ud[0] = (ao._f[0] < bo._f[0]) ? 0xffffffffL : 0x0;
    ro._f[1] = ao._f[1];
    ro._f[2] = ao._f[2];
    ro._f[3] = ao._f[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmplt_ps compare for less-than
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_cmplt_ps(Rt_m128 a, Rt_m128 b) /* CMPLTPS */
{
    /*
     * Compare for less-than
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmplt_ps"));

    ao.m128 = a;
    bo.m128 = b;

    ro.ud[0] = (ao._f[0] < bo._f[0]) ? 0xffffffffL : 0x0;
    ro.ud[1] = (ao._f[1] < bo._f[1]) ? 0xffffffffL : 0x0;
    ro.ud[2] = (ao._f[2] < bo._f[2]) ? 0xffffffffL : 0x0;
    ro.ud[3] = (ao._f[3] < bo._f[3]) ? 0xffffffffL : 0x0;

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmple_ss compare for less-than-or-equal
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_cmple_ss(Rt_m128 a, Rt_m128 b) /* CMPLESS */
{
    /*
     * Compare for less-than-or-equal
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmple_ss"));

    ao.m128 = a;
    bo.m128 = b;

    ro.ud[0] = (ao._f[0] <= bo._f[0]) ? 0xffffffffL : 0x0;
    ro._f[1] = ao._f[1];
    ro._f[2] = ao._f[2];
    ro._f[3] = ao._f[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmple_ps compare for less-than-or-equal
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_cmple_ps(Rt_m128 a, Rt_m128 b) /* CMPLEPS */
{
    /*
     * Compare for less-than-or-equal
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmple_ps"));

    ao.m128 = a;
    bo.m128 = b;

    ro.ud[0] = (ao._f[0] <= bo._f[0]) ? 0xffffffffL : 0x0;
    ro.ud[1] = (ao._f[1] <= bo._f[1]) ? 0xffffffffL : 0x0;
    ro.ud[2] = (ao._f[2] <= bo._f[2]) ? 0xffffffffL : 0x0;
    ro.ud[3] = (ao._f[3] <= bo._f[3]) ? 0xffffffffL : 0x0;

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpgt_ss compare for greater-than
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_cmpgt_ss(Rt_m128 a, Rt_m128 b) /*     CMPLTSS        r */
{
    /*
     * Compare for greater-than
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpgt_ss"));

    ao.m128 = a;
    bo.m128 = b;

    ro.ud[0] = (ao._f[0] > bo._f[0]) ? 0xffffffffL : 0x0;
    ro._f[1] = ao._f[1];
    ro._f[2] = ao._f[2];
    ro._f[3] = ao._f[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpgt_ps compare for greater-than
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_cmpgt_ps(Rt_m128 a, Rt_m128 b) /*     CMPLTPS        r */
{
    /*
     * Compare for greater-than
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpgt_ps"));

    ao.m128 = a;
    bo.m128 = b;

    ro.ud[0] = (ao._f[0] > bo._f[0]) ? 0xffffffffL : 0x0;
    ro.ud[1] = (ao._f[1] > bo._f[1]) ? 0xffffffffL : 0x0;
    ro.ud[2] = (ao._f[2] > bo._f[2]) ? 0xffffffffL : 0x0;
    ro.ud[3] = (ao._f[3] > bo._f[3]) ? 0xffffffffL : 0x0;

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpge_ss compare for greater-than-or-equal
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_cmpge_ss(Rt_m128 a, Rt_m128 b) /*     CMPLESS        r */
{
    /*
     * Compare for greater-than-or-equal
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpge_ss"));

    ao.m128 = a;
    bo.m128 = b;

    ro.ud[0] = (ao._f[0] >= bo._f[0]) ? 0xffffffffL : 0x0;
    ro._f[1] = ao._f[1];
    ro._f[2] = ao._f[2];
    ro._f[3] = ao._f[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpge_ps compare for greater-than-or-equal
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_cmpge_ps(Rt_m128 a, Rt_m128 b) /*     CMPLEPS        r */
{
    /*
     * Compare for greater-than-or-equal
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpge_ps"));

    ao.m128 = a;
    bo.m128 = b;

    ro.ud[0] = (ao._f[0] >= bo._f[0]) ? 0xffffffffL : 0x0;
    ro.ud[1] = (ao._f[1] >= bo._f[1]) ? 0xffffffffL : 0x0;
    ro.ud[2] = (ao._f[2] >= bo._f[2]) ? 0xffffffffL : 0x0;
    ro.ud[3] = (ao._f[3] >= bo._f[3]) ? 0xffffffffL : 0x0;

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpneq_ss compare for inequality
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_cmpneq_ss(Rt_m128 a, Rt_m128 b) /* CMPNEQSS */
{
    /*
     * Compare for inequality
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpneq_ss"));

    ao.m128 = a;
    bo.m128 = b;

    ro.ud[0] = (ao._f[0] != bo._f[0]) ? 0xffffffffL : 0x0;
    ro._f[1] = ao._f[1];
    ro._f[2] = ao._f[2];
    ro._f[3] = ao._f[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpneq_ps compare for inequality
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_cmpneq_ps(Rt_m128 a, Rt_m128 b) /* CMPNEQPS */
{
    /*
     * Compare for inequality
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpneq_ps"));

    ao.m128 = a;
    bo.m128 = b;

    ro.ud[0] = (ao._f[0] != bo._f[0]) ? 0xffffffffL : 0x0;
    ro.ud[1] = (ao._f[1] != bo._f[1]) ? 0xffffffffL : 0x0;
    ro.ud[2] = (ao._f[2] != bo._f[2]) ? 0xffffffffL : 0x0;
    ro.ud[3] = (ao._f[3] != bo._f[3]) ? 0xffffffffL : 0x0;

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpnlt_ss compare for not-less-than
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_cmpnlt_ss(Rt_m128 a, Rt_m128 b) /* CMPNLTSS */
{
    /*
     * Compare for not-less-than
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpnlt_ss"));

    ao.m128 = a;
    bo.m128 = b;

    ro.ud[0] = !(ao._f[0] < bo._f[0]) ? 0xffffffffL : 0x0;
    ro._f[1] = ao._f[1];
    ro._f[2] = ao._f[2];
    ro._f[3] = ao._f[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpnlt_ps compare for not-less-than
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_cmpnlt_ps(Rt_m128 a, Rt_m128 b) /* CMPNLTPS */
{
    /*
     * Compare for not-less-than
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpnlt_ps"));

    ao.m128 = a;
    bo.m128 = b;

    ro.ud[0] = !(ao._f[0] < bo._f[0]) ? 0xffffffffL : 0x0;
    ro.ud[1] = !(ao._f[1] < bo._f[1]) ? 0xffffffffL : 0x0;
    ro.ud[2] = !(ao._f[2] < bo._f[2]) ? 0xffffffffL : 0x0;
    ro.ud[3] = !(ao._f[3] < bo._f[3]) ? 0xffffffffL : 0x0;

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpnle_ss compare for not-less-than-or-equal
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_cmpnle_ss(Rt_m128 a, Rt_m128 b) /* CMPNLESS */
{
    /*
     * Compare for not-less-than-or-equal
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpnle_ss"));

    ao.m128 = a;
    bo.m128 = b;

    ro.ud[0] = !(ao._f[0] <= bo._f[0]) ? 0xffffffffL : 0x0;
    ro._f[1] = ao._f[1];
    ro._f[2] = ao._f[2];
    ro._f[3] = ao._f[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpnle_ps compare for not-less-than-or-equal
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_cmpnle_ps(Rt_m128 a, Rt_m128 b) /* CMPNLEPS */
{
    /*
     * Compare for not-less-than-or-equal
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpnle_ps"));

    ao.m128 = a;
    bo.m128 = b;

    ro.ud[0] = !(ao._f[0] <= bo._f[0]) ? 0xffffffffL : 0x0;
    ro.ud[1] = !(ao._f[1] <= bo._f[1]) ? 0xffffffffL : 0x0;
    ro.ud[2] = !(ao._f[2] <= bo._f[2]) ? 0xffffffffL : 0x0;
    ro.ud[3] = !(ao._f[3] <= bo._f[3]) ? 0xffffffffL : 0x0;

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpngt_ss compare for not-greater-than
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_cmpngt_ss(Rt_m128 a, Rt_m128 b) /*     CMPNLTSS       r */
{
    /*
     * Compare for not-greater-than
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpngt_ss"));

    ao.m128 = a;
    bo.m128 = b;

    ro.ud[0] = !(ao._f[0] > bo._f[0]) ? 0xffffffffL : 0x0;
    ro._f[1] = ao._f[1];
    ro._f[2] = ao._f[2];
    ro._f[3] = ao._f[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpngt_ps compare for not-greater-than
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_cmpngt_ps(Rt_m128 a, Rt_m128 b) /*     CMPNLTPS       r */
{
    /*
     * Compare for not-greater-than
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpngt_ps"));

    ao.m128 = a;
    bo.m128 = b;

    ro.ud[0] = !(ao._f[0] > bo._f[0]) ? 0xffffffffL : 0x0;
    ro.ud[1] = !(ao._f[1] > bo._f[1]) ? 0xffffffffL : 0x0;
    ro.ud[2] = !(ao._f[2] > bo._f[2]) ? 0xffffffffL : 0x0;
    ro.ud[3] = !(ao._f[3] > bo._f[3]) ? 0xffffffffL : 0x0;

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpnge_ss compare for not-greater-than-or-equal
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_cmpnge_ss(Rt_m128 a, Rt_m128 b) /*     CMPNLESS       r */
{
    /*
     * Compare for not-greater-than-or-equal
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpnge_ss"));

    ao.m128 = a;
    bo.m128 = b;

    ro.ud[0] = !(ao._f[0] >= bo._f[0]) ? 0xffffffffL : 0x0;
    ro._f[1] = ao._f[1];
    ro._f[2] = ao._f[2];
    ro._f[3] = ao._f[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpnge_ps compare for not-greater-than-or-equal
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_cmpnge_ps(Rt_m128 a, Rt_m128 b) /*     CMPNLEPS       r */
{
    /*
     * Compare for not-greater-than-or-equal
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpnge_ps"));

    ao.m128 = a;
    bo.m128 = b;

    ro.ud[0] = !(ao._f[0] >= bo._f[0]) ? 0xffffffffL : 0x0;
    ro.ud[1] = !(ao._f[1] >= bo._f[1]) ? 0xffffffffL : 0x0;
    ro.ud[2] = !(ao._f[2] >= bo._f[2]) ? 0xffffffffL : 0x0;
    ro.ud[3] = !(ao._f[3] >= bo._f[3]) ? 0xffffffffL : 0x0;

    RWRETURN(ro.m128);
}

/*
 * unordered
 *
 * when two source opearands ae compared and one or both
 * sources are a NaN (not a number), the result of the
 * comparison is unordered
 */

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpord_ss compare for ordered
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_cmpord_ss(Rt_m128 a, Rt_m128 b) /* CMPORDSS */
{
    /*
     * Compare for ordered
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpord_ss"));

    ao.m128 = a;
    bo.m128 = b;

    ro.ud[0] =
        (!(_rw_isnan(ao._f[0]) || _rw_isnan(bo._f[0]))) ? 0xffffffffL :
        0x0;
    ro._f[1] = ao._f[1];
    ro._f[2] = ao._f[2];
    ro._f[3] = ao._f[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpord_ps compare for ordered
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_cmpord_ps(Rt_m128 a, Rt_m128 b) /* CMPORDPS */
{
    /*
     * Compare for ordered
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpord_ps"));

    ao.m128 = a;
    bo.m128 = b;

    ro.ud[0] =
        (!(_rw_isnan(ao._f[0]) || _rw_isnan(bo._f[0]))) ? 0xffffffffL :
        0x0;
    ro.ud[1] =
        (!(_rw_isnan(ao._f[1]) || _rw_isnan(bo._f[1]))) ? 0xffffffffL :
        0x0;
    ro.ud[2] =
        (!(_rw_isnan(ao._f[2]) || _rw_isnan(bo._f[2]))) ? 0xffffffffL :
        0x0;
    ro.ud[3] =
        (!(_rw_isnan(ao._f[3]) || _rw_isnan(bo._f[3]))) ? 0xffffffffL :
        0x0;

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpunord_ss compare for unordered
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_cmpunord_ss(Rt_m128 a, Rt_m128 b) /* CMPUNORDSS */
{
    /*
     * Compare for unordered
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpunord_ss"));

    ao.m128 = a;
    bo.m128 = b;

    ro.ud[0] =
        ((_rw_isnan(ao._f[0]) || _rw_isnan(bo._f[0]))) ? 0xffffffffL :
        0x0;
    ro._f[1] = ao._f[1];
    ro._f[2] = ao._f[2];
    ro._f[3] = ao._f[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cmpunord_ps compare for unordered
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
*/

Rt_m128
Rt_mm_cmpunord_ps(Rt_m128 a, Rt_m128 b) /* CMPUNORDPS */
{
    /*
     * Compare for unordered
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cmpunord_ps"));

    ao.m128 = a;
    bo.m128 = b;

    ro.ud[0] =
        ((_rw_isnan(ao._f[0]) || _rw_isnan(bo._f[0]))) ? 0xffffffffL :
        0x0;
    ro.ud[1] =
        ((_rw_isnan(ao._f[1]) || _rw_isnan(bo._f[1]))) ? 0xffffffffL :
        0x0;
    ro.ud[2] =
        ((_rw_isnan(ao._f[2]) || _rw_isnan(bo._f[2]))) ? 0xffffffffL :
        0x0;
    ro.ud[3] =
        ((_rw_isnan(ao._f[3]) || _rw_isnan(bo._f[3]))) ? 0xffffffffL :
        0x0;

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_comieq_ss compares the lower SP FP value of a and b
 * for a equal to b If a and b are equal, 1 is returned Otherwise 0 is
 * returned
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

int
Rt_mm_comieq_ss(Rt_m128 a, Rt_m128 b) /* COMISS */
{
    /*
     * Compares the lower SP FP value of a and b for a equal to b
     * If a and b are equal, 1 is returned
     * Otherwise 0 is returned
     */
    int                 r;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_comieq_ss"));

    ao.m128 = a;
    bo.m128 = b;

    r = (ao._f[0] == bo._f[0]) ? 0x1 : 0x0;

    RWRETURN((int) r);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_comilt_ss compares the lower SP FP value of a and b
 * for a less than b If a is less than b, 1 is returned Otherwise 0 is
 * returned
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

int
Rt_mm_comilt_ss(Rt_m128 a, Rt_m128 b) /* COMISS */
{
    /*
     * Compares the lower SP FP value of a and b for a less than b
     * If a is less  than b, 1 is returned
     * Otherwise 0 is returned
     */
    int                 r;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_comilt_ss"));

    ao.m128 = a;
    bo.m128 = b;

    r = (ao._f[0] < bo._f[0]) ? 0x1 : 0x0;

    RWRETURN((int) r);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_comile_ss compares the lower SP FP value of a and b
 * for a less than or equal to b If a is less than or equal to b, 1 is
 * returned Otherwise 0 is returned
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

int
Rt_mm_comile_ss(Rt_m128 a, Rt_m128 b) /* COMISS */
{
    /*
     * Compares the lower SP FP value of a and b for a less than or equal to b
     * If  a is less than or equal to b, 1 is returned
     * Otherwise 0 is returned
     */
    int                 r;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_comile_ss"));

    ao.m128 = a;
    bo.m128 = b;

    r = (ao._f[0] <= bo._f[0]) ? 0x1 : 0x0;

    RWRETURN((int) r);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_comigt_ss compares the lower SP FP value of a and b
 * for a greater than b If a is greater than b are equal, 1 is returned
 * Otherwise 0 is returned
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

int
Rt_mm_comigt_ss(Rt_m128 a, Rt_m128 b) /* COMISS */
{
    /*
     * Compares the lower SP FP value of a and b for a greater than b
     * If a is greater than b are equal, 1 is returned
     * Otherwise 0 is returned
     */
    int                 r;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_comigt_ss"));

    ao.m128 = a;
    bo.m128 = b;

    r = (ao._f[0] > bo._f[0]) ? 0x1 : 0x0;

    RWRETURN((int) r);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_comige_ss compares the lower SP FP value of a and b
 * for a greater than or equal to b If a is greater than or equal to b, 1
 * is returned Otherwise 0 is returned
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

int
Rt_mm_comige_ss(Rt_m128 a, Rt_m128 b) /* COMISS */
{
    /*
     * Compares the lower SP FP value of a and b for a greater than or
     * equal to b
     * If a is greater than or equal to b, 1 is returned
     * Otherwise 0 is returned
     */
    int                 r;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_comige_ss"));

    ao.m128 = a;
    bo.m128 = b;

    r = (ao._f[0] >= bo._f[0]) ? 0x1 : 0x0;

    RWRETURN((int) r);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_comineq_ss compares the lower SP FP value of a and
 * b for a not equal to b If a and b are not equal, 1 is returned
 * Otherwise 0 is returned
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

int
Rt_mm_comineq_ss(Rt_m128 a, Rt_m128 b) /* COMISS */
{
    /*
     * Compares the lower SP FP value of a and b for a not equal to b
     * If a and b  are not equal, 1 is returned
     * Otherwise 0 is returned
     */
    int                 r;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_comineq_ss"));

    ao.m128 = a;
    bo.m128 = b;

    r = (ao._f[0] != bo._f[0]) ? 0x1 : 0x0;

    RWRETURN((int) r);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_ucomieq_ss compares the lower SP FP value of a and
 * b for a equal to b If a and b are equal, 1 is returned Otherwise 0 is
 * returned
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

int
Rt_mm_ucomieq_ss(Rt_m128 a, Rt_m128 b) /* UCOMISS */
{
    /*
     * Compares the lower SP FP value of a and b for a equal to b
     * If a and b are  equal, 1 is returned
     * Otherwise 0 is returned
     */
    int                 r;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_ucomieq_ss"));

    ao.m128 = a;
    bo.m128 = b;

    r = (ao._f[0] == bo._f[0]) ? 0x1 : 0x0;

    RWRETURN((int) r);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_ucomilt_ss compares the lower SP FP value of a and
 * b for a less than b If a is less than b, 1 is returned Otherwise 0 is
 * returned
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

int
Rt_mm_ucomilt_ss(Rt_m128 a, Rt_m128 b) /* UCOMISS */
{
    /*
     * Compares the lower SP FP value of a and b for a less than b
     * If a is less than b, 1 is returned
     * Otherwise 0 is returned
     */
    int                 r;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_ucomilt_ss"));

    ao.m128 = a;
    bo.m128 = b;

    r = (ao._f[0] < bo._f[0]) ? 0x1 : 0x0;

    RWRETURN((int) r);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_ucomile_ss compares the lower SP FP value of a and
 * b for a less than or equal to b If a is less than or equal to b, 1 is
 * returned Otherwise 0 is returned
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

int
Rt_mm_ucomile_ss(Rt_m128 a, Rt_m128 b) /* UCOMISS */
{
    /*
     * Compares the lower SP FP value of a and b for a less than or equal to b
     * If a is less than or equal to b, 1 is returned
     * Otherwise 0 is returned
     */
    int                 r;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_ucomile_ss"));

    ao.m128 = a;
    bo.m128 = b;

    r = (ao._f[0] <= bo._f[0]) ? 0x1 : 0x0;

    RWRETURN((int) r);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_ucomigt_ss compares the lower SP FP value of a and
 * b for a greater than b If a is greater than b are equal, 1 is returned
 * Otherwise 0 is returned
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

int
Rt_mm_ucomigt_ss(Rt_m128 a, Rt_m128 b) /* UCOMISS */
{
    /*
     * Compares the lower SP FP value of a and b for a greater than b
     * If a is  greater than b are equal, 1 is returned
     * Otherwise 0 is returned
     */
    int                 r;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_ucomigt_ss"));

    ao.m128 = a;
    bo.m128 = b;

    r = (ao._f[0] > bo._f[0]) ? 0x1 : 0x0;

    RWRETURN((int) r);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_ucomige_ss compares the lower SP FP value of a and
 * b for a greater than or equal to b If a is greater than or equal to b,
 * 1 is returned Otherwise 0 is returned
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

int
Rt_mm_ucomige_ss(Rt_m128 a, Rt_m128 b) /* UCOMISS */
{
    /*
     * Compares the lower SP FP value of a and b for a greater than
     * or equal to b
     * If a is greater than or equal to b, 1 is returned
     * Otherwise 0 is returned
     */
    int                 r;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_ucomige_ss"));

    ao.m128 = a;
    bo.m128 = b;

    r = (ao._f[0] >= bo._f[0]) ? 0x1 : 0x0;

    RWRETURN((int) r);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_ucomineq_ss compares the lower SP FP value of a and
 * b for a not equal to b If a and b are not equal, 1 is returned
 * Otherwise 0 is returned
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

int
Rt_mm_ucomineq_ss(Rt_m128 a, Rt_m128 b) /* UCOMISS */
{
    /*
     * Compares the lower SP FP value of a and b for a not equal to b
     * If a and b are not equal, 1 is returned
     * Otherwise 0 is returned
     */
    int                 r;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_ucomineq_ss"));

    ao.m128 = a;
    bo.m128 = b;

    r = (ao._f[0] != bo._f[0]) ? 0x1 : 0x0;

    RWRETURN((int) r);
}

/*
 * Conversion Operations
 */

/**
 * \ingroup rtintel
 * \ref Rt_mm_cvt_ss2si convert the lower SP FP value of a to a
 * 32-bit integer according to the current rounding mode
 * \param A  Rt_m128 a
 */

int
Rt_mm_cvt_ss2si(Rt_m128 a)      /* CVTSS2SI */
{
    /*
     * Convert the lower SP FP value of a to a 32-bit integer according to the
     * current rounding mode
     */
    int                 r;
    volatile RwOverlayM128 ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cvt_ss2si"));

    ao.m128 = a;

    r = (int) ao._f[0];

    RWRETURN((int) r);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cvt_ps2pi convert the two lower SP FP values of a
 * to two 32-bit integers according to the current rounding mode,
 * returning the integers in packed form
 */

Rt_m64
Rt_mm_cvt_ps2pi(Rt_m128 a)      /* CVTPS2PI */
{

    /* Convert the two
     * lower SP FP values of a to two 32-bit integers according to the
     * current rounding mode, returning the integers in packed form 
     */
    volatile RwOverlayM64 ro;
    volatile RwOverlayM128 ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cvt_ps2pi"));

    ao.m128 = a;

    ro._d[0] = (int) ao._f[0];
    ro._d[1] = (int) ao._f[1];

    RWRETURN(ro.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cvtt_ss2si convert the lower SP FP value of a to a
 * 32-bit integer with truncation
 * \param A  Rt_m128 a
 */

int
Rt_mm_cvtt_ss2si(Rt_m128 a)     /* CVTTSS2SI */
{
    /*
     * Convert the lower SP FP value of a to a 32-bit integer with truncation
     */
    int                 r;
    float               floor_a;
    volatile RwOverlayM128 ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cvtt_ss2si"));

    ao.m128 = a;

    floor_a = (float) RwFloor(ao._f[0]);
    r = (int) floor_a;

    RWRETURN((int) r);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cvtt_ps2pi convert the two lower SP FP values of a
 * to two 32-bit integer with truncation, RWRETURN(ing the integers in
 * packed form */

Rt_m64
Rt_mm_cvtt_ps2pi(Rt_m128 a)     /* CVTTPS2PI */
{

/* Convert the
 * two lower SP FP values of a to two 32-bit integer with * truncation,
 * RWRETURN(ing the integers in packed form */
    volatile RwOverlayM64 ro;
    volatile RwOverlayM128 ao;
    float               floor_a;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cvtt_ps2pi"));

    ao.m128 = a;

    floor_a = (float) RwFloor(ao._f[0]);
    ro._d[0] = (int) floor_a;
    floor_a = (float) RwFloor(ao._f[1]);
    ro._d[1] = (int) floor_a;

    RWRETURN(ro.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cvt_si2ss convert the two 32-bit integer value b to
 * a SP FP value the upper three SP FP values are passed through from a
 * \param a  Rt_m128 a
 * \param b  int b
 */

Rt_m128
Rt_mm_cvt_si2ss(Rt_m128 a, int b) /*                CVTSI2SS */
{
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cvt_si2ss"));

    ao.m128 = a;

    ro._f[0] = (float) b;
    ro._f[1] = ao._f[1];
    ro._f[2] = ao._f[2];
    ro._f[3] = ao._f[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_cvt_pi2ps convert the two 32-bit integer values in
 * packed form in b to two SP FP values the upper two SP FP values are
 * passed through from a
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_cvt_pi2ps(Rt_m128 a, Rt_m64 b) /* CVTPI2PS */
{
    /*
     * Convert the two 32-bit integer values in packed form in b to two SP FP
     * values the upper two SP FP values are passed through from a
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM64 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_cvt_pi2ps"));

    ao.m128 = a;
    bo.m64 = b;

    ro._f[0] = (float) (bo._d[0]);
    ro._f[1] = (float) (bo._d[1]);
    ro._f[2] = ao._f[2];
    ro._f[3] = ao._f[3];

    RWRETURN(ro.m128);
}

/*
 *     Miscellaneous
 */

/**
 * \ingroup rtintel
 * \ref Rt_mm_shuffle_ps selects four specific SP FP values from
 * a and b, based on the mask i The mask must be an immediate See ``
 * Macro Function for Shuffle'' in the end of this section for a
 * description of the shuffle semantics
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 * \param i  int i 
 */

Rt_m128
Rt_mm_shuffle_ps(Rt_m128 a, Rt_m128 b, int i) /*                SHUFPS */
{
    /*
     * Selects four specific SP FP values from a and b, based on the mask i
     * The  mask must be an immediate
     * See `` Macro Function for Shuffle'' in the end of
     * this section for a description of the shuffle semantics
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_shuffle_ps"));

    ao.m128 = a;
    bo.m128 = b;

    ro._f[0] = ao._f[(i >> 0) & 3];
    ro._f[1] = ao._f[(i >> 2) & 3];
    ro._f[2] = bo._f[(i >> 4) & 3];
    ro._f[3] = bo._f[(i >> 6) & 3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_unpackhi_ps selects and interleaves the upper two
 * SP FP values from a and b
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_unpackhi_ps(Rt_m128 a, Rt_m128 b) /* UNPCKHPS */
{
    /*
     * Selects and interleaves the upper two SP FP values from a and b
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_unpackhi_ps"));

    ao.m128 = a;
    bo.m128 = b;

    ro._f[0] = ao._f[2];
    ro._f[1] = bo._f[2];
    ro._f[2] = ao._f[3];
    ro._f[3] = bo._f[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_unpacklo_ps selects and interleaves the lower two
 * SP FP values from a and b
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_unpacklo_ps(Rt_m128 a, Rt_m128 b) /* UNPCKLPS */
{
    /*
     * Selects and interleaves the lower two SP FP values from a and b
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_unpacklo_ps"));

    ao.m128 = a;
    bo.m128 = b;

    ro._f[0] = ao._f[0];
    ro._f[1] = bo._f[0];
    ro._f[2] = ao._f[1];
    ro._f[3] = bo._f[1];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_loadh_pi sets the upper two SP FP values with 64
 * bits of data loaded from the address p the lower two values are passed
 * through from a
 * \param a  Rt_m128 a
 * \param p  Rt_m64 * p
 */

Rt_m128
Rt_mm_loadh_pi(Rt_m128 a, Rt_m64 * p) /*     MOVHPS         reg, mem */
{
    /*
     * Sets the upper two SP FP values with 64 bits of data loaded from the
     * address p the lower two values are passed through from a
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM64 *po;

    RWAPIFUNCTION(RWSTRING("Rt_mm_loadh_pi"));

    ao.m128 = a;
    po = (volatile RwOverlayM64 *) p;

    ro._f[0] = ao._f[0];
    ro._f[1] = ao._f[1];
    ro._f[2] = (float) po->_d[0];
    ro._f[3] = (float) po->_d[1];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_storeh_pi stores the upper two SP FP values of a to
 * the address p
 * \param p  Rt_m64 * p
 * \param a  Rt_m128 a
 */

void
Rt_mm_storeh_pi(Rt_m64 * p, Rt_m128 a) /*     MOVHPS         mem, reg */
{
    /*
     * Stores the upper two SP FP values of a to the address p
     */
    volatile RwOverlayM128 ao;
    volatile RwOverlayM64 *po;

    RWAPIFUNCTION(RWSTRING("Rt_mm_storeh_pi"));

    ao.m128 = a;
    po = (volatile RwOverlayM64 *) p;
    po->_d[0] = ao._d[2];
    po->_d[1] = ao._d[3];

}

/**
 * \ingroup rtintel
 * \ref Rt_mm_movehl_ps moves the upper 2 SP FP values of b to
 * the lower 2 SP FP values of the result The upper 2 SP FP values of a
 * are passed through to the result
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
*/

Rt_m128
Rt_mm_movehl_ps(Rt_m128 a, Rt_m128 b) /* MOVHLPS */
{
    /*
     * Moves the upper 2 SP FP values of b to the lower 2 SP FP values of
     * the result
     * The upper 2 SP FP values of a are passed through to the result
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_movehl_ps"));

    ao.m128 = a;
    bo.m128 = b;

    ro._f[3] = ao._f[3];
    ro._f[2] = ao._f[2];
    ro._f[1] = bo._f[3];
    ro._f[0] = bo._f[2];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_movelh_ps moves the lower 2 SP FP values of b to
 * the upper 2 SP FP values of the result The lower 2 SP FP values of a
 * are passed through to the result
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_movelh_ps(Rt_m128 a, Rt_m128 b) /* MOVLHPS */
{
    /*
     * Moves the lower 2 SP FP values of b to the upper 2 SP FP values of
     * the result
     * The lower 2 SP FP values of a are passed through to the result
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_movelh_ps"));

    ao.m128 = a;
    bo.m128 = b;

    ro._f[3] = bo._f[1];
    ro._f[2] = bo._f[0];
    ro._f[1] = ao._f[1];
    ro._f[0] = ao._f[0];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_loadl_pi sets the lower two SP FP values with 64
 * bits of data loaded from the address p the upper two values are passed
 * through from a
 * \param a  Rt_m128 a
 * \param p  Rt_m64 * p
 */

Rt_m128
Rt_mm_loadl_pi(Rt_m128 a, Rt_m64 * p) /*     MOVLPS         reg,    mem */
{
    /*
     * Sets the lower two SP FP values with 64 bits of data loaded from the
     * address p the upper two values are passed through from a
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM64 *po;

    RWAPIFUNCTION(RWSTRING("Rt_mm_loadl_pi"));

    ao.m128 = a;
    po = (volatile RwOverlayM64 *) p;
    ro._d[0] = po->_d[0];
    ro._d[1] = po->_d[1];
    ro._f[2] = ao._f[2];
    ro._f[3] = ao._f[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_storel_pi stores the lower two SP FP values of a to
 * the address p
 * \param p  Rt_m64 * p
 * \param a  Rt_m128 a
 */

void
Rt_mm_storel_pi(Rt_m64 * p, Rt_m128 a) /* MOVLPS         mem, reg */
{
    /*
     * Stores the lower two SP FP values of a to the address p
     */
    volatile RwOverlayM128 ao;
    volatile RwOverlayM64 *po;

    RWAPIFUNCTION(RWSTRING("Rt_mm_storel_pi"));

    ao.m128 = a;
    po = (volatile RwOverlayM64 *) p;
    po->_d[0] = ao._d[0];
    po->_d[1] = ao._d[1];

    RWRETURNVOID();
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_movemask_ps creates a 4-bit mask from the most
 * significant bits of the four SP FP values
 * \param a  Rt_m128 a
 */

int
Rt_mm_movemask_ps(Rt_m128 a)    /* MOVMSKPS */
{
    /*
     * Creates a 4-bit mask from the most significant bits of
     * the four SP FP values
     */
    int                 r;
    volatile RwOverlayM128 ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_movemask_ps"));

    ao.m128 = a;

    r = sign(ao._f[3]) << 3 |
        sign(ao._f[2]) << 2 | sign(ao._f[1]) << 1 | sign(ao._f[0]);

    RWRETURN((int) r);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_getcsr returns the contents of the control register
 */
unsigned int
Rt_mm_getcsr(void)              /* STMXCSR */
{                              /* returns the
                                * * contents of the control register */
    unsigned int        r = 0;

    RWAPIFUNCTION(RWSTRING("Rt_mm_getcsr"));

    INTEL_SSE_UNIMPLEMENTED("Rt_mm_getcsr");

    RWRETURN((unsigned int) r);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_setcsr sets the control register to the value
 * specified
 * \param  i   i
 */

void
Rt_mm_setcsr(unsigned int __RWUNUSED__ i) /*  LDMXCSR */
{
    /*
     * Sets the control register to the value specified
     */
    RWAPIFUNCTION(RWSTRING("Rt_mm_setcsr"));

    INTEL_SSE_UNIMPLEMENTED("Rt_mm_setcsr");

    RWRETURNVOID();
}

/*
 * Load Operations
 */

/**
 * \ingroup rtintel
 * \ref Rt_mm_load_ss loads an SP FP value into the low word and
 * clears the upper three words
 * \param p  float *p
 */

Rt_m128
Rt_mm_load_ss(float *p)         /*  MOVSS */
{
    /*
     * Loads an SP FP value into the low word and clears the upper three words
     */
    volatile RwOverlayM128 ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_load_ss"));

    ro._f[0] = *p;
    ro._f[1] = (float) 0.0;
    ro._f[2] = (float) 0.0;
    ro._f[3] = (float) 0.0;

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_load_ps1 loads a single SP FP value, copying it
 * into all four words
 * \param p  float *p
 */

Rt_m128
Rt_mm_load_ps1(float *p)        /*  MOVSS + shuffling */
{
    /*
     * Loads a single SP FP value, copying it into all four words
     */
    volatile RwOverlayM128 ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_load_ps1"));

    ro._f[0] = *p;
    ro._f[1] = *p;
    ro._f[2] = *p;
    ro._f[3] = *p;

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_load_ps loads four SP FP values.The address must be
 * 16 - byte - aligned
 * \param p  float *p
 */

Rt_m128
Rt_mm_load_ps(float *p)         /*  MOVAPS */
{
    /*
     * Loads four SP FP values.The address must be 16 - byte - aligned
     */
    volatile RwOverlayM128 ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_load_ps"));

    ro._f[0] = p[0];
    ro._f[1] = p[1];
    ro._f[2] = p[2];
    ro._f[3] = p[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_loadu_ps loads four SP FP values.The address need
 * not be 16 - byte - aligned
 * \param p  float *p
 */

Rt_m128
Rt_mm_loadu_ps(float *p)        /*  MOVUPS */
{
    /*
     * Loads four SP FP values.The address need not be 16 - byte - aligned
     */
    volatile RwOverlayM128 ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_loadu_ps"));

    ro._f[0] = p[0];
    ro._f[1] = p[1];
    ro._f[2] = p[2];
    ro._f[3] = p[3];

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_loadr_ps loads four SP FP values in reverse order
 * The address must be 16-byte-aligned
 * \param p  float *p
 */

Rt_m128
Rt_mm_loadr_ps(float *p)        /*  MOVAPS + shuffling  */
{
    /*
     * Loads four SP FP values in reverse order
     The address must be
     * 16-byte-aligned
     */
    volatile RwOverlayM128 ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_loadr_ps"));

    ro._f[0] = p[3];
    ro._f[1] = p[2];
    ro._f[2] = p[1];
    ro._f[3] = p[0];

    RWRETURN(ro.m128);
}

/*
 * Set Operations
 */

/**
 * \ingroup rtintel
 * \ref Rt_mm_set_ss sets the low word of an SP FP value to w and
 * clears the upper three words
 * \param p  float w
 */

Rt_m128
Rt_mm_set_ss(float w)           /* (composite) */
{
    /*
     * Sets the low word of an SP FP value to w and clears
     * the upper three words
     */
    volatile RwOverlayM128 ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_set_ss"));

    ro._f[0] = w;
    ro._f[1] = (float) 0.0;
    ro._f[2] = (float) 0.0;
    ro._f[3] = (float) 0.0;

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_set_ps1 sets the four SP FP values to w
 * \param w float w
 */

Rt_m128
Rt_mm_set_ps1(float w)          /* (composite) */
{
    /*
     * Sets the four SP FP values to w
     */
    volatile RwOverlayM128 ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_set_ps1"));

    ro._f[0] = w;
    ro._f[1] = w;
    ro._f[2] = w;
    ro._f[3] = w;

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_set_ps sets the four SP FP values to the four
 * inputs
 * \param x  float z
 * \param y  float y
 * \param z  float x
 * \param w  float w
 */

Rt_m128
Rt_mm_set_ps(float z, float y, float x, float w) /* (composite) */
{
    /*
     * Sets the four SP FP values to the four inputs
     */
    volatile RwOverlayM128 ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_set_ps"));

    ro._f[0] = w;
    ro._f[1] = x;
    ro._f[2] = y;
    ro._f[3] = z;

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_setr_ps sets the four SP FP values to the four
 * inputs in reverse order
 * \param z  float z
 * \param y  float y
 * \param x  float x
 * \param w  float w
 */

Rt_m128
Rt_mm_setr_ps(float z, float y, float x, float w) /* (composite) */
{
    /*
     * Sets the four SP FP values to the four inputs in reverse order
     */
    volatile RwOverlayM128 ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_setr_ps"));

    ro._f[0] = z;
    ro._f[1] = y;
    ro._f[2] = x;
    ro._f[3] = w;

    RWRETURN(ro.m128);
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_setzero_ps clears the four SP FP values 
 */
Rt_m128
Rt_mm_setzero_ps(void)          /* (composite) */
{

    /* Clears the four SP FP values */
    volatile RwOverlayM128 ro;

    RWAPIFUNCTION(RWSTRING("Rt_mm_setzero_ps"));

    ro._f[0] = (float) 0.0;
    ro._f[1] = (float) 0.0;
    ro._f[2] = (float) 0.0;
    ro._f[3] = (float) 0.0;

    RWRETURN(ro.m128);
}

/* Store Operations */

/**
 * \ingroup rtintel
 * \ref Rt_mm_store_ss stores the lower SP FP value
 * \param p  float *p
 * \param a  Rt_m128 a
 */

void
Rt_mm_store_ss(float *p, Rt_m128 a) /*     MOVSS */
{
    /*
     * Stores the lower SP FP value
     */
    volatile RwOverlayM128 ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_store_ss"));

    ao.m128 = a;

    *p = ao._f[0];

    RWRETURNVOID();
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_store_ps1 stores the lower SP FP value across four
 * words
 * \param p  float *p
 * \param a  Rt_m128 a
 */

void
Rt_mm_store_ps1(float *p, Rt_m128 a) /*       MOVSS + shuffling */
{
    /*
     * Stores the lower SP FP value across four words
     */
    volatile RwOverlayM128 ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_store_ps1"));

    ao.m128 = a;

    p[0] = ao._f[0];
    p[1] = ao._f[0];
    p[2] = ao._f[0];
    p[3] = ao._f[0];

    RWRETURNVOID();
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_store_ps stores four SP FP values The address must
 * be 16-byte-aligned
 * \param p  float *p
 * \param a  Rt_m128 a
 */

void
Rt_mm_store_ps(float *p, Rt_m128 a) /*     MOVAPS */
{
    /*
     * Stores four SP FP values
     * The address must be 16-byte-aligned
     */
    volatile RwOverlayM128 ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_store_ps"));

    ao.m128 = a;

    p[0] = ao._f[0];
    p[1] = ao._f[1];
    p[2] = ao._f[2];
    p[3] = ao._f[3];

    RWRETURNVOID();
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_storeu_ps stores four SP FP values The address need
 * not be 16-byte-aligned
 * \param p  float *p
 * \param a  Rt_m128 a
 */

void
Rt_mm_storeu_ps(float *p, Rt_m128 a) /*     MOVUPS */
{
    /*
     * Stores four SP FP values
     * The address need not be 16-byte-aligned
     */
    volatile RwOverlayM128 ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_storeu_ps"));

    ao.m128 = a;

    p[0] = ao._f[0];
    p[1] = ao._f[1];
    p[2] = ao._f[2];
    p[3] = ao._f[3];

    RWRETURNVOID();
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_storer_ps stores four SP FP values in reverse order
 * The address must be 16-byte-aligned
 * \param p  float *p
 * \param a  Rt_m128 a
 */

void
Rt_mm_storer_ps(float *p, Rt_m128 a) /*      MOVAPS + shuffling */
{
    /*
     * Stores four SP FP values in reverse order
     * The address must be 16-byte-aligned
     */
    volatile RwOverlayM128 ao;

    RWAPIFUNCTION(RWSTRING("Rt_mm_storer_ps"));

    ao.m128 = a;

    p[0] = ao._f[3];
    p[1] = ao._f[2];
    p[2] = ao._f[1];
    p[3] = ao._f[0];
    RWRETURNVOID();
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_move_ss sets the low word to the SP FP value of b
 * The upper 3 SP FP values are passed through from a
 * \param a  Rt_m128 a
 * \param b  Rt_m128 b
 */

Rt_m128
Rt_mm_move_ss(Rt_m128 a, Rt_m128 b) /*     MOVSS */
{
    /*
     * Sets the low word to the SP FP value of b
     * The upper 3 SP FP values are passed through from a
     */
    volatile RwOverlayM128 ro;
    volatile RwOverlayM128 ao;
    volatile RwOverlayM128 bo;

    RWAPIFUNCTION(RWSTRING("Rt_mm_move_ss"));

    ao.m128 = a;
    bo.m128 = b;

    ro._f[0] = bo._f[0];
    ro._f[1] = ao._f[1];
    ro._f[2] = ao._f[2];
    ro._f[3] = ao._f[3];

    RWRETURN(ro.m128);
}

/*
 * Integer Intrinsics
 */

/**
 * \ingroup rtintel
 * \ref Rt_m_pextrw extracts one of the four words of a The
 * selector n must be an immediate
 * \param a  Rt_m64 a
 * \param n  int n
 */

int
Rt_m_pextrw(Rt_m64 a, int n)    /*     PEXTRW */
{
    /*
     * Extracts one of the four words of a
     * The selector n must be an immediate
     */
    int                 r;
    volatile RwOverlayM64 ao;

    RWAPIFUNCTION(RWSTRING("Rt_m_pextrw"));

    ao.m64 = a;

    r = ((n == 0) ? ao._w[0] :
         ((n == 1) ? ao._w[1] : ((n == 2) ? ao._w[2] : ao._w[3])));

    RWRETURN((int) r);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_pinsrw inserts word d into one of four words of a
 * The selector n must be an immediate */

Rt_m64
Rt_m_pinsrw(Rt_m64 a, int d, int n) /* PINSRW */
{
    /* Inserts word d into one of four words of a 
     * The selector n must be an immediate 
     */
    volatile RwOverlayM64 ro;
    volatile RwOverlayM64 ao;

    RWAPIFUNCTION(RWSTRING("Rt_m_pinsrw"));

    ao.m64 = a;

    ro._w[0] = (n == 0) ? d : ao._w[0];
    ro._w[1] = (n == 1) ? d : ao._w[1];
    ro._w[2] = (n == 2) ? d : ao._w[2];
    ro._w[3] = (n == 3) ? d : ao._w[3];

    RWRETURN(ro.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_pmaxsw computes the elementwise maximum of the words
 * in a and b */
Rt_m64
Rt_m_pmaxsw(Rt_m64 a, Rt_m64 b) /* PMAXSW */
{

/* Computes the elementwise maximum of the words in a and b */
    volatile RwOverlayM64 ro;
    volatile RwOverlayM64 ao;
    volatile RwOverlayM64 bo;

    RWAPIFUNCTION(RWSTRING("Rt_m_pmaxsw"));

    ao.m64 = a;
    bo.m64 = b;

    ro._w[0] = _rw_min(ao._w[0], bo._w[0]);
    ro._w[1] = _rw_min(ao._w[1], bo._w[1]);
    ro._w[2] = _rw_min(ao._w[2], bo._w[2]);
    ro._w[3] = _rw_min(ao._w[3], bo._w[3]);

    RWRETURN(ro.m64);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_pmaxub computes the elementwise maximum of the
 * unsigned bytes in a and b 
 */
Rt_m64
Rt_m_pmaxub(Rt_m64 a, Rt_m64 b) /* PMAXUB */
{

    /* Computes the elementwise maximum of the unsigned bytes in a and b */
    volatile RwOverlayM64 ro;
    volatile RwOverlayM64 ao;
    volatile RwOverlayM64 bo;

    RWAPIFUNCTION(RWSTRING("Rt_m_pmaxub"));

    ao.m64 = a;
    bo.m64 = b;

    ro._b[0] = _rw_min(ao._b[0], bo._b[0]);
    ro._b[1] = _rw_min(ao._b[1], bo._b[1]);
    ro._b[2] = _rw_min(ao._b[2], bo._b[2]);
    ro._b[3] = _rw_min(ao._b[3], bo._b[3]);
    ro._b[4] = _rw_min(ao._b[4], bo._b[4]);
    ro._b[5] = _rw_min(ao._b[5], bo._b[5]);
    ro._b[6] = _rw_min(ao._b[6], bo._b[6]);
    ro._b[7] = _rw_min(ao._b[7], bo._b[7]);

    RWRETURN(ro.m64);
}

 /**
 * \ingroup rtintel
 * \ref Rt_m_pminsw computes the elementwise minimum of the words
  * in a and b 
  */
Rt_m64
Rt_m_pminsw(Rt_m64 a, Rt_m64 b) /* PMINSW */
{
    /* Computes the elementwise minimum of the words in a and b */
    volatile RwOverlayM64 ro;
    volatile RwOverlayM64 ao;
    volatile RwOverlayM64 bo;

    RWAPIFUNCTION(RWSTRING("Rt_m_pminsw"));

    ao.m64 = a;
    bo.m64 = b;

    ro._w[0] = _rw_min(ao._w[0], bo._w[0]);
    ro._w[1] = _rw_min(ao._w[1], bo._w[1]);
    ro._w[2] = _rw_min(ao._w[2], bo._w[2]);
    ro._w[3] = _rw_min(ao._w[3], bo._w[3]);

    RWRETURN(ro.m64);
}

 /**
 * \ingroup rtintel
 * \ref Rt_m_pminub computes the elementwise minimum of the
  * unsigned bytes in a and b 
  */
Rt_m64
Rt_m_pminub(Rt_m64 a, Rt_m64 b) /* PMINUB */
{
    /* Computes the elementwise minimum of the unsigned bytes in a and b */
    volatile RwOverlayM64 ro;
    volatile RwOverlayM64 ao;
    volatile RwOverlayM64 bo;

    RWAPIFUNCTION(RWSTRING("Rt_m_pminub"));

    ao.m64 = a;
    bo.m64 = b;

    ro._b[0] = _rw_min(ao._b[0], bo._b[0]);
    ro._b[1] = _rw_min(ao._b[1], bo._b[1]);
    ro._b[2] = _rw_min(ao._b[2], bo._b[2]);
    ro._b[3] = _rw_min(ao._b[3], bo._b[3]);
    ro._b[4] = _rw_min(ao._b[4], bo._b[4]);
    ro._b[5] = _rw_min(ao._b[5], bo._b[5]);
    ro._b[6] = _rw_min(ao._b[6], bo._b[6]);
    ro._b[7] = _rw_min(ao._b[7], bo._b[7]);

    RWRETURN(ro.m64);
}

 /**
 * \ingroup rtintel
 * \ref Rt_m_pmovmskb creates an 8-bit mask from the most
  * * significant bits of the bytes in a
  * * \param a  Rt_m64 a
  */

int
Rt_m_pmovmskb(Rt_m64 a)         /*     PMOVMSKB */
{
    /*
     * Creates an 8-bit mask from the most significant bits of the bytes in a
     */
    int                 r;
    volatile RwOverlayM64 ao;

    RWAPIFUNCTION(RWSTRING("Rt_m_pmovmskb"));

    ao.m64 = a;

    r = ((sign(ao._b[7]) << 7) |
         (sign(ao._b[6]) << 6) |
         (sign(ao._b[5]) << 5) |
         (sign(ao._b[4]) << 4) |
         (sign(ao._b[3]) << 3) |
         (sign(ao._b[2]) << 2) |
         (sign(ao._b[1]) << 1) | (sign(ao._b[0]) << 0));

    RWRETURN((int) r);
}

/**
 * \ingroup rtintel
 * \ref Rt_m_pmulhuw multiplies the unsigned words in a and b,
 * RWRETURN(ing the upper 16 bits of the 32-bit intermediate results */

Rt_m64
Rt_m_pmulhuw(Rt_m64 a, Rt_m64 b) /* PMULHUW */
{
    /* Multiplies the unsigned words in a and b, returning 
     * the upper 16 bits of the 32-bit intermediate results */
    volatile RwOverlayM64 ro;
    volatile RwOverlayM64 ao;
    volatile RwOverlayM64 bo;

    RWAPIFUNCTION(RWSTRING("Rt_m_pmulhuw"));

    ao.m64 = a;
    bo.m64 = b;

    ro._w[0] = hiword(((long) ao._w[0]) * ((long) bo._w[0]));
    ro._w[1] = hiword(((long) ao._w[1]) * ((long) bo._w[1]));
    ro._w[2] = hiword(((long) ao._w[2]) * ((long) bo._w[2]));
    ro._w[3] = hiword(((long) ao._w[3]) * ((long) bo._w[3]));

    RWRETURN(ro.m64);
}

 /**
 * \ingroup rtintel
 * \ref Rt_m_pshufw returns a combination of the four words of a
  * The selector n must be an immediate 
  */
Rt_m64
Rt_m_pshufw(Rt_m64 a, int n)    /* PSHUFW */
{
    /* returns a combination of the four words of a 
     * The selector n must be an immediate 
     */
    volatile RwOverlayM64 ro;
    volatile RwOverlayM64 ao;

    RWAPIFUNCTION(RWSTRING("Rt_m_pshufw"));

    ao.m64 = a;

    ro._w[0] = ao._w[(n >> 0) & 0x3];
    ro._w[1] = ao._w[(n >> 2) & 0x3];
    ro._w[2] = ao._w[(n >> 4) & 0x3];
    ro._w[3] = ao._w[(n >> 6) & 0x3];

    RWRETURN(ro.m64);
}

 /**
 * \ingroup rtintel
 * \ref Rt_m_lwmaskmovq conditionally store byte elements of a to
  * address p The high bit of each byte in the selector b determines
  * whether the corresponding byte in a will be stored
  * \param a  Rt_m64 a
  * \param b  Rt_m64 b
  * \param p  char *p
  */

void
Rt_m_lwmaskmovq(Rt_m64 a, Rt_m64 b, char *p) /*     MASKMOVQ */
{
    /*
     * Conditionally store byte elements of a to address p
     * The high bit of each  byte in the selector b determines
     * whether the corresponding byte in a will be  stored
     */
    volatile RwOverlayM64 ao;
    volatile RwOverlayM64 bo;

    RWAPIFUNCTION(RWSTRING("Rt_m_lwmaskmovq"));

    ao.m64 = a;
    bo.m64 = b;

    if (sign(bo._b[0]))
        p[0] = ao._b[0];
    if (sign(bo._b[1]))
        p[1] = ao._b[1];
    if (sign(bo._b[2]))
        p[2] = ao._b[2];
    if (sign(bo._b[3]))
        p[3] = ao._b[3];
    if (sign(bo._b[4]))
        p[4] = ao._b[4];
    if (sign(bo._b[5]))
        p[5] = ao._b[5];
    if (sign(bo._b[6]))
        p[6] = ao._b[6];
    if (sign(bo._b[7]))
        p[7] = ao._b[7];

    RWRETURNVOID();
}

/*
 * Cacheability Support
 */

/**
 * \ingroup rtintel
 * \ref Rt_mm_prefetch loads one cache line of data from address
 * p to a location ``closer'' to the processor The value i specifies the
 * type of prefetch operation: the constants _rwmM_HINT_T0,
 * _rwmM_HINT_T1, _rwmM_HINT_T2, and _rwmM_HINT_NTA should be used,
 * corresponding to the type of prefetch instruction
 * \param  p   p
 * \param  i   i
 */

void
Rt_mm_prefetch(char *__RWUNUSED__ p, int __RWUNUSED__ i) /*     PREFETCH */
{
    /*
     * Loads one cache line of data from address p to
     * a location ``closer'' to the  processor
     * The value i specifies the type of prefetch operation:
     * the constants _rwmM_HINT_T0, _rwmM_HINT_T1, _rwmM_HINT_T2, and
     * _rwmM_HINT_NTA should be used, corresponding to the type of prefetch
     * instruction
     */
    RWAPIFUNCTION(RWSTRING("Rt_mm_prefetch"));

    INTEL_SSE_UNIMPLEMENTED("Rt_mm_prefetch");

    RWRETURNVOID();
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_stream_pi stores the data in a to the address p
 * without polluting the caches
 * \param  p   p
 * \param  a   a
 */

void
Rt_mm_stream_pi(Rt_m64 * __RWUNUSED__ p, Rt_m64 __RWUNUSED__ a) /*     MOVNTQ */
{
    /*
     * Stores the data in a to the address p without polluting the caches
     */
    RWAPIFUNCTION(RWSTRING("Rt_mm_stream_pi"));

    INTEL_SSE_UNIMPLEMENTED("Rt_mm_stream_pi");

    RWRETURNVOID();
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_stream_ps stores the data in a to the address p
 * without polluting the caches The address must be 16-byte-aligned
 * \param  *p   *p
 * \param  a   a
 */

void
Rt_mm_stream_ps(float *__RWUNUSED__ p, Rt_m128 __RWUNUSED__ a) /* MOVNTPS */
{
    /*
     * Stores the data in a to the address p without polluting the caches
     * The address must be 16-byte-aligned
     */
    RWAPIFUNCTION(RWSTRING("Rt_mm_stream_ps"));

    INTEL_SSE_UNIMPLEMENTED("Rt_mm_stream_ps");

    RWRETURNVOID();
}

/**
 * \ingroup rtintel
 * \ref Rt_mm_sfence guarantees that every preceding store is
 * globally visible before any subsequent store
 */

void
Rt_mm_sfence(void)              /* SFENCE */
{
    /*
     * Guarantees that every preceding store is globally visible before any
     * subsequent store
     */
    RWAPIFUNCTION(RWSTRING("Rt_mm_sfence"));

    INTEL_SSE_UNIMPLEMENTED("Rt_mm_sfence");

    RWRETURNVOID();
}
