Commit 9c300c2b authored by Manuel Jander's avatar Manuel Jander
Browse files

Remove FIX_1010_OPT_INV_USING_INVSQRT and FIX_1010_OPT_GIVENS_AMAX_BMIN....

Remove FIX_1010_OPT_INV_USING_INVSQRT and FIX_1010_OPT_GIVENS_AMAX_BMIN. Remove all debug/measurement code. Preparation for merge to main.
parent bb0333d8
Loading
Loading
Loading
Loading
Loading
+4 −515
Original line number Diff line number Diff line
@@ -53,11 +53,9 @@

#if 1
#define FIX_1010_OPT_DIV
// #define FIX_1010_OPT_INV_USING_INVSQRT
#define FIX_1010_OPT_SINGLE_RESCALE
#define FIX_1010_OPT_GIVENS
#define FIX_1010_OPT_GIVENS_INV
// #define FIX_1010_OPT_GIVENS_AMAX_BMIN
#define FIX_1010_OPT_NORM_NOSAT
#define FIX_1010_OPT_SEC_SINGLE_RESCALE
#endif
@@ -297,288 +295,6 @@ void svdMat2mat_fx(
    return;
}

#ifdef MORE_DEBUG2
static void matrixFx2Fl2(
    float r[][MAX_OUTPUT_CHANNELS],
    const Word32 a[][MAX_OUTPUT_CHANNELS],
    const Word16 a_e[][MAX_OUTPUT_CHANNELS],
    const int adim1,
    const int adim2 )
{
    for ( int i1 = 0; i1 < adim1; i1++ )
    {
        for ( int i2 = 0; i2 < adim2; i2++ )
        {
            r[i1][i2] = (float) a[i1][i2] * powf( 2.f, a_e[i1][i2] - 31 );
        }
    }
}

static void matrixPrint2(
    const float a[][MAX_OUTPUT_CHANNELS],
    const int dim1,
    const int dim2,
    const char *name )
{
    printf( "Matrix %s[%d][%d] = \n", name, dim1, dim2 );
    for ( int i1 = 0; i1 < dim1; i1++ )
    {
        printf( " { " );
        for ( int i2 = 0; i2 < dim2; i2++ )
        {
            printf( "%.10e, ", a[i1][i2] );
        }
        printf( " },\n" );
    }
}
#endif

#ifndef DEBUG_SVD_TEST
// #define DEBUG_SVD_PRECISION
#endif
// #define MORE_DEBUG

#if defined( DEBUG_SVD_PRECISION ) || defined( MORE_DEBUG )

#if ( MAX_INPUT_CHANNELS > MAX_OUTPUT_CHANNELS )
#define MAX_MATRIX MAX_INPUT_CHANNELS
#else
#define MAX_MATRIX MAX_OUTPUT_CHANNELS
#endif

static void matrixFx2Fl(
    float r[][MAX_MATRIX],
    const Word32 a[][MAX_MATRIX],
    const Word16 a_e[MAX_MATRIX],
    const int adim1,
    const int adim2 )
{
    for ( int i1 = 0; i1 < adim1; i1++ )
    {
        for ( int i2 = 0; i2 < adim2; i2++ )
        {
            r[i1][i2] = (float) a[i1][i2] * powf( 2.f, a_e[i2] - 31 );
        }
    }
}

static void matrixProduct(
    float r[][MAX_MATRIX],
    const float a[][MAX_MATRIX],
    const float b[][MAX_MATRIX],
    const int adim1,
    const int adim2,
    const int bdim1,
    const int bdim2 )
{
    assert( adim2 == bdim1 );

    for ( int i1 = 0; i1 < adim1; i1++ )
    {
        for ( int i2 = 0; i2 < bdim2; i2++ )
        {
            r[i1][i2] = 0.f;
            for ( int i3 = 0; i3 < bdim1; i3++ )
            {
                r[i1][i2] += a[i1][i3] * b[i3][i2];
            }
        }
    }
}

static void matrixTranspose(
    float r[][MAX_MATRIX],
    const float a[][MAX_MATRIX],
    const int adim1,
    const int adim2 )
{
    for ( int i1 = 0; i1 < adim1; i1++ )
    {
        for ( int i2 = 0; i2 < adim2; i2++ )
        {
            r[i2][i1] = a[i1][i2];
        }
    }
}

static void matrixDiagonal(
    float r[][MAX_MATRIX],
    const float a[MAX_MATRIX],
    const int dim )
{
    for ( int i1 = 0; i1 < dim; i1++ )
    {
        for ( int i2 = 0; i2 < dim; i2++ )
        {
            r[i1][i2] = 0;
        }
        r[i1][i1] = a[i1];
    }
}

static float matrixDifference(
    const float a[][MAX_MATRIX],
    const float b[][MAX_MATRIX],
    const int dim1,
    const int dim2 )
{
    float r = 0.f;

    for ( int i1 = 0; i1 < dim1; i1++ )
    {
        for ( int i2 = 0; i2 < dim2; i2++ )
        {
            if ( a[i1][i2] != 0.f )
            {
                r += fabsf( ( b[i1][i2] - a[i1][i2] ) / a[i1][i2] );
            }
            else
            {
                r += fabsf( b[i1][i2] - a[i1][i2] );
            }
        }
    }

    return r / (float) ( dim1 * dim2 );
}

static void matrixPrint(
    const float a[][MAX_MATRIX],
    const int dim1,
    const int dim2,
    const char *name )
{
    printf( "Matrix %s[%d][%d] = \n", name, dim1, dim2 );
    for ( int i1 = 0; i1 < dim1; i1++ )
    {
        for ( int i2 = 0; i2 < dim2; i2++ )
        {
            printf( "%.10e, ", a[i1][i2] );
        }
        printf( "\n" );
    }
}

static float matrixTestIdentity(
    const float a[][MAX_MATRIX],
    const int dim )
{
    float r = 0.f;

    for ( int i1 = 0; i1 < dim; i1++ )
    {
        for ( int i2 = 0; i2 < dim; i2++ )
        {
            if ( i1 == i2 )
            {
                r += fabsf( 1.f - a[i1][i2] );
            }
            else
            {
                r += fabsf( 0.f - a[i1][i2] );
            }
        }
    }

    return r;
}

#define PROBLEMATIC_THRESHOLD 0.5f
static void svd_accuracy_test_fx(
    Word32 InputMatrixFx[][MAX_OUTPUT_CHANNELS], /* i  : matrix to be decomposed (M)            InputMatrix_e*/
    Word16 InputMatrixFx_e,
    Word32 singularVectors_LeftFx[][MAX_OUTPUT_CHANNELS],  /* o  : left singular vectors (U)			Q31 */
    Word32 singularValuesFx[MAX_OUTPUT_CHANNELS],          /* o  : singular values vector (S)         singularValues_fx_e*/
    Word32 singularVectors_RightFx[][MAX_OUTPUT_CHANNELS], /* o  : right singular vectors (V)			Q31 */
    Word16 singularValuesFx_e[MAX_OUTPUT_CHANNELS],
    const Word16 nChannelsL, /* i  : number of rows in the matrix to be decomposed		Q0*/
    const Word16 nChannelsC  /* i  : number of columns in the matrix to be decomposed	Q0*/
)
{
    float tmp1[MAX_MATRIX][MAX_MATRIX];
    float tmp2[MAX_MATRIX][MAX_MATRIX];
    float tmp3[MAX_MATRIX][MAX_MATRIX];
    float InputMatrix[MAX_MATRIX][MAX_MATRIX];

    Word16 singularValuesFx2_e[MAX_OUTPUT_CHANNELS];

    float singularVectors_Left[MAX_INPUT_CHANNELS][MAX_OUTPUT_CHANNELS];
    float singularValues[MAX_MATRIX];
    float singularValuesMatrix[MAX_MATRIX][MAX_MATRIX];
    float singularVectors_Right[MAX_INPUT_CHANNELS][MAX_OUTPUT_CHANNELS];
    float result;
    int dimSingular;
    int problematic = 0;

    /* Convert to float and Create singular values matrix from signular values vector */
    for ( int x = 0; x < MAX_MATRIX; x++ )
        singularValuesFx2_e[x] = InputMatrixFx_e;
    matrixFx2Fl( InputMatrix, InputMatrixFx, singularValuesFx2_e, nChannelsL, nChannelsC );
    dimSingular = min( nChannelsL, nChannelsC );
    matrixFx2Fl( &singularValues, (Word32( * )[MAX_MATRIX]) singularValuesFx, singularValuesFx_e, 1, nChannelsC );
    for ( int x = 0; x < MAX_MATRIX; x++ )
        singularValuesFx2_e[x] = 0;
    matrixFx2Fl( singularVectors_Left, singularVectors_LeftFx, singularValuesFx2_e, nChannelsL, nChannelsC );
    matrixFx2Fl( singularVectors_Right, singularVectors_RightFx, singularValuesFx2_e, nChannelsC, nChannelsC );
    matrixDiagonal( singularValuesMatrix, singularValues, dimSingular ); /* CxC */

#ifdef MORE_DEBUG
    matrixPrint( InputMatrix, nChannelsL, nChannelsC, "A" );
    printf( "Result of svd() \n" );
    matrixPrint( singularVectors_Left, nChannelsL, nChannelsC, "U" );
    matrixPrint( singularValuesMatrix, nChannelsC, nChannelsC, "S" );
    matrixPrint( singularVectors_Right, nChannelsC, nChannelsC, "V" );
#endif

    printf( "\nResult quality tests\n\n" );

    /* Test U' * U == I */
    matrixTranspose( tmp1, singularVectors_Left, nChannelsL, nChannelsC );                             /* CxL */
    matrixProduct( tmp2, tmp1, singularVectors_Left, nChannelsC, nChannelsL, nChannelsL, nChannelsC ); /* CxC */
    result = matrixTestIdentity( tmp2, nChannelsC );
    if ( result >= PROBLEMATIC_THRESHOLD )
    {
        problematic = 1;
    }
#ifdef MORE_DEBUG
    matrixPrint( tmp2, nChannelsC, nChannelsC, "U\'*U" );
#endif
    printf( "U' * U difference to I is %f\n", result );

    /* Test V * V' == I */
    matrixTranspose( tmp1, singularVectors_Right, nChannelsC, nChannelsC );                             /* CxC */
    matrixProduct( tmp2, singularVectors_Right, tmp1, nChannelsC, nChannelsC, nChannelsC, nChannelsC ); /* CxC */
    result = matrixTestIdentity( tmp2, nChannelsC );
    if ( result >= PROBLEMATIC_THRESHOLD )
    {
        problematic = 1;
    }
#ifdef MORE_DEBUG
    matrixPrint( tmp2, nChannelsC, nChannelsC, "V*V\'" );
#endif
    printf( "V * V' difference to I is %f\n", result );

    /* Test InputMatrix == U * S * V' */
    matrixProduct( tmp1, singularVectors_Left, singularValuesMatrix, nChannelsL, nChannelsC, dimSingular, dimSingular ); /* LxC */
    matrixTranspose( tmp3, singularVectors_Right, nChannelsC, nChannelsC );                                              /* CxC */
    matrixProduct( tmp2, tmp1, tmp3, nChannelsL, dimSingular, nChannelsC, nChannelsC );                                  /* LxC */
    result = matrixDifference( tmp2, InputMatrix, nChannelsL, nChannelsC );
    if ( result >= PROBLEMATIC_THRESHOLD )
    {
        problematic = 1;
    }
#ifdef MORE_DEBUG
    matrixPrint( tmp2, nChannelsL, nChannelsC, "U*S*V\'" );
#endif
    printf( "U * S * V' difference to M is %f\n", result );

    if ( problematic )
    {
        matrixPrint( InputMatrix, nChannelsL, nChannelsC, "Problematic Input" );
    }
}
#endif

/*-------------------------------------------------------------------------
 * svd()
 *
@@ -615,24 +331,9 @@ Word16 svd_fx(
    Word16 temp_fx_e;
    push_wmops( "svd_fx" );

#ifdef MORE_DEBUG2
    {
        float input[MAX_OUTPUT_CHANNELS][MAX_OUTPUT_CHANNELS];
        Word16 exp_matrix[MAX_OUTPUT_CHANNELS][MAX_OUTPUT_CHANNELS];

        for ( int ii = 0; ii < MAX_OUTPUT_CHANNELS; ii++ )
            for ( int iii = 0; iii < MAX_OUTPUT_CHANNELS; iii++ )
                exp_matrix[ii][iii] = InputMatrix_e;

        matrixFx2Fl2( input, InputMatrix, exp_matrix, nChannelsL, nChannelsC );
        matrixPrint2( input, nChannelsL, nChannelsC, "  input  " );
    }
#endif

#ifndef FIX_1010_OPT_SINGLE_RESCALE
    set32_fx( secDiag_fx, 0, MAX_OUTPUT_CHANNELS );
    set16_fx( secDiag_fx_e, 0, MAX_OUTPUT_CHANNELS );

    set16_fx( singularValues_fx_e, 0, MAX_OUTPUT_CHANNELS );
#endif

@@ -714,17 +415,6 @@ Word16 svd_fx(
    WHILE( EQ_16( condition, 1 ) );

    pop_wmops();
#ifdef DEBUG_SVD_PRECISION
    svd_accuracy_test_fx(
        InputMatrix,
        InputMatrix_e,
        singularVectors_Left_fx,
        singularValues_fx,
        singularVectors_Right_fx,
        singularValues_fx_e,
        nChannelsL,
        nChannelsC );
#endif
    return ( errorMessage );
}

@@ -1323,18 +1013,6 @@ static void HouseholderReduction_fx(
        }
    }

#ifdef MORE_DEBUG2
    {
        float singularVectors_Left[MAX_OUTPUT_CHANNELS][MAX_OUTPUT_CHANNELS];
        float secDiag[MAX_OUTPUT_CHANNELS][MAX_OUTPUT_CHANNELS];

        matrixFx2Fl2( singularVectors_Left, singularVectors_Left_fx, singularVectors_Left_fx_e, nChannelsL, nChannelsC );
        matrixFx2Fl2( secDiag, (Word32( * )[MAX_OUTPUT_CHANNELS]) secDiag_fx, (Word16( * )[MAX_OUTPUT_CHANNELS]) secDiag_fx_e, 1, nChannelsC );
        matrixPrint2( singularVectors_Left, nChannelsL, nChannelsC, "left" );
        matrixPrint2( secDiag, 1, nChannelsC, "secDiag" );
    }
#endif

    /* SingularVecotr Accumulation */
#ifndef FIX_1010_OPT_SINGLE_RESCALE
    singularVectorsAccumulationRight_fx( singularVectors_Left_fx, singularVectors_Right_fx, secDiag_fx, singularVectors_Left_e, *secDiag_fx_e, nChannelsC );
@@ -1346,36 +1024,10 @@ static void HouseholderReduction_fx(
    singularVectorsAccumulationRight_fx( singularVectors_Left_fx, singularVectors_Right_fx, secDiag_fx, singularVectors_Left_fx_e, secDiag_fx_e, nChannelsC );
#endif

#ifdef MORE_DEBUG2
    {
        float singularVectors_Right[MAX_OUTPUT_CHANNELS][MAX_OUTPUT_CHANNELS];
        Word16 singularVectors_Left_fx_e[MAX_OUTPUT_CHANNELS][MAX_OUTPUT_CHANNELS];

        for ( int ii = 0; ii < MAX_OUTPUT_CHANNELS; ii++ )
            for ( int iii = 0; iii < MAX_OUTPUT_CHANNELS; iii++ )
                singularVectors_Left_fx_e[ii][iii] = 0;

        matrixFx2Fl2( singularVectors_Right, singularVectors_Right_fx, singularVectors_Left_fx_e, nChannelsC, nChannelsC );
        matrixPrint2( singularVectors_Right, nChannelsC, nChannelsC, "right2" );
    }
#endif

    singularVectorsAccumulationLeft_fx( singularVectors_Left_fx, singularValues_fx, singularVectors_Left_fx_e, singularValues_fx_e, nChannelsL, nChannelsC );
#endif

#ifdef MORE_DEBUG2
    {
        float singularVectors_Left[MAX_OUTPUT_CHANNELS][MAX_OUTPUT_CHANNELS];
        Word16 singularVectors_Left_fx_e[MAX_OUTPUT_CHANNELS][MAX_OUTPUT_CHANNELS];

        for ( int ii = 0; ii < MAX_OUTPUT_CHANNELS; ii++ )
            for ( int iii = 0; iii < MAX_OUTPUT_CHANNELS; iii++ )
                singularVectors_Left_fx_e[ii][iii] = 0;

        matrixFx2Fl2( singularVectors_Left, singularVectors_Left_fx, singularVectors_Left_fx_e, nChannelsL, nChannelsC );
        matrixPrint2( singularVectors_Left, nChannelsL, nChannelsC, "left2" );
    }
#endif
    return;
}

@@ -1385,41 +1037,6 @@ static void HouseholderReduction_fx(
 *
 *-------------------------------------------------------------------------*/

#ifdef FIX_1010_OPT_INV_USING_INVSQRT
static Word32 BASOP_Util_Inv32( Word32 x, Word16 *px_e )
{
    Word16 sign, shift, shift2;

    sign = 0;
    move16();
    if ( x < 0 )
    {
        sign = 1;
    }
    if ( sign )
    {
        x = L_negate( x );
    }

    shift = norm_l( x );
    x = L_shl( x, shift );
    *px_e = 0;
    move16();
    x = ISqrt32norm( x, px_e );
    x = Mpy_32_32( x, x );
    shift2 = norm_l( x );
    x = L_shl( x, shift2 );
    *px_e = add( shl( *px_e, 1 ), sub( shift, shift2 ) );
    move16();

    if ( sign )
    {
        x = L_negate( x );
    }
    return x;
}
#endif

static void biDiagonalReductionLeft_fx(
    Word32 singularVectors[][MAX_OUTPUT_CHANNELS], /* exp(singularVectors_e) */
    Word32 singularValues[MAX_OUTPUT_CHANNELS],    /* exp(singularValues_e) */
@@ -1477,6 +1094,7 @@ ELSE IF( LT_16( *sig_x_e, *secDiag_e ) )
    secDiag_e[currChannel] = *sig_x_e;
    move16();
#endif

/* Setting values to 0 */
( *sig_x ) = 0;
move32();
@@ -1502,11 +1120,7 @@ IF( LT_16( currChannel, nChannelsL ) ) /* i <= m */
#ifdef FIX_1010_OPT_DIV
        Word16 invVal_e;
        Word32 invVal;
#ifdef FIX_1010_OPT_INV_USING_INVSQRT
        invVal = BASOP_Util_Inv32( maxWithSign_fx( *sig_x ), &invVal_e );
#else
        invVal = BASOP_Util_Divide3232_Scale_cadence( MAXVAL_WORD32, maxWithSign_fx( *sig_x ), &invVal_e );
#endif
#endif
        norm_x = 0;
        move32();
@@ -1576,11 +1190,7 @@ IF( LT_16( currChannel, nChannelsL ) ) /* i <= m */
        move32();

#ifdef FIX_1010_OPT_DIV
#ifdef FIX_1010_OPT_INV_USING_INVSQRT
        invVal = BASOP_Util_Inv32( maxWithSign_fx( r ), &invVal_e );
#else
        invVal = BASOP_Util_Divide3232_Scale_cadence( MAXVAL_WORD32, maxWithSign_fx( r ), &invVal_e );
#endif
#endif

        FOR( iCh = currChannel + 1; iCh < nChannelsC; iCh++ ) /* nChannelsC */
@@ -1673,7 +1283,7 @@ return;

static void biDiagonalReductionRight_fx(
    Word32 singularVectors[][MAX_OUTPUT_CHANNELS], /* exp(singularVectors_e) */
    Word32 secDiag[MAX_OUTPUT_CHANNELS],           /* exp(secDiag_e) */
    Word32 secDiag[MAX_OUTPUT_CHANNELS],           /* exp(secDiag_exp[]) */
#ifndef FIX_1010_OPT_SINGLE_RESCALE
    Word16 *singularVectors_e,
#else
@@ -1741,11 +1351,7 @@ static void biDiagonalReductionRight_fx(
#ifdef FIX_1010_OPT_DIV
            Word16 invVal_e, temp_e;
            Word32 invVal;
#ifdef FIX_1010_OPT_INV_USING_INVSQRT
            invVal = BASOP_Util_Inv32( maxWithSign_fx( *sig_x ), &invVal_e );
#else
            invVal = BASOP_Util_Divide3232_Scale_cadence( MAXVAL_WORD32, maxWithSign_fx( *sig_x ), &invVal_e );
#endif
#endif
            FOR( jCh = idx; jCh < nChannelsC; jCh++ ) /*nChannelsC */
            {
@@ -1810,11 +1416,7 @@ static void biDiagonalReductionRight_fx(
            move32();

#ifdef FIX_1010_OPT_DIV
#ifdef FIX_1010_OPT_INV_USING_INVSQRT
            invVal = BASOP_Util_Inv32( maxWithSign_fx( r ), &invVal_e );
#else
            invVal = BASOP_Util_Divide3232_Scale_cadence( MAXVAL_WORD32, maxWithSign_fx( r ), &invVal_e );
#endif
#endif

            FOR( jCh = idx; jCh < nChannelsC; jCh++ ) /* nChannelsC */
@@ -1898,6 +1500,8 @@ static void biDiagonalReductionRight_fx(
            *secDiag_e = exp_max;
            move16();
#endif


#ifndef FIX_1010_OPT_SINGLE_RESCALE
            exp_max = *singularVectors_e;
            move16();
@@ -1976,11 +1580,7 @@ static void singularVectorsAccumulationLeft_fx(
        IF( t_ii ) /*if (fabsf(t_ii) > EPSILON *fabsf(t_ii)) {*/
        {
#ifdef FIX_1010_OPT_DIV
#ifdef FIX_1010_OPT_INV_USING_INVSQRT
            t_ii = BASOP_Util_Inv32( maxWithSign_fx( t_ii ), &temp_exp );
#else
            t_ii = BASOP_Util_Divide3232_Scale_cadence( MAXVAL_WORD32, maxWithSign_fx( t_ii ), &temp_exp );
#endif
            t_ii_e = sub( temp_exp, t_ii_e );
#else
            t_ii = BASOP_Util_Divide3232_Scale_cadence( ONE_IN_Q30, maxWithSign_fx( t_ii ), &temp_exp ); /* exp(1 + (temp_exp + tii_e)) */
@@ -2001,22 +1601,11 @@ static void singularVectorsAccumulationLeft_fx(
                    norm_y = BASOP_Util_Add_Mant32Exp( norm_y, norm_y_e, Mpy_32_32( singularVectors_Left[k][nCh], singularVectors_Left[k][iCh] ), add( singularVectors_Left_e[k][nCh], singularVectors_Left_e[k][iCh] ), &norm_y_e ); /* exp(norm_y_e) */
#endif
                }
#ifdef FIX_1010_OPT_INV_USING_INVSQRT
                Word16 temp_e = norm_l( singularVectors_Left[nCh][nCh] );
                t_jj = BASOP_Util_Inv32( maxWithSign_fx( L_shl( singularVectors_Left[nCh][nCh], temp_e ) ), &temp_exp );
                t_jj = Mpy_32_32( Mpy_32_32( t_ii, norm_y ), t_jj );
#ifndef FIX_1010_OPT_SINGLE_RESCALE
                t_jj_e = add( add( temp_exp, temp_e ), sub( add( t_ii_e, norm_y_e ), sing_exp2[nCh][nCh] ) );
#else
                t_jj_e = add( add( temp_exp, temp_e ), sub( add( t_ii_e, norm_y_e ), singularVectors_Left_e[nCh][nCh] ) );
#endif
#else
                t_jj = BASOP_Util_Divide3232_Scale_cadence( Mpy_32_32( t_ii, norm_y ), maxWithSign_fx( singularVectors_Left[nCh][nCh] ), &temp_exp );                                                                                 // t_ii_e+norm_y_e-*singularVectors_e,
#ifndef FIX_1010_OPT_SINGLE_RESCALE
                t_jj_e = add( temp_exp, sub( add( t_ii_e, norm_y_e ), sing_exp2[nCh][nCh] ) );
#else
                t_jj_e = add( temp_exp, sub( add( t_ii_e, norm_y_e ), singularVectors_Left_e[nCh][nCh] ) );
#endif
#endif
                FOR( k = nCh; k < nChannelsL; k++ ) /* nChannelsL */
                {
@@ -2187,59 +1776,6 @@ static void singularVectorsAccumulationRight_fx(
 *
 *-------------------------------------------------------------------------*/

#ifdef FIX_1010_OPT_GIVENS_AMAX_BMIN
#ifndef M_PI
#define M_PI 3.141592653589793
#endif
#define NUM_REGIONS 128
static Word32 alphaBeta[NUM_REGIONS][2];
static void get_alpha_beta( Word32 p, Word16 p_e, Word32 q, Word16 q_e, Word32 *alpha, Word32 *beta )
{
    static int init = 0;

    if ( init == 0 )
    {
        for ( int i = 0; i < NUM_REGIONS; i++ )
        {
            double thetaS, thetaE, thetaM;

            thetaS = M_PI / 4. * (double) i / (double) NUM_REGIONS;
            thetaE = M_PI / 4. * (double) ( i + 1 ) / (double) NUM_REGIONS;
            thetaM = M_PI / 4. * ( (double) i + 0.5 ) / (double) NUM_REGIONS;
            // alphaBeta[i][0] = FL2WORD32(1./(sin(thetaM)*tan((thetaS+thetaE)/2.)+cos(thetaM)));
            // alphaBeta[i][1] = FL2WORD32(1./(sin(thetaM)*tan((thetaS+thetaE)/2.)+cos(thetaM)) * tan((thetaS+thetaE)/2.));
            alphaBeta[i][0] = FL2WORD32( 2. / ( ( ( sin( thetaM ) + sin( thetaS ) ) * tan( ( thetaS + thetaE ) / 2. ) ) + cos( thetaM ) + cos( thetaS ) ) );
            alphaBeta[i][1] = FL2WORD32( 2. / ( ( ( sin( thetaM ) + sin( thetaS ) ) * tan( ( thetaS + thetaE ) / 2. ) ) + cos( thetaM ) + cos( thetaS ) ) * tan( ( thetaS + thetaE ) / 2. ) );
        }
        init = 1;
    }
    Word16 r, shift;
#if 0
    float pf, qf;
    pf = (float)p * powf(2.f, p_e-31);
    qf = (float)q * powf(2.f, q_e-31);
    r = floor((double)NUM_REGIONS * 4. * atan2f(qf, pf)/M_PI);
    if (r >= NUM_REGIONS) {
        r =  NUM_REGIONS-1;
    }
#elif 1
    shift = sub( norm_l( q ), 1 );
    q = L_shl( q, shift );
    q_e = sub( q_e, shift );
    shift = norm_l( p );
    p = L_shl( p, shift );
    p_e = sub( p_e, shift );
    shift = sub( q_e, p_e );
    r = shl_sat( div_s( extract_h( q ), s_max( 1, extract_h( p ) ) ), shift );
    /* Second order polyfit of atan(r)/(pi/4) for r=0..1 */
    r = add( add( mult( mult( r, r ), FL2WORD16_SCALE( -3.672563685340096e-01, 3 ) ), mult( r, FL2WORD16_SCALE( 1.375369641423651e+00, 3 ) ) ), FL2WORD16_SCALE( -6.529424378422714e-03, 3 ) );
    r = s_min( s_max( 0, shr( r, WORD16_BITS - 1 - 7 - 3 ) ), NUM_REGIONS - 1 );
#endif
    assert( ( r >= 0 ) && ( r < NUM_REGIONS ) );
    *alpha = alphaBeta[r][0];
    *beta = alphaBeta[r][1];
}
#endif

#ifdef FIX_1010_OPT_GIVENS_INV
static void GivensRotation2_fx(
@@ -2253,35 +1789,6 @@ static void GivensRotation2_fx(
    Word16 *outInv_e )
{
    Word32 r;
#ifdef FIX_1010_OPT_GIVENS_AMAX_BMIN
    Word32 az, ax, a, b;

    ax = L_abs( x );
    az = L_abs( z );
    IF( BASOP_Util_Cmp_Mant32Exp( ax, x_e, az, z_e ) > 0 )
    {
        get_alpha_beta( ax, x_e, az, z_e, &a, &b );
        r = BASOP_Util_Add_Mant32Exp( Mpy_32_32( ax, a ), x_e, Mpy_32_32( az, b ), z_e, out_e );
    }
    ELSE
    {
        get_alpha_beta( az, z_e, ax, x_e, &a, &b );
        r = BASOP_Util_Add_Mant32Exp( Mpy_32_32( az, a ), z_e, Mpy_32_32( ax, b ), x_e, out_e );
    }
    *result = r;
    move32();
#if 1
    *outInv_e = shl( *out_e, 1 );
    *resultInv = ISqrt32( L_max( 1, Mpy_32_32( r, r ) ), outInv_e );
    move32();
#else
    *resultInv = L_deposit_h( BASOP_Util_Divide3232_Scale( MAX_32, r, outInv_e ) );
    move32();
    *outInv_e = sub( *outInv_e, *out_e );
    move16();
#endif

#else
    r = BASOP_Util_Add_Mant32Exp( Mpy_32_32( z, z ), shl( z_e, 1 ), Mpy_32_32( x, x ), shl( x_e, 1 ), out_e );
    r = L_max( r, 1 );
    *outInv_e = *out_e;
@@ -2291,7 +1798,6 @@ static void GivensRotation2_fx(

    *resultInv = ISqrt32( r, outInv_e );
    move32();
#endif
}
#endif

@@ -2312,25 +1818,8 @@ static Word32 GivensRotation_fx(
#endif

#ifdef FIX_1010_OPT_GIVENS
#ifdef FIX_1010_OPT_GIVENS_AMAX_BMIN
    Word32 az, ax, a, b;

    ax = L_abs( x );
    az = L_abs( z );
    IF( BASOP_Util_Cmp_Mant32Exp( ax, x_e, az, z_e ) > 0 )
    {
        get_alpha_beta( ax, x_e, az, z_e, &a, &b );
        r = BASOP_Util_Add_Mant32Exp( Mpy_32_32( ax, a ), x_e, Mpy_32_32( az, b ), z_e, out_e );
    }
    ELSE
    {
        get_alpha_beta( az, z_e, ax, x_e, &a, &b );
        r = BASOP_Util_Add_Mant32Exp( Mpy_32_32( az, a ), z_e, Mpy_32_32( ax, b ), x_e, out_e );
    }
#else
    r = BASOP_Util_Add_Mant32Exp( Mpy_32_32( z, z ), shl( z_e, 1 ), Mpy_32_32( x, x ), shl( x_e, 1 ), out_e );
    r = Sqrt32( r, out_e );
#endif
#else
    x_abs = L_abs( x );
    z_abs = L_abs( z );