Commit 5aaf2643 authored by Mohammadreza Naghibzadeh's avatar Mohammadreza Naghibzadeh
Browse files

Rewrite biDiagonalReductionLeft_fx() and biDiagonalReductionRight_fx()...

Rewrite biDiagonalReductionLeft_fx() and biDiagonalReductionRight_fx() according to optimized float version.
parent 6ccf3303
Loading
Loading
Loading
Loading
Loading
+161 −189
Original line number Diff line number Diff line
@@ -65,16 +65,19 @@ static void HouseholderReduction_fx(
    Word32 *eps_x_fx,        /* exp(eps_x_fx_e) */
    Word16 *eps_x_fx_e );
#ifdef MERGE_REQUEST_1926_SPEEDUP_ivas_svd_dec_fx_NONBE
static void biDiagonalReductionLeft_64(
    Word64 singularVectors_Left_64[MAX_OUTPUT_CHANNELS][MAX_OUTPUT_CHANNELS],
    const Word16 nChannelsL,  /* Q0 */

static void biDiagonalReductionLeft_fx(
    Word32 singularVectors[][MAX_OUTPUT_CHANNELS],  /* exp(singularVectors_e) */
    Word16 singularValues_e[][MAX_OUTPUT_CHANNELS], /* Q0 */
    const Word16 nChannelsL,
    const Word16 nChannelsC,  /* Q0 */
    const Word16 currChannel, /* Q0 */
    Word32 *g,                /* Q31 */
    Word32 *g,
    Word16 *g_e );

static void biDiagonalReductionRight_64(
    Word64 singularVectors_Left_64[MAX_OUTPUT_CHANNELS][MAX_OUTPUT_CHANNELS],
static void biDiagonalReductionRight_fx(
    Word32 singularVectors[][MAX_OUTPUT_CHANNELS], /* exp(singularVectors_e) */
    Word16 singularVectors_e[][MAX_OUTPUT_CHANNELS],
    const Word16 nChannelsL,  /* Q0 */
    const Word16 nChannelsC,  /* Q0 */
    const Word16 currChannel, /* Q0 */
@@ -840,7 +843,6 @@ static void HouseholderReduction_fx(
    Word16 nCh;
#ifdef MERGE_REQUEST_1926_SPEEDUP_ivas_svd_dec_fx_NONBE

    Word64 singularVectors_Left_64[MAX_OUTPUT_CHANNELS][MAX_OUTPUT_CHANNELS];
    Word32 g_left_fx = 0;
    Word16 g_left_e = 0;
    move32();
@@ -862,40 +864,48 @@ static void HouseholderReduction_fx(

    Word16 iCh, jCh;
    Word16 singularVectors_Left_fx_e[MAX_OUTPUT_CHANNELS][MAX_OUTPUT_CHANNELS];

#ifdef MERGE_REQUEST_1926_SPEEDUP_ivas_svd_dec_fx_NONBE

    FOR( jCh = 0; jCh < nChannelsL; jCh++ )
    {
        FOR( iCh = 0; iCh < nChannelsC; iCh++ )
        {
            singularVectors_Left_64[jCh][iCh] = W_shr( W_deposit32_h( singularVectors_Left_fx[jCh][iCh] ), 32 );
            singularVectors_Left_fx_e[jCh][iCh] = singularVectors_Left_e;
            move16();
        }
    }
    
    FOR( nCh = 0; nCh < nChannelsC; nCh++ )
    {
        biDiagonalReductionLeft_64(
            singularVectors_Left_64,
        biDiagonalReductionLeft_fx(
            singularVectors_Left_fx,
            singularVectors_Left_fx_e,
            nChannelsL,
            nChannelsC,
            nCh,
            &g_left_fx,
            &g_left_e );

        singularValues_fx[nCh] = g_left_fx;
        move32();
        singularValues_fx_e[nCh] = add( singularVectors_Left_e, g_left_e );
        singularValues_fx_e[nCh] = g_left_e;

        secDiag_fx[nCh] = g_right_fx; /* from the previous channel */
        move32();
        secDiag_fx_e[nCh] = add( singularVectors_Left_e, g_right_e );
        biDiagonalReductionRight_64(
            singularVectors_Left_64,
        secDiag_fx_e[nCh] = g_right_e;

        biDiagonalReductionRight_fx(
            singularVectors_Left_fx,
            singularVectors_Left_fx_e,
            nChannelsL,
            nChannelsC,
            nCh,
            &g_right_fx,
            &g_right_e );
        {

        Word16 L_temp_e;
            Word32 L_temp;
            L_temp = BASOP_Util_Add_Mant32Exp( L_abs( singularValues_fx[nCh] ), singularValues_fx_e[nCh], L_abs( secDiag_fx[nCh] ), secDiag_fx_e[nCh], &L_temp_e ); /* exp(L_temp_e) */
        Word32 L_temp = BASOP_Util_Add_Mant32Exp( L_abs( singularValues_fx[nCh] ), singularValues_fx_e[nCh], L_abs( secDiag_fx[nCh] ), secDiag_fx_e[nCh], &L_temp_e ); /* exp(L_temp_e) */
        IF( EQ_16( BASOP_Util_Cmp_Mant32Exp( L_temp, L_temp_e, *eps_x_fx, *eps_x_fx_e ), 1 ) )
        {
            *eps_x_fx = L_temp; /* exp(L_temp_e) */
@@ -904,20 +914,7 @@ static void HouseholderReduction_fx(
            move32();
        }
    }
    }
    {
        int i, j;
        for ( j = 0; j < nChannelsL; j++ )
        {
            for ( i = 0; i < nChannelsC; i++ )
            {
                Word16 n;
                n = W_norm( singularVectors_Left_64[j][i] );
                singularVectors_Left_fx[j][i] = W_extract_h( W_shl( singularVectors_Left_64[j][i], n ) );
                singularVectors_Left_fx_e[j][i] = sub( add( 32, singularVectors_Left_e ), n );
            }
        }
    }

#else

    FOR( jCh = 0; jCh < nChannelsL; jCh++ )
@@ -962,222 +959,197 @@ static void HouseholderReduction_fx(
 *
 *
 *-------------------------------------------------------------------------*/

static void biDiagonalReductionLeft_64(
    Word64 singularVectors_Left_64[MAX_OUTPUT_CHANNELS][MAX_OUTPUT_CHANNELS], // q(sing)	exp(sing)
    const Word16 nChannelsL,                                                  /* Q0 */
static void biDiagonalReductionLeft_fx(
    Word32 singularVectors[][MAX_OUTPUT_CHANNELS],                            /* exp(singularVectors_e) */
    Word16 singularVectors_e[][MAX_OUTPUT_CHANNELS], /* Q0 */
    const Word16 nChannelsL,
    const Word16 nChannelsC,                                                  /* Q0 */
    const Word16 currChannel,                                                 /* Q0 */
    Word32 *g,
    Word16 *g_e )
{
/* TODO: For some reason, this is optimal. But why? why not ( 32 - 2 * MAGIC_HEADROOM_1 - norm_x_e0 + 1 ) , for example? */
#define MAGIC_HEADROOM_1 2
//#define MAGIC_HEADROOM_2 ( sub( 16, shr( norm_x_e0, 2 ) ) )
//#define MAGIC_HEADROOM_3 ( sub( 16, shr( norm_x_e0, 2 ) ) )
//#define MAGIC_HEADROOM_4 ( sub( 16, shr( norm_x_e0, 2 ) ) )
#define MAGIC_HEADROOM_2 magic_headroom
#define MAGIC_HEADROOM_3 magic_headroom
#define MAGIC_HEADROOM_4 magic_headroom

    Word16 iCh, jCh;
    Word32 norm_x;
    Word16 norm_x_e;
    Word64 norm_64;
    Word16 magic_headroom;
    Word32 norm_x, f, r;
    Word16 norm_x_e, f_e, r_e;
    Word32 L_temp;
    Word16 L_temp_e;

    /* Setting values to 0 */
    ( *g ) = 0;
    ( *g_e ) = 0;
    move32();
    move16();
    norm_x = 0;
    move32();

    IF( LT_16( currChannel, nChannelsL ) ) /* i <= m */
    {

        Word32 tmp;
        norm_64 = 0;
        Word64 temp = 0;
        move64();
        norm_x = 0;
        norm_x_e = 0;
        Word16 max_e = MIN_16;
        move16();
        FOR( jCh = currChannel; jCh < nChannelsL; jCh++ ) /* nChannelsL */
        {
            tmp = W_extract_l( W_shr( singularVectors_Left_64[jCh][currChannel], MAGIC_HEADROOM_1 ) ); // q(sing)-H1			// exp(sing)+H1
            norm_64 = W_add( norm_64, W_mult0_32_32( tmp, tmp ) );                                     // q(norm)=2*q(sing)-2*H1	// exp(norm)=2*exp(sing)+2*H1
        }
        norm_x_e = W_norm( norm_64 );
        magic_headroom = sub( 16, shr( norm_x_e, 2 ) );
        norm_x = W_extract_h( W_shl( norm_64, norm_x_e ) ); // q(norm_x)=32-exp(norm_x)	exp(norm_x)=exp(norm)-32
            max_e = s_max( max_e, singularVectors_e[jCh][currChannel] );
        }
    IF( norm_x )

        FOR( jCh = currChannel; jCh < nChannelsL; jCh++ ) /* nChannelsL */
        {
        Word32 factor2;
        Word16 tmp_e;
        Word64 tmpmul;
            temp = W_add( temp, L_shr( Mpy_32_32( singularVectors[jCh][currChannel], singularVectors[jCh][currChannel] ), shl( sub( max_e, singularVectors_e[jCh][currChannel] ), 1 ) ) );
        }

        Word64 r_64;
        Word32 r, invVal;
        Word16 r_e, invVal_e;
        Word16 nrm = W_norm( temp );
        nrm = sub( nrm, 32 );
        norm_x = W_shl_sat_l( temp, nrm );
        norm_x_e = sub( add( max_e, max_e ), nrm );

        ( *g_e ) = add( sub( add( MAGIC_HEADROOM_1, MAGIC_HEADROOM_1 ), norm_x_e ), 1 ); // exp(g)=(2*H1-exp(norm_x)+1)
        IF( ( norm_x ) ) /*(fabsf(*sig_x) > EPSILON * fabsf(*sig_x)) { */
        {
            Word16 invVal_e;
            Word32 invVal;

            L_temp_e = norm_x_e;
            move16();
        ( *g ) = Sqrt32( norm_x, g_e ); // --> exp(g)=((2*H1-exp(norm_x)+1)/2)
        IF( GE_64( singularVectors_Left_64[currChannel][currChannel], 0 ) )
            if (0)
                L_temp = ISqrt32( norm_x, &L_temp_e );
            else
                L_temp = Sqrt32( norm_x, &L_temp_e );
            //( *g ) = L_negate( GE_32( singularVectors[currChannel][idx], 0 ) ? L_temp : L_negate( L_temp ) );
            if ( singularVectors[currChannel][currChannel] >= 0 )
            {
            ( *g ) = L_negate( *g );
                L_temp = L_negate( L_temp );
            }
        factor2 = W_extract_l( W_shr( singularVectors_Left_64[currChannel][currChannel], MAGIC_HEADROOM_2 ) ); // q(factor2)=q(sing)-H2	exp(factor2)=exp(qsing)+H2
        tmp_e = sub( 2 * MAGIC_HEADROOM_1 - MAGIC_HEADROOM_2, ( *g_e ) );
        tmpmul = W_mult0_32_32( ( *g ), factor2 ); // q(tmpmul)=q(g)+q(factor2) --> q(tmpmul) ~= q(norm)
        tmpmul = W_shr( tmpmul, tmp_e );           // --> q(tmpmul)=q(g)+q(factor2)-(2*H1-H2-q(g))
        r_64 = W_sub( tmpmul, norm_64 );           // q(r_64)=max(q(tmpmul),q(norm))
        r_e = W_norm( r_64 );
        r = W_extract_h( W_shl( r_64, r_e ) );
            ( *g ) = L_temp;
            move32();
            *g_e = L_temp_e;
            r = BASOP_Util_Add_Mant32Exp( Mpy_32_32( ( *g ), singularVectors[currChannel][currChannel] ), singularVectors_e[currChannel][currChannel] + L_temp_e, -norm_x, norm_x_e, &r_e );                                          /* exp(r_e) */
            singularVectors[currChannel][currChannel] = BASOP_Util_Add_Mant32Exp( singularVectors[currChannel][currChannel], singularVectors_e[currChannel][currChannel], -( *g ), *g_e, &singularVectors_e[currChannel][currChannel] ); /* sing_exp */
            move32();
            invVal = BASOP_Util_Divide3232_Scale_newton( MAXVAL_WORD32, maxWithSign_fx( r ), &invVal_e );

        invVal_e = 0;
            FOR( iCh = currChannel + 1; iCh < nChannelsC; iCh++ ) /* nChannelsC */
            {
                Word16 max2_e = MIN_16;
                max_e = MIN_16;
                move16();
        invVal = BASOP_Util_Divide3232_Scale_newton( MAXVAL_WORD32, maxWithSign_fx( r ), &invVal_e ); // invVal=1/r --> q(invVal)=-q(r)


        tmp_e = sub( 32, *g_e );
        singularVectors_Left_64[currChannel][currChannel] = W_sub( singularVectors_Left_64[currChannel][currChannel], W_shr( W_deposit32_h( *g ), tmp_e ) ); // q(sing)=max(q(sing),q(r)-(2*H1-H2-exp(r))
                move16();
                temp = 0;
                move64();

        FOR( iCh = add( currChannel, 1 ); iCh < nChannelsC; iCh++ )
                FOR( jCh = currChannel; jCh < nChannelsL; jCh++ ) /* nChannelsL */
                {
            Word32 factor1;
            Word32 factor2;
            Word32 f; // = norm / r
            Word16 magic_shift;
                    max_e = s_max( max_e, singularVectors_e[jCh][currChannel] ); /* exp(norm_x_e) */
                    max2_e = s_max( max2_e, singularVectors_e[jCh][iCh] );       /* exp(norm_x_e) */
                }
                max_e = add( max_e, max2_e );

            norm_64 = 0;
            for ( jCh = currChannel; jCh < nChannelsL; jCh++ )
                FOR( jCh = currChannel; jCh < nChannelsL; jCh++ ) /* nChannelsL */
                {
                factor1 = W_extract_h( W_shl( singularVectors_Left_64[jCh][currChannel], sub( 32, MAGIC_HEADROOM_3 ) ) ); // q(factor1) = q(sing)-H3
                factor2 = W_extract_h( W_shl( singularVectors_Left_64[jCh][iCh], sub( 32, MAGIC_HEADROOM_3 ) ) );         // q(factor2) = q(sing)-H3
                norm_64 = W_add( norm_64, W_mult0_32_32( factor1, factor2 ) );                                            // q(norm)=2*q(sing)-2*H3
                    temp = W_add( temp, L_shr( Mpy_32_32( singularVectors[jCh][currChannel], singularVectors[jCh][iCh] ), sub( max_e, add( singularVectors_e[jCh][currChannel], singularVectors_e[jCh][iCh] ) ) ) );
                }
            norm_x_e = W_norm( norm_64 );
            norm_x = W_extract_h( W_shl( norm_64, norm_x_e ) ); // Note: different norm
            f = Mpy_32_32( norm_x, invVal );                    // q(f)=q(norm_x)-q(invVal)
                                                                //            magic_shift = ( norm_x_e - 2 * MAGIC_HEADROOM_3 ) - ( r_e - 2 * MAGIC_HEADROOM_1 ) + ( 32 - MAGIC_HEADROOM_4 ) - 2 * invVal_e;
            magic_shift = sub( norm_x_e, shl( MAGIC_HEADROOM_3, 1 ) );
            magic_shift = sub( magic_shift, sub( r_e, ( shl( MAGIC_HEADROOM_1, 1 ) ) ) );
            magic_shift = add( magic_shift, sub( 32, MAGIC_HEADROOM_4 ) );
            magic_shift = sub( magic_shift, shl( invVal_e, 1 ) );
            FOR( jCh = currChannel; jCh < nChannelsL; jCh++ )
                Word16 nrm = W_norm( temp );
                nrm = sub( nrm, 32 );
                norm_x = W_shl_sat_l( temp, nrm );
                norm_x_e = sub( max_e, nrm );

                f = Mpy_32_32( norm_x, invVal ); /* invVal_e + (norm_x_e - r_e) */
                f_e = add( invVal_e, sub( norm_x_e, r_e ) );

                FOR( jCh = currChannel; jCh < nChannelsL; jCh++ ) /* nChannelsL */
                {
                factor1 = W_extract_h( W_shl( singularVectors_Left_64[jCh][currChannel], sub( 32, MAGIC_HEADROOM_4 ) ) );
                singularVectors_Left_64[jCh][iCh] = W_add( singularVectors_Left_64[jCh][iCh], W_shr( W_mult0_32_32( f, factor1 ), magic_shift ) );
                    singularVectors[jCh][iCh] = BASOP_Util_Add_Mant32Exp( singularVectors[jCh][iCh], singularVectors_e[jCh][iCh], Mpy_32_32( f, singularVectors[jCh][currChannel] ), add( f_e, singularVectors_e[jCh][currChannel] ), &singularVectors_e[jCh][iCh] );
                    move32();
                }
            }
        }
    }
    return;
}

/*-------------------------------------------------------------------------
 * biDiagonalReductionRight()
 *
 *
 *-------------------------------------------------------------------------*/

static void biDiagonalReductionRight_64(
    Word64 singularVectors_Left_64[MAX_OUTPUT_CHANNELS][MAX_OUTPUT_CHANNELS],
static void biDiagonalReductionRight_fx(
    Word32 singularVectors[][MAX_OUTPUT_CHANNELS], /* exp(singularVectors_e) */
    Word16 singularVectors_e[][MAX_OUTPUT_CHANNELS],
    const Word16 nChannelsL,  /* Q0 */
    const Word16 nChannelsC,  /* Q0 */
    const Word16 currChannel, /* Q0 */
    Word32 *g,                /* Q31 */
    Word16 *g_e )
    Word16 *g_e
)
{
    Word16 iCh, jCh;
    Word32 norm_x;
    Word16 norm_x_e;
    Word64 norm_64;
    Word16 idx;
    Word16 magic_headroom;

    Word16 iCh, jCh, idx;
    Word32 norm_x, r;
    Word16 norm_x_e, r_e;
    Word32 L_temp;
    Word16 L_temp_e;

    /* Setting values to 0 */
    ( *g ) = 0;
    ( *g_e ) = 0;
    move32();
    move16();
    IF( LT_16( currChannel, nChannelsL ) && NE_16( currChannel, sub( nChannelsC, 1 ) ) ) /* i <=m && i !=n */
    {
        norm_64 = 0;
        move64();
        idx = add( currChannel, 1 );
        FOR( jCh = idx; jCh < nChannelsC; jCh++ )
        idx = add( currChannel, 1 ); /* Q0 */

        norm_x = 0;
        move32();
        norm_x_e = 0;
        move16();
        FOR( jCh = idx; jCh < nChannelsC; jCh++ ) /*nChannelsC */
        {
            Word32 tmp;
            tmp = W_extract_l( W_shr( singularVectors_Left_64[currChannel][jCh], MAGIC_HEADROOM_1 ) ); // q(sing)-H1			// exp(sing)+H1
            norm_64 = W_add( norm_64, W_mult0_32_32( tmp, tmp ) );                                     // q(norm)=2*q(sing)-2*H1	// exp(norm)=2*exp(sing)+2*H1
            norm_x = BASOP_Util_Add_Mant32Exp( norm_x, norm_x_e, Mpy_32_32( singularVectors[currChannel][jCh], singularVectors[currChannel][jCh] ), shl( singularVectors_e[currChannel][jCh], 1 ), &norm_x_e ); /* exp(norm_x_e) */
        }
        norm_x_e = W_norm( norm_64 );
        magic_headroom = sub( 16, shr( norm_x_e, 2 ) );
        norm_x = W_extract_h( W_shl( norm_64, norm_x_e ) ); // q(norm_x)=32-exp(norm_x)	exp(norm_x)=exp(norm)-32
        move16();

        IF( norm_x )
        IF( ( norm_x ) ) /*(fabsf(*sig_x) > EPSILON * fabsf(*sig_x)) { */
        {
            Word32 factor1;
            Word32 factor2;
            Word16 tmp_e;
            Word64 tmpmul;
            Word16 magic_shift;
            Word64 r_64;
            Word32 r;
            Word16 r_e;
            Word32 f;
            Word16 invVal_e, temp_e;
            Word32 invVal;
            Word16 invVal_e;

            ( *g_e ) = add( sub( ( MAGIC_HEADROOM_1 + MAGIC_HEADROOM_1 ), norm_x_e ), 1 ); // exp(g)=(2*H1-exp(norm_x)+1)
            L_temp_e = norm_x_e;
            move16();
            ( *g ) = Sqrt32( norm_x, g_e ); // --> exp(g)=((2*H1-exp(norm_x)+1)/2)
            IF( GE_64( singularVectors_Left_64[currChannel][idx], 0 ) )
            L_temp = Sqrt32( norm_x, &L_temp_e );
            //L_temp = L_shl_r( L_temp, L_temp_e ); // Q31
            IF( singularVectors[currChannel][idx] >= 0 )
            {
                ( *g ) = L_negate( *g );
                ( *g ) = L_negate( L_temp ); /* exp(L_temp_e) */
                move32();
            }
            ELSE
            {
                ( *g ) = L_negate( L_negate( L_temp ) ); /* exp(L_temp_e) */
                move32();
            }
            *g_e = L_temp_e;
            r = BASOP_Util_Add_Mant32Exp( Mpy_32_32( ( *g ), singularVectors[currChannel][idx] ), singularVectors_e[currChannel][idx] + (*g_e), -norm_x, norm_x_e, &r_e );                                  /* exp(r_e) */
            singularVectors[currChannel][idx] = BASOP_Util_Add_Mant32Exp( singularVectors[currChannel][idx], singularVectors_e[currChannel][idx], -( *g ), *g_e, &singularVectors_e[currChannel][idx] ); /* exp(sing_exp) */
            move32();
            move16();
            factor2 = W_extract_l( W_shr( singularVectors_Left_64[currChannel][idx], MAGIC_HEADROOM_2 ) ); // q(factor2)=q(sing)-H2	exp(factor2)=exp(qsing)+H2
            tmp_e = sub( 2 * MAGIC_HEADROOM_1 - MAGIC_HEADROOM_2, *g_e );
            tmpmul = W_mult0_32_32( *g, factor2 ); // q(tmpmul)=q(g)+q(factor2)
            tmpmul = W_shr( tmpmul, tmp_e );       // --> q(tmpmul)=q(g)+q(factor2)-(2*H1-H2-q(g))
            r_64 = W_sub( tmpmul, norm_64 );       // q(r_64)=max(q(tmpmul),q(norm))
            r_e = W_norm( r_64 );
            r = W_extract_h( W_shl( r_64, r_e ) );

            invVal_e = 0;
            move16();
            invVal = BASOP_Util_Divide3232_Scale_newton( MAXVAL_WORD32, maxWithSign_fx( r ), &invVal_e ); // invVal=1/r --> q(invVal)=-q(r)
            tmp_e = sub( 32, *g_e );
            singularVectors_Left_64[currChannel][idx] = W_sub( singularVectors_Left_64[currChannel][idx], W_shr( W_deposit32_h( *g ), tmp_e ) ); // q(sing)=max(q(sing),q(r)-(2*H1-H2-exp(r)))
            invVal = BASOP_Util_Divide3232_Scale_newton( MAXVAL_WORD32, maxWithSign_fx( r ), &invVal_e );

            FOR( iCh = idx; iCh < nChannelsL; iCh++ )
            FOR( iCh = currChannel + 1; iCh < nChannelsL; iCh++ ) /*  nChannelsL */
            {

                norm_64 = 0;
                move64();
                FOR( jCh = idx; jCh < nChannelsC; jCh++ )
                norm_x = 0;
                move32();
                norm_x_e = 0;
                move16();
                FOR( jCh = idx; jCh < nChannelsC; jCh++ ) /* nChannelsC */
                {
                    factor1 = W_extract_h( W_shl( singularVectors_Left_64[iCh][jCh], sub( 32, MAGIC_HEADROOM_3 ) ) );         // q(factor1) = q(sing)-H3
                    factor2 = W_extract_h( W_shl( singularVectors_Left_64[currChannel][jCh], sub( 32, MAGIC_HEADROOM_3 ) ) ); // q(factor2) = q(sing)-H3
                    norm_64 = W_add( norm_64, W_mult0_32_32( factor1, factor2 ) );                                            // q(norm)=2*q(sing)-2*H3
                    norm_x = BASOP_Util_Add_Mant32Exp( norm_x, norm_x_e, Mpy_32_32( singularVectors[iCh][jCh], singularVectors[currChannel][jCh] ), add( singularVectors_e[iCh][jCh], singularVectors_e[currChannel][jCh] ), &norm_x_e ); /* exp(norm_x_e) */
                }

                norm_x_e = W_norm( norm_64 );
                norm_x = W_extract_h( W_shl( norm_64, norm_x_e ) ); // Note: different norm
                f = Mpy_32_32( norm_x, invVal );                    // q(f)=q(norm_x)-q(invVal)
                // magic_shift = ( norm_x_e - 2 * MAGIC_HEADROOM_3 ) - ( r_e - 2 * MAGIC_HEADROOM_1 ) + ( 32 - MAGIC_HEADROOM_4 ) - 2 * invVal_e;
                magic_shift = sub( norm_x_e, shl( MAGIC_HEADROOM_3, 1 ) );
                magic_shift = sub( magic_shift, sub( r_e, ( shl( MAGIC_HEADROOM_1, 1 ) ) ) );
                magic_shift = add( magic_shift, sub( 32, MAGIC_HEADROOM_4 ) );
                magic_shift = sub( magic_shift, shl( invVal_e, 1 ) );
                norm_x = Mpy_32_32( norm_x, invVal ); /* invVal_e + (norm_x_e - r_e) */
                norm_x_e = add( invVal_e, sub( norm_x_e, r_e ) );

                FOR( jCh = idx; jCh < nChannelsC; jCh++ )
                FOR( jCh = idx; jCh < nChannelsC; jCh++ ) /*  nChannelsC */
                {
                    factor2 = W_extract_h( W_shl( singularVectors_Left_64[currChannel][jCh], sub( 32, MAGIC_HEADROOM_4 ) ) );
                    singularVectors_Left_64[iCh][jCh] = W_add( singularVectors_Left_64[iCh][jCh], W_shr( W_mult0_32_32( f, factor2 ), magic_shift ) );
                    singularVectors[iCh][jCh] = BASOP_Util_Add_Mant32Exp( singularVectors[iCh][jCh], singularVectors_e[iCh][jCh], Mpy_32_32( norm_x, singularVectors[currChannel][jCh] ), add( norm_x_e, singularVectors_e[currChannel][jCh] ), &singularVectors_e[iCh][jCh] ); /* exp(sing_exp2) */
                    move32();
                }
            }
        }
    }

    return;
}
#else
/*-------------------------------------------------------------------------