Commit a1cac2d7 authored by thomas dettbarn's avatar thomas dettbarn
Browse files

brought experiment 5 to the same state as experiment 6.

parent 35dc4d77
Loading
Loading
Loading
Loading
+191 −196
Original line number Diff line number Diff line
@@ -79,16 +79,13 @@ static void biDiagonalReductionLeft_64(
static void biDiagonalReductionRight_64(
    Word64 singularVectors_Left_64[MAX_OUTPUT_CHANNELS][MAX_OUTPUT_CHANNELS],
    Word16 bitwindow,
    Word32 secDiag[MAX_OUTPUT_CHANNELS],    /* exp(singularValues_e) */
    Word16 secDiag_e[MAX_OUTPUT_CHANNELS],
    const Word16 nChannelsL,  /* Q0 */
    const Word16 nChannelsC,  /* Q0 */
    const Word16 currChannel, /* Q0 */
    Word32 *g, /* Q31 */
    Word16 *g_e
);

#endif
#else
static void biDiagonalReductionLeft_fx(
    Word32 singularVectors[][MAX_OUTPUT_CHANNELS], /* exp(singularVectors_e) */
    Word32 singularValues[MAX_OUTPUT_CHANNELS],    /* exp(singularValues_e) */
@@ -116,6 +113,7 @@ static void biDiagonalReductionRight_fx(
    Word16 *sig_x_e,
    Word32 *g /* Q31 */
);            // Q31
#endif

static void singularVectorsAccumulationLeft_fx(
    Word32 singularVectors_Left[][MAX_OUTPUT_CHANNELS], /* exp(singularVectors_e) as Input, Q31 as output */
@@ -345,7 +343,11 @@ Word16 svd_fx(
        move16();
        FOR( iCh = 0; iCh < lengthSingularValues - 1; iCh++ )
        {
#ifdef OPT_MCH_DEC_V1_NBE
            IF( LT_32( L_shl_sat( singularValues_fx[iCh], sub( singularValues_fx_e[iCh], singularValues_fx_e[iCh + 1] ) ), singularValues_fx[iCh + 1] ) )
#else  /* OPT_MCH_DEC_V1_NBE */
            IF( BASOP_Util_Cmp_Mant32Exp( singularValues_fx[iCh], singularValues_fx_e[iCh], singularValues_fx[iCh + 1], singularValues_fx_e[iCh + 1] ) < 0 )
#endif /* OPT_MCH_DEC_V1_NBE */
            {
                condition = 1;
                move16();
@@ -450,14 +452,24 @@ static Word16 BidagonalDiagonalisation_fx(

            FOR( jCh = iCh; jCh >= 0; jCh-- )
            {
                split = sub( jCh, 1 );                                                                                                                         /* Q0 */
#ifdef OPT_MCH_DEC_V1_NBE
                Word16 com_e = s_max( secDiag_new_e[jCh], eps_x_e );
                IF( LE_32( L_shr( L_abs( secDiag_fx[jCh] ), sub( com_e, secDiag_new_e[jCh] ) ), L_shr( Mpy_32_32( CONVERGENCE_FACTOR_FX, eps_x ), sub( com_e, eps_x_e ) ) ) ) /* is secDiag[ch] vanishing compared to eps_x */
#else
                split = sub( jCh, 1 ); /* Q0 */                                                                                                                                  /* OPT_MCH_DEC_V1_NBE */
                IF( LE_16( BASOP_Util_Cmp_Mant32Exp( L_abs( secDiag_fx[jCh] ), secDiag_new_e[jCh], Mpy_32_32( CONVERGENCE_FACTOR_FX, eps_x ), eps_x_e ), 0 ) )                   /* is secDiag[ch] vanishing compared to eps_x */
#endif /* OPT_MCH_DEC_V1_NBE */
                {
                    found_split = 0;
                    move16();
                    BREAK;
                }
#ifdef OPT_MCH_DEC_V1_NBE
                com_e = s_max( singularValues_new_e[jCh - 1], eps_x_e );
                IF( LE_32( L_shr( L_abs( singularValues_fx[jCh - 1] ), sub( com_e, singularValues_new_e[jCh - 1] ) ), L_shr( Mpy_32_32( CONVERGENCE_FACTOR_FX, eps_x ), sub( com_e, eps_x_e ) ) ) ) /* is singularValues[jCh - 1] vanishing compared to eps_x */
#else                                                                                                                                                                                               /* OPT_MCH_DEC_V1_NBE */
                IF( LE_16( BASOP_Util_Cmp_Mant32Exp( L_abs( singularValues_fx[split] ), singularValues_new_e[split], Mpy_32_32( CONVERGENCE_FACTOR_FX, eps_x ), eps_x_e ), 0 ) ) /* is singularValues[split] vanishing compared to eps_x */
#endif                                                                                                                                                                                              /* OPT_MCH_DEC_V1_NBE */
                {
                    BREAK;
                }
@@ -485,14 +497,21 @@ static Word16 BidagonalDiagonalisation_fx(
                move32();
                c_e = 0;
                move16();

#ifdef OPT_MCH_DEC_V1_NBE
                split = sub( jCh, 1 ); /* Q0 */
#endif                                 /* OPT_MCH_DEC_V1_NBE */
                FOR( kCh = jCh; kCh <= iCh; kCh++ )
                {
                    g = Mpy_32_32( s, secDiag_fx[kCh] ); /* exp(s_e + secDiag_new_e) */
                    g_e = add( s_e, secDiag_new_e[kCh] );
                    secDiag_fx[kCh] = Mpy_32_32( c, secDiag_fx[kCh] ); /* exp(c_e + secDiag_new_e) */
                    secDiag_new_e[kCh] = add( c_e, secDiag_new_e[kCh] );
#ifdef OPT_MCH_DEC_V1_NBE
                    Word16 com_e = s_max( g_e, eps_x_e );
                    IF( LE_32( L_shr( L_abs( g ), sub( com_e, g_e ) ), L_shr( Mpy_32_32( CONVERGENCE_FACTOR_FX, eps_x ), sub( com_e, eps_x_e ) ) ) )
#else  /* OPT_MCH_DEC_V1_NBE */
                    IF( LE_16( BASOP_Util_Cmp_Mant32Exp( L_abs( g ), g_e, Mpy_32_32( CONVERGENCE_FACTOR_FX, eps_x ), eps_x_e ), 0 ) )                                            /* is singularValues[split] vanishing compared to eps_x */
#endif /* OPT_MCH_DEC_V1_NBE */
                    {
                        BREAK;
                    }
@@ -857,16 +876,13 @@ static void HouseholderReduction_fx(
    Word16 nCh;
	push_wmops("HouseholderReduction_fx");
#ifdef	MYCHANGES

	Word64 singularVectors_Left_64[MAX_OUTPUT_CHANNELS][MAX_OUTPUT_CHANNELS];
    Word32 g_fx = 0;
    Word16 g_e = 0;
    move32();
    move16();

    Word32 sig_x_fx = 0;
    Word16 sig_x_fx_e = 0;
    move32();
    move16();
#else

    // float g = 0.0f, sig_x = 0.0f;// to be removed
@@ -879,7 +895,6 @@ static void HouseholderReduction_fx(

    Word16 iCh, jCh;
    Word16 singularVectors_Left_fx_e[MAX_OUTPUT_CHANNELS][MAX_OUTPUT_CHANNELS];
	printf("\n");
#ifdef	 MYCHANGES
	push_wmops("HouseholderReduction_fx 64");
    FOR( jCh = 0; jCh < nChannelsL; jCh++ )
@@ -900,52 +915,47 @@ static void HouseholderReduction_fx(
			    nChannelsC,
			    nCh
			    );
		{
			int i,j;
			printf("\ncompare%d, (%d)\x1b[1;32mstart\x1b[0m\n",nCh,singularVectors_Left_e);
			for (i=0;i<nChannelsL;i++)
			{
				printf("compare%d,%d: \x1b[1;32m",nCh,i);
				for (j=0;j<nChannelsC;j++)
				{
					Word16 n;
					n=W_norm(singularVectors_Left_64[i][j]);
					printf("%08X ",W_extract_h(W_shl(singularVectors_Left_64[i][j],n)));
				}
				printf("\x1b[0m\n");
			}
			printf("compare%d \x1b[1;32mend\x1b[0m\n",nCh);
		}

//                bitwindow=nCh+1;
	    singularValues_fx_e[nCh]=add(singularVectors_Left_e,singularValues_fx_e[nCh]);
	    secDiag_fx[nCh]=g_fx;
	    move32();
	    secDiag_fx_e[nCh]=add(singularVectors_Left_e,g_e);
	    bitwindow=2;
	    biDiagonalReductionRight_64(
			    singularVectors_Left_64,bitwindow,
			secDiag_fx,secDiag_fx_e,
			    nChannelsL,
			    nChannelsC,
			    nCh,
			    &g_fx,
			    &g_e	
			    );
		{
        		Word16 L_temp_e;
		        Word32 L_temp;
			L_temp = BASOP_Util_Add_Mant32Exp( L_abs( singularValues_fx[nCh] ), singularValues_fx_e[nCh], L_abs( secDiag_fx[nCh] ), secDiag_fx_e[nCh], &L_temp_e ); /* exp(L_temp_e) */
			IF( EQ_16( BASOP_Util_Cmp_Mant32Exp( L_temp, L_temp_e, *eps_x_fx, *eps_x_fx_e ), 1 ) )
			{
				*eps_x_fx = L_temp; /* exp(L_temp_e) */
				move32();
				*eps_x_fx_e = L_temp_e;
				move32();
			}
		}
    }	
    {
        int i,j;
			printf("\nCOMPARE%d, (%d)\x1b[1;32mstart\x1b[0m\n",nCh,singularVectors_Left_e);
			for (i=0;i<nChannelsL;i++)
        for (j=0;j<nChannelsL;j++)
        {
				printf("COMPARE%d,%d: \x1b[1;32m",nCh,i);
				for (j=0;j<nChannelsC;j++)
            for (i=0;i<nChannelsC;i++)
            {
                Word16 n;
					n=W_norm(singularVectors_Left_64[i][j]);
					printf("%08X<%3d ",W_extract_h(W_shl(singularVectors_Left_64[i][j],n)),n);
				}
				printf("\x1b[0m\n");
                n=W_norm(singularVectors_Left_64[j][i]);
                singularVectors_Left_fx[j][i]=W_extract_h(W_shl(singularVectors_Left_64[j][i],n));
                singularVectors_Left_fx_e[j][i]=sub(add(32,singularVectors_Left_e),n);
	    }
			printf("COMPARE%d \x1b[1;32mend\x1b[0m\n",nCh);
        }
    }
	pop_wmops();
#endif
#else
	push_wmops("HouseholderReduction_fx 32");
    FOR( jCh = 0; jCh < nChannelsL; jCh++ )
    {
@@ -960,35 +970,7 @@ static void HouseholderReduction_fx(
    FOR( nCh = 0; nCh < nChannelsC; nCh++ ) /* nChannelsC */
    {
        biDiagonalReductionLeft_fx( singularVectors_Left_fx, singularValues_fx, secDiag_fx, singularVectors_Left_fx_e, singularValues_fx_e, secDiag_fx_e, nChannelsL, nChannelsC, nCh, &sig_x_fx, &sig_x_fx_e, &g_fx );
   		{
			int i,j;
			printf("compare%d, start\n",nCh);
			for (i=0;i<nChannelsL;i++)
			{
				printf("compare%d,%d: ",nCh,i);
				for (j=0;j<nChannelsC;j++)
				{
					printf("%08X ",singularVectors_Left_fx[i][j]);
				}
				printf("\n");
			}
			printf("compare%d, end\n",nCh);
		}
        biDiagonalReductionRight_fx( singularVectors_Left_fx, secDiag_fx, singularVectors_Left_fx_e, secDiag_fx_e, nChannelsL, nChannelsC, nCh, &sig_x_fx, &sig_x_fx_e, &g_fx );
		{
			int i,j;
			printf("COMPARE%d, start\n",nCh);
			for (i=0;i<nChannelsL;i++)
			{
				printf("COMPARE%d,%d: ",nCh,i);
				for (j=0;j<nChannelsC;j++)
				{
					printf("%08X<%3d ",singularVectors_Left_fx[i][j],singularVectors_Left_fx_e[i][j]);
				}
				printf("\n");
			}
			printf("COMPARE%d, end\n",nCh);
		}

        Word16 L_temp_e;
        Word32 L_temp = BASOP_Util_Add_Mant32Exp( L_abs( singularValues_fx[nCh] ), singularValues_fx_e[nCh], L_abs( secDiag_fx[nCh] ), secDiag_fx_e[nCh], &L_temp_e ); /* exp(L_temp_e) */
@@ -1000,57 +982,10 @@ static void HouseholderReduction_fx(
            move32();
        }
    }





    pop_wmops();
    {
	    int i,j;
	    static int replacecnt=0;
	    static int bettercnt=0;
	    static int totalcnt=0;

	    for (i=0;i<nChannelsL;i++)
	    {
			printf("EXPONENT%02d: IN%3d ",i,singularVectors_Left_e);
		    for (j=0;j<nChannelsC;j++)
		    {
			    Word16 n;
			    Word32 tmp;
			    Word32 minemant;
			    unsigned int x,y;
				int mine,theirs;
			    n=norm_l(singularVectors_Left_fx[i][j]);
			    minemant=singularVectors_Left_fx[i][j]<<n;
                            x=(unsigned int)minemant;
				printf("[(%2d)",singularVectors_Left_fx_e[i][j]-n);
				theirs=singularVectors_Left_fx_e[i][j]-n;
#endif

			    n=W_norm(singularVectors_Left_64[i][j]);
			    tmp=W_extract_h(W_shl(singularVectors_Left_64[i][j],n));
			    y=(unsigned int)tmp;

				n=32+singularVectors_Left_e-n;
				mine=n;
				printf("MINE:%2d]",n);
				if (!((x^y)&0xff000000)) bettercnt++;
				if (mine==theirs || minemant==singularVectors_Left_fx[i][j])
				{
					singularVectors_Left_fx[i][j]=x;
					singularVectors_Left_fx_e[i][j]=mine;
					replacecnt++;
					printf("\x1b[0;32m");
				}
				printf("%08X/%08X ",x,y);
				printf("\x1b[0m");
			    totalcnt++;
		    }
			printf("\n");
	    }
	    printf("\nbetter %d replace:%d  /%d\n",bettercnt,replacecnt,totalcnt);
    }

    /* SingularVecotr Accumulation */
    singularVectorsAccumulationRight_fx( singularVectors_Left_fx, singularVectors_Right_fx, secDiag_fx, singularVectors_Left_fx_e, secDiag_fx_e, nChannelsC );
@@ -1084,6 +1019,10 @@ static void biDiagonalReductionLeft_64(
    Word32 norm_x, g;
    Word16 norm_x_e, g_e;
    Word64 norm_64;
    g=0;
    g_e=0;
    move32();
    move16();
    norm_x=0;
    move32();
    IF( LT_16( currChannel, nChannelsL ) ) /* i <= m */
@@ -1161,18 +1100,16 @@ static void biDiagonalReductionLeft_64(
                 singularVectors_Left_64[jCh][iCh] = W_add( singularVectors_Left_64[jCh][iCh], W_shr(W_mult0_32_32( f, factor1 ), magic_shift ) );
            }
        }
    }
    singularValues[currChannel] = g;
    singularValues_e[currChannel] = g_e;
    move32();
    move16();
}
}

static void biDiagonalReductionRight_64(
    Word64 singularVectors_Left_64[MAX_OUTPUT_CHANNELS][MAX_OUTPUT_CHANNELS],
    Word16 bitwindow,
    Word32 secDiag[MAX_OUTPUT_CHANNELS],    /* exp(singularValues_e) */
    Word16 secDiag_e[MAX_OUTPUT_CHANNELS],
    const Word16 nChannelsL,  /* Q0 */
    const Word16 nChannelsC,  /* Q0 */
    const Word16 currChannel, /* Q0 */
@@ -1184,19 +1121,10 @@ static void biDiagonalReductionRight_64(
    Word32 norm_x;
    Word16 norm_x_e;
    Word64 norm_64;
    Word32 abs_x;
    Word16 abs_x_e;
    Word64 abs_64;
    Word16 idx;
    Word16 bitwindow0;



    secDiag[currChannel] = ( *g );
    secDiag_e[currChannel] = ( *g_e );
    move32();
    move16();
    bitwindow0=bitwindow;

    ( *g ) =0;
    ( *g_e ) = 0;
@@ -1205,24 +1133,18 @@ static void biDiagonalReductionRight_64(
    IF ( LT_16( currChannel, nChannelsL ) && NE_16( currChannel, sub( nChannelsC, 1 ) ) ) /* i <=m && i !=n */
    {
        norm_64=0;
        abs_64=0;
        move64();
        move64();
        idx = add( currChannel, 1);
        FOR ( jCh = idx; jCh < nChannelsC; jCh++ )
        {
            Word32 tmp;
            tmp = W_extract_l( W_shr( singularVectors_Left_64[currChannel][jCh], bitwindow+1) );
            tmp = W_extract_l( W_shr( singularVectors_Left_64[currChannel][jCh], bitwindow) );
            norm_64 = W_add( norm_64, W_mult0_32_32( tmp, tmp) );
            abs_64 = W_add( abs_64, W_abs( singularVectors_Left_64[currChannel][jCh]) );
        }
        norm_x_e = W_norm( norm_64);
        norm_x = W_extract_h( W_shl( norm_64, norm_x_e) );
        norm_x_e = add( sub( shl( bitwindow+1, 1), norm_x_e), 1);
        norm_x_e = add( sub( shl( bitwindow, 1), norm_x_e), 1);
        move16();
        abs_x_e = W_norm( abs_64);
        abs_x = W_extract_h( W_shl( abs_64, abs_x_e) );
        abs_x_e = add( sub( add( bitwindow, bitwindow), abs_x_e), 1);

        IF ( norm_x )
        {
@@ -1237,7 +1159,6 @@ static void biDiagonalReductionRight_64(
            Word32 r;
            Word16 r_e;
            Word32 f;
            Word16 f_e;
            Word32 invVal;
            Word16 invVal_e;

@@ -1252,17 +1173,15 @@ static void biDiagonalReductionRight_64(
            *g_e = tmp_g_e;
            move32();
            move16();
            factor2=W_extract_l( W_shr( singularVectors_Left_64[currChannel][idx], bitwindow+1) );
            tmp_e = sub( tmp_g_e, bitwindow+1);
            factor2=W_extract_l( W_shr( singularVectors_Left_64[currChannel][idx], bitwindow) );
            tmp_e = sub( tmp_g_e, bitwindow);
            tmpmul = W_mult0_32_32( tmp_g, factor2);
            tmpmul = W_shl(tmpmul, tmp_e);
            r_64 = W_sub( tmpmul, norm_64 );
            r_e = W_norm( r_64);
            r = W_extract_h( W_shl( r_64, r_e) );
            r_e = sub( add( shl( bitwindow, 1), 1), r_e );
//	r_e=2*bitwindow+1-r_e;

            invVal_e = r_e;
            invVal_e = 0;
            move16();
            invVal = BASOP_Util_Divide3232_Scale_newton( MAXVAL_WORD32, maxWithSign_fx( r ), &invVal_e);

@@ -1285,8 +1204,7 @@ static void biDiagonalReductionRight_64(
                norm_x_e = W_norm( norm_64);
                norm_x = W_extract_h( W_shl( norm_64, norm_x_e) );
                f = Mpy_32_32( norm_x, invVal);
                f_e = add( invVal_e, sub( norm_x_e, r_e) );
                magic_shift = 22-2*norm_x_e+4*r_e+3*f_e;		// FIXME: HOW IS THIS WORKING?????!?!?!?!?!?!?!?!?!?
		magic_shift = 25+norm_x_e-  r_e ;	// FIXME: Why does this work?
		
                FOR( jCh = idx; jCh < nChannelsC; jCh++ )
                {
@@ -1294,25 +1212,10 @@ static void biDiagonalReductionRight_64(
                    singularVectors_Left_64[iCh][jCh] = W_add( singularVectors_Left_64[iCh][jCh], W_shr( W_mult0_32_32( f, factor2), magic_shift) );
                }
            }
            // FIXME BEGIN: The following code has not yet been tested
            invVal_e = 0;
            move16();
            invVal = BASOP_Util_Divide3232_Scale_newton( abs_x, maxWithSign_fx( r ), &invVal_e);
            invVal_e = add( invVal_e, sub( abs_x_e, r_e) );
            bitwindow = bitwindow0;
            move16();
            FOR ( jCh = idx; jCh < nChannelsL ; jCh++ )
            {
                secDiag[jCh] = Mpy_32_32( W_extract_l( W_shr( singularVectors_Left_64[currChannel][jCh], bitwindow) ), invVal );
                move32();
                secDiag_e[jCh] = add(invVal_e, bitwindow);
                move16();
            }
            // FIXME END
        }
    }
}
#endif
#else
static void biDiagonalReductionLeft_fx(
    Word32 singularVectors[][MAX_OUTPUT_CHANNELS], /* exp(singularVectors_e) */
    Word32 singularValues[MAX_OUTPUT_CHANNELS],    /* exp(singularValues_e) */
@@ -1360,9 +1263,15 @@ static void biDiagonalReductionLeft_fx(
            Word16 invVal_e;
            Word32 invVal;
            invVal = BASOP_Util_Divide3232_Scale_newton( MAXVAL_WORD32, maxWithSign_fx( *sig_x ), &invVal_e );
#ifdef OPT_MCH_DEC_V1_NBE
            Word64 temp = 0;
            move64();
            Word16 max_e = MIN_16;
#else  /* OPT_MCH_DEC_V1_NBE */
            norm_x = 0;
            move32();
            norm_x_e = 0;
#endif /* OPT_MCH_DEC_V1_NBE */
            move16();

            FOR( jCh = idx; jCh < nChannelsL; jCh++ ) /* nChannelsL */
@@ -1372,8 +1281,25 @@ static void biDiagonalReductionLeft_fx(
                move32();
                singularVectors2_e[jCh][currChannel] = sub( add( invVal_e, sub( singularVectors2_e[jCh][currChannel], *sig_x_e ) ), temp_e );
                move16();
#ifdef OPT_MCH_DEC_V1_NBE
                max_e = s_max( max_e, singularVectors2_e[jCh][currChannel] );
#else  /* OPT_MCH_DEC_V1_NBE */
                norm_x = BASOP_Util_Add_Mant32Exp( norm_x, norm_x_e, Mpy_32_32( singularVectors[jCh][currChannel], singularVectors[jCh][currChannel] ), shl( singularVectors2_e[jCh][currChannel], 1 ), &norm_x_e ); /* exp(norm_x_e) */
#endif /* OPT_MCH_DEC_V1_NBE */
            }

#ifdef OPT_MCH_DEC_V1_NBE
            FOR( jCh = idx; jCh < nChannelsL; jCh++ ) /* nChannelsL */
            {
                temp = W_add( temp, L_shr( Mpy_32_32( singularVectors[jCh][currChannel], singularVectors[jCh][currChannel] ), shl( sub( max_e, singularVectors2_e[jCh][currChannel] ), 1 ) ) );
            }

            Word16 nrm = W_norm( temp );
            nrm = sub( nrm, 32 );
            norm_x = W_shl_sat_l( temp, nrm );
            norm_x_e = sub( add( max_e, max_e ), nrm );
#endif /* OPT_MCH_DEC_V1_NBE */

            IF( GT_16( norm_x_e, 0 ) )
            {
                norm_x = MAX_32;
@@ -1401,6 +1327,30 @@ static void biDiagonalReductionLeft_fx(

            FOR( iCh = currChannel + 1; iCh < nChannelsC; iCh++ ) /* nChannelsC */
            {
#ifdef OPT_MCH_DEC_V1_NBE
                Word16 max2_e = MIN_16;
                max_e = MIN_16;
                move16();
                move16();
                temp = 0;
                move64();

                FOR( jCh = idx; jCh < nChannelsL; jCh++ ) /* nChannelsL */
                {
                    max_e = s_max( max_e, singularVectors2_e[jCh][currChannel] ); /* exp(norm_x_e) */
                    max2_e = s_max( max2_e, singularVectors2_e[jCh][iCh] );       /* exp(norm_x_e) */
                }
                max_e = add( max_e, max2_e );

                FOR( jCh = idx; jCh < nChannelsL; jCh++ ) /* nChannelsL */
                {
                    temp = W_add( temp, L_shr( Mpy_32_32( singularVectors[jCh][currChannel], singularVectors[jCh][iCh] ), sub( max_e, add( singularVectors2_e[jCh][currChannel], singularVectors2_e[jCh][iCh] ) ) ) );
                }
                nrm = W_norm( temp );
                nrm = sub( nrm, 32 );
                norm_x = W_shl_sat_l( temp, nrm );
                norm_x_e = sub( max_e, nrm );
#else  /* OPT_MCH_DEC_V1_NBE */
                norm_x = 0;
                move32();
                norm_x_e = 0;
@@ -1409,6 +1359,7 @@ static void biDiagonalReductionLeft_fx(
                {
                    norm_x = BASOP_Util_Add_Mant32Exp( norm_x, norm_x_e, Mpy_32_32( singularVectors[jCh][currChannel], singularVectors[jCh][iCh] ), add( singularVectors2_e[jCh][currChannel], singularVectors2_e[jCh][iCh] ), &norm_x_e ); /* exp(norm_x_e) */
                }
#endif /* OPT_MCH_DEC_V1_NBE */

                f = Mpy_32_32( norm_x, invVal ); /* invVal_e + (norm_x_e - r_e) */
                f_e = add( invVal_e, sub( norm_x_e, r_e ) );
@@ -1565,6 +1516,7 @@ static void biDiagonalReductionRight_fx(

    return;
}
#endif

/*-------------------------------------------------------------------------
 * singularVectorsAccumulationLeft()
@@ -1659,8 +1611,16 @@ static void singularVectorsAccumulationLeft_fx(
                move32();
            }
        }
#ifdef OPT_MCH_DEC_V1_NBE
        Word16 exp = s_max( singularVectors_Left_e[nCh][nCh], 1 );
        singularVectors_Left[nCh][nCh] = L_sub( L_shr( singularVectors_Left[nCh][nCh], sub( exp, singularVectors_Left_e[nCh][nCh] ) ), L_shr( MINUS_ONE_IN_Q31, exp ) ); /* exp(sing_exp2) */
        move32();
        singularVectors_Left_e[nCh][nCh] = exp;
        move16();
#else  /* OPT_MCH_DEC_V1_NBE */
        singularVectors_Left[nCh][nCh] = BASOP_Util_Add_Mant32Exp( singularVectors_Left[nCh][nCh], singularVectors_Left_e[nCh][nCh], ONE_IN_Q30, 1, &singularVectors_Left_e[nCh][nCh] ); /* exp(sing_exp2) */
        move32();
#endif /* OPT_MCH_DEC_V1_NBE */
    }
    // fclose(fp);
    FOR( nCh = 0; nCh < nChannelsL; nCh++ )
@@ -1723,21 +1683,56 @@ static void singularVectorsAccumulationRight_fx(

                FOR( iCh = nCh + 1; iCh < nChannelsC; iCh++ ) /* nChannelsC */
                {
#ifdef OPT_MCH_DEC_V1_NBE
                    Word64 norm_val = 0;
                    move64();
                    Word16 maxL_e = MIN_16;
                    Word16 maxR_e = MIN_16;
                    Word16 maxR2_e = MIN_16;
                    move16();
                    move16();
                    move16();
                    FOR( k = nCh + 1; k < nChannelsC; k++ ) /* nChannelsC */
                    {
                        maxL_e = s_max( maxL_e, singularVectors_Left_e[nCh][k] );
                        maxR_e = s_max( maxR_e, sing_right_exp[k][iCh] );
                        maxR2_e = s_max( maxR2_e, sing_right_exp[k][nCh] );
                    }
#else  /* OPT_MCH_DEC_V1_NBE */
                    norm_y = 0;
                    move32();
                    norm_y_e = 0;
                    move16();
#endif /* OPT_MCH_DEC_V1_NBE */

                    FOR( k = nCh + 1; k < nChannelsC; k++ ) /* nChannelsC */
                    {
#ifdef OPT_MCH_DEC_V1_NBE
                        norm_val = W_mac_32_32( norm_val, L_shr( singularVectors_Left[nCh][k], sub( maxL_e, singularVectors_Left_e[nCh][k] ) ), L_shr( singularVectors_Right[k][iCh], sub( maxR_e, sing_right_exp[k][iCh] ) ) );
#else  /* OPT_MCH_DEC_V1_NBE */
                        norm_y = BASOP_Util_Add_Mant32Exp( norm_y, norm_y_e, Mpy_32_32( singularVectors_Left[nCh][k], singularVectors_Right[k][iCh] ), add( singularVectors_Left_e[nCh][k], sing_right_exp[k][iCh] ), &norm_y_e );                               /* exp(norm_y_e) */
#endif /* OPT_MCH_DEC_V1_NBE */
                    }
#ifdef OPT_MCH_DEC_V1_NBE
                    norm_y_e = W_norm( norm_val );
                    norm_y = W_extract_h( W_shl( norm_val, norm_y_e ) );
                    norm_y_e = sub( add( maxL_e, maxR_e ), norm_y_e );

                    Word16 max_new = s_max( maxR_e, add( maxR2_e, norm_y_e ) );
#endif                                                      /* OPT_MCH_DEC_V1_NBE */
                    FOR( k = nCh + 1; k < nChannelsC; k++ ) /* nChannelsC */
                    {
#ifdef OPT_MCH_DEC_V1_NBE
                        Word32 temp = Mpy_32_32( norm_y, singularVectors_Right[k][nCh] );
                        Word32 op2 = L_shr( temp, sub( max_new, add( norm_y_e, sing_right_exp[k][nCh] ) ) );
                        singularVectors_Right[k][iCh] = L_add_sat( L_shr( singularVectors_Right[k][iCh], sub( max_new, sing_right_exp[k][iCh] ) ), op2 ); /* exp(sing_right_exp) */
                        move32();
                        singularVectors_Right[k][iCh] = L_shl_sat( singularVectors_Right[k][iCh], max_new ); /* Q31 */
#else                                                                                                        /* OPT_MCH_DEC_V1_NBE */
                        singularVectors_Right[k][iCh] = BASOP_Util_Add_Mant32Exp( singularVectors_Right[k][iCh], sing_right_exp[k][iCh], Mpy_32_32( norm_y, singularVectors_Right[k][nCh] ), add( norm_y_e, sing_right_exp[k][nCh] ), &sing_right_exp[k][iCh] ); /* exp(sing_right_exp) */
                        move32();
                        singularVectors_Right[k][iCh] = L_shl_sat( singularVectors_Right[k][iCh], sing_right_exp[k][iCh] ); /* Q31 */
#endif                                                                                                       /* OPT_MCH_DEC_V1_NBE */
                        move32();
                        sing_right_exp[k][iCh] = 0;
                        move16();