Commit 26535cc7 authored by Sandesh Venkatesh's avatar Sandesh Venkatesh
Browse files

Stereo and MCH optimizations

parent b8a579eb
Loading
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -1213,7 +1213,7 @@ void cldfbSynthesis_ivas_fx(
            /*cplxMult(&iBuffer[2*i], &iBuffer[2*i+1],-imagBuffer[k][2*i], imagBuffer[k][M1-1-2*i], rot_vctr_re[i], rot_vctr_im[i]);*/
            iBuffer_fx[2 * i] = Msub_32_32( Mpy_32_32( ( L_negate( imagBuffer_fx[k][2 * i] ) ), rot_vctr_re_fx[i] ), imagBuffer_fx[k][( M1 - 1 ) - ( i * 2 )], rot_vctr_im_fx[i] ); // Qx
            move32();
            iBuffer_fx[2 * i + 1] = Madd_32_32( Mpy_32_32( ( L_negate( imagBuffer_fx[k][2 * i] ) ), rot_vctr_im_fx[i] ), imagBuffer_fx[k][( M1 - 1 ) - ( i * 2 )], rot_vctr_re_fx[i] ); // Qx
            iBuffer_fx[2 * i + 1] = Msub_32_32( Mpy_32_32( imagBuffer_fx[k][( M1 - 1 ) - ( i * 2 )], rot_vctr_re_fx[i] ), imagBuffer_fx[k][2 * i], rot_vctr_im_fx[i] ); // Qx
            move32();
        }

+24 −38
Original line number Diff line number Diff line
@@ -131,24 +131,23 @@ static void interpTargetChannel_fx(
    Word32 spread_factor2_fx;
    Word64 tempD1_fx, tempD2_fx;

    d = negate( sub( currShift, prevShift ) );
    IF( d >= 0 )
    d = sub( prevShift, currShift );

    IF( d == 0 )
    {
        /* this can happen in DFT->TD switching */
        return;
    }

    signShift = 1;
    move16();
    }
    ELSE

    if ( d < 0 )
    {
        signShift = -1;
        move16();
    }

    IF( d == 0 )
    {
        /* this can happen in DFT->TD switching */
        return;
    }

    N = L_shift_adapt;
    move16();
    Word32 *table_pointer = NULL;
@@ -207,7 +206,7 @@ static void interpTargetChannel_fx(

            FOR( j = lim1; j <= lim2; j++ )
            {
                ptr2_fx[i] = L_add( Mpy_32_32( win_fx[j * INTERP_FACTOR1 - i], ptr1_fx[j] ), ptr2_fx[i] ); // qsynth
                ptr2_fx[i] = Madd_32_32( ptr2_fx[i], win_fx[j * INTERP_FACTOR1 - i], ptr1_fx[j] ); // qsynth
                move32();
            }
        }
@@ -225,44 +224,31 @@ static void interpTargetChannel_fx(
    tempD1_fx = W_deposit32_l( table_D1_pointer[abs( d )] );    // Q35
    tempD2_fx = W_mult0_32_32( 3, table_D1_pointer[abs( d )] ); // Q35

    IF( EQ_16( signShift, 1 ) )
    {
        tempF1_fx = ONE_IN_Q12; // Q12
        move32();
    }
    ELSE
    {
    tempF1_fx = -ONE_IN_Q12; // Q12
    move32();
    }
    tempF1_fx = L_sub( imult3216( factor_fx, d ), tempF1_fx ); // Q12
    FOR( k = 0; k < sub( N, 1 ); k++ )
    {
        Word32 local = L_sub( W_extract_l( W_shr( W_mult0_32_32( tempF1_fx, spread_factor2_fx ), 31 ) ), ONE_IN_Q12 ); // Q12
        Word32 sign_local;
        IF( local > 0 )

    if ( EQ_16( signShift, 1 ) )
    {
            sign_local = 1;
        tempF1_fx = ONE_IN_Q12; // Q12
        move32();
    }
        ELSE

    tempF1_fx = L_sub( imult3216( factor_fx, d ), tempF1_fx ); // Q12
    FOR( k = 0; k < N - 1; k++ )
    {
            sign_local = -1;
            move32();
        }
        Word32 local_int = W_extract_l( W_shr( W_abs( local ), 12 ) ); // Q0
        Word32 local = Madd_32_32( -ONE_IN_Q12, tempF1_fx, spread_factor2_fx ); // Q12
        Word32 local_int = L_shr( local, 12 );                                  // Q0
        Word32 res_a1, res_a2, res_a3;
        Word32 res_b1, res_b2, res_b3;
        Word32 res_c1, res_c2, res_c3;
        Word32 res_d1, res_d2, res_d3;
        Word64 local_int_scaled;
        Word32 local_int_scaled;
        Word64 res_a, res_b, res_c, res_d;
        Word64 tempa, tempb;
        Word64 mult_a_D1, mult_b_D2;
        local_int = W_extract_l( W_mult0_32_32( sign_local, local_int ) ); // Q0
        local_int_scaled = W_deposit32_l( L_shl( local_int, 12 ) );        // Q12
        local_int_scaled = L_shl( local_int, 12 );  // Q12
        lim1 = extract_l( local_int );              // Q0
        IF( W_sub( local_int_scaled, local ) > 0 )                         // Q21
        if ( L_sub( local_int_scaled, local ) > 0 ) // Q12
        {
            lim1 = sub( lim1, 1 ); // Q0
        }
@@ -387,7 +373,7 @@ static void targetCh_AlignStereoDFT_fx(
        }
        FOR( i = 0; i < L_shift_adapt; i++ )
        {
            target_fx[i] = L_add( Mpy_32_32( alpha_fx, fadeInBuff_fx[i] ), Mpy_32_32( L_sub( ONE_IN_Q31, alpha_fx ), fadeOutBuff_fx[i] ) ); // qsynth
            target_fx[i] = Madd_32_32( Mpy_32_32( alpha_fx, fadeInBuff_fx[i] ), L_sub( ONE_IN_Q31, alpha_fx ), fadeOutBuff_fx[i] ); // qsynth
            move32();

            alpha_fx = L_add_sat( alpha_fx, winSlope_fx ); // Q31
+81 −43
Original line number Diff line number Diff line
@@ -942,6 +942,12 @@ Word16 matrix_product_mant_exp_fx(
    Word16 *Zp_fx_e = out_e;
    Word16 row, col;
    Word16 x_idx, y_idx;
    Word64 temp;
    Word16 temp_e;
    Word16 prod_e = add( X_fx_e, Y_fx_e );

    Word16 max_exp = -31;
    move16();

    /* Processing */
    test();
@@ -957,17 +963,28 @@ Word16 matrix_product_mant_exp_fx(
        {
            FOR( i = 0; i < colsX; ++i )
            {
                ( *Zp_fx ) = 0;
                move32();
                ( *Zp_fx_e ) = 0;
                move16();
                temp = 0;
                move64();

                FOR( k = 0; k < rowsX; ++k )
                {
                    x_idx = add( k, imult1616( i, rowsX ) );
                    y_idx = add( k, imult1616( j, rowsY ) );
                    ( *Zp_fx ) = BASOP_Util_Add_Mant32Exp( *Zp_fx, *Zp_fx_e, Mpy_32_32( X_fx[x_idx], Y_fx[y_idx] ), add( X_fx_e, Y_fx_e ), Zp_fx_e ); /*Q31 - Zp_fx_e*/
                    move32();
                    x_idx = k + i * rowsX;
                    y_idx = k + j * rowsY;
                    temp = W_mac_32_32( temp, X_fx[x_idx], Y_fx[y_idx] ); // X_fx_e + Y_fx_e
                }
                /* Maximize accumulated value to 32-bit */
                temp_e = W_norm( temp );
                temp = W_shl( temp, temp_e );
                if ( 0 == temp )
                {
                    temp_e = prod_e;
                    move16();
                }
                *Zp_fx_e = sub( prod_e, temp_e );
                move16();
                ( *Zp_fx ) = W_extract_h( temp );
                move32();
                max_exp = s_max( max_exp, *Zp_fx_e ); // Find the max exp
                Zp_fx++;
                Zp_fx_e++;
            }
@@ -987,17 +1004,27 @@ Word16 matrix_product_mant_exp_fx(
        {
            FOR( i = 0; i < rowsX; ++i )
            {
                ( *Zp_fx ) = 0;
                move32();
                ( *Zp_fx_e ) = 0;
                move16();
                temp = 0;
                move64();
                FOR( k = 0; k < colsX; ++k )
                {
                    x_idx = add( i, imult1616( k, rowsX ) );
                    y_idx = add( j, imult1616( k, rowsY ) );
                    ( *Zp_fx ) = BASOP_Util_Add_Mant32Exp( *Zp_fx, *Zp_fx_e, Mpy_32_32( X_fx[x_idx], Y_fx[y_idx] ), add( X_fx_e, Y_fx_e ), Zp_fx_e ); /*Q31 - Zp_fx_e*/
                    move32();
                    x_idx = i + k * rowsX;
                    y_idx = j + k * rowsY;
                    temp = W_mac_32_32( temp, X_fx[x_idx], Y_fx[y_idx] ); // X_fx_e + Y_fx_e
                }
                /* Maximize accumulated value to 32-bit */
                temp_e = W_norm( temp );
                temp = W_shl( temp, temp_e );
                if ( 0 == temp )
                {
                    temp_e = prod_e;
                    move16();
                }
                *Zp_fx_e = sub( prod_e, temp_e );
                move16();
                ( *Zp_fx ) = W_extract_h( temp );
                move32();
                max_exp = s_max( max_exp, *Zp_fx_e ); // Find the max exp
                Zp_fx++;
                Zp_fx_e++;
            }
@@ -1017,18 +1044,27 @@ Word16 matrix_product_mant_exp_fx(
        {
            FOR( i = 0; i < colsX; ++i )
            {
                ( *Zp_fx ) = 0;
                move32();
                ( *Zp_fx_e ) = 0;
                move16();
                temp = 0;
                move64();
                FOR( k = 0; k < colsX; ++k )
                {
                    x_idx = add( k, imult1616( i, rowsX ) );
                    y_idx = add( j, imult1616( k, rowsY ) );
                    ( *Zp_fx ) = BASOP_Util_Add_Mant32Exp( *Zp_fx, *Zp_fx_e, Mpy_32_32( X_fx[x_idx], Y_fx[y_idx] ), add( X_fx_e, Y_fx_e ), Zp_fx_e ); /*Q31 - Zp_fx_e*/
                    move32();
                    x_idx = k + i * rowsX;
                    y_idx = j + k * rowsY;
                    temp = W_mac_32_32( temp, X_fx[x_idx], Y_fx[y_idx] ); // X_fx_e + Y_fx_e
                }

                /* Maximize accumulated value to 32-bit */
                temp_e = W_norm( temp );
                temp = W_shl( temp, temp_e );
                if ( 0 == temp )
                {
                    temp_e = prod_e;
                    move16();
                }
                *Zp_fx_e = sub( prod_e, temp_e );
                move16();
                ( *Zp_fx ) = W_extract_h( temp );
                move32();
                max_exp = s_max( max_exp, *Zp_fx_e ); // Find the max exp
                Zp_fx++;
                Zp_fx_e++;
            }
@@ -1049,17 +1085,26 @@ Word16 matrix_product_mant_exp_fx(
        {
            FOR( i = 0; i < rowsX; ++i )
            {
                ( *Zp_fx ) = 0;
                move32();
                ( *Zp_fx_e ) = 0;
                move16();
                temp = 0;
                move64();
                FOR( k = 0; k < colsX; ++k )
                {
                    x_idx = add( i, imult1616( k, rowsX ) );
                    y_idx = add( k, imult1616( j, rowsY ) );
                    ( *Zp_fx ) = BASOP_Util_Add_Mant32Exp( *Zp_fx, *Zp_fx_e, Mpy_32_32( X_fx[x_idx], Y_fx[y_idx] ), add( X_fx_e, Y_fx_e ), Zp_fx_e ); /*Q31 - Zp_fx_e*/
                    move32();
                    x_idx = i + k * rowsX;
                    y_idx = k + j * rowsY;
                    temp = W_mac_32_32( temp, X_fx[x_idx], Y_fx[y_idx] ); // X_fx_e + Y_fx_e
                }
                /* Maximize accumulated value to 32-bit */
                temp_e = W_norm( temp );
                temp = W_shl( temp, temp_e );
                if ( 0 == temp )
                {
                    temp_e = prod_e;
                }
                *Zp_fx_e = sub( prod_e, temp_e );
                move16();
                ( *Zp_fx ) = W_extract_h( temp );
                move32();
                max_exp = s_max( max_exp, *Zp_fx_e ); // Find the max exp
                Zp_fx++;
                Zp_fx_e++;
            }
@@ -1070,18 +1115,11 @@ Word16 matrix_product_mant_exp_fx(
        move16();
    }
    Zp_fx = Z_fx; /*Q31 - Zp_fx_e*/

    Zp_fx_e = out_e;
    Word16 max_exp = -31;
    move16();
    FOR( j = 0; j < row; ++j )
    {
        FOR( i = 0; i < col; ++i )
        {
            max_exp = s_max( max_exp, *Zp_fx_e );
            Zp_fx_e++;
        }
    }
    Zp_fx_e = out_e;


    *Z_fx_e = max_exp;
    move16();
    FOR( j = 0; j < row; ++j )
+2 −1
Original line number Diff line number Diff line
@@ -152,5 +152,6 @@
#define FIX_ISSUE_1214                          /* Ittiam: Fix for issue 1214: Energy leakage in IGF tiles for MDCT-stereo @64kbps SWB*/
#define FIX_881_HILBERT_FILTER                  /* VA: improve the precision of the Hilbert filter to remove 2kHz unwanted tone */
#define FIX_ISSUE_1245                          /* Ittiam: Fix for issue 1245: Basop Encoder: Audible noise for silent Stereo input DTX on @24.4 kbps, @32 kbps*/
#endif
#define FIX_MINOR_SVD_WMOPS_MR1010X             /* FhG: Minor WMOPS tuning, bit-exact to previous version, saves about 8.2 WMOPS for MR1010 */
#define SVD_WMOPS_OPT                           /* Ittiam : SVD related optimizations */
#endif
+23 −12
Original line number Diff line number Diff line
@@ -237,6 +237,7 @@ void decoder_tcx_imdct_fx(
    Word16 q_a_itf = 15;
    Word16 x_e = sub( 31, q_x );
    move16();
    Word16 shift_q = sub( q_x, q_win );

    /*-----------------------------------------------------------------*
     * Initializations
@@ -364,9 +365,10 @@ void decoder_tcx_imdct_fx(

    IF( EQ_16( st->element_mode, IVAS_CPE_MDCT ) )
    {
        Word16 copy_len = s_min( L_FRAME48k, s_max( L_spec, s_max( L_frame, L_frameTCX ) ) );
        set32_fx( x_tmp_fx, 0, L_FRAME_PLUS );
        Copy32( x_fx, x_tmp_fx, s_min( L_FRAME48k, s_max( L_spec, s_max( L_frame, L_frameTCX ) ) ) );    // q_x
        Copy32( x_fx, xn_bufFB_fx, s_min( L_FRAME48k, s_max( L_spec, s_max( L_frame, L_frameTCX ) ) ) ); // q_x
        Copy32( x_fx, x_tmp_fx, copy_len );    // q_x
        Copy32( x_fx, xn_bufFB_fx, copy_len ); // q_x
    }
    ELSE IF( ( st->element_mode == EVS_MONO ) )
    {
@@ -374,8 +376,9 @@ void decoder_tcx_imdct_fx(
    }
    ELSE
    {
        Copy32( x_fx, x_tmp_fx, s_max( L_spec, s_max( L_frame, L_frameTCX ) ) );    // q_x
        Copy32( x_fx, xn_bufFB_fx, s_max( L_spec, s_max( L_frame, L_frameTCX ) ) ); // q_x
        Word16 copy_len = s_max( L_spec, s_max( L_frame, L_frameTCX ) );
        Copy32( x_fx, x_tmp_fx, copy_len );    // q_x
        Copy32( x_fx, xn_bufFB_fx, copy_len ); // q_x
    }

    IF( ( st->igf != 0 ) )
@@ -416,24 +419,29 @@ void decoder_tcx_imdct_fx(

    FOR( Word16 ind = 0; ind < L_MDCT_OVLP_MAX + L_FRAME_PLUS + L_MDCT_OVLP_MAX; ind++ )
    {
        xn_bufFB_fx_16[ind] = extract_l( L_shr( xn_bufFB_fx[ind], sub( q_x, q_win ) ) ); // q_x
        xn_bufFB_fx_16[ind] = extract_l( L_shr( xn_bufFB_fx[ind], shift_q ) ); // q_x
        move16();
    }

    Word16 ratio_e;
    Word16 ratio = BASOP_Util_Divide1616_Scale( L_frameTCX_glob, L_frame_glob, &ratio_e ); // Q = 15-ratio_e. * FSCALE_DENOM is (1 << 9)
    ratio = shr( ratio, sub( 6, ratio_e ) );

    IF( st->element_mode != EVS_MONO )
    {
        IMDCT_ivas_fx( x_tmp_fx, q_x, hTcxDec->syn_OverlFB, hTcxDec->syn_Overl_TDACFB, xn_bufFB_fx_16, hTcxCfg->tcx_aldo_window_1_FB, hTcxCfg->tcx_aldo_window_1_FB_trunc, hTcxCfg->tcx_aldo_window_2_FB,
                       hTcxCfg->tcx_mdct_window_halfFB, hTcxCfg->tcx_mdct_window_minimumFB, hTcxCfg->tcx_mdct_window_transFB, hTcxCfg->tcx_mdct_window_half_lengthFB, hTcxCfg->tcx_mdct_window_min_lengthFB, index,
                       kernelType, left_rect, tcx_offsetFB, overlapFB, L_frameTCX, L_frameTCX, max( L_frameTCX, L_spec ) >> 1, L_frameTCX_glob, frame_cnt, bfi, st->hHQ_core->old_out_fx, 1, st, FSCALE_DENOM * L_frameTCX_glob / L_frame_glob, acelp_zir_fx, q_win );
                       kernelType, left_rect, tcx_offsetFB, overlapFB, L_frameTCX, L_frameTCX, shr( max( L_frameTCX, L_spec ), 1 ), L_frameTCX_glob, frame_cnt, bfi, st->hHQ_core->old_out_fx, 1, st, ratio, acelp_zir_fx, q_win );
    }
    ELSE
    {

        IMDCT_ivas_fx( x_fx, q_x, hTcxDec->syn_OverlFB, hTcxDec->syn_Overl_TDACFB, xn_bufFB_fx_16, hTcxCfg->tcx_aldo_window_1_FB, hTcxCfg->tcx_aldo_window_1_FB_trunc, hTcxCfg->tcx_aldo_window_2_FB, hTcxCfg->tcx_mdct_window_halfFB, hTcxCfg->tcx_mdct_window_minimumFB, hTcxCfg->tcx_mdct_window_transFB, hTcxCfg->tcx_mdct_window_half_lengthFB, hTcxCfg->tcx_mdct_window_min_lengthFB, index,
                       kernelType, left_rect, tcx_offsetFB, overlapFB, L_frameTCX, L_frameTCX, shr( s_max( L_frameTCX, L_spec ), 1 ), L_frameTCX_glob, frame_cnt, bfi, st->hHQ_core->old_out_fx, 1, st, FSCALE_DENOM * L_frameTCX_glob / L_frame_glob, acelp_zir_fx, q_win );
                       kernelType, left_rect, tcx_offsetFB, overlapFB, L_frameTCX, L_frameTCX, shr( s_max( L_frameTCX, L_spec ), 1 ), L_frameTCX_glob, frame_cnt, bfi, st->hHQ_core->old_out_fx, 1, st, ratio, acelp_zir_fx, q_win );
    }
    FOR( Word16 ind = 0; ind < L_MDCT_OVLP_MAX + L_FRAME_PLUS + L_MDCT_OVLP_MAX; ind++ )
    {
        xn_bufFB_fx[ind] = L_shl( xn_bufFB_fx_16[ind], sub( q_x, q_win ) ); // Q_x
        xn_bufFB_fx[ind] = L_shl( L_deposit_l( xn_bufFB_fx_16[ind] ), shift_q ); // Q_x
    }

    IF( ( bfi == 0 ) )
@@ -453,19 +461,22 @@ void decoder_tcx_imdct_fx(

        IF( EQ_16( st->element_mode, IVAS_CPE_MDCT ) )
        {
            res_m = BASOP_Util_Divide1616_Scale( L_frame_glob, L_FRAME, &res_e );
            st->old_fpitch = L_shl( Mpy_32_16_1( st->old_fpitch, res_m ), res_e );
            // Using sat as a single instruction shifts and extracts
            st->old_fpitch = W_shl_sat_l( W_mult0_32_32( st->old_fpitch, L_frame_glob ), -8 ); // Divide by 256 ==> SHR by 8
            move32();
        }

        IF( GT_16( st->element_mode, EVS_MONO ) )
        {
            res_m = BASOP_Util_Divide1616_Scale( L_frameTCX_glob, L_frame_glob, &res_e );
            st->old_fpitchFB = L_shl( Mpy_32_16_1( st->old_fpitch, res_m ), res_e );
            move32();
        }
        ELSE
        {
            res_m = BASOP_Util_Divide1616_Scale( L_frameTCX, L_frame, &res_e );
            st->old_fpitchFB = L_shl( Mpy_32_16_1( st->old_fpitch, res_m ), res_e );
            move32();
        }
    }

@@ -475,7 +486,7 @@ void decoder_tcx_imdct_fx(
        Copy( xn_buf_fx + L_frame, hTcxDec->syn_Overl, overlap ); // Q(-2)
        FOR( Word16 ind = 0; ind < overlapFB; ind++ )
        {
            hTcxDec->syn_OverlFB[ind] = (Word16) L_shr( xn_bufFB_fx[( ind + L_frameTCX )], sub( q_x, q_win ) ); // q_x
            hTcxDec->syn_OverlFB[ind] = extract_l( L_shr( xn_bufFB_fx[( ind + L_frameTCX )], shift_q ) ); // q_x
        }
    }

@@ -483,7 +494,7 @@ void decoder_tcx_imdct_fx(
    Copy( xn_buf_fx + sub( shr( overlap, 1 ), tcx_offset ), synth_fx, L_frame_glob ); // Q(-2)
    FOR( Word16 ind = 0; ind < L_frameTCX_glob; ind++ )
    {
        synthFB_fx[ind] = (Word16) L_shr( xn_bufFB_fx[( ind + ( ( overlapFB >> 1 ) - tcx_offsetFB ) )], sub( q_x, q_win ) ); // q_x
        synthFB_fx[ind] = extract_l( L_shr( xn_bufFB_fx[( ind + ( ( overlapFB >> 1 ) - tcx_offsetFB ) )], shift_q ) ); // q_x
    }


Loading