Commit 0ae7f52d authored by Sandesh Venkatesh's avatar Sandesh Venkatesh
Browse files

Merge branch '1519-improve-wmops-performance-of-ivas_param_mc_param_est_enc_fx-step-2' into 'main'

[allow-regression]Resolve "Improve WMOPS Performance of ivas_param_mc_param_est_enc_fx() step 2"

Closes #1519

See merge request !1472
parents 7225ebdc 4e3b6f90
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -86,6 +86,7 @@
//#define HARM_SCE_INIT
#define DIV32_OPT_NEWTON                               /* FhG: faster 32 by 32 bit division */ 
#define	MERGE_REQUEST_1378_SPEEDUP_ivas_mc_param_enc_fx_NONBE /* FhG: reduce WMOPS of Cy calculation in ivas_param_mc_param_est_enc_fx() by using 64 Bit addition. Obsoletes IMPROVE_HIGH_COMPLEXITY_PARAM_MC_PRM_EST_NONBE. */
#define MERGE_REQUEST_1472_SPEEDUP_ivas_mc_param_enc_fx_NONBE /* FhG: reduce WMOPS of dmx calculation in ivas_param_mc_param_est_enc_fx() by using 64 Bit addition. Requires MERGE_REQUEST_1378_SPEEDUP_ivas_mc_param_enc_fx_NONBE. */
#define	MERGE_REQUEST_1564_SPEEDUP_ivas_dirac_dec_output_synthesis_cov_param_mc_synthesise_slot_fx_NONBE /* FhG: reduce WMOPS by inlining the matrix multiplications for the smoothing operation. */
#define FIX_1439_SPEEDUP_Copy_Scale_sig_16_32_no_sat            /*FhG: reduces WMOPS - bit-exact*/
#define FIX_1439_SPEEDUP_stereo_icBWE_dec_fx                    /*FhG: reduces WMOPS - bit-exact*/
+119 −36
Original line number Diff line number Diff line
@@ -30,6 +30,21 @@

*******************************************************************************************************/

// helper macros to convert the 64 bitt accumulators into the 48 bit float format
#define CONVERT_CY( x_64, y_fx, y_e )              \
    {                                              \
        Word16 norm;                               \
        norm = W_norm( x_64 );                     \
        y_fx = W_extract_h( W_shl( x_64, norm ) ); \
        y_e = sub( sub62gb, norm );                \
    }
#define CONVERT_DMX( x_64, y_fx, y_e )             \
    {                                              \
        Word16 norm;                               \
        norm = W_norm( x_64 );                     \
        y_fx = W_extract_h( W_shl( x_64, norm ) ); \
        y_e = sub( sub35gb, norm );                \
    }
#include <math.h>
#include <assert.h>
#include "options.h"
@@ -649,15 +664,22 @@ static void ivas_param_mc_param_est_enc_fx(
    Word32 *p_slot_frame_f_real_fx[MAX_CICP_CHANNELS];                     /* Output of the MDFT FB - real part */
    Word32 *p_slot_frame_f_imag_fx[MAX_CICP_CHANNELS];                     /* Output of the MDFT FB - imag part */

#ifdef MERGE_REQUEST_1472_SPEEDUP_ivas_mc_param_enc_fx_NONBE
    Word64 dmx_real_64[PARAM_MC_MAX_TRANSPORT_CHANS];
    Word64 dmx_imag_64[PARAM_MC_MAX_TRANSPORT_CHANS];
#else
    Word32 dmx_real_fx[PARAM_MC_MAX_TRANSPORT_CHANS]; /* Downmix channel - Real Part */
    Word16 dmx_real_e[PARAM_MC_MAX_TRANSPORT_CHANS];  /* Downmix channel - Real Part */
    Word32 dmx_imag_fx[PARAM_MC_MAX_TRANSPORT_CHANS]; /* Downmix channel - Imag Part */
    Word16 dmx_imag_e[PARAM_MC_MAX_TRANSPORT_CHANS];  /* Downmix channel - Imag Part */
#endif
    Word32 a_fx, b_fx, c_fx, d_fx; /* Tmp complex values */
    Word16 a_e, b_e, c_e, d_e;     /* Tmp complex values */
#ifdef MERGE_REQUEST_1378_SPEEDUP_ivas_mc_param_enc_fx_NONBE
    Word64 Cy_sum_real_64[PARAM_MC_MAX_PARAMETER_BANDS][MAX_CICP_CHANNELS][MAX_CICP_CHANNELS];
    Word64 Cy_sum_imag_64[PARAM_MC_MAX_PARAM_BAND_ABS_COV_ENC][MAX_CICP_CHANNELS][MAX_CICP_CHANNELS];
    Word16 sub62gb;
    Word16 sub35gb;
#else
    Word32 Cy_sum_imag_fx[PARAM_MC_MAX_PARAM_BAND_ABS_COV_ENC][MAX_CICP_CHANNELS][MAX_CICP_CHANNELS];
    Word16 Cy_sum_imag_e[PARAM_MC_MAX_PARAM_BAND_ABS_COV_ENC][MAX_CICP_CHANNELS][MAX_CICP_CHANNELS];
@@ -740,9 +762,16 @@ static void ivas_param_mc_param_est_enc_fx(

#if defined( IMPROVE_HIGH_COMPLEXITY_PARAM_MC_PRM_EST_BE ) || defined( IMPROVE_HIGH_COMPLEXITY_PARAM_MC_PRM_EST_NONBE )
    Word16 gb = find_guarded_bits_fx( l_ts );
#ifndef MERGE_REQUEST_1472_SPEEDUP_ivas_mc_param_enc_fx_NONBE
    Word16 add20gb = add( 20, gb );
#endif

#endif
#ifdef MERGE_REQUEST_1378_SPEEDUP_ivas_mc_param_enc_fx_NONBE
    sub35gb = sub( 35, find_guarded_bits_fx( l_ts ) );
    sub62gb = sub( 62, find_guarded_bits_fx( l_ts ) );
#endif

    FOR( ts = start_ts; ts < num_time_slots; ts++ )
    {
#if !defined( IMPROVE_HIGH_COMPLEXITY_PARAM_MC_PRM_EST_BE ) && !defined( IMPROVE_HIGH_COMPLEXITY_PARAM_MC_PRM_EST_NONBE )
@@ -774,6 +803,26 @@ static void ivas_param_mc_param_est_enc_fx(

                FOR( ch_idx1 = 0; ch_idx1 < nchan_transport; ++ch_idx1 )
                {
#ifdef MERGE_REQUEST_1472_SPEEDUP_ivas_mc_param_enc_fx_NONBE
                    Word64 real_64;
                    Word64 imag_64;

                    real_64 = 0;
                    imag_64 = 0;
                    move64();
                    move64();
                    FOR( inp_ch = 0; inp_ch < nchan_input; inp_ch++ )
                    {
                        real_64 = W_add( real_64, W_mult0_32_32( slot_frame_f_real_fx[inp_ch][cur_cldfb_band], ( *p_dmx_fac_fx ) ) );
                        imag_64 = W_add( imag_64, W_mult0_32_32( slot_frame_f_imag_fx[inp_ch][cur_cldfb_band], ( *p_dmx_fac_fx ) ) );
                        p_dmx_fac_fx++;
                    }
                    dmx_real_64[ch_idx1] = real_64;
                    dmx_imag_64[ch_idx1] = imag_64;
                    move64();
                    move64();

#else
#ifndef IMPROVE_HIGH_COMPLEXITY_PARAM_MC_PRM_EST_BE
                    dmx_real_fx[ch_idx1] = 0;
                    move32();
@@ -814,14 +863,24 @@ static void ivas_param_mc_param_est_enc_fx(
                    move16();
                    move32();
                    move16();
#endif

#endif
                }

                /* Cx for transport channels */
                FOR( ch_idx1 = 0; ch_idx1 < nchan_transport; ++ch_idx1 )
                {
#ifdef MERGE_REQUEST_1472_SPEEDUP_ivas_mc_param_enc_fx_NONBE
                    CONVERT_DMX( dmx_real_64[ch_idx1], a_fx, a_e );
                    CONVERT_DMX( dmx_imag_64[ch_idx1], b_fx, b_e );
#endif
                    FOR( ch_idx2 = 0; ch_idx2 < nchan_transport; ++ch_idx2 )
                    {
#ifdef MERGE_REQUEST_1472_SPEEDUP_ivas_mc_param_enc_fx_NONBE
                        CONVERT_DMX( dmx_real_64[ch_idx2], c_fx, c_e );
                        CONVERT_DMX( dmx_imag_64[ch_idx2], d_fx, d_e );
#else
                        a_fx = dmx_real_fx[ch_idx1];
                        move32();
                        a_e = dmx_real_e[ch_idx1];
@@ -838,6 +897,7 @@ static void ivas_param_mc_param_est_enc_fx(
                        move32();
                        d_e = dmx_imag_e[ch_idx2];
                        move16();
#endif

                        /* (a-ib)(c+id) = ac + bd + i(ad-bc) */
                        L_tmp = BASOP_Util_Add_Mant32Exp( Mpy_32_32( a_fx, c_fx ), add( a_e, c_e ), Mpy_32_32( b_fx, d_fx ), add( b_e, d_e ), &tmp_e );
@@ -858,7 +918,6 @@ static void ivas_param_mc_param_est_enc_fx(
                    move32();
                    FOR( ch_idx2 = ch_idx1; ch_idx2 < nchan_input; ++ch_idx2 )
                    {
                        Word16 norm;
                        c_fx = slot_frame_f_real_fx[ch_idx2][cur_cldfb_band];
                        d_fx = slot_frame_f_imag_fx[ch_idx2][cur_cldfb_band];
                        move32();
@@ -869,11 +928,6 @@ static void ivas_param_mc_param_est_enc_fx(
                        Cy_sum_imag_64[cur_param_band][ch_idx1][ch_idx2] = W_add( Cy_sum_imag_64[cur_param_band][ch_idx1][ch_idx2],
                                                                                  W_sub( W_mult0_32_32( a_fx, d_fx ), W_mult0_32_32( b_fx, c_fx ) ) );
                        move64();

                        // convert the 64 bit fixpoint back into the 48 bit float format
                        norm = W_norm( Cy_sum_real_64[cur_param_band][ch_idx1][ch_idx2] );
                        Cy_sum_fx[cur_param_band][ch_idx1][ch_idx2] = W_extract_h( W_shl( Cy_sum_real_64[cur_param_band][ch_idx1][ch_idx2], norm ) );
                        Cy_sum_e[cur_param_band][ch_idx1][ch_idx2] = sub( sub( 62, gb ), norm );
                    }
                }
#else
@@ -953,6 +1007,26 @@ static void ivas_param_mc_param_est_enc_fx(

                FOR( ch_idx1 = 0; ch_idx1 < nchan_transport; ++ch_idx1 )
                {
#ifdef MERGE_REQUEST_1472_SPEEDUP_ivas_mc_param_enc_fx_NONBE
                    Word64 real_64;
                    Word64 imag_64;

                    real_64 = 0;
                    imag_64 = 0;
                    move64();
                    move64();

                    FOR( inp_ch = 0; inp_ch < nchan_input; inp_ch++ )
                    {
                        real_64 = W_add( real_64, W_mult0_32_32( slot_frame_f_real_fx[inp_ch][cur_cldfb_band], ( *p_dmx_fac_fx ) ) );
                        imag_64 = W_add( imag_64, W_mult0_32_32( slot_frame_f_imag_fx[inp_ch][cur_cldfb_band], ( *p_dmx_fac_fx ) ) );
                        p_dmx_fac_fx++;
                    }
                    dmx_real_64[ch_idx1] = real_64;
                    dmx_imag_64[ch_idx1] = imag_64;
                    move64();
                    move64();
#else
#ifndef IMPROVE_HIGH_COMPLEXITY_PARAM_MC_PRM_EST_BE
                    dmx_real_fx[ch_idx1] = 0;
                    move32();
@@ -997,12 +1071,18 @@ static void ivas_param_mc_param_est_enc_fx(
                    move32();
                    dmx_imag_e[ch_idx1] = imag_e;
                    move16();
#endif

#endif
                }

                /* Cx for transport channels */
                FOR( ch_idx1 = 0; ch_idx1 < nchan_transport; ++ch_idx1 )
                {
#ifdef MERGE_REQUEST_1472_SPEEDUP_ivas_mc_param_enc_fx_NONBE
                    CONVERT_DMX( dmx_real_64[ch_idx1], a_fx, a_e );
                    CONVERT_DMX( dmx_imag_64[ch_idx1], b_fx, b_e );
#else
#ifdef IMPROVE_HIGH_COMPLEXITY_PARAM_MC_PRM_EST_BE
                    a_fx = dmx_real_fx[ch_idx1];
                    move32();
@@ -1012,9 +1092,20 @@ static void ivas_param_mc_param_est_enc_fx(
                    move32();
                    b_e = dmx_imag_e[ch_idx1];
                    move16();
#endif

#endif
                    FOR( ch_idx2 = 0; ch_idx2 < nchan_transport; ++ch_idx2 )
                    {
#ifdef MERGE_REQUEST_1472_SPEEDUP_ivas_mc_param_enc_fx_NONBE
                        CONVERT_DMX( dmx_real_64[ch_idx2], c_fx, c_e );
                        CONVERT_DMX( dmx_imag_64[ch_idx2], d_fx, d_e );

                        /* (a-ib)(c+id) = ac + bd + i(ad-bc) */
                        L_tmp = BASOP_Util_Add_Mant32Exp( Mpy_32_32( a_fx, c_fx ), add( a_e, c_e ), Mpy_32_32( b_fx, d_fx ), add( b_e, d_e ), &tmp_e );
                        Cx_sum_fx[cur_param_band][ch_idx1][ch_idx2] = BASOP_Util_Add_Mant32Exp( Cx_sum_fx[cur_param_band][ch_idx1][ch_idx2], Cx_sum_e[cur_param_band][ch_idx1][ch_idx2], L_tmp, tmp_e,
                                                                                                &Cx_sum_e[cur_param_band][ch_idx1][ch_idx2] );
#else
#ifndef IMPROVE_HIGH_COMPLEXITY_PARAM_MC_PRM_EST_BE
                        a_fx = dmx_real_fx[ch_idx1];
                        move32();
@@ -1042,6 +1133,7 @@ static void ivas_param_mc_param_est_enc_fx(
                        L_tmp = BASOP_Util_Add_Mant32Exp( Mpy_32_32( a_fx, dmx_real_fx[ch_idx2] ), add( a_e, dmx_real_e[ch_idx2] ), Mpy_32_32( b_fx, dmx_imag_fx[ch_idx2] ), add( b_e, dmx_imag_e[ch_idx2] ), &tmp_e );
                        Cx_sum_fx[cur_param_band][ch_idx1][ch_idx2] = BASOP_Util_Add_Mant32Exp( Cx_sum_fx[cur_param_band][ch_idx1][ch_idx2], Cx_sum_e[cur_param_band][ch_idx1][ch_idx2], L_tmp, tmp_e,
                                                                                                &Cx_sum_e[cur_param_band][ch_idx1][ch_idx2] );
#endif
#endif
                        move32();
                    }
@@ -1075,7 +1167,6 @@ static void ivas_param_mc_param_est_enc_fx(
                    move32();
                    FOR( ch_idx2 = ch_idx1; ch_idx2 < nchan_input; ++ch_idx2 )
                    {
                        Word16 norm;
                        c_fx = slot_frame_f_real_fx[ch_idx2][cur_cldfb_band];
                        d_fx = slot_frame_f_imag_fx[ch_idx2][cur_cldfb_band];
                        move32();
@@ -1084,10 +1175,6 @@ static void ivas_param_mc_param_est_enc_fx(
                        Cy_sum_real_64[cur_param_band][ch_idx1][ch_idx2] = W_add( Cy_sum_real_64[cur_param_band][ch_idx1][ch_idx2],
                                                                                  W_add( W_mult0_32_32( a_fx, c_fx ), W_mult0_32_32( b_fx, d_fx ) ) );
                        move64();
                        // convert the 64 bit fixpoint back into the 48 bit float format
                        norm = W_norm( Cy_sum_real_64[cur_param_band][ch_idx1][ch_idx2] );
                        Cy_sum_fx[cur_param_band][ch_idx1][ch_idx2] = W_extract_h( W_shl( Cy_sum_real_64[cur_param_band][ch_idx1][ch_idx2], norm ) );
                        Cy_sum_e[cur_param_band][ch_idx1][ch_idx2] = sub( sub( 62, gb ), norm );
                    }
#else
                    FOR( ch_idx2 = ch_idx1; ch_idx2 < nchan_input; ++ch_idx2 )
@@ -1135,14 +1222,6 @@ static void ivas_param_mc_param_est_enc_fx(
        {
            FOR( ch_idx1 = 0; ch_idx1 < nchan_input; ++ch_idx1 )
            {
                Cy_sum_fx[cur_param_band][hParamMC->lfe_index][ch_idx1] = 0;
                move32();
                Cy_sum_e[cur_param_band][hParamMC->lfe_index][ch_idx1] = 0;
                move16();
                Cy_sum_fx[cur_param_band][ch_idx1][hParamMC->lfe_index] = 0;
                move32();
                Cy_sum_e[cur_param_band][ch_idx1][hParamMC->lfe_index] = 0;
                move16();
#ifdef MERGE_REQUEST_1378_SPEEDUP_ivas_mc_param_enc_fx_NONBE
                Cy_sum_real_64[cur_param_band][hParamMC->lfe_index][ch_idx1] = 0;
                move64();
@@ -1153,6 +1232,14 @@ static void ivas_param_mc_param_est_enc_fx(
                Cy_sum_imag_64[cur_param_band][ch_idx1][hParamMC->lfe_index] = 0;
                move64();
#else
                Cy_sum_fx[cur_param_band][hParamMC->lfe_index][ch_idx1] = 0;
                move32();
                Cy_sum_e[cur_param_band][hParamMC->lfe_index][ch_idx1] = 0;
                move16();
                Cy_sum_fx[cur_param_band][ch_idx1][hParamMC->lfe_index] = 0;
                move32();
                Cy_sum_e[cur_param_band][ch_idx1][hParamMC->lfe_index] = 0;
                move16();
                Cy_sum_imag_fx[cur_param_band][hParamMC->lfe_index][ch_idx1] = 0;
                move32();
                Cy_sum_imag_e[cur_param_band][hParamMC->lfe_index][ch_idx1] = 0;
@@ -1174,7 +1261,7 @@ static void ivas_param_mc_param_est_enc_fx(
                move64();
                Cy_sum_real_64[cur_param_band][ch_idx1][hParamMC->lfe_index] = 0;
                move64();
#endif
#else
                Cy_sum_fx[cur_param_band][hParamMC->lfe_index][ch_idx1] = 0;
                move32();
                Cy_sum_e[cur_param_band][hParamMC->lfe_index][ch_idx1] = 0;
@@ -1183,6 +1270,7 @@ static void ivas_param_mc_param_est_enc_fx(
                move32();
                Cy_sum_e[cur_param_band][ch_idx1][hParamMC->lfe_index] = 0;
                move16();
#endif
            }
        }
    }
@@ -1220,11 +1308,9 @@ static void ivas_param_mc_param_est_enc_fx(
                FOR( k = 0; k < nchan_input; ++k )
                {
#ifdef MERGE_REQUEST_1378_SPEEDUP_ivas_mc_param_enc_fx_NONBE
                    Word16 norm;
                    // convert the 64 bit fixpoint back into the 48 bit float format
                    norm = W_norm( Cy_sum_real_64[cur_param_band][k][k] );
                    Nrg_fx[k] = W_extract_h( W_shl( Cy_sum_real_64[cur_param_band][k][k], norm ) );
                    Nrg_e[k] = sub( sub( 62, gb ), norm );
                    CONVERT_CY( Cy_sum_real_64[cur_param_band][k][k], Nrg_fx[k], Nrg_e[k] );
                    move32();
                    move16();
#else
                    Nrg_fx[k] = Cy_sum_fx[cur_param_band][k][k];
                    move32();
@@ -1361,10 +1447,9 @@ static void ivas_param_mc_param_est_enc_fx(
            {
                FOR( ch_idx2 = 0; ch_idx2 < MAX_CICP_CHANNELS; ch_idx2++ )
                {
                    Word16 norm;
                    norm = W_norm( Cy_sum_real_64[cur_param_band][ch_idx1][ch_idx2] );
                    Cy_sum_fx[cur_param_band][ch_idx1][ch_idx2] = W_extract_h( W_shl( Cy_sum_real_64[cur_param_band][ch_idx1][ch_idx2], norm ) );
                    Cy_sum_e[cur_param_band][ch_idx1][ch_idx2] = sub( sub( 62, gb ), norm );
                    CONVERT_CY( Cy_sum_real_64[cur_param_band][ch_idx1][ch_idx2], Cy_sum_fx[cur_param_band][ch_idx1][ch_idx2], Cy_sum_e[cur_param_band][ch_idx1][ch_idx2] );
                    move32();
                    move16();
                }
            }
        }
@@ -1407,11 +1492,9 @@ static void ivas_param_mc_param_est_enc_fx(
            FOR( ch_idx2 = ch_idx1; ch_idx2 < nchan_input; ch_idx2++ )
            {
#ifdef MERGE_REQUEST_1378_SPEEDUP_ivas_mc_param_enc_fx_NONBE
                Word16 norm;
                // convert the 64 bit fixpoint back into the 48 bit float format
                norm = W_norm( Cy_sum_imag_64[cur_param_band][ch_idx1][ch_idx2] );
                imag_part_fx = W_extract_h( W_shl( Cy_sum_imag_64[cur_param_band][ch_idx1][ch_idx2], norm ) );
                imag_part_e = sub( sub( 62, gb ), norm );
                CONVERT_CY( Cy_sum_imag_64[cur_param_band][ch_idx1][ch_idx2], imag_part_fx, imag_part_e );
                move32();
                move16();
#else
                imag_part_fx = Cy_sum_imag_fx[cur_param_band][ch_idx1][ch_idx2];
                move32();