Merge branch '1519-improve-wmops-performance-of-ivas_param_mc_param_est_enc_fx-step-2' into 'main' (0ae7f52d) · Commits · SA4 / Audio / IVAS BASOP

lib_com/options.h

+1 −0

Original line number	Diff line number	Diff line
		@@ -86,6 +86,7 @@
		//#define HARM_SCE_INIT
		#define DIV32_OPT_NEWTON /* FhG: faster 32 by 32 bit division */
		#define MERGE_REQUEST_1378_SPEEDUP_ivas_mc_param_enc_fx_NONBE /* FhG: reduce WMOPS of Cy calculation in ivas_param_mc_param_est_enc_fx() by using 64 Bit addition. Obsoletes IMPROVE_HIGH_COMPLEXITY_PARAM_MC_PRM_EST_NONBE. */
		#define MERGE_REQUEST_1472_SPEEDUP_ivas_mc_param_enc_fx_NONBE /* FhG: reduce WMOPS of dmx calculation in ivas_param_mc_param_est_enc_fx() by using 64 Bit addition. Requires MERGE_REQUEST_1378_SPEEDUP_ivas_mc_param_enc_fx_NONBE. */
		#define MERGE_REQUEST_1564_SPEEDUP_ivas_dirac_dec_output_synthesis_cov_param_mc_synthesise_slot_fx_NONBE /* FhG: reduce WMOPS by inlining the matrix multiplications for the smoothing operation. */
		#define FIX_1439_SPEEDUP_Copy_Scale_sig_16_32_no_sat /FhG: reduces WMOPS - bit-exact/
		#define FIX_1439_SPEEDUP_stereo_icBWE_dec_fx /FhG: reduces WMOPS - bit-exact/

lib_enc/ivas_mc_param_enc_fx.c

+119 −36

Original line number	Diff line number	Diff line
		@@ -30,6 +30,21 @@

		*******************************************************************************************************/

		// helper macros to convert the 64 bitt accumulators into the 48 bit float format
		#define CONVERT_CY( x_64, y_fx, y_e ) \
		{ \
		Word16 norm; \
		norm = W_norm( x_64 ); \
		y_fx = W_extract_h( W_shl( x_64, norm ) ); \
		y_e = sub( sub62gb, norm ); \
		}
		#define CONVERT_DMX( x_64, y_fx, y_e ) \
		{ \
		Word16 norm; \
		norm = W_norm( x_64 ); \
		y_fx = W_extract_h( W_shl( x_64, norm ) ); \
		y_e = sub( sub35gb, norm ); \
		}
		#include <math.h>
		#include <assert.h>
		#include "options.h"
		@@ -649,15 +664,22 @@ static void ivas_param_mc_param_est_enc_fx(
		Word32 p_slot_frame_f_real_fx[MAX_CICP_CHANNELS]; / Output of the MDFT FB - real part */
		Word32 p_slot_frame_f_imag_fx[MAX_CICP_CHANNELS]; / Output of the MDFT FB - imag part */

		#ifdef MERGE_REQUEST_1472_SPEEDUP_ivas_mc_param_enc_fx_NONBE
		Word64 dmx_real_64[PARAM_MC_MAX_TRANSPORT_CHANS];
		Word64 dmx_imag_64[PARAM_MC_MAX_TRANSPORT_CHANS];
		#else
		Word32 dmx_real_fx[PARAM_MC_MAX_TRANSPORT_CHANS]; /* Downmix channel - Real Part */
		Word16 dmx_real_e[PARAM_MC_MAX_TRANSPORT_CHANS]; /* Downmix channel - Real Part */
		Word32 dmx_imag_fx[PARAM_MC_MAX_TRANSPORT_CHANS]; /* Downmix channel - Imag Part */
		Word16 dmx_imag_e[PARAM_MC_MAX_TRANSPORT_CHANS]; /* Downmix channel - Imag Part */
		#endif
		Word32 a_fx, b_fx, c_fx, d_fx; /* Tmp complex values */
		Word16 a_e, b_e, c_e, d_e; /* Tmp complex values */
		#ifdef MERGE_REQUEST_1378_SPEEDUP_ivas_mc_param_enc_fx_NONBE
		Word64 Cy_sum_real_64[PARAM_MC_MAX_PARAMETER_BANDS][MAX_CICP_CHANNELS][MAX_CICP_CHANNELS];
		Word64 Cy_sum_imag_64[PARAM_MC_MAX_PARAM_BAND_ABS_COV_ENC][MAX_CICP_CHANNELS][MAX_CICP_CHANNELS];
		Word16 sub62gb;
		Word16 sub35gb;
		#else
		Word32 Cy_sum_imag_fx[PARAM_MC_MAX_PARAM_BAND_ABS_COV_ENC][MAX_CICP_CHANNELS][MAX_CICP_CHANNELS];
		Word16 Cy_sum_imag_e[PARAM_MC_MAX_PARAM_BAND_ABS_COV_ENC][MAX_CICP_CHANNELS][MAX_CICP_CHANNELS];
		@@ -740,9 +762,16 @@ static void ivas_param_mc_param_est_enc_fx(

		#if defined( IMPROVE_HIGH_COMPLEXITY_PARAM_MC_PRM_EST_BE ) \|\| defined( IMPROVE_HIGH_COMPLEXITY_PARAM_MC_PRM_EST_NONBE )
		Word16 gb = find_guarded_bits_fx( l_ts );
		#ifndef MERGE_REQUEST_1472_SPEEDUP_ivas_mc_param_enc_fx_NONBE
		Word16 add20gb = add( 20, gb );
		#endif

		#endif
		#ifdef MERGE_REQUEST_1378_SPEEDUP_ivas_mc_param_enc_fx_NONBE
		sub35gb = sub( 35, find_guarded_bits_fx( l_ts ) );
		sub62gb = sub( 62, find_guarded_bits_fx( l_ts ) );
		#endif

		FOR( ts = start_ts; ts < num_time_slots; ts++ )
		{
		#if !defined( IMPROVE_HIGH_COMPLEXITY_PARAM_MC_PRM_EST_BE ) && !defined( IMPROVE_HIGH_COMPLEXITY_PARAM_MC_PRM_EST_NONBE )
		@@ -774,6 +803,26 @@ static void ivas_param_mc_param_est_enc_fx(

		FOR( ch_idx1 = 0; ch_idx1 < nchan_transport; ++ch_idx1 )
		{
		#ifdef MERGE_REQUEST_1472_SPEEDUP_ivas_mc_param_enc_fx_NONBE
		Word64 real_64;
		Word64 imag_64;

		real_64 = 0;
		imag_64 = 0;
		move64();
		move64();
		FOR( inp_ch = 0; inp_ch < nchan_input; inp_ch++ )
		{
		real_64 = W_add( real_64, W_mult0_32_32( slot_frame_f_real_fx[inp_ch][cur_cldfb_band], ( *p_dmx_fac_fx ) ) );
		imag_64 = W_add( imag_64, W_mult0_32_32( slot_frame_f_imag_fx[inp_ch][cur_cldfb_band], ( *p_dmx_fac_fx ) ) );
		p_dmx_fac_fx++;
		}
		dmx_real_64[ch_idx1] = real_64;
		dmx_imag_64[ch_idx1] = imag_64;
		move64();
		move64();

		#else
		#ifndef IMPROVE_HIGH_COMPLEXITY_PARAM_MC_PRM_EST_BE
		dmx_real_fx[ch_idx1] = 0;
		move32();
		@@ -814,14 +863,24 @@ static void ivas_param_mc_param_est_enc_fx(
		move16();
		move32();
		move16();
		#endif

		#endif
		}

		/* Cx for transport channels */
		FOR( ch_idx1 = 0; ch_idx1 < nchan_transport; ++ch_idx1 )
		{
		#ifdef MERGE_REQUEST_1472_SPEEDUP_ivas_mc_param_enc_fx_NONBE
		CONVERT_DMX( dmx_real_64[ch_idx1], a_fx, a_e );
		CONVERT_DMX( dmx_imag_64[ch_idx1], b_fx, b_e );
		#endif
		FOR( ch_idx2 = 0; ch_idx2 < nchan_transport; ++ch_idx2 )
		{
		#ifdef MERGE_REQUEST_1472_SPEEDUP_ivas_mc_param_enc_fx_NONBE
		CONVERT_DMX( dmx_real_64[ch_idx2], c_fx, c_e );
		CONVERT_DMX( dmx_imag_64[ch_idx2], d_fx, d_e );
		#else
		a_fx = dmx_real_fx[ch_idx1];
		move32();
		a_e = dmx_real_e[ch_idx1];
		@@ -838,6 +897,7 @@ static void ivas_param_mc_param_est_enc_fx(
		move32();
		d_e = dmx_imag_e[ch_idx2];
		move16();
		#endif

		/* (a-ib)(c+id) = ac + bd + i(ad-bc) */
		L_tmp = BASOP_Util_Add_Mant32Exp( Mpy_32_32( a_fx, c_fx ), add( a_e, c_e ), Mpy_32_32( b_fx, d_fx ), add( b_e, d_e ), &tmp_e );
		@@ -858,7 +918,6 @@ static void ivas_param_mc_param_est_enc_fx(
		move32();
		FOR( ch_idx2 = ch_idx1; ch_idx2 < nchan_input; ++ch_idx2 )
		{
		Word16 norm;
		c_fx = slot_frame_f_real_fx[ch_idx2][cur_cldfb_band];
		d_fx = slot_frame_f_imag_fx[ch_idx2][cur_cldfb_band];
		move32();
		@@ -869,11 +928,6 @@ static void ivas_param_mc_param_est_enc_fx(
		Cy_sum_imag_64[cur_param_band][ch_idx1][ch_idx2] = W_add( Cy_sum_imag_64[cur_param_band][ch_idx1][ch_idx2],
		W_sub( W_mult0_32_32( a_fx, d_fx ), W_mult0_32_32( b_fx, c_fx ) ) );
		move64();

		// convert the 64 bit fixpoint back into the 48 bit float format
		norm = W_norm( Cy_sum_real_64[cur_param_band][ch_idx1][ch_idx2] );
		Cy_sum_fx[cur_param_band][ch_idx1][ch_idx2] = W_extract_h( W_shl( Cy_sum_real_64[cur_param_band][ch_idx1][ch_idx2], norm ) );
		Cy_sum_e[cur_param_band][ch_idx1][ch_idx2] = sub( sub( 62, gb ), norm );
		}
		}
		#else
		@@ -953,6 +1007,26 @@ static void ivas_param_mc_param_est_enc_fx(

		FOR( ch_idx1 = 0; ch_idx1 < nchan_transport; ++ch_idx1 )
		{
		#ifdef MERGE_REQUEST_1472_SPEEDUP_ivas_mc_param_enc_fx_NONBE
		Word64 real_64;
		Word64 imag_64;

		real_64 = 0;
		imag_64 = 0;
		move64();
		move64();

		FOR( inp_ch = 0; inp_ch < nchan_input; inp_ch++ )
		{
		real_64 = W_add( real_64, W_mult0_32_32( slot_frame_f_real_fx[inp_ch][cur_cldfb_band], ( *p_dmx_fac_fx ) ) );
		imag_64 = W_add( imag_64, W_mult0_32_32( slot_frame_f_imag_fx[inp_ch][cur_cldfb_band], ( *p_dmx_fac_fx ) ) );
		p_dmx_fac_fx++;
		}
		dmx_real_64[ch_idx1] = real_64;
		dmx_imag_64[ch_idx1] = imag_64;
		move64();
		move64();
		#else
		#ifndef IMPROVE_HIGH_COMPLEXITY_PARAM_MC_PRM_EST_BE
		dmx_real_fx[ch_idx1] = 0;
		move32();
		@@ -997,12 +1071,18 @@ static void ivas_param_mc_param_est_enc_fx(
		move32();
		dmx_imag_e[ch_idx1] = imag_e;
		move16();
		#endif

		#endif
		}

		/* Cx for transport channels */
		FOR( ch_idx1 = 0; ch_idx1 < nchan_transport; ++ch_idx1 )
		{
		#ifdef MERGE_REQUEST_1472_SPEEDUP_ivas_mc_param_enc_fx_NONBE
		CONVERT_DMX( dmx_real_64[ch_idx1], a_fx, a_e );
		CONVERT_DMX( dmx_imag_64[ch_idx1], b_fx, b_e );
		#else
		#ifdef IMPROVE_HIGH_COMPLEXITY_PARAM_MC_PRM_EST_BE
		a_fx = dmx_real_fx[ch_idx1];
		move32();
		@@ -1012,9 +1092,20 @@ static void ivas_param_mc_param_est_enc_fx(
		move32();
		b_e = dmx_imag_e[ch_idx1];
		move16();
		#endif

		#endif
		FOR( ch_idx2 = 0; ch_idx2 < nchan_transport; ++ch_idx2 )
		{
		#ifdef MERGE_REQUEST_1472_SPEEDUP_ivas_mc_param_enc_fx_NONBE
		CONVERT_DMX( dmx_real_64[ch_idx2], c_fx, c_e );
		CONVERT_DMX( dmx_imag_64[ch_idx2], d_fx, d_e );

		/* (a-ib)(c+id) = ac + bd + i(ad-bc) */
		L_tmp = BASOP_Util_Add_Mant32Exp( Mpy_32_32( a_fx, c_fx ), add( a_e, c_e ), Mpy_32_32( b_fx, d_fx ), add( b_e, d_e ), &tmp_e );
		Cx_sum_fx[cur_param_band][ch_idx1][ch_idx2] = BASOP_Util_Add_Mant32Exp( Cx_sum_fx[cur_param_band][ch_idx1][ch_idx2], Cx_sum_e[cur_param_band][ch_idx1][ch_idx2], L_tmp, tmp_e,
		&Cx_sum_e[cur_param_band][ch_idx1][ch_idx2] );
		#else
		#ifndef IMPROVE_HIGH_COMPLEXITY_PARAM_MC_PRM_EST_BE
		a_fx = dmx_real_fx[ch_idx1];
		move32();
		@@ -1042,6 +1133,7 @@ static void ivas_param_mc_param_est_enc_fx(
		L_tmp = BASOP_Util_Add_Mant32Exp( Mpy_32_32( a_fx, dmx_real_fx[ch_idx2] ), add( a_e, dmx_real_e[ch_idx2] ), Mpy_32_32( b_fx, dmx_imag_fx[ch_idx2] ), add( b_e, dmx_imag_e[ch_idx2] ), &tmp_e );
		Cx_sum_fx[cur_param_band][ch_idx1][ch_idx2] = BASOP_Util_Add_Mant32Exp( Cx_sum_fx[cur_param_band][ch_idx1][ch_idx2], Cx_sum_e[cur_param_band][ch_idx1][ch_idx2], L_tmp, tmp_e,
		&Cx_sum_e[cur_param_band][ch_idx1][ch_idx2] );
		#endif
		#endif
		move32();
		}
		@@ -1075,7 +1167,6 @@ static void ivas_param_mc_param_est_enc_fx(
		move32();
		FOR( ch_idx2 = ch_idx1; ch_idx2 < nchan_input; ++ch_idx2 )
		{
		Word16 norm;
		c_fx = slot_frame_f_real_fx[ch_idx2][cur_cldfb_band];
		d_fx = slot_frame_f_imag_fx[ch_idx2][cur_cldfb_band];
		move32();
		@@ -1084,10 +1175,6 @@ static void ivas_param_mc_param_est_enc_fx(
		Cy_sum_real_64[cur_param_band][ch_idx1][ch_idx2] = W_add( Cy_sum_real_64[cur_param_band][ch_idx1][ch_idx2],
		W_add( W_mult0_32_32( a_fx, c_fx ), W_mult0_32_32( b_fx, d_fx ) ) );
		move64();
		// convert the 64 bit fixpoint back into the 48 bit float format
		norm = W_norm( Cy_sum_real_64[cur_param_band][ch_idx1][ch_idx2] );
		Cy_sum_fx[cur_param_band][ch_idx1][ch_idx2] = W_extract_h( W_shl( Cy_sum_real_64[cur_param_band][ch_idx1][ch_idx2], norm ) );
		Cy_sum_e[cur_param_band][ch_idx1][ch_idx2] = sub( sub( 62, gb ), norm );
		}
		#else
		FOR( ch_idx2 = ch_idx1; ch_idx2 < nchan_input; ++ch_idx2 )
		@@ -1135,14 +1222,6 @@ static void ivas_param_mc_param_est_enc_fx(
		{
		FOR( ch_idx1 = 0; ch_idx1 < nchan_input; ++ch_idx1 )
		{
		Cy_sum_fx[cur_param_band][hParamMC->lfe_index][ch_idx1] = 0;
		move32();
		Cy_sum_e[cur_param_band][hParamMC->lfe_index][ch_idx1] = 0;
		move16();
		Cy_sum_fx[cur_param_band][ch_idx1][hParamMC->lfe_index] = 0;
		move32();
		Cy_sum_e[cur_param_band][ch_idx1][hParamMC->lfe_index] = 0;
		move16();
		#ifdef MERGE_REQUEST_1378_SPEEDUP_ivas_mc_param_enc_fx_NONBE
		Cy_sum_real_64[cur_param_band][hParamMC->lfe_index][ch_idx1] = 0;
		move64();
		@@ -1153,6 +1232,14 @@ static void ivas_param_mc_param_est_enc_fx(
		Cy_sum_imag_64[cur_param_band][ch_idx1][hParamMC->lfe_index] = 0;
		move64();
		#else
		Cy_sum_fx[cur_param_band][hParamMC->lfe_index][ch_idx1] = 0;
		move32();
		Cy_sum_e[cur_param_band][hParamMC->lfe_index][ch_idx1] = 0;
		move16();
		Cy_sum_fx[cur_param_band][ch_idx1][hParamMC->lfe_index] = 0;
		move32();
		Cy_sum_e[cur_param_band][ch_idx1][hParamMC->lfe_index] = 0;
		move16();
		Cy_sum_imag_fx[cur_param_band][hParamMC->lfe_index][ch_idx1] = 0;
		move32();
		Cy_sum_imag_e[cur_param_band][hParamMC->lfe_index][ch_idx1] = 0;
		@@ -1174,7 +1261,7 @@ static void ivas_param_mc_param_est_enc_fx(
		move64();
		Cy_sum_real_64[cur_param_band][ch_idx1][hParamMC->lfe_index] = 0;
		move64();
		#endif
		#else
		Cy_sum_fx[cur_param_band][hParamMC->lfe_index][ch_idx1] = 0;
		move32();
		Cy_sum_e[cur_param_band][hParamMC->lfe_index][ch_idx1] = 0;
		@@ -1183,6 +1270,7 @@ static void ivas_param_mc_param_est_enc_fx(
		move32();
		Cy_sum_e[cur_param_band][ch_idx1][hParamMC->lfe_index] = 0;
		move16();
		#endif
		}
		}
		}
		@@ -1220,11 +1308,9 @@ static void ivas_param_mc_param_est_enc_fx(
		FOR( k = 0; k < nchan_input; ++k )
		{
		#ifdef MERGE_REQUEST_1378_SPEEDUP_ivas_mc_param_enc_fx_NONBE
		Word16 norm;
		// convert the 64 bit fixpoint back into the 48 bit float format
		norm = W_norm( Cy_sum_real_64[cur_param_band][k][k] );
		Nrg_fx[k] = W_extract_h( W_shl( Cy_sum_real_64[cur_param_band][k][k], norm ) );
		Nrg_e[k] = sub( sub( 62, gb ), norm );
		CONVERT_CY( Cy_sum_real_64[cur_param_band][k][k], Nrg_fx[k], Nrg_e[k] );
		move32();
		move16();
		#else
		Nrg_fx[k] = Cy_sum_fx[cur_param_band][k][k];
		move32();
		@@ -1361,10 +1447,9 @@ static void ivas_param_mc_param_est_enc_fx(
		{
		FOR( ch_idx2 = 0; ch_idx2 < MAX_CICP_CHANNELS; ch_idx2++ )
		{
		Word16 norm;
		norm = W_norm( Cy_sum_real_64[cur_param_band][ch_idx1][ch_idx2] );
		Cy_sum_fx[cur_param_band][ch_idx1][ch_idx2] = W_extract_h( W_shl( Cy_sum_real_64[cur_param_band][ch_idx1][ch_idx2], norm ) );
		Cy_sum_e[cur_param_band][ch_idx1][ch_idx2] = sub( sub( 62, gb ), norm );
		CONVERT_CY( Cy_sum_real_64[cur_param_band][ch_idx1][ch_idx2], Cy_sum_fx[cur_param_band][ch_idx1][ch_idx2], Cy_sum_e[cur_param_band][ch_idx1][ch_idx2] );
		move32();
		move16();
		}
		}
		}
		@@ -1407,11 +1492,9 @@ static void ivas_param_mc_param_est_enc_fx(
		FOR( ch_idx2 = ch_idx1; ch_idx2 < nchan_input; ch_idx2++ )
		{
		#ifdef MERGE_REQUEST_1378_SPEEDUP_ivas_mc_param_enc_fx_NONBE
		Word16 norm;
		// convert the 64 bit fixpoint back into the 48 bit float format
		norm = W_norm( Cy_sum_imag_64[cur_param_band][ch_idx1][ch_idx2] );
		imag_part_fx = W_extract_h( W_shl( Cy_sum_imag_64[cur_param_band][ch_idx1][ch_idx2], norm ) );
		imag_part_e = sub( sub( 62, gb ), norm );
		CONVERT_CY( Cy_sum_imag_64[cur_param_band][ch_idx1][ch_idx2], imag_part_fx, imag_part_e );
		move32();
		move16();
		#else
		imag_part_fx = Cy_sum_imag_fx[cur_param_band][ch_idx1][ch_idx2];
		move32();