From a81a25447c601e70ae7152117d5fe5a924714c56 Mon Sep 17 00:00:00 2001 From: Sandesh Venkatesh Date: Tue, 1 Jul 2025 16:34:29 +0530 Subject: [PATCH] Multichannel decoder path optimizations - bit exact --- lib_com/options.h | 1 + lib_dec/ivas_dirac_output_synthesis_cov_fx.c | 30 ++++++++- lib_dec/ivas_svd_dec_fx.c | 19 ++++-- lib_rend/ivas_dirac_decorr_dec_fx.c | 69 +++++++++++++++++++- 4 files changed, 112 insertions(+), 7 deletions(-) diff --git a/lib_com/options.h b/lib_com/options.h index 450b94c16..0fda6f452 100644 --- a/lib_com/options.h +++ b/lib_com/options.h @@ -77,6 +77,7 @@ /* Note: each compile switch (FIX_1101_...) is independent from the other ones */ +#define OPT_MCH_DEC_V1_BE #define OPT_MCT_ENC_V2_NBE #define OPT_SBA_DEC_V2_NBE #define OPT_MCT_ENC_V1_NBE diff --git a/lib_dec/ivas_dirac_output_synthesis_cov_fx.c b/lib_dec/ivas_dirac_output_synthesis_cov_fx.c index 4869f8d1c..ea03e1ddb 100644 --- a/lib_dec/ivas_dirac_output_synthesis_cov_fx.c +++ b/lib_dec/ivas_dirac_output_synthesis_cov_fx.c @@ -895,9 +895,13 @@ Word16 computeMixingMatrices_fx( } } - L_tmp = Mpy_32_32( limit_fx, reg_Sx_fx ); limit_e = add( limit_e, reg_Sx_e ); +#ifdef OPT_MCH_DEC_V1_BE + limit_fx = Madd_32_32( EPSILON_FX, limit_fx, reg_Sx_fx ); +#else /* OPT_MCH_DEC_V1_BE */ + L_tmp = Mpy_32_32( limit_fx, reg_Sx_fx ); limit_fx = L_add( L_tmp, EPSILON_FX ); +#endif /* OPT_MCH_DEC_V1_BE */ FOR( i = 0; i < lengthCx; ++i ) { @@ -956,8 +960,12 @@ Word16 computeMixingMatrices_fx( move16(); } } +#ifdef OPT_MCH_DEC_V1_BE + limit_fx = Madd_32_32( EPSILON_FX, limit_fx, reg_ghat_fx ); // limit_e+ reg_ghat_e +#else /* OPT_MCH_DEC_V1_BE */ L_tmp = Mpy_32_32( limit_fx, reg_ghat_fx ); // limit_e+ reg_ghat_e limit_fx = L_add( L_tmp, EPSILON_FX ); +#endif /* OPT_MCH_DEC_V1_BE */ limit_e = add( limit_e, reg_ghat_e ); FOR( i = 0; i < lengthCy; ++i ) @@ -1436,8 +1444,12 @@ Word16 computeMixingMatricesResidual_fx( } } +#ifdef OPT_MCH_DEC_V1_BE + limit_fx = Madd_32_32( EPSILON_FX, limit_fx, reg_ghat_fx ); // Q(limit_e+reg_ghat_e) +#else /* OPT_MCH_DEC_V1_BE */ L_tmp = Mpy_32_32( limit_fx, reg_ghat_fx ); // Q(limit_e+reg_ghat_e) limit_fx = L_add( L_tmp, EPSILON_FX ); +#endif /* OPT_MCH_DEC_V1_BE */ limit_e = add( limit_e, reg_ghat_e ); /* Computing G_hat */ @@ -1468,8 +1480,12 @@ Word16 computeMixingMatricesResidual_fx( FOR( i = 0; i < num_outputs; i++ ) { +#ifdef OPT_MCH_DEC_V1_BE + Kx_fx[i] = Mpy_32_32( Kx_fx[i], G_hat_fx[i] ); // Q(31-(Kx_fx_e+G_hag_e)) +#else /* OPT_MCH_DEC_V1_BE */ L_tmp = Mpy_32_32( Kx_fx[i], G_hat_fx[i] ); // Q(31-(Kx_fx_e+G_hag_e)) Kx_fx[i] = L_tmp; +#endif /* OPT_MCH_DEC_V1_BE */ move32(); Kx_fx_e[i] = add( Kx_fx_e[i], G_hat_e[i] ); move16(); @@ -1483,11 +1499,18 @@ Word16 computeMixingMatricesResidual_fx( FOR( j = 0; j < num_outputs; j++ ) { +#ifdef OPT_MCH_DEC_V1_BE + mat_mult_buffer1_fx[i + j * num_outputs] = Mpy_32_32( Ky_fx[i + j * num_outputs], fac_fx ); // Q(31-(Ky_fx_e+Kx_fx_e)); + move32(); + mat_mult_buffer1_buff_e[i + j * num_outputs] = add( Ky_fx_e[i + j * num_outputs], Kx_fx_e[i] ); + move16(); +#else /* OPT_MCH_DEC_V1_BE */ L_tmp = Mpy_32_32( Ky_fx[i + j * num_outputs], fac_fx ); // Q(31-(Ky_fx_e+Kx_fx_e)) mat_mult_buffer1_fx[i + j * num_outputs] = L_tmp; move32(); mat_mult_buffer1_buff_e[i + j * num_outputs] = extract_l( L_add( Ky_fx_e[i + j * num_outputs], Kx_fx_e[i] ) ); move16(); +#endif /* OPT_MCH_DEC_V1_BE */ } } @@ -1543,9 +1566,14 @@ Word16 computeMixingMatricesResidual_fx( FOR( j = 0; j < num_outputs; j++ ) { +#ifdef OPT_MCH_DEC_V1_BE + mixing_matrix_fx[j + i * num_outputs] = Mpy_32_32( mat_mult_buffer1_fx[j + i * num_outputs], fac_fx ); // Q(31-mat_mult_buffer1_e+Kx_reg_inv_e); + move32(); +#else /* OPT_MCH_DEC_V1_BE */ L_tmp = Mpy_32_32( mat_mult_buffer1_fx[j + i * num_outputs], fac_fx ); // Q(31-mat_mult_buffer1_e+Kx_reg_inv_e) mixing_matrix_fx[j + i * num_outputs] = L_tmp; move32(); +#endif /* OPT_MCH_DEC_V1_BE */ mixing_matrix_fx_e[j + i * num_outputs] = add( mat_mult_buffer1_buff_e[j + i * num_outputs], Kx_reg_inv_e[i] ); move16(); } diff --git a/lib_dec/ivas_svd_dec_fx.c b/lib_dec/ivas_svd_dec_fx.c index 4b0522884..2afe48db6 100644 --- a/lib_dec/ivas_svd_dec_fx.c +++ b/lib_dec/ivas_svd_dec_fx.c @@ -779,6 +779,9 @@ static void ApplyRotation_fx( move16(); } op_e = add( op_e, 1 ); // 64 bit mac -> +1 +#ifdef OPT_MCH_DEC_V1_BE + op_e = negate( op_e ); +#endif /* OPT_MCH_DEC_V1_BE */ FOR( ch = 0; ch < nChannels; ch++ ) { @@ -788,13 +791,21 @@ static void ApplyRotation_fx( move32(); Word64 temp = W_mac_32_32( W_mult_32_32( op1, x11 ), op2, x12 ); // Q(singularVector) + op_e - temp = W_shr( temp, op_e ); // Q(singularVector) - singularVector[ch][currentIndex2] = W_sat_l( temp ); // Q(singularVector) +#ifdef OPT_MCH_DEC_V1_BE + singularVector[ch][currentIndex2] = W_shl_sat_l( temp, op_e ); // Q(singularVector) +#else /* OPT_MCH_DEC_V1_BE */ + temp = W_shr( temp, op_e ); // Q(singularVector) + singularVector[ch][currentIndex2] = W_sat_l( temp ); // Q(singularVector) +#endif /* OPT_MCH_DEC_V1_BE */ move32(); temp = W_mac_32_32( W_mult_32_32( op1, x12 ), L_negate( op2 ), x11 ); // Q(singularVector) + op_e - temp = W_shr( temp, op_e ); // Q(singularVector) - singularVector[ch][currentIndex1] = W_sat_l( temp ); // Q(singularVector) +#ifdef OPT_MCH_DEC_V1_BE + singularVector[ch][currentIndex1] = W_shl_sat_l( temp, op_e ); // Q(singularVector) +#else /* OPT_MCH_DEC_V1_BE */ + temp = W_shr( temp, op_e ); // Q(singularVector) + singularVector[ch][currentIndex1] = W_sat_l( temp ); // Q(singularVector) +#endif /* OPT_MCH_DEC_V1_BE */ move32(); } diff --git a/lib_rend/ivas_dirac_decorr_dec_fx.c b/lib_rend/ivas_dirac_decorr_dec_fx.c index aa4aed84c..b7c0ebbbc 100644 --- a/lib_rend/ivas_dirac_decorr_dec_fx.c +++ b/lib_rend/ivas_dirac_decorr_dec_fx.c @@ -485,11 +485,19 @@ void ivas_dirac_dec_decorr_process_fx( set32_fx( aux_buffer_fx, 0, 2 * MAX_OUTPUT_CHANNELS * CLDFB_NO_CHANNELS_MAX ); FOR( ch_idx = 0; ch_idx < num_protos_dir; ch_idx++ ) { - v_shr( &input_frame_fx[imult1616( 2, imult1616( ch_idx, num_freq_bands ) )], negate( q_shift ), &aux_buffer_fx[imult1616( 2, imult1616( ch_idx, num_freq_bands ) )], imult1616( 2, num_freq_bands ) ); // Q - q_shift +#ifdef OPT_MCH_DEC_V1_BE + v_shr( &input_frame_fx[2 * ch_idx * num_freq_bands], negate( q_shift ), &aux_buffer_fx[2 * ch_idx * num_freq_bands], imult1616( 2, num_freq_bands ) ); // Q - q_shift +#else /* OPT_MCH_DEC_V1_BE */ + v_shr( &input_frame_fx[imult1616( 2, imult1616( ch_idx, num_freq_bands ) )], negate( q_shift ), &aux_buffer_fx[imult1616( 2, imult1616( ch_idx, num_freq_bands ) )], imult1616( 2, num_freq_bands ) ); // Q - q_shift +#endif /* OPT_MCH_DEC_V1_BE */ } FOR( ch_idx = 0; ch_idx < num_protos_dir; ch_idx++ ) { +#ifdef OPT_MCH_DEC_V1_BE + v_mult_fixed( &aux_buffer_fx[2 * ch_idx * num_freq_bands], &aux_buffer_fx[2 * ch_idx * num_freq_bands], &aux_buffer_fx[2 * ch_idx * max_band_decorr_temp], imult1616( 2, max_band_decorr_temp ) ); // q_aux_buffer +#else /* OPT_MCH_DEC_V1_BE */ v_mult_fixed( &aux_buffer_fx[imult1616( 2, imult1616( ch_idx, num_freq_bands ) )], &aux_buffer_fx[imult1616( 2, imult1616( ch_idx, num_freq_bands ) )], &aux_buffer_fx[imult1616( 2, imult1616( ch_idx, max_band_decorr_temp ) )], imult1616( 2, max_band_decorr_temp ) ); // q_aux_buffer +#endif /* OPT_MCH_DEC_V1_BE */ } q_aux_buffer = sub( add( add( add( q_input_frame, q_input_frame ), q_shift ), q_shift ), 31 ); @@ -506,6 +514,10 @@ void ivas_dirac_dec_decorr_process_fx( max_band_decorr = h_freq_domain_decorr_ap_params->max_band_decorr; move16(); +#ifdef OPT_MCH_DEC_V1_BE + Word16 decorX2 = shl( max_band_decorr, 1 ); +#endif /* OPT_MCH_DEC_V1_BE */ + set32_fx( onset_filter_fx, ONE_IN_Q31, imult1616( num_protos_diff, num_freq_bands ) ); Word16 q_temp = s_min( q_onset_dec, q_aux_buffer ); @@ -678,7 +690,11 @@ void ivas_dirac_dec_decorr_process_fx( move32(); move32(); } +#ifdef OPT_MCH_DEC_V1_BE + decorr_buffer_ptr_fx = decorr_buffer_start_ptr_fx + ( pre_delay - 1 ) * decorr_buffer_step * 2; +#else /* OPT_MCH_DEC_V1_BE */ decorr_buffer_ptr_fx = decorr_buffer_start_ptr_fx + shl( imult1616( ( sub( pre_delay, 1 ) ), decorr_buffer_step ), 1 ); +#endif /* OPT_MCH_DEC_V1_BE */ /*add MA part to state */ decorr_buffer_ptr_fx[0] = L_add( decorr_buffer_ptr_fx[0], frame_ma_fx[0] ); @@ -723,7 +739,10 @@ void ivas_dirac_dec_decorr_process_fx( Word16 q_direct_energy; Word64 aux_64[2 * MAX_OUTPUT_CHANNELS * CLDFB_NO_CHANNELS_MAX]; Word16 e_reverb_energy_smooth, e_direct_energy_smooth; - Word16 offset1, offset2; + Word16 offset1; +#ifndef OPT_MCH_DEC_V1_BE + Word16 offset2; +#endif /* OPT_MCH_DEC_V1_BE */ Word16 norm = 63; move16(); e_reverb_energy_smooth = sub( 31, h_freq_domain_decorr_ap_state->q_reverb_energy_smooth ); @@ -732,6 +751,17 @@ void ivas_dirac_dec_decorr_process_fx( // scaling to get max precision for aux_buffer values// q_shift = Q31; move16(); +#ifdef OPT_MCH_DEC_V1_BE + FOR( ch_idx = 0; ch_idx < num_channels; ++ch_idx ) + { + q_shift = s_min( q_shift, + L_norm_arr( &frame_dec_fx[2 * ch_idx * num_freq_bands], decorX2 ) ); + } + FOR( ch_idx = 0; ch_idx < num_channels; ++ch_idx ) + { + scale_sig32( &frame_dec_fx[2 * ch_idx * num_freq_bands], decorX2, q_shift ); + } +#else /* OPT_MCH_DEC_V1_BE */ offset = shl( max_band_decorr, 1 ); FOR( ch_idx = 0; ch_idx < num_channels; ++ch_idx ) { @@ -742,6 +772,7 @@ void ivas_dirac_dec_decorr_process_fx( { scale_sig32( &frame_dec_fx[2 * ch_idx * num_freq_bands], offset, q_shift ); } +#endif /* OPT_MCH_DEC_V1_BE */ q_frame_f = add( q_frame_f, q_shift ); @@ -773,12 +804,18 @@ void ivas_dirac_dec_decorr_process_fx( Word32 *m32_frame_dec_fx = frame_dec_fx; move32(); offset1 = shl( num_freq_bands, 1 ); +#ifndef OPT_MCH_DEC_V1_BE offset2 = shl( max_band_decorr, 1 ); +#endif /* OPT_MCH_DEC_V1_BE */ FOR( ch_idx = 0; ch_idx < num_channels; ++ch_idx ) { +#ifdef OPT_MCH_DEC_V1_BE + FOR( Word16 i = 0; i < decorX2; i++ ) +#else /* OPT_MCH_DEC_V1_BE */ FOR( Word16 i = 0; i < offset2; i++ ) +#endif /* OPT_MCH_DEC_V1_BE */ { m64_aux[i] = W_mult0_32_32( m32_frame_dec_fx[i], m32_frame_dec_fx[i] ); move64(); @@ -788,19 +825,35 @@ void ivas_dirac_dec_decorr_process_fx( move64(); } } +#ifdef OPT_MCH_DEC_V1_BE + m64_aux += decorX2; +#else /* OPT_MCH_DEC_V1_BE */ m64_aux += offset2; +#endif /* OPT_MCH_DEC_V1_BE */ m32_frame_dec_fx += offset1; move64(); move32(); } norm = W_norm( min64 ); +#ifdef OPT_MCH_DEC_V1_BE + norm = sub( norm, 33 ); +#else /* OPT_MCH_DEC_V1_BE */ norm = sub( norm, 1 /*find_guarded_bits_fx( 2 )*/ ); +#endif /* OPT_MCH_DEC_V1_BE */ FOR( Word16 i = 0; i < 2 * num_channels * max_band_decorr; i++ ) { +#ifdef OPT_MCH_DEC_V1_BE + aux_buffer_fx[i] = W_shl_sat_l( aux_64[i], norm ); +#else /* OPT_MCH_DEC_V1_BE */ aux_buffer_fx[i] = W_extract_h( W_shl( aux_64[i], norm ) ); +#endif /* OPT_MCH_DEC_V1_BE */ move32(); } +#ifdef OPT_MCH_DEC_V1_BE + q_aux_buffer = add( shl( q_frame_f, 1 ), norm ); +#else /* OPT_MCH_DEC_V1_BE */ q_aux_buffer = add( shl( q_frame_f, 1 ), sub( norm, 32 ) ); +#endif /* OPT_MCH_DEC_V1_BE */ FOR( ch_idx = 0; ch_idx < num_channels; ++ch_idx ) @@ -954,7 +1007,11 @@ void ivas_dirac_dec_decorr_process_fx( Word16 sf = MAX_16; FOR( ch_idx = 0; ch_idx < num_channels; ++ch_idx ) { +#ifdef OPT_MCH_DEC_V1_BE + sf = s_min( sf, getScaleFactor32( &frame_dec_fx[2 * ch_idx * num_freq_bands], decorX2 ) ); +#else /* OPT_MCH_DEC_V1_BE */ sf = s_min( sf, getScaleFactor32( &frame_dec_fx[2 * ch_idx * num_freq_bands], shl( max_band_decorr, 1 ) ) ); +#endif /* OPT_MCH_DEC_V1_BE */ } sf = s_min( sub( sf, 1 ), q_shift ); q_if_local = sub( q_shift, sf ); @@ -963,7 +1020,11 @@ void ivas_dirac_dec_decorr_process_fx( // scaling it to sf FOR( ch_idx = 0; ch_idx < num_channels; ++ch_idx ) { +#ifdef OPT_MCH_DEC_V1_BE + scale_sig32( &frame_dec_fx[2 * ch_idx * num_freq_bands], decorX2, q_shift ); +#else /* OPT_MCH_DEC_V1_BE */ scale_sig32( &frame_dec_fx[2 * ch_idx * num_freq_bands], shl( max_band_decorr, 1 ), q_shift ); +#endif /* OPT_MCH_DEC_V1_BE */ } q_frame_f = add( q_frame_f, sf ); } @@ -972,7 +1033,11 @@ void ivas_dirac_dec_decorr_process_fx( // scaling it to input q FOR( ch_idx = 0; ch_idx < num_channels; ++ch_idx ) { +#ifdef OPT_MCH_DEC_V1_BE + scale_sig32( &frame_dec_fx[2 * ch_idx * num_freq_bands], decorX2, q_shift ); +#else /* OPT_MCH_DEC_V1_BE */ scale_sig32( &frame_dec_fx[2 * ch_idx * num_freq_bands], shl( max_band_decorr, 1 ), q_shift ); +#endif /* OPT_MCH_DEC_V1_BE */ } q_frame_f = q_input_frame; q_if_local = 0; -- GitLab