diff --git a/lib_com/options.h b/lib_com/options.h index af3a65d479ac3c15b3e0145c45db77d68e816a2d..737e0850164e52c4baec6b4bc0dd77b96308a2d5 100644 --- a/lib_com/options.h +++ b/lib_com/options.h @@ -86,6 +86,7 @@ //#define HARM_SCE_INIT #define DIV32_OPT_NEWTON /* FhG: faster 32 by 32 bit division */ #define MERGE_REQUEST_1378_SPEEDUP_ivas_mc_param_enc_fx_NONBE /* FhG: reduce WMOPS of Cy calculation in ivas_param_mc_param_est_enc_fx() by using 64 Bit addition. Obsoletes IMPROVE_HIGH_COMPLEXITY_PARAM_MC_PRM_EST_NONBE. */ +#define MERGE_REQUEST_1472_SPEEDUP_ivas_mc_param_enc_fx_NONBE /* FhG: reduce WMOPS of dmx calculation in ivas_param_mc_param_est_enc_fx() by using 64 Bit addition. Requires MERGE_REQUEST_1378_SPEEDUP_ivas_mc_param_enc_fx_NONBE. */ #define MERGE_REQUEST_1564_SPEEDUP_ivas_dirac_dec_output_synthesis_cov_param_mc_synthesise_slot_fx_NONBE /* FhG: reduce WMOPS by inlining the matrix multiplications for the smoothing operation. */ #define FIX_1439_SPEEDUP_Copy_Scale_sig_16_32_no_sat /*FhG: reduces WMOPS - bit-exact*/ #define FIX_1439_SPEEDUP_stereo_icBWE_dec_fx /*FhG: reduces WMOPS - bit-exact*/ diff --git a/lib_enc/ivas_mc_param_enc_fx.c b/lib_enc/ivas_mc_param_enc_fx.c index 0c996b25a1355a5bb6a73ca6f5559d0a11b0fbce..3058f87f3a6869903b2a51bb2be9f035c2742a43 100644 --- a/lib_enc/ivas_mc_param_enc_fx.c +++ b/lib_enc/ivas_mc_param_enc_fx.c @@ -30,6 +30,21 @@ *******************************************************************************************************/ +// helper macros to convert the 64 bitt accumulators into the 48 bit float format +#define CONVERT_CY( x_64, y_fx, y_e ) \ + { \ + Word16 norm; \ + norm = W_norm( x_64 ); \ + y_fx = W_extract_h( W_shl( x_64, norm ) ); \ + y_e = sub( sub62gb, norm ); \ + } +#define CONVERT_DMX( x_64, y_fx, y_e ) \ + { \ + Word16 norm; \ + norm = W_norm( x_64 ); \ + y_fx = W_extract_h( W_shl( x_64, norm ) ); \ + y_e = sub( sub35gb, norm ); \ + } #include #include #include "options.h" @@ -649,15 +664,22 @@ static void ivas_param_mc_param_est_enc_fx( Word32 *p_slot_frame_f_real_fx[MAX_CICP_CHANNELS]; /* Output of the MDFT FB - real part */ Word32 *p_slot_frame_f_imag_fx[MAX_CICP_CHANNELS]; /* Output of the MDFT FB - imag part */ +#ifdef MERGE_REQUEST_1472_SPEEDUP_ivas_mc_param_enc_fx_NONBE + Word64 dmx_real_64[PARAM_MC_MAX_TRANSPORT_CHANS]; + Word64 dmx_imag_64[PARAM_MC_MAX_TRANSPORT_CHANS]; +#else Word32 dmx_real_fx[PARAM_MC_MAX_TRANSPORT_CHANS]; /* Downmix channel - Real Part */ Word16 dmx_real_e[PARAM_MC_MAX_TRANSPORT_CHANS]; /* Downmix channel - Real Part */ Word32 dmx_imag_fx[PARAM_MC_MAX_TRANSPORT_CHANS]; /* Downmix channel - Imag Part */ Word16 dmx_imag_e[PARAM_MC_MAX_TRANSPORT_CHANS]; /* Downmix channel - Imag Part */ - Word32 a_fx, b_fx, c_fx, d_fx; /* Tmp complex values */ - Word16 a_e, b_e, c_e, d_e; /* Tmp complex values */ +#endif + Word32 a_fx, b_fx, c_fx, d_fx; /* Tmp complex values */ + Word16 a_e, b_e, c_e, d_e; /* Tmp complex values */ #ifdef MERGE_REQUEST_1378_SPEEDUP_ivas_mc_param_enc_fx_NONBE Word64 Cy_sum_real_64[PARAM_MC_MAX_PARAMETER_BANDS][MAX_CICP_CHANNELS][MAX_CICP_CHANNELS]; Word64 Cy_sum_imag_64[PARAM_MC_MAX_PARAM_BAND_ABS_COV_ENC][MAX_CICP_CHANNELS][MAX_CICP_CHANNELS]; + Word16 sub62gb; + Word16 sub35gb; #else Word32 Cy_sum_imag_fx[PARAM_MC_MAX_PARAM_BAND_ABS_COV_ENC][MAX_CICP_CHANNELS][MAX_CICP_CHANNELS]; Word16 Cy_sum_imag_e[PARAM_MC_MAX_PARAM_BAND_ABS_COV_ENC][MAX_CICP_CHANNELS][MAX_CICP_CHANNELS]; @@ -740,9 +762,16 @@ static void ivas_param_mc_param_est_enc_fx( #if defined( IMPROVE_HIGH_COMPLEXITY_PARAM_MC_PRM_EST_BE ) || defined( IMPROVE_HIGH_COMPLEXITY_PARAM_MC_PRM_EST_NONBE ) Word16 gb = find_guarded_bits_fx( l_ts ); +#ifndef MERGE_REQUEST_1472_SPEEDUP_ivas_mc_param_enc_fx_NONBE Word16 add20gb = add( 20, gb ); #endif +#endif +#ifdef MERGE_REQUEST_1378_SPEEDUP_ivas_mc_param_enc_fx_NONBE + sub35gb = sub( 35, find_guarded_bits_fx( l_ts ) ); + sub62gb = sub( 62, find_guarded_bits_fx( l_ts ) ); +#endif + FOR( ts = start_ts; ts < num_time_slots; ts++ ) { #if !defined( IMPROVE_HIGH_COMPLEXITY_PARAM_MC_PRM_EST_BE ) && !defined( IMPROVE_HIGH_COMPLEXITY_PARAM_MC_PRM_EST_NONBE ) @@ -774,6 +803,26 @@ static void ivas_param_mc_param_est_enc_fx( FOR( ch_idx1 = 0; ch_idx1 < nchan_transport; ++ch_idx1 ) { +#ifdef MERGE_REQUEST_1472_SPEEDUP_ivas_mc_param_enc_fx_NONBE + Word64 real_64; + Word64 imag_64; + + real_64 = 0; + imag_64 = 0; + move64(); + move64(); + FOR( inp_ch = 0; inp_ch < nchan_input; inp_ch++ ) + { + real_64 = W_add( real_64, W_mult0_32_32( slot_frame_f_real_fx[inp_ch][cur_cldfb_band], ( *p_dmx_fac_fx ) ) ); + imag_64 = W_add( imag_64, W_mult0_32_32( slot_frame_f_imag_fx[inp_ch][cur_cldfb_band], ( *p_dmx_fac_fx ) ) ); + p_dmx_fac_fx++; + } + dmx_real_64[ch_idx1] = real_64; + dmx_imag_64[ch_idx1] = imag_64; + move64(); + move64(); + +#else #ifndef IMPROVE_HIGH_COMPLEXITY_PARAM_MC_PRM_EST_BE dmx_real_fx[ch_idx1] = 0; move32(); @@ -814,14 +863,24 @@ static void ivas_param_mc_param_est_enc_fx( move16(); move32(); move16(); +#endif + #endif } /* Cx for transport channels */ FOR( ch_idx1 = 0; ch_idx1 < nchan_transport; ++ch_idx1 ) { +#ifdef MERGE_REQUEST_1472_SPEEDUP_ivas_mc_param_enc_fx_NONBE + CONVERT_DMX( dmx_real_64[ch_idx1], a_fx, a_e ); + CONVERT_DMX( dmx_imag_64[ch_idx1], b_fx, b_e ); +#endif FOR( ch_idx2 = 0; ch_idx2 < nchan_transport; ++ch_idx2 ) { +#ifdef MERGE_REQUEST_1472_SPEEDUP_ivas_mc_param_enc_fx_NONBE + CONVERT_DMX( dmx_real_64[ch_idx2], c_fx, c_e ); + CONVERT_DMX( dmx_imag_64[ch_idx2], d_fx, d_e ); +#else a_fx = dmx_real_fx[ch_idx1]; move32(); a_e = dmx_real_e[ch_idx1]; @@ -838,6 +897,7 @@ static void ivas_param_mc_param_est_enc_fx( move32(); d_e = dmx_imag_e[ch_idx2]; move16(); +#endif /* (a-ib)(c+id) = ac + bd + i(ad-bc) */ L_tmp = BASOP_Util_Add_Mant32Exp( Mpy_32_32( a_fx, c_fx ), add( a_e, c_e ), Mpy_32_32( b_fx, d_fx ), add( b_e, d_e ), &tmp_e ); @@ -858,7 +918,6 @@ static void ivas_param_mc_param_est_enc_fx( move32(); FOR( ch_idx2 = ch_idx1; ch_idx2 < nchan_input; ++ch_idx2 ) { - Word16 norm; c_fx = slot_frame_f_real_fx[ch_idx2][cur_cldfb_band]; d_fx = slot_frame_f_imag_fx[ch_idx2][cur_cldfb_band]; move32(); @@ -869,11 +928,6 @@ static void ivas_param_mc_param_est_enc_fx( Cy_sum_imag_64[cur_param_band][ch_idx1][ch_idx2] = W_add( Cy_sum_imag_64[cur_param_band][ch_idx1][ch_idx2], W_sub( W_mult0_32_32( a_fx, d_fx ), W_mult0_32_32( b_fx, c_fx ) ) ); move64(); - - // convert the 64 bit fixpoint back into the 48 bit float format - norm = W_norm( Cy_sum_real_64[cur_param_band][ch_idx1][ch_idx2] ); - Cy_sum_fx[cur_param_band][ch_idx1][ch_idx2] = W_extract_h( W_shl( Cy_sum_real_64[cur_param_band][ch_idx1][ch_idx2], norm ) ); - Cy_sum_e[cur_param_band][ch_idx1][ch_idx2] = sub( sub( 62, gb ), norm ); } } #else @@ -953,6 +1007,26 @@ static void ivas_param_mc_param_est_enc_fx( FOR( ch_idx1 = 0; ch_idx1 < nchan_transport; ++ch_idx1 ) { +#ifdef MERGE_REQUEST_1472_SPEEDUP_ivas_mc_param_enc_fx_NONBE + Word64 real_64; + Word64 imag_64; + + real_64 = 0; + imag_64 = 0; + move64(); + move64(); + + FOR( inp_ch = 0; inp_ch < nchan_input; inp_ch++ ) + { + real_64 = W_add( real_64, W_mult0_32_32( slot_frame_f_real_fx[inp_ch][cur_cldfb_band], ( *p_dmx_fac_fx ) ) ); + imag_64 = W_add( imag_64, W_mult0_32_32( slot_frame_f_imag_fx[inp_ch][cur_cldfb_band], ( *p_dmx_fac_fx ) ) ); + p_dmx_fac_fx++; + } + dmx_real_64[ch_idx1] = real_64; + dmx_imag_64[ch_idx1] = imag_64; + move64(); + move64(); +#else #ifndef IMPROVE_HIGH_COMPLEXITY_PARAM_MC_PRM_EST_BE dmx_real_fx[ch_idx1] = 0; move32(); @@ -997,12 +1071,18 @@ static void ivas_param_mc_param_est_enc_fx( move32(); dmx_imag_e[ch_idx1] = imag_e; move16(); +#endif + #endif } /* Cx for transport channels */ FOR( ch_idx1 = 0; ch_idx1 < nchan_transport; ++ch_idx1 ) { +#ifdef MERGE_REQUEST_1472_SPEEDUP_ivas_mc_param_enc_fx_NONBE + CONVERT_DMX( dmx_real_64[ch_idx1], a_fx, a_e ); + CONVERT_DMX( dmx_imag_64[ch_idx1], b_fx, b_e ); +#else #ifdef IMPROVE_HIGH_COMPLEXITY_PARAM_MC_PRM_EST_BE a_fx = dmx_real_fx[ch_idx1]; move32(); @@ -1012,9 +1092,20 @@ static void ivas_param_mc_param_est_enc_fx( move32(); b_e = dmx_imag_e[ch_idx1]; move16(); +#endif + #endif FOR( ch_idx2 = 0; ch_idx2 < nchan_transport; ++ch_idx2 ) { +#ifdef MERGE_REQUEST_1472_SPEEDUP_ivas_mc_param_enc_fx_NONBE + CONVERT_DMX( dmx_real_64[ch_idx2], c_fx, c_e ); + CONVERT_DMX( dmx_imag_64[ch_idx2], d_fx, d_e ); + + /* (a-ib)(c+id) = ac + bd + i(ad-bc) */ + L_tmp = BASOP_Util_Add_Mant32Exp( Mpy_32_32( a_fx, c_fx ), add( a_e, c_e ), Mpy_32_32( b_fx, d_fx ), add( b_e, d_e ), &tmp_e ); + Cx_sum_fx[cur_param_band][ch_idx1][ch_idx2] = BASOP_Util_Add_Mant32Exp( Cx_sum_fx[cur_param_band][ch_idx1][ch_idx2], Cx_sum_e[cur_param_band][ch_idx1][ch_idx2], L_tmp, tmp_e, + &Cx_sum_e[cur_param_band][ch_idx1][ch_idx2] ); +#else #ifndef IMPROVE_HIGH_COMPLEXITY_PARAM_MC_PRM_EST_BE a_fx = dmx_real_fx[ch_idx1]; move32(); @@ -1042,6 +1133,7 @@ static void ivas_param_mc_param_est_enc_fx( L_tmp = BASOP_Util_Add_Mant32Exp( Mpy_32_32( a_fx, dmx_real_fx[ch_idx2] ), add( a_e, dmx_real_e[ch_idx2] ), Mpy_32_32( b_fx, dmx_imag_fx[ch_idx2] ), add( b_e, dmx_imag_e[ch_idx2] ), &tmp_e ); Cx_sum_fx[cur_param_band][ch_idx1][ch_idx2] = BASOP_Util_Add_Mant32Exp( Cx_sum_fx[cur_param_band][ch_idx1][ch_idx2], Cx_sum_e[cur_param_band][ch_idx1][ch_idx2], L_tmp, tmp_e, &Cx_sum_e[cur_param_band][ch_idx1][ch_idx2] ); +#endif #endif move32(); } @@ -1075,7 +1167,6 @@ static void ivas_param_mc_param_est_enc_fx( move32(); FOR( ch_idx2 = ch_idx1; ch_idx2 < nchan_input; ++ch_idx2 ) { - Word16 norm; c_fx = slot_frame_f_real_fx[ch_idx2][cur_cldfb_band]; d_fx = slot_frame_f_imag_fx[ch_idx2][cur_cldfb_band]; move32(); @@ -1084,10 +1175,6 @@ static void ivas_param_mc_param_est_enc_fx( Cy_sum_real_64[cur_param_band][ch_idx1][ch_idx2] = W_add( Cy_sum_real_64[cur_param_band][ch_idx1][ch_idx2], W_add( W_mult0_32_32( a_fx, c_fx ), W_mult0_32_32( b_fx, d_fx ) ) ); move64(); - // convert the 64 bit fixpoint back into the 48 bit float format - norm = W_norm( Cy_sum_real_64[cur_param_band][ch_idx1][ch_idx2] ); - Cy_sum_fx[cur_param_band][ch_idx1][ch_idx2] = W_extract_h( W_shl( Cy_sum_real_64[cur_param_band][ch_idx1][ch_idx2], norm ) ); - Cy_sum_e[cur_param_band][ch_idx1][ch_idx2] = sub( sub( 62, gb ), norm ); } #else FOR( ch_idx2 = ch_idx1; ch_idx2 < nchan_input; ++ch_idx2 ) @@ -1135,14 +1222,6 @@ static void ivas_param_mc_param_est_enc_fx( { FOR( ch_idx1 = 0; ch_idx1 < nchan_input; ++ch_idx1 ) { - Cy_sum_fx[cur_param_band][hParamMC->lfe_index][ch_idx1] = 0; - move32(); - Cy_sum_e[cur_param_band][hParamMC->lfe_index][ch_idx1] = 0; - move16(); - Cy_sum_fx[cur_param_band][ch_idx1][hParamMC->lfe_index] = 0; - move32(); - Cy_sum_e[cur_param_band][ch_idx1][hParamMC->lfe_index] = 0; - move16(); #ifdef MERGE_REQUEST_1378_SPEEDUP_ivas_mc_param_enc_fx_NONBE Cy_sum_real_64[cur_param_band][hParamMC->lfe_index][ch_idx1] = 0; move64(); @@ -1153,6 +1232,14 @@ static void ivas_param_mc_param_est_enc_fx( Cy_sum_imag_64[cur_param_band][ch_idx1][hParamMC->lfe_index] = 0; move64(); #else + Cy_sum_fx[cur_param_band][hParamMC->lfe_index][ch_idx1] = 0; + move32(); + Cy_sum_e[cur_param_band][hParamMC->lfe_index][ch_idx1] = 0; + move16(); + Cy_sum_fx[cur_param_band][ch_idx1][hParamMC->lfe_index] = 0; + move32(); + Cy_sum_e[cur_param_band][ch_idx1][hParamMC->lfe_index] = 0; + move16(); Cy_sum_imag_fx[cur_param_band][hParamMC->lfe_index][ch_idx1] = 0; move32(); Cy_sum_imag_e[cur_param_band][hParamMC->lfe_index][ch_idx1] = 0; @@ -1174,7 +1261,7 @@ static void ivas_param_mc_param_est_enc_fx( move64(); Cy_sum_real_64[cur_param_band][ch_idx1][hParamMC->lfe_index] = 0; move64(); -#endif +#else Cy_sum_fx[cur_param_band][hParamMC->lfe_index][ch_idx1] = 0; move32(); Cy_sum_e[cur_param_band][hParamMC->lfe_index][ch_idx1] = 0; @@ -1183,6 +1270,7 @@ static void ivas_param_mc_param_est_enc_fx( move32(); Cy_sum_e[cur_param_band][ch_idx1][hParamMC->lfe_index] = 0; move16(); +#endif } } } @@ -1220,11 +1308,9 @@ static void ivas_param_mc_param_est_enc_fx( FOR( k = 0; k < nchan_input; ++k ) { #ifdef MERGE_REQUEST_1378_SPEEDUP_ivas_mc_param_enc_fx_NONBE - Word16 norm; - // convert the 64 bit fixpoint back into the 48 bit float format - norm = W_norm( Cy_sum_real_64[cur_param_band][k][k] ); - Nrg_fx[k] = W_extract_h( W_shl( Cy_sum_real_64[cur_param_band][k][k], norm ) ); - Nrg_e[k] = sub( sub( 62, gb ), norm ); + CONVERT_CY( Cy_sum_real_64[cur_param_band][k][k], Nrg_fx[k], Nrg_e[k] ); + move32(); + move16(); #else Nrg_fx[k] = Cy_sum_fx[cur_param_band][k][k]; move32(); @@ -1361,10 +1447,9 @@ static void ivas_param_mc_param_est_enc_fx( { FOR( ch_idx2 = 0; ch_idx2 < MAX_CICP_CHANNELS; ch_idx2++ ) { - Word16 norm; - norm = W_norm( Cy_sum_real_64[cur_param_band][ch_idx1][ch_idx2] ); - Cy_sum_fx[cur_param_band][ch_idx1][ch_idx2] = W_extract_h( W_shl( Cy_sum_real_64[cur_param_band][ch_idx1][ch_idx2], norm ) ); - Cy_sum_e[cur_param_band][ch_idx1][ch_idx2] = sub( sub( 62, gb ), norm ); + CONVERT_CY( Cy_sum_real_64[cur_param_band][ch_idx1][ch_idx2], Cy_sum_fx[cur_param_band][ch_idx1][ch_idx2], Cy_sum_e[cur_param_band][ch_idx1][ch_idx2] ); + move32(); + move16(); } } } @@ -1407,11 +1492,9 @@ static void ivas_param_mc_param_est_enc_fx( FOR( ch_idx2 = ch_idx1; ch_idx2 < nchan_input; ch_idx2++ ) { #ifdef MERGE_REQUEST_1378_SPEEDUP_ivas_mc_param_enc_fx_NONBE - Word16 norm; - // convert the 64 bit fixpoint back into the 48 bit float format - norm = W_norm( Cy_sum_imag_64[cur_param_band][ch_idx1][ch_idx2] ); - imag_part_fx = W_extract_h( W_shl( Cy_sum_imag_64[cur_param_band][ch_idx1][ch_idx2], norm ) ); - imag_part_e = sub( sub( 62, gb ), norm ); + CONVERT_CY( Cy_sum_imag_64[cur_param_band][ch_idx1][ch_idx2], imag_part_fx, imag_part_e ); + move32(); + move16(); #else imag_part_fx = Cy_sum_imag_fx[cur_param_band][ch_idx1][ch_idx2]; move32();