From 8cdaa39df0c7b287efe951b8e5b72a52c172be9b Mon Sep 17 00:00:00 2001 From: Sandesh Venkatesh Date: Thu, 24 Apr 2025 14:42:03 +0530 Subject: [PATCH] optimizing vector add and subtract, hp20 optimizations --- lib_com/hp50_fx.c | 85 +++++-------------- lib_com/mslvq_com_fx.c | 35 ++++++-- lib_com/options.h | 3 +- lib_com/prot_fx.h | 18 ++++ lib_com/tools.c | 20 +++++ lib_com/tools_fx.c | 20 +++++ lib_dec/fd_cng_dec_fx.c | 4 + lib_dec/ivas_dirac_dec_fx.c | 4 + lib_dec/ivas_jbm_dec_fx.c | 4 + lib_dec/ivas_mc_param_dec_fx.c | 5 ++ lib_enc/ivas_qmetadata_enc_fx.c | 18 +++- lib_enc/ivas_sns_enc_fx.c | 4 + lib_enc/speech_music_classif_fx.c | 4 + lib_rend/ivas_dirac_ana_fx.c | 4 + lib_rend/ivas_dirac_output_synthesis_dec_fx.c | 7 ++ lib_rend/ivas_dirac_rend_fx.c | 12 ++- lib_rend/ivas_efap_fx.c | 29 ++++++- lib_rend/ivas_omasa_ana_fx.c | 5 ++ lib_rend/ivas_reverb_fx.c | 30 +++++++ lib_rend/lib_rend.c | 8 ++ 20 files changed, 238 insertions(+), 81 deletions(-) diff --git a/lib_com/hp50_fx.c b/lib_com/hp50_fx.c index f5e7105cf..f1864d7a7 100644 --- a/lib_com/hp50_fx.c +++ b/lib_com/hp50_fx.c @@ -458,14 +458,8 @@ void hp20_fx_32( { Word16 i; Word32 a1_fx, a2_fx, b1_fx, b2_fx; -#ifdef OPT_STEREO_32KBPS_V1 - Word16 Qy1, Qy2, Qmin; - Word64 y0_fx64, y1_fx64, y2_fx64; - Word32 x0, x1, x2; -#else /* OPT_STEREO_32KBPS_V1 */ Word16 Qx0, Qx1, Qx2, Qy1, Qprev_y1, Qy2, Qprev_y2, Qmin; Word64 x0_fx64, x1_fx64, x2_fx64, y0_fx64, y1_fx64, y2_fx64, R1, R2, R3, R4, R5; -#endif /* OPT_STEREO_32KBPS_V1 */ IF( EQ_32( Fs, 8000 ) ) { @@ -516,64 +510,15 @@ void hp20_fx_32( move32(); move32(); -#ifdef OPT_STEREO_32KBPS_V1 - y1_fx64 = W_add( W_deposit32_l( mem_fx[0] ), W_deposit32_h( mem_fx[1] ) ); - y2_fx64 = W_add( W_deposit32_l( mem_fx[2] ), W_deposit32_h( mem_fx[3] ) ); - - x0 = mem_fx[4]; - move32(); - x1 = mem_fx[5]; - move32(); -#else /* OPT_STEREO_32KBPS_V1 */ Qprev_y1 = extract_l( mem_fx[4] ); Qprev_y2 = extract_l( mem_fx[5] ); y1_fx64 = W_deposit32_l( mem_fx[0] ); y2_fx64 = W_deposit32_l( mem_fx[1] ); x0_fx64 = W_deposit32_l( mem_fx[2] ); x1_fx64 = W_deposit32_l( mem_fx[3] ); -#endif /* OPT_STEREO_32KBPS_V1 */ FOR( i = 0; i < lg; i++ ) { -#ifdef OPT_STEREO_32KBPS_V1 - x2 = x1; - move32(); - x1 = x0; - move32(); - x0 = signal_fx[i]; - move32(); - - Qy1 = W_norm( y1_fx64 ); - if ( y1_fx64 == 0 ) - { - Qy1 = 62; - move16(); - } - - Qy2 = W_norm( y2_fx64 ); - if ( y2_fx64 == 0 ) - { - Qy2 = 62; - move16(); - } - - Qmin = s_min( Qy1, Qy2 ); - - Qmin = sub( Qmin, 34 ); - - y0_fx64 = W_mac_32_32( W_mult_32_32( W_shl_sat_l( y1_fx64, Qmin ), a1_fx ), W_shl_sat_l( y2_fx64, Qmin ), a2_fx ); // Qmin + Q29 + Q30 + 1 - - Word64 temp = W_mac_32_32( W_mac_32_32( W_mult_32_32( x2, b2_fx ), x1, b1_fx ), x0, b2_fx ); // Q30 - Word64 y0_fx = W_shr( y0_fx64, add( Qmin, Q30 ) ); // Q30 - y0_fx64 = W_add( temp, y0_fx ); // Q30 - signal_fx[i] = W_shl_sat_l( y0_fx64, -Q30 ); - move32(); - - y2_fx64 = y1_fx64; - move64(); - y1_fx64 = y0_fx64; - move64(); -#else /* OPT_STEREO_32KBPS_V1 */ x2_fx64 = x1_fx64; move64(); x1_fx64 = x0_fx64; @@ -587,7 +532,11 @@ void hp20_fx_32( move16(); } Qy1 = sub( Qy1, 34 ); +#ifdef OPT_STEREO_32KBPS_V1 + R1 = W_mult0_32_32( W_shl_sat_l( y1_fx64, Qy1 ), a1_fx ); +#else /* OPT_STEREO_32KBPS_V1 */ R1 = W_mult0_32_32( W_extract_l( W_shl( y1_fx64, Qy1 ) ), a1_fx ); +#endif /* OPT_STEREO_32KBPS_V1 */ Qy1 = add( Qy1, Qprev_y1 ); Qy2 = W_norm( y2_fx64 ); @@ -597,7 +546,11 @@ void hp20_fx_32( move16(); } Qy2 = sub( Qy2, 34 ); +#ifdef OPT_STEREO_32KBPS_V1 + R2 = W_mult0_32_32( W_shl_sat_l( y2_fx64, Qy2 ), a2_fx ); +#else /* OPT_STEREO_32KBPS_V1 */ R2 = W_mult0_32_32( W_extract_l( W_shl( y2_fx64, Qy2 ) ), a2_fx ); +#endif /* OPT_STEREO_32KBPS_V1 */ Qy2 = add( Qy2, Qprev_y2 ); Qx0 = W_norm( x0_fx64 ); @@ -607,7 +560,11 @@ void hp20_fx_32( move16(); } Qx0 = sub( Qx0, 34 ); +#ifdef OPT_STEREO_32KBPS_V1 + R3 = W_mult0_32_32( W_shl_sat_l( x0_fx64, Qx0 ), b2_fx ); +#else /* OPT_STEREO_32KBPS_V1 */ R3 = W_mult0_32_32( W_extract_l( W_shl( x0_fx64, Qx0 ) ), b2_fx ); +#endif /* OPT_STEREO_32KBPS_V1 */ Qx1 = W_norm( x1_fx64 ); if ( x1_fx64 == 0 ) @@ -616,7 +573,11 @@ void hp20_fx_32( move16(); } Qx1 = sub( Qx1, 34 ); +#ifdef OPT_STEREO_32KBPS_V1 + R4 = W_mult0_32_32( W_shl_sat_l( x1_fx64, Qx1 ), b1_fx ); +#else /* OPT_STEREO_32KBPS_V1 */ R4 = W_mult0_32_32( W_extract_l( W_shl( x1_fx64, Qx1 ) ), b1_fx ); +#endif /* OPT_STEREO_32KBPS_V1 */ Qx2 = W_norm( x2_fx64 ); if ( x2_fx64 == 0 ) @@ -625,7 +586,11 @@ void hp20_fx_32( move16(); } Qx2 = sub( Qx2, 34 ); +#ifdef OPT_STEREO_32KBPS_V1 + R5 = W_mult0_32_32( W_shl_sat_l( x2_fx64, Qx2 ), b2_fx ); +#else /* OPT_STEREO_32KBPS_V1 */ R5 = W_mult0_32_32( W_extract_l( W_shl( x2_fx64, Qx2 ) ), b2_fx ); +#endif /* OPT_STEREO_32KBPS_V1 */ Qmin = s_min( Qy1, Qy2 ); @@ -655,17 +620,8 @@ void hp20_fx_32( move64(); move16(); move16(); -#endif /* OPT_STEREO_32KBPS_V1 */ } -#ifdef OPT_STEREO_32KBPS_V1 - mem_fx[0] = W_extract_l( y1_fx64 ); - mem_fx[1] = W_extract_h( y1_fx64 ); - mem_fx[2] = W_extract_l( y2_fx64 ); - mem_fx[3] = W_extract_h( y2_fx64 ); - mem_fx[4] = x0; - mem_fx[5] = x1; -#else /* OPT_STEREO_32KBPS_V1 */ Qy1 = W_norm( y1_fx64 ); test(); IF( y1_fx64 != 0 && LT_16( Qy1, 32 ) ) @@ -688,7 +644,6 @@ void hp20_fx_32( mem_fx[3] = W_extract_l( x1_fx64 ); mem_fx[4] = Qprev_y1; mem_fx[5] = Qprev_y2; -#endif /* OPT_STEREO_32KBPS_V1 */ move32(); move32(); diff --git a/lib_com/mslvq_com_fx.c b/lib_com/mslvq_com_fx.c index cf2ad8a8a..7fcf36bfc 100644 --- a/lib_com/mslvq_com_fx.c +++ b/lib_com/mslvq_com_fx.c @@ -128,21 +128,31 @@ void init_lvq_fx( FOR( i = 0; i < MAX_NO_MODES; i++ ) { #ifdef OPT_STEREO_32KBPS_V1 - FOR( ( j = 0, k = 0 ); j < MAX_NO_SCALES; ( j++, k++ ) ) + FOR( ( j = 0, k = 0 ); j < MAX_NO_SCALES; j++ ) { - if ( ( no_lead_fx[i][j] <= 0 ) ) + if ( no_lead_fx[i][j] > 0 ) { - j = MAX_NO_SCALES; + k = add( k, 1 ); + } + if ( no_lead_fx[i][j] <= 0 ) + { + j = MAX_NO_SCALES - 1; + move16(); } } no_scales[i][0] = k; move16(); - FOR( k = 0; j < MAX_NO_SCALES << 1; ( j++, k++ ) ) + FOR( k = 0; j < MAX_NO_SCALES << 1; j++ ) { + if ( no_lead_fx[i][j] > 0 ) + { + k = add( k, 1 ); + } if ( no_lead_fx[i][j] <= 0 ) { j = MAX_NO_SCALES << 1; + move16(); } } no_scales[i][1] = k; @@ -172,23 +182,32 @@ void init_lvq_fx( FOR( i = 0; i < MAX_NO_MODES_p; i++ ) { #ifdef OPT_STEREO_32KBPS_V1 - FOR( ( j = 0, k = 0 ); j < MAX_NO_SCALES; ( j++, k++ ) ) + FOR( ( j = 0, k = 0 ); j < MAX_NO_SCALES; j++ ) { - + if ( no_lead_p_fx[i][j] > 0 ) + { + k = add( k, 1 ); + } if ( ( no_lead_p_fx[i][j] <= 0 ) ) { - j = MAX_NO_SCALES; + j = MAX_NO_SCALES - 1; + move16(); } } no_scales_p[i][0] = k; move16(); - FOR( k = 0; j < MAX_NO_SCALES << 1; ( j++, k++ ) ) + FOR( k = 0; j < MAX_NO_SCALES << 1; j++ ) { + if ( no_lead_p_fx[i][j] > 0 ) + { + k = add( k, 1 ); + } if ( ( no_lead_p_fx[i][j] <= 0 ) ) { j = MAX_NO_SCALES << 1; + move16(); } } diff --git a/lib_com/options.h b/lib_com/options.h index 2f1e82de8..cbc77b6e7 100644 --- a/lib_com/options.h +++ b/lib_com/options.h @@ -74,7 +74,7 @@ #define FIX_1379_MASA_ANGLE_ROUND /* Note: each compile switch (FIX_1101_...) is independent from the other ones */ -//#define OPT_STEREO_32KBPS_V1 /* Optimization made in stereo decoding path for 32kbps decoding */ +#define OPT_STEREO_32KBPS_V1 /* Optimization made in stereo decoding path for 32kbps decoding */ #define OPT_AVOID_STATE_BUF_RESCALE /* Optimization made to avoid rescale of synth state buffer */ #define FIX_1310_SPEEDUP_ivas_dirac_dec_get_response_fx /*FhG: WMOPS tuning, nonbe*/ #define FIX_1310_SPEEDUP_ivas_dirac_dec_output_synthesis_process_slot /*FhG: WMOPS tuning, nonbe*/ @@ -93,6 +93,7 @@ #define FIX_1439_SPEEDUP_SIMPLIFY_elliptic_bpf_48k_generic /*FhG: reduces maintenance complexity & reduces WMOPS & prepares STAGE2 patch*/ #define FIX_1439_SPEEDUP_SIMPLIFY_elliptic_bpf_48k_generic_STAGE2 /*FhG: reduces WMOPS*/ #define FIX_1481_HARDCODE_DIV /* FhG: hardcode division results in stereo_dmx_evs_init_encoder_fx() */ +#define VEC_ARITH_OPT_v1 #define FIX_1486_IND_SHB_RES /* VA: Fix for issue 1486: align the usage of IND_SHB_RES_GS indices with float code */ #define TEST_HR diff --git a/lib_com/prot_fx.h b/lib_com/prot_fx.h index 41f68807e..86b5ec0b7 100644 --- a/lib_com/prot_fx.h +++ b/lib_com/prot_fx.h @@ -6184,6 +6184,15 @@ void v_add_fixed( const Word16 hdrm /* i : headroom for when subtraction result > 1 or < -1 */ ); +#ifdef VEC_ARITH_OPT_v1 +void v_add_fixed_no_hdrm( + const Word32 x1[], /* i : Input vector 1 */ + const Word32 x2[], /* i : Input vector 2 */ + Word32 y[], /* o : Output vector that contains vector 1 + vector 2 */ + const Word16 N /* i : Vector length */ +); +#endif /* VEC_ARITH_OPT_v1 */ + void v_add_fixed_me( const Word32 x1[], /* i : Input vector 1 */ const Word16 x1_e, /* i : Exponent for input vector 1 */ @@ -6218,6 +6227,15 @@ void v_sub_fixed( const Word16 hdrm /* i : headroom for when subtraction result > 1 or < -1 */ ); +#ifdef VEC_ARITH_OPT_v1 +void v_sub_fixed_no_hdrm( + const Word32 x1[], /* i : Input vector 1 */ + const Word32 x2[], /* i : Input vector 2 */ + Word32 y[], /* o : Output vector that contains vector 1 - vector 2 */ + const Word16 N /* i : Vector length */ +); +#endif /* VEC_ARITH_OPT_v1 */ + /*! r: dot product of x[] and y[] */ Word32 dotp_fixed( const Word32 x[], /* i : vector x[] */ diff --git a/lib_com/tools.c b/lib_com/tools.c index b1a9f4d13..f7303c6f3 100644 --- a/lib_com/tools.c +++ b/lib_com/tools.c @@ -888,6 +888,26 @@ void v_sub_fixed( return; } +#ifdef VEC_ARITH_OPT_v1 +void v_sub_fixed_no_hdrm( + const Word32 x1[], /* i : Input vector 1 */ + const Word32 x2[], /* i : Input vector 2 */ + Word32 y[], /* o : Output vector that contains vector 1 - vector 2 */ + const Word16 N /* i : Vector length */ +) +{ + Word16 i; + + FOR( i = 0; i < N; i++ ) + { + y[i] = L_sub( x1[i], x2[i] ); + move32(); + } + + return; +} +#endif /* VEC_ARITH_OPT_v1 */ + /*-------------------------------------------------------------------* * v_multc_fixed() * diff --git a/lib_com/tools_fx.c b/lib_com/tools_fx.c index 6dc1c45f2..44a86b0eb 100644 --- a/lib_com/tools_fx.c +++ b/lib_com/tools_fx.c @@ -4524,6 +4524,26 @@ void v_add_fixed( return; } +#ifdef VEC_ARITH_OPT_v1 +void v_add_fixed_no_hdrm( + const Word32 x1[], /* i : Input vector 1 */ + const Word32 x2[], /* i : Input vector 2 */ + Word32 y[], /* o : Output vector that contains vector 1 + vector 2 */ + const Word16 N /* i : Vector length */ +) +{ + Word16 i; + + FOR( i = 0; i < N; i++ ) + { + y[i] = L_add( x1[i], x2[i] ); + move32(); + } + + return; +} +#endif /* VEC_ARITH_OPT_v1 */ + void v_add_fixed_me( const Word32 x1[], /* i : Input vector 1 */ const Word16 x1_e, /* i : Exponent for input vector 1 */ diff --git a/lib_dec/fd_cng_dec_fx.c b/lib_dec/fd_cng_dec_fx.c index d0fd178f2..47a61a26e 100644 --- a/lib_dec/fd_cng_dec_fx.c +++ b/lib_dec/fd_cng_dec_fx.c @@ -5345,7 +5345,11 @@ void generate_masking_noise_ivas_fx( } ELSE { +#ifdef VEC_ARITH_OPT_v1 + v_add_fixed_no_hdrm( maskingNoise_fx, timeDomainBuffer, timeDomainBuffer, s_min( hFdCngCom->frameSize, length ) ); /*Q31 - *exp_out*/ +#else /* VEC_ARITH_OPT_v1 */ v_add_fixed( maskingNoise_fx, timeDomainBuffer, timeDomainBuffer, s_min( hFdCngCom->frameSize, length ), 0 ); /*Q31 - *exp_out*/ +#endif /* VEC_ARITH_OPT_v1 */ } return; diff --git a/lib_dec/ivas_dirac_dec_fx.c b/lib_dec/ivas_dirac_dec_fx.c index 9c319489e..37b0bf1ba 100644 --- a/lib_dec/ivas_dirac_dec_fx.c +++ b/lib_dec/ivas_dirac_dec_fx.c @@ -3100,7 +3100,11 @@ void ivas_dirac_dec_render_sf_fx( v_multc_fixed( onset_filter_fx, 536870912 /* 0.25f in Q31 */, onset_filter_fx, hSpatParamRendCom->num_freq_bands ); +#ifdef VEC_ARITH_OPT_v1 + v_add_fixed_no_hdrm( onset_filter_fx, onset_filter_subframe_fx, onset_filter_subframe_fx, hSpatParamRendCom->num_freq_bands ); /* Q31 */ +#else /* VEC_ARITH_OPT_v1 */ v_add_fixed( onset_filter_fx, onset_filter_subframe_fx, onset_filter_subframe_fx, hSpatParamRendCom->num_freq_bands, 0 ); /* Q31 */ +#endif /* VEC_ARITH_OPT_v1 */ p_onset_filter_fx = onset_filter_subframe_fx; } ELSE diff --git a/lib_dec/ivas_jbm_dec_fx.c b/lib_dec/ivas_jbm_dec_fx.c index 74af57bd5..3f67962dc 100644 --- a/lib_dec/ivas_jbm_dec_fx.c +++ b/lib_dec/ivas_jbm_dec_fx.c @@ -2137,7 +2137,11 @@ ivas_error ivas_jbm_dec_render_fx( /* add already rendered SBA part */ FOR( n = 0; n < nchan_out; n++ ) { +#ifdef VEC_ARITH_OPT_v1 + v_add_fixed_no_hdrm( p_output_fx[n], p_tc_fx[n + st_ivas->nchan_ism], p_output_fx[n], *nSamplesRendered ); +#else /* VEC_ARITH_OPT_v1 */ v_add_fixed( p_output_fx[n], p_tc_fx[n + st_ivas->nchan_ism], p_output_fx[n], *nSamplesRendered, 0 ); +#endif /* VEC_ARITH_OPT_v1 */ } } ELSE IF( EQ_32( st_ivas->renderer_type, RENDERER_OSBA_AMBI ) || EQ_32( st_ivas->renderer_type, RENDERER_OSBA_LS ) || EQ_32( st_ivas->renderer_type, RENDERER_BINAURAL_FASTCONV_ROOM ) ) diff --git a/lib_dec/ivas_mc_param_dec_fx.c b/lib_dec/ivas_mc_param_dec_fx.c index 5f93c9676..e0671eea0 100644 --- a/lib_dec/ivas_mc_param_dec_fx.c +++ b/lib_dec/ivas_mc_param_dec_fx.c @@ -2084,8 +2084,13 @@ void ivas_param_mc_dec_render_fx( { IF( hLsSetup.index_lfe[idx_lfe] != ch ) { +#ifdef VEC_ARITH_OPT_v1 + v_add_fixed_no_hdrm( Cldfb_RealBuffer_fx[ch][slot_idx], Cldfb_RealBuffer_fx[hLsSetup.index_lfe[idx_lfe]][slot_idx], Cldfb_RealBuffer_fx[ch][slot_idx], 1 ); + v_add_fixed_no_hdrm( Cldfb_ImagBuffer_fx[ch][slot_idx], Cldfb_ImagBuffer_fx[hLsSetup.index_lfe[idx_lfe]][slot_idx], Cldfb_ImagBuffer_fx[ch][slot_idx], 1 ); +#else /* VEC_ARITH_OPT_v1 */ v_add_fixed( Cldfb_RealBuffer_fx[ch][slot_idx], Cldfb_RealBuffer_fx[hLsSetup.index_lfe[idx_lfe]][slot_idx], Cldfb_RealBuffer_fx[ch][slot_idx], 1, 0 ); v_add_fixed( Cldfb_ImagBuffer_fx[ch][slot_idx], Cldfb_ImagBuffer_fx[hLsSetup.index_lfe[idx_lfe]][slot_idx], Cldfb_ImagBuffer_fx[ch][slot_idx], 1, 0 ); +#endif /* VEC_ARITH_OPT_v1 */ } } } diff --git a/lib_enc/ivas_qmetadata_enc_fx.c b/lib_enc/ivas_qmetadata_enc_fx.c index 79a4adee7..744d2c067 100644 --- a/lib_enc/ivas_qmetadata_enc_fx.c +++ b/lib_enc/ivas_qmetadata_enc_fx.c @@ -1069,8 +1069,12 @@ void ivas_qmetadata_enc_sid_encode_fx( { /*compute the average direction */ ivas_qmetadata_azimuth_elevation_to_direction_vector_fx( q_direction->band_data[b].azimuth_fx[m], q_direction->band_data[b].elevation_fx[m], direction_vector_fx ); - scale_sig32( direction_vector_fx, 3, Q22 - Q30 ); // Q30 -> Q22 + scale_sig32( direction_vector_fx, 3, Q22 - Q30 ); // Q30 -> Q22 +#ifdef VEC_ARITH_OPT_v1 + v_add_fixed_no_hdrm( avg_direction_vector_fx, direction_vector_fx, avg_direction_vector_fx, 3 ); // Q22 +#else /* VEC_ARITH_OPT_v1 */ v_add_fixed( avg_direction_vector_fx, direction_vector_fx, avg_direction_vector_fx, 3, 0 ); // Q22 +#endif /* VEC_ARITH_OPT_v1 */ } ivas_qmetadata_direction_vector_to_azimuth_elevation_fx( avg_direction_vector_fx, Q22, &avg_azimuth_fx[b], &avg_elevation_fx[b] ); @@ -2402,7 +2406,11 @@ static Word16 ivas_qmetadata_entropy_encode_dir_fx( /*compute the average direction */ ivas_qmetadata_azimuth_elevation_to_direction_vector_fx( q_direction->band_data[i].azimuth_fx[j], q_direction->band_data[i].elevation_fx[j], direction_vector ); scale_sig32( direction_vector, 3, -8 ); // Q30 -> Q22 +#ifdef VEC_ARITH_OPT_v1 + v_add_fixed_no_hdrm( avg_direction_vector, direction_vector, avg_direction_vector, 3 ); +#else /* VEC_ARITH_OPT_v1 */ v_add_fixed( avg_direction_vector, direction_vector, avg_direction_vector, 3, 0 ); +#endif /* VEC_ARITH_OPT_v1 */ } } } @@ -2665,7 +2673,11 @@ static Word16 ivas_qmetadata_entropy_encode_dir_fx( IF( LT_16( idx, 4 ) ) { +#ifdef VEC_ARITH_OPT_v1 + v_add_fixed_no_hdrm( avg_direction_vector, direction_vector, avg_direction_vector, 3 ); +#else /* VEC_ARITH_OPT_v1 */ v_add_fixed( avg_direction_vector, direction_vector, avg_direction_vector, 3, 0 ); +#endif /* VEC_ARITH_OPT_v1 */ } } /* project the quantized average azimuth angle to the same grid as the current sample */ @@ -2696,7 +2708,11 @@ static Word16 ivas_qmetadata_entropy_encode_dir_fx( ivas_qmetadata_azimuth_elevation_to_direction_vector_fx( q_direction->band_data[i].azimuth_fx[j], q_direction->band_data[i].elevation_fx[j], direction_vector ); scale_sig32( direction_vector, 3, -8 ); // Q30 -> Q22 +#ifdef VEC_ARITH_OPT_v1 + v_add_fixed_no_hdrm( avg_direction_vector, direction_vector, avg_direction_vector, 3 ); +#else /* VEC_ARITH_OPT_v1 */ v_add_fixed( avg_direction_vector, direction_vector, avg_direction_vector, 3, 0 ); +#endif /* VEC_ARITH_OPT_v1 */ ivas_qmetadata_direction_vector_to_azimuth_elevation_fx( avg_direction_vector, Q22, &avg_azimuth, &avg_elevation ); avg_azimuth_index_upd = quantize_phi_enc_fx( L_add( avg_azimuth, 180 << Q22 ), 0, &avg_azimuth, avg_azimuth_alphabet ); diff --git a/lib_enc/ivas_sns_enc_fx.c b/lib_enc/ivas_sns_enc_fx.c index b7de878a1..2eaefb483 100644 --- a/lib_enc/ivas_sns_enc_fx.c +++ b/lib_enc/ivas_sns_enc_fx.c @@ -672,7 +672,11 @@ Word16 quantize_sns_fx( Word32 ener_side_fx; Word16 ener_side_q; +#ifdef VEC_ARITH_OPT_v1 + v_sub_fixed_no_hdrm( snsQ_out_fx[0][k], snsQ_out_fx[1][k], side_fx, M ); +#else /* VEC_ARITH_OPT_v1 */ v_sub_fixed( snsQ_out_fx[0][k], snsQ_out_fx[1][k], side_fx, M, 0 ); +#endif /* VEC_ARITH_OPT_v1 */ Word64 L64_sum; L64_sum = 1; diff --git a/lib_enc/speech_music_classif_fx.c b/lib_enc/speech_music_classif_fx.c index cbd8ea19f..cf6ba543b 100644 --- a/lib_enc/speech_music_classif_fx.c +++ b/lib_enc/speech_music_classif_fx.c @@ -2181,7 +2181,11 @@ Word16 ivas_smc_gmm_fx( } /* PCA */ +#ifdef VEC_ARITH_OPT_v1 + v_sub_fixed_no_hdrm( FV_fx, pca_mean_fx, FV_fx, N_SMC_FEATURES ); +#else /* VEC_ARITH_OPT_v1 */ v_sub_fixed( FV_fx, pca_mean_fx, FV_fx, N_SMC_FEATURES, 0 ); +#endif /* VEC_ARITH_OPT_v1 */ v_mult_mat_fixed( FV_fx, FV_fx, pca_components_fx, N_SMC_FEATURES, N_PCA_COEF, 0 ); /*------------------------------------------------------------------* * Calculation of posterior probability diff --git a/lib_rend/ivas_dirac_ana_fx.c b/lib_rend/ivas_dirac_ana_fx.c index 8124c4967..c2430d008 100644 --- a/lib_rend/ivas_dirac_ana_fx.c +++ b/lib_rend/ivas_dirac_ana_fx.c @@ -578,7 +578,11 @@ static void ivas_dirac_dmx_fx( v_add_fx( data_in_fx[0], data_in_fx[1], data_out_fx[0], input_frame ); v_multc_fixed( data_out_fx[0], ONE_IN_Q30, data_out_fx[0], input_frame ); // ONE_IN_Q30 = 0.5* ONE_IN_Q31 +#ifdef VEC_ARITH_OPT_v1 + v_sub_fixed_no_hdrm( data_in_fx[0], data_in_fx[1], data_out_fx[1], input_frame ); +#else /* VEC_ARITH_OPT_v1 */ v_sub_fixed( data_in_fx[0], data_in_fx[1], data_out_fx[1], input_frame, 0 ); +#endif /* VEC_ARITH_OPT_v1 */ v_multc_fixed( data_out_fx[1], ONE_IN_Q30, data_out_fx[1], input_frame ); FOR( i = 0; i < nchan_transport; i++ ) diff --git a/lib_rend/ivas_dirac_output_synthesis_dec_fx.c b/lib_rend/ivas_dirac_output_synthesis_dec_fx.c index 0cdbe3a10..509eec6e2 100644 --- a/lib_rend/ivas_dirac_output_synthesis_dec_fx.c +++ b/lib_rend/ivas_dirac_output_synthesis_dec_fx.c @@ -1227,10 +1227,17 @@ void ivas_dirac_dec_output_synthesis_process_slot_fx( { Scale_sig32( aux_buf, num_freq_bands, sub( h_dirac_output_synthesis_state->q_cy_auto_diff_smooth, temp_q ) ); /*temp_q->(h_dirac_output_synthesis_state->q_cy_auto_diff_smooth)*/ } +#ifdef VEC_ARITH_OPT_v1 + v_add_fixed_no_hdrm( aux_buf, + &h_dirac_output_synthesis_state->cy_auto_diff_smooth_fx[ch_idx * num_freq_bands_diff], + &h_dirac_output_synthesis_state->cy_auto_diff_smooth_fx[ch_idx * num_freq_bands_diff], + num_freq_bands_diff ); /*h_dirac_output_synthesis_state->q_cy_auto_diff_smooth*/ +#else /* VEC_ARITH_OPT_v1 */ v_add_fixed( aux_buf, &h_dirac_output_synthesis_state->cy_auto_diff_smooth_fx[ch_idx * num_freq_bands_diff], &h_dirac_output_synthesis_state->cy_auto_diff_smooth_fx[ch_idx * num_freq_bands_diff], num_freq_bands_diff, 0 ); /*h_dirac_output_synthesis_state->q_cy_auto_diff_smooth*/ +#endif /* VEC_ARITH_OPT_v1 */ } return; diff --git a/lib_rend/ivas_dirac_rend_fx.c b/lib_rend/ivas_dirac_rend_fx.c index 2f748289a..2540426cc 100644 --- a/lib_rend/ivas_dirac_rend_fx.c +++ b/lib_rend/ivas_dirac_rend_fx.c @@ -4528,9 +4528,13 @@ static void ivas_masa_ext_dirac_render_sf_fx( hDirACRend->h_freq_domain_decorr_ap_params, hDirACRend->h_freq_domain_decorr_ap_state ); - v_multc_fixed( onset_filter_fx, 536870912 /* 0.25f in Q31 */, onset_filter_fx, hSpatParamRendCom->num_freq_bands ); /* Q31 */ + v_multc_fixed( onset_filter_fx, 536870912 /* 0.25f in Q31 */, onset_filter_fx, hSpatParamRendCom->num_freq_bands ); /* Q31 */ +#ifdef VEC_ARITH_OPT_v1 + v_add_fixed_no_hdrm( onset_filter_fx, onset_filter_subframe_fx, onset_filter_subframe_fx, hSpatParamRendCom->num_freq_bands ); /* Q31 */ +#else /* VEC_ARITH_OPT_v1 */ v_add_fixed( onset_filter_fx, onset_filter_subframe_fx, onset_filter_subframe_fx, hSpatParamRendCom->num_freq_bands, 0 ); /* Q31 */ - p_onset_filter_fx = onset_filter_subframe_fx; /*q31*/ +#endif /* VEC_ARITH_OPT_v1 */ + p_onset_filter_fx = onset_filter_subframe_fx; /*q31*/ } ELSE { @@ -4627,7 +4631,11 @@ static void ivas_masa_ext_dirac_render_sf_fx( DirAC_mem.reference_power_smooth_q = DirAC_mem.reference_power_q; move16(); #endif +#ifdef VEC_ARITH_OPT_v1 + v_add_fixed_no_hdrm( reference_power_fix, reference_power_smooth_fx, reference_power_smooth_fx, hSpatParamRendCom->num_freq_bands ); // DirAC_mem.reference_power_smooth_q +#else /* VEC_ARITH_OPT_v1 */ v_add_fixed( reference_power_fix, reference_power_smooth_fx, reference_power_smooth_fx, hSpatParamRendCom->num_freq_bands, 0 ); // DirAC_mem.reference_power_smooth_q +#endif /* VEC_ARITH_OPT_v1 */ } } /*Rescaling proto_direct_buffer_f*/ diff --git a/lib_rend/ivas_efap_fx.c b/lib_rend/ivas_efap_fx.c index 543decbdd..4fb88643b 100644 --- a/lib_rend/ivas_efap_fx.c +++ b/lib_rend/ivas_efap_fx.c @@ -1525,7 +1525,11 @@ static void get_poly_gains_fx( A[1] = elePoly[i - 1]; // q22 move32(); +#ifdef VEC_ARITH_OPT_v1 + v_sub_fixed_no_hdrm( P, A, P_minus_A, 2 ); /* Precalculate value of (P-A) q22*/ +#else /* VEC_ARITH_OPT_v1 */ v_sub_fixed( P, A, P_minus_A, 2, 0 ); /* Precalculate value of (P-A) q22*/ +#endif /* VEC_ARITH_OPT_v1 */ FOR( j = i; j < numChan - 2 + i; ++j ) { @@ -1578,7 +1582,11 @@ static Word32 get_tri_gain_fx( tmpN[1] = L_sub( C[0], B[0] ); // q22 move32(); - v_sub_fixed( B, A, tmpSub1, 2, 0 ); // tmpSub1 q22 +#ifdef VEC_ARITH_OPT_v1 + v_sub_fixed_no_hdrm( B, A, tmpSub1, 2 ); // tmpSub1 q22 +#else /* VEC_ARITH_OPT_v1 */ + v_sub_fixed( B, A, tmpSub1, 2, 0 ); // tmpSub1 q22 +#endif /* VEC_ARITH_OPT_v1 */ tmpDot1 = dotp_fixed( tmpN, tmpSub1, 2 ); // Q13 @@ -2237,7 +2245,11 @@ static void sort_channels_vertex_fx( move32(); } +#ifdef VEC_ARITH_OPT_v1 + v_sub_fixed_no_hdrm( tmpV1, tmpV2, tmpV3, 3 ); // tmpV3 Q30 +#else /* VEC_ARITH_OPT_v1 */ v_sub_fixed( tmpV1, tmpV2, tmpV3, 3, 0 ); // tmpV3 Q30 +#endif /* VEC_ARITH_OPT_v1 */ Word16 exp2 = 2; move16(); normV = ISqrt32( dotp_fixed( tmpV3, tmpV3, 3 ) /*q29*/, &exp2 ); // q=31-exp2 @@ -2419,7 +2431,11 @@ static Word16 in_poly_fx( /* Angles are in Q22 */ A[1] = poly.polyEle[0]; // q22 move32(); - v_sub_fixed( P, A, P_minus_A, 2, 0 ); /* Precalculate value of (P-A) q22*/ +#ifdef VEC_ARITH_OPT_v1 + v_sub_fixed_no_hdrm( P, A, P_minus_A, 2 ); /* Precalculate value of (P-A) q22*/ +#else /* VEC_ARITH_OPT_v1 */ + v_sub_fixed( P, A, P_minus_A, 2, 0 ); /* Precalculate value of (P-A) q22*/ +#endif /* VEC_ARITH_OPT_v1 */ FOR( n = 1; n < sub( numVertices, 1 ); ++n ) { @@ -2487,8 +2503,13 @@ static Word16 in_tri_fx( I'll just compute the determinant and if it's equal to 0, that means the two vectors are colinear */ - v_sub_fixed( B, A, tmpDot1, 2, 0 ); // tmpDot1 q22 - v_sub_fixed( C, A, tmpDot2, 2, 0 ); // tmpDot2 q22 +#ifdef VEC_ARITH_OPT_v1 + v_sub_fixed_no_hdrm( B, A, tmpDot1, 2 ); // tmpDot1 q22 + v_sub_fixed_no_hdrm( C, A, tmpDot2, 2 ); // tmpDot2 q22 +#else /* VEC_ARITH_OPT_v1 */ + v_sub_fixed( B, A, tmpDot1, 2, 0 ); // tmpDot1 q22 + v_sub_fixed( C, A, tmpDot2, 2, 0 ); // tmpDot2 q22 +#endif /* VEC_ARITH_OPT_v1 */ /* Verification of the non-colinearity : Q22 * Q22 = Q13 */ invFactor = L_sub( Mpy_32_32( tmpDot1[0], tmpDot2[1] ), Mpy_32_32( tmpDot1[1], tmpDot2[0] ) ); /*q22+q22-q31->q13*/ diff --git a/lib_rend/ivas_omasa_ana_fx.c b/lib_rend/ivas_omasa_ana_fx.c index ba632c6ba..7b7f64c84 100644 --- a/lib_rend/ivas_omasa_ana_fx.c +++ b/lib_rend/ivas_omasa_ana_fx.c @@ -538,8 +538,13 @@ static void ivas_omasa_param_est_ana_fx( FOR( i = 1; i < nchan_ism; i++ ) { +#ifdef VEC_ARITH_OPT_v1 + v_add_fixed_no_hdrm( Chnl_RealBuffer_fx[i], Foa_RealBuffer_fx[0], Foa_RealBuffer_fx[0], num_freq_bins ); // Q: Chnl_RealBuffer_q + v_add_fixed_no_hdrm( Chnl_ImagBuffer_fx[i], Foa_ImagBuffer_fx[0], Foa_ImagBuffer_fx[0], num_freq_bins ); // Q: Chnl_ImagBuffer_q +#else /* VEC_ARITH_OPT_v1 */ v_add_fixed( Chnl_RealBuffer_fx[i], Foa_RealBuffer_fx[0], Foa_RealBuffer_fx[0], num_freq_bins, 0 ); // Q: Chnl_RealBuffer_q v_add_fixed( Chnl_ImagBuffer_fx[i], Foa_ImagBuffer_fx[0], Foa_ImagBuffer_fx[0], num_freq_bins, 0 ); // Q: Chnl_ImagBuffer_q +#endif /* VEC_ARITH_OPT_v1 */ } /* Y */ diff --git a/lib_rend/ivas_reverb_fx.c b/lib_rend/ivas_reverb_fx.c index 9b149940c..12b94b1a2 100644 --- a/lib_rend/ivas_reverb_fx.c +++ b/lib_rend/ivas_reverb_fx.c @@ -2177,13 +2177,23 @@ void ivas_binaural_reverb_processSubframe_fx( { IF( s_and( ch, 1 ) ) { +#ifdef VEC_ARITH_OPT_v1 + v_add_fixed_no_hdrm( hReverb->preDelayBufferReal_fx[idx], inReal[ch][sample], hReverb->preDelayBufferReal_fx[idx], hReverb->numBins ); + v_add_fixed_no_hdrm( hReverb->preDelayBufferImag_fx[idx], inImag[ch][sample], hReverb->preDelayBufferImag_fx[idx], hReverb->numBins ); +#else /* VEC_ARITH_OPT_v1 */ v_add_fixed( hReverb->preDelayBufferReal_fx[idx], inReal[ch][sample], hReverb->preDelayBufferReal_fx[idx], hReverb->numBins, 0 ); v_add_fixed( hReverb->preDelayBufferImag_fx[idx], inImag[ch][sample], hReverb->preDelayBufferImag_fx[idx], hReverb->numBins, 0 ); +#endif /* VEC_ARITH_OPT_v1 */ } ELSE { +#ifdef VEC_ARITH_OPT_v1 + v_sub_fixed_no_hdrm( hReverb->preDelayBufferReal_fx[idx], inImag[ch][sample], hReverb->preDelayBufferReal_fx[idx], hReverb->numBins ); + v_add_fixed_no_hdrm( hReverb->preDelayBufferImag_fx[idx], inReal[ch][sample], hReverb->preDelayBufferImag_fx[idx], hReverb->numBins ); +#else /* VEC_ARITH_OPT_v1 */ v_sub_fixed( hReverb->preDelayBufferReal_fx[idx], inImag[ch][sample], hReverb->preDelayBufferReal_fx[idx], hReverb->numBins, 0 ); v_add_fixed( hReverb->preDelayBufferImag_fx[idx], inReal[ch][sample], hReverb->preDelayBufferImag_fx[idx], hReverb->numBins, 0 ); +#endif /* VEC_ARITH_OPT_v1 */ } } idx = add( idx, 1 ) % hReverb->preDelayBufferLength; @@ -2213,20 +2223,40 @@ void ivas_binaural_reverb_processSubframe_fx( SWITCH( phaseShiftTypePr[tapIdx] ) { case 0: /* 0 degrees phase */ +#ifdef VEC_ARITH_OPT_v1 + v_add_fixed_no_hdrm( hReverb->outputBufferReal_fx[bin][ch], tapRealPr_fx[tapIdx], hReverb->outputBufferReal_fx[bin][ch], numSlots ); + v_add_fixed_no_hdrm( hReverb->outputBufferImag_fx[bin][ch], tapImagPr_fx[tapIdx], hReverb->outputBufferImag_fx[bin][ch], numSlots ); +#else /* VEC_ARITH_OPT_v1 */ v_add_fixed( hReverb->outputBufferReal_fx[bin][ch], tapRealPr_fx[tapIdx], hReverb->outputBufferReal_fx[bin][ch], numSlots, 0 ); v_add_fixed( hReverb->outputBufferImag_fx[bin][ch], tapImagPr_fx[tapIdx], hReverb->outputBufferImag_fx[bin][ch], numSlots, 0 ); +#endif /* VEC_ARITH_OPT_v1 */ BREAK; case 1: /* 90 degrees phase */ +#ifdef VEC_ARITH_OPT_v1 + v_sub_fixed_no_hdrm( hReverb->outputBufferReal_fx[bin][ch], tapImagPr_fx[tapIdx], hReverb->outputBufferReal_fx[bin][ch], numSlots ); + v_add_fixed_no_hdrm( hReverb->outputBufferImag_fx[bin][ch], tapRealPr_fx[tapIdx], hReverb->outputBufferImag_fx[bin][ch], numSlots ); +#else /* VEC_ARITH_OPT_v1 */ v_sub_fixed( hReverb->outputBufferReal_fx[bin][ch], tapImagPr_fx[tapIdx], hReverb->outputBufferReal_fx[bin][ch], numSlots, 0 ); v_add_fixed( hReverb->outputBufferImag_fx[bin][ch], tapRealPr_fx[tapIdx], hReverb->outputBufferImag_fx[bin][ch], numSlots, 0 ); +#endif /* VEC_ARITH_OPT_v1 */ BREAK; case 2: /* 180 degrees phase */ +#ifdef VEC_ARITH_OPT_v1 + v_sub_fixed_no_hdrm( hReverb->outputBufferReal_fx[bin][ch], tapRealPr_fx[tapIdx], hReverb->outputBufferReal_fx[bin][ch], numSlots ); + v_sub_fixed_no_hdrm( hReverb->outputBufferImag_fx[bin][ch], tapImagPr_fx[tapIdx], hReverb->outputBufferImag_fx[bin][ch], numSlots ); +#else /* VEC_ARITH_OPT_v1 */ v_sub_fixed( hReverb->outputBufferReal_fx[bin][ch], tapRealPr_fx[tapIdx], hReverb->outputBufferReal_fx[bin][ch], numSlots, 0 ); v_sub_fixed( hReverb->outputBufferImag_fx[bin][ch], tapImagPr_fx[tapIdx], hReverb->outputBufferImag_fx[bin][ch], numSlots, 0 ); +#endif /* VEC_ARITH_OPT_v1 */ BREAK; default: /* 270 degrees phase */ +#ifdef VEC_ARITH_OPT_v1 + v_add_fixed_no_hdrm( hReverb->outputBufferReal_fx[bin][ch], tapImagPr_fx[tapIdx], hReverb->outputBufferReal_fx[bin][ch], numSlots ); + v_sub_fixed_no_hdrm( hReverb->outputBufferImag_fx[bin][ch], tapRealPr_fx[tapIdx], hReverb->outputBufferImag_fx[bin][ch], numSlots ); +#else /* VEC_ARITH_OPT_v1 */ v_add_fixed( hReverb->outputBufferReal_fx[bin][ch], tapImagPr_fx[tapIdx], hReverb->outputBufferReal_fx[bin][ch], numSlots, 0 ); v_sub_fixed( hReverb->outputBufferImag_fx[bin][ch], tapRealPr_fx[tapIdx], hReverb->outputBufferImag_fx[bin][ch], numSlots, 0 ); +#endif /* VEC_ARITH_OPT_v1 */ BREAK; } } diff --git a/lib_rend/lib_rend.c b/lib_rend/lib_rend.c index 859768784..ad84c92e9 100644 --- a/lib_rend/lib_rend.c +++ b/lib_rend/lib_rend.c @@ -5772,7 +5772,11 @@ static ivas_error renderLfeToBinaural_fx( { writePtr = getSmplPtr_fx( outAudio, ear_idx, 0 ); move32(); +#ifdef VEC_ARITH_OPT_v1 + v_add_fixed_no_hdrm( writePtr, tmpLfeBuffer, writePtr, frame_size ); /* Q(out_q) */ +#else /* VEC_ARITH_OPT_v1 */ v_add_fixed( writePtr, tmpLfeBuffer, writePtr, frame_size, 0 ); /* Q(out_q) */ +#endif /* VEC_ARITH_OPT_v1 */ } pop_wmops(); @@ -6793,7 +6797,11 @@ static void renderMasaToMasa( ELSE IF( EQ_16( masaInput->base.inputBuffer.config.numChannels, 2 ) && EQ_16( outAudio.config.numChannels, 1 ) ) { // v_add( tmpBuffer[0], tmpBuffer[1], tmpBuffer[0], masaInput->base.inputBuffer.config.numSamplesPerChannel ); +#ifdef VEC_ARITH_OPT_v1 + v_add_fixed_no_hdrm( tmpBuffer_fx[0], tmpBuffer_fx[1], tmpBuffer_fx[0], masaInput->base.inputBuffer.config.numSamplesPerChannel ); +#else /* VEC_ARITH_OPT_v1 */ v_add_fixed( tmpBuffer_fx[0], tmpBuffer_fx[1], tmpBuffer_fx[0], masaInput->base.inputBuffer.config.numSamplesPerChannel, 0 ); +#endif /* VEC_ARITH_OPT_v1 */ } /* Copy metadata */ -- GitLab