From bd88ebcc33968191ec4327a1179d95abab2d6883 Mon Sep 17 00:00:00 2001 From: Sandesh Venkatesh Date: Fri, 13 Jun 2025 19:19:36 +0530 Subject: [PATCH] Bit exact optimizations in SBA decoder renderer path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changes are enabled with macro OPT_SBA_REND_V1_BE Functions and corresponding WMOPs gains: ivas_dirac_dec_output_synthesis_process_subframe_psd_ls_fx 1.5 vbap_gains 0.2 vector_matrix_multiply_3x3_fx 2.2 protoSignalComputation4_fx 1.8 in_tri_fx 0.1 Test case: ./IVAS_cod -sba 1 256000 48 scripts/testv/stvFOA48c.wav bit_sba ./IVAS_dec 7_1 48 bit_sba sba.wav  --- lib_com/options.h | 1 + .../ivas_dirac_dec_binaural_functions_fx.c | 165 ++++++++++++++++++ lib_rend/ivas_dirac_output_synthesis_dec_fx.c | 63 ++++++- lib_rend/ivas_dirac_rend_fx.c | 8 +- lib_rend/ivas_efap_fx.c | 16 +- lib_rend/ivas_vbap_fx.c | 38 +++- 6 files changed, 277 insertions(+), 14 deletions(-) diff --git a/lib_com/options.h b/lib_com/options.h index 9dc7e7fd1..7e3799ffc 100644 --- a/lib_com/options.h +++ b/lib_com/options.h @@ -75,6 +75,7 @@ #define FIX_1379_MASA_ANGLE_ROUND /* Note: each compile switch (FIX_1101_...) is independent from the other ones */ +#define OPT_SBA_REND_V1_BE #define OPT_SBA_ENC_V1_BE #define OPT_BIN_RENDERER_V1 #define OPT_BIN_RENDERER_V2 diff --git a/lib_rend/ivas_dirac_dec_binaural_functions_fx.c b/lib_rend/ivas_dirac_dec_binaural_functions_fx.c index 6dc701468..c92f800af 100644 --- a/lib_rend/ivas_dirac_dec_binaural_functions_fx.c +++ b/lib_rend/ivas_dirac_dec_binaural_functions_fx.c @@ -3415,6 +3415,169 @@ static void eig2x2_fx( move16(); move16(); +#ifdef OPT_SBA_REND_V1_BE + /* Eigenvectors */ + FOR( ch = 0; ch < BINAURAL_CHANNELS; ch++ ) + { + Word16 q_diff = sub( q_e, *q_D ); + IF( q_diff > 0 ) + { + tmp1 = L_sub( D_fx[ch], L_shr( e1, q_diff ) ); + tmp2 = L_sub( D_fx[ch], L_shr( e2, q_diff ) ); + q_tmp1 = *q_D; + move16(); + } + ELSE + { + tmp1 = L_sub( L_shl( D_fx[ch], q_diff ), e1 ); + tmp2 = L_sub( L_shl( D_fx[ch], q_diff ), e2 ); + q_tmp1 = q_e; + move16(); + } + + IF( GT_32( L_abs( tmp2 ), L_abs( tmp1 ) ) ) + { + s_fx = tmp2; + move32(); + exp = sub( norm_l( s_fx ), 1 ); + tmp2 = Mpy_32_32( s_fx, s_fx ); + q_tmp2 = sub( add( q_tmp1, q_tmp1 ), 31 ); + + tmp2 = BASOP_Util_Add_Mant32Exp( crossSquare_fx, sub( 31, q_crossSquare ), tmp2, sub( 31, q_tmp2 ), &q_tmp2 ); + q_tmp2 = sub( 31, q_tmp2 ); + + tmp3 = BASOP_Util_Add_Mant32Exp( tmp2, sub( 31, q_tmp2 ), epsilon_mant, epsilon_exp, &exp_tmp3 ); + + tmp2 = BASOP_Util_Divide3232_Scale_newton( ONE_IN_Q30, tmp3, &exp ); + exp = sub( exp, sub( Q30, sub( 31, exp_tmp3 ) ) ); + normVal_fx = Sqrt32( tmp2, &exp ); // q_tmp2 + q_tmp2 = sub( 31, exp ); + + q_diff = sub( q_c, q_tmp1 ); + IF( q_diff > 0 ) + { + c_re = L_shr( c_re, q_diff ); + c_im = L_shr( c_im, q_diff ); + q_c = q_tmp1; + move16(); + } + ELSE + { + s_fx = L_shl( s_fx, q_diff ); + q_tmp1 = q_c; + move16(); + } + + Ure_fx[0][ch] = Mpy_32_32( s_fx, normVal_fx ); + move32(); + Ure_fx[1][ch] = Mpy_32_32( c_re, normVal_fx ); + move32(); + Uim_fx[1][ch] = Mpy_32_32( c_im, normVal_fx ); + move32(); + q_U_1 = sub( add( q_tmp1, q_tmp2 ), 31 ); + + IF( q_U_2 != 0 ) + { + q_diff = sub( q_U_2, q_U_1 ); + IF( q_diff > 0 ) + { + Ure_fx[1][ch - 1] = L_shr( Ure_fx[1][ch - 1], q_diff ); + Ure_fx[0][ch - 1] = L_shr( Ure_fx[0][ch - 1], q_diff ); + Uim_fx[0][ch - 1] = L_shr( Uim_fx[0][ch - 1], q_diff ); + q_U_2 = q_U_1; + move32(); + move32(); + move32(); + move16(); + } + ELSE IF( GT_16( q_U_1, q_U_2 ) ) + { + Ure_fx[1][ch] = L_shl( Ure_fx[1][ch], q_diff ); + Ure_fx[0][ch] = L_shl( Ure_fx[0][ch], q_diff ); + Uim_fx[1][ch] = L_shl( Uim_fx[1][ch], q_diff ); + q_U_1 = q_U_2; + move32(); + move32(); + move32(); + move16(); + } + } + q_U_2 = q_U_1; + move16(); + } + ELSE + { + s_fx = tmp1; + move32(); + + exp = sub( norm_l( s_fx ), 1 ); + tmp2 = Mpy_32_32( s_fx, s_fx ); + q_tmp2 = sub( add( q_tmp1, q_tmp1 ), 31 ); + + tmp2 = BASOP_Util_Add_Mant32Exp( crossSquare_fx, sub( 31, q_crossSquare ), tmp2, sub( 31, q_tmp2 ), &q_tmp2 ); + q_tmp2 = sub( 31, q_tmp2 ); + + tmp3 = BASOP_Util_Add_Mant32Exp( tmp2, sub( 31, q_tmp2 ), epsilon_mant, epsilon_exp, &exp_tmp3 ); + + tmp2 = BASOP_Util_Divide3232_Scale_newton( ONE_IN_Q30, tmp3, &exp ); + exp = sub( exp, sub( Q30, sub( 31, exp_tmp3 ) ) ); + normVal_fx = Sqrt32( tmp2, &exp ); // q_tmp2 + q_tmp2 = sub( 31, exp ); + + q_diff = sub( q_c, q_tmp1 ); + IF( q_diff > 0 ) + { + c_re = L_shr( c_re, q_diff ); + c_im = L_shr( c_im, q_diff ); + q_c = q_tmp1; + move16(); + } + ELSE + { + s_fx = L_shl( s_fx, q_diff ); + q_tmp1 = q_c; + move16(); + } + + Ure_fx[1][ch] = Mpy_32_32( s_fx, normVal_fx ); + move32(); + Ure_fx[0][ch] = Mpy_32_32( c_re, normVal_fx ); + move32(); + Uim_fx[0][ch] = Mpy_32_32( L_negate( c_im ), normVal_fx ); + move32(); + q_U_2 = sub( add( q_tmp1, q_tmp2 ), 31 ); + + IF( q_U_1 != 0 ) + { + q_diff = sub( q_U_2, q_U_1 ); + IF( q_diff > 0 ) + { + Ure_fx[1][ch] = L_shr( Ure_fx[1][ch], q_diff ); + Ure_fx[0][ch] = L_shr( Ure_fx[0][ch], q_diff ); + Uim_fx[0][ch] = L_shr( Uim_fx[0][ch], q_diff ); + q_U_2 = q_U_1; + move32(); + move32(); + move32(); + move16(); + } + ELSE IF( GT_16( q_U_1, q_U_2 ) ) + { + Ure_fx[1][ch - 1] = L_shl( Ure_fx[1][ch - 1], q_diff ); + Ure_fx[0][ch - 1] = L_shl( Ure_fx[0][ch - 1], q_diff ); + Uim_fx[1][ch - 1] = L_shl( Uim_fx[1][ch - 1], q_diff ); + q_U_1 = q_U_2; + move32(); + move32(); + move32(); + move16(); + } + } + q_U_1 = q_U_2; + move16(); + } + } +#else /* OPT_SBA_REND_V1_BE */ /* Eigenvectors */ FOR( ch = 0; ch < BINAURAL_CHANNELS; ch++ ) { @@ -3571,6 +3734,8 @@ static void eig2x2_fx( move16(); } } +#endif /* OPT_SBA_REND_V1_BE */ + if ( q_U_1 != 0 ) { *q_U = q_U_1; diff --git a/lib_rend/ivas_dirac_output_synthesis_dec_fx.c b/lib_rend/ivas_dirac_output_synthesis_dec_fx.c index 872d89c34..ba0ab8953 100644 --- a/lib_rend/ivas_dirac_output_synthesis_dec_fx.c +++ b/lib_rend/ivas_dirac_output_synthesis_dec_fx.c @@ -2465,6 +2465,11 @@ void ivas_dirac_dec_output_synthesis_process_subframe_psd_ls_fx( move16(); move16(); +#ifdef OPT_SBA_REND_V1_BE + Word32 cmp = W_shl_sat_l( DIRAC_GAIN_LIMIT_Q26, sub( h_dirac_output_synthesis_state->gains_dir_prev_q, 26 ) ); + Word32 cmp2 = W_extract_h( W_shl( W_mult_32_32( DIRAC_GAIN_LIMIT_Q26, L_shl( 1, h_dirac_output_synthesis_state->gains_diff_prev_q ) ), Q5 ) ); +#endif /* OPT_SBA_REND_V1_BE */ + FOR( k = 0; k < nchan_out_woLFE; k++ ) { Word32 power_smooth_temp; @@ -2512,11 +2517,19 @@ void ivas_dirac_dec_output_synthesis_process_subframe_psd_ls_fx( *( p_gains_dir ) = 0; move32(); } +#ifdef OPT_SBA_REND_V1_BE + ELSE IF( GT_32( *( p_gains_dir ), cmp ) ) + { + *( p_gains_dir ) = cmp; /*26 + h_dirac_output_synthesis_state->gains_dir_prev_q + 1 + 5 - 32 -> h_dirac_output_synthesis_state->gains_dir_prev_q*/ + move32(); + } +#else /* OPT_SBA_REND_V1_BE */ ELSE IF( GT_32( *( p_gains_dir ), W_extract_h( W_shl( W_mult_32_32( DIRAC_GAIN_LIMIT_Q26, L_shl( 1, h_dirac_output_synthesis_state->gains_dir_prev_q ) ), Q5 ) ) ) ) { *( p_gains_dir ) = W_extract_h( W_shl( W_mult_32_32( DIRAC_GAIN_LIMIT_Q26, L_shl( 1, h_dirac_output_synthesis_state->gains_dir_prev_q ) ), Q5 ) ); /*26 + h_dirac_output_synthesis_state->gains_dir_prev_q + 1 + 5 - 32 -> h_dirac_output_synthesis_state->gains_dir_prev_q*/ move32(); } +#endif /* OPT_SBA_REND_V1_BE */ IF( *( p_cy_cross_dir_smooth_prev++ ) < 0 ) { @@ -2549,11 +2562,19 @@ void ivas_dirac_dec_output_synthesis_process_subframe_psd_ls_fx( *( p_gains_diff ) = 0; move32(); } +#ifdef OPT_SBA_REND_V1_BE + ELSE IF( GT_32( *( p_gains_diff ), cmp2 ) ) /*h_dirac_output_synthesis_state->gains_diff_prev_q*/ + { + *( p_gains_diff ) = cmp2; /*h_dirac_output_synthesis_state->gains_diff_prev_q*/ + move32(); + } +#else /* OPT_SBA_REND_V1_BE */ ELSE IF( GT_32( *( p_gains_diff ), W_extract_h( W_shl( W_mult_32_32( DIRAC_GAIN_LIMIT_Q26, L_shl( 1, h_dirac_output_synthesis_state->gains_diff_prev_q ) ), Q5 ) ) ) ) /*h_dirac_output_synthesis_state->gains_diff_prev_q*/ { *( p_gains_diff ) = W_extract_h( W_shl( W_mult_32_32( DIRAC_GAIN_LIMIT_Q26, L_shl( 1, h_dirac_output_synthesis_state->gains_diff_prev_q ) ), Q5 ) ); /*h_dirac_output_synthesis_state->gains_diff_prev_q*/ move32(); } +#endif /* OPT_SBA_REND_V1_BE */ p_gains_diff++; } @@ -2564,15 +2585,25 @@ void ivas_dirac_dec_output_synthesis_process_subframe_psd_ls_fx( g1 = alpha[l]; // Q31 move32(); g2 = L_sub( ONE_IN_Q31, g1 ); // Q31 +#ifdef OPT_SBA_REND_V1_BE + W_temp = W_mac_32_32( W_mult_32_32( g1, ( *( p_cy_auto_dir_smooth++ ) ) ), + g2, ( *( p_cy_auto_dir_smooth_prev ) ) ); /*32+q_cy_auto_dir_smooth_prev_local*/ +#else /* OPT_SBA_REND_V1_BE */ W_temp = W_add( W_mult_32_32( g1, ( *( p_cy_auto_dir_smooth++ ) ) ), W_mult_32_32( g2, ( *( p_cy_auto_dir_smooth_prev ) ) ) ); /*32+q_cy_auto_dir_smooth_prev_local*/ +#endif /* OPT_SBA_REND_V1_BE */ q_tmp = W_norm( W_temp ); L_tmp = W_extract_h( W_shl( W_temp, q_tmp ) ); // q_cy_auto_dir_smooth_prev_local + q_tmp *( p_cy_auto_dir_smooth_prev++ ) = L_shr_r( L_tmp, q_tmp ); // q_cy_auto_dir_smooth_prev_local move32(); +#ifdef OPT_SBA_REND_V1_BE + *( p_cy_cross_dir_smooth_prev ) = Madd_32_32( Mpy_32_32( g1, ( *( p_cy_cross_dir_smooth ) ) ), + g2, ( *( p_cy_cross_dir_smooth_prev ) ) ); // (Q31, q_cy_cross_dir_smooth_prev) -> q_cy_cross_dir_smooth_prev +#else /* OPT_SBA_REND_V1_BE */ *( p_cy_cross_dir_smooth_prev ) = L_add( Mpy_32_32( g1, ( *( p_cy_cross_dir_smooth ) ) ), Mpy_32_32( g2, ( *( p_cy_cross_dir_smooth_prev ) ) ) ); // (Q31, q_cy_cross_dir_smooth_prev) -> q_cy_cross_dir_smooth_prev +#endif /* OPT_SBA_REND_V1_BE */ move32(); test(); if ( *( p_cy_cross_dir_smooth_prev ) == 0 && ( *( p_cy_cross_dir_smooth ) != 0 ) ) @@ -2604,11 +2635,19 @@ void ivas_dirac_dec_output_synthesis_process_subframe_psd_ls_fx( *( p_gains_dir ) = 0; move32(); } +#ifdef OPT_SBA_REND_V1_BE + ELSE IF( GT_32( *( p_gains_dir ), cmp ) ) /*gains_dir_prev_q*/ + { + *( p_gains_dir ) = cmp; /*gains_dir_prev_q*/ + move32(); + } +#else /* OPT_SBA_REND_V1_BE */ ELSE IF( GT_32( *( p_gains_dir ), W_extract_h( W_shl( W_mult_32_32( DIRAC_GAIN_LIMIT_Q26, L_shl( 1, h_dirac_output_synthesis_state->gains_dir_prev_q ) ), Q5 ) ) ) ) /*gains_dir_prev_q*/ { *( p_gains_dir ) = W_extract_h( W_shl( W_mult_32_32( DIRAC_GAIN_LIMIT_Q26, L_shl( 1, h_dirac_output_synthesis_state->gains_dir_prev_q ) ), Q5 ) ); /*gains_dir_prev_q*/ move32(); } +#endif /* OPT_SBA_REND_V1_BE */ IF( *( p_cy_cross_dir_smooth_prev++ ) < 0 ) { @@ -2695,7 +2734,11 @@ void ivas_dirac_dec_output_synthesis_process_subframe_psd_ls_fx( shl( i_mult( proto_direct_index[k], num_freq_bands ), Q1 ); FOR( l = 0; l < num_freq_bands; l++ ) { - g = L_add( Mpy_32_32( g1, *( p_gain_1++ ) ), Mpy_32_32( g2, *( p_gain_2++ ) ) ); // (Q31, gains_dir_prev_q) -> gains_dir_prev_q +#ifdef OPT_SBA_REND_V1_BE + g = Madd_32_32( Mpy_32_32( g1, *( p_gain_1++ ) ), g2, *( p_gain_2++ ) ); // (Q31, gains_dir_prev_q) -> gains_dir_prev_q +#else /* OPT_SBA_REND_V1_BE */ + g = L_add( Mpy_32_32( g1, *( p_gain_1++ ) ), Mpy_32_32( g2, *( p_gain_2++ ) ) ); // (Q31, gains_dir_prev_q) -> gains_dir_prev_q +#endif /* OPT_SBA_REND_V1_BE */ Cldfb_RealBuffer64_fx[k][buf_idx][l] = W_mult0_32_32( g, ( *( p_power_smooth++ ) ) ); // (gains_dir_prev_q, q_proto_direct_buffer) -> gains_dir_prev_q + q_proto_direct_buffer move64(); @@ -2717,7 +2760,12 @@ void ivas_dirac_dec_output_synthesis_process_subframe_psd_ls_fx( { FOR( l = 0; l < h_dirac_output_synthesis_params->max_band_decorr; l++ ) { - g = L_add( Mpy_32_32( g1, *( p_gain_1++ ) ), Mpy_32_32( g2, *( p_gain_2++ ) ) ); // (Q31, gains_diff_prev_q) -> gains_diff_prev_q +#ifdef OPT_SBA_REND_V1_BE + g = Madd_32_32( Mpy_32_32( g1, *( p_gain_1++ ) ), g2, *( p_gain_2++ ) ); // (Q31, gains_diff_prev_q) -> gains_diff_prev_q +#else /* OPT_SBA_REND_V1_BE */ + g = L_add( Mpy_32_32( g1, *( p_gain_1++ ) ), Mpy_32_32( g2, *( p_gain_2++ ) ) ); // (Q31, gains_diff_prev_q) -> gains_diff_prev_q + +#endif /* OPT_SBA_REND_V1_BE */ Cldfb_RealBuffer64_fx[k][buf_idx][l] = W_add( Cldfb_RealBuffer64_fx[k][buf_idx][l], W_shr( W_mult0_32_32( g, ( *( p_power_smooth_diff++ ) ) ), negate( q_align ) ) ); // (gains_diff_prev_q, q_proto_direct_buffer) -> gains_diff_prev_q + q_proto_direct_buffer move64(); @@ -2766,16 +2814,27 @@ void ivas_dirac_dec_output_synthesis_process_subframe_psd_ls_fx( } } q_align = W_norm( W_temp ); +#ifdef OPT_SBA_REND_V1_BE + Word16 shift = sub( q_align, 32 ); +#endif /* OPT_SBA_REND_V1_BE */ + FOR( buf_idx = 0; buf_idx < nbslots; ++buf_idx ) { FOR( k = 0; k < nchan_out_woLFE; k++ ) { FOR( l = 0; l < num_freq_bands; l++ ) { +#ifdef OPT_SBA_REND_V1_BE + RealBuffer[k][buf_idx][l] = W_shl_sat_l( Cldfb_RealBuffer64_fx[k][buf_idx][l], shift ); /*( ( ( h_dirac_output_synthesis_state->proto_direct_buffer_f_q+h_dirac_output_synthesis_state->gains_dir_prev_q )+ q_align )- 32 )*/ + move32(); + ImagBuffer[k][buf_idx][l] = W_shl_sat_l( Cldfb_ImagBuffer64_fx[k][buf_idx][l], shift ); /*( ( ( h_dirac_output_synthesis_state->proto_direct_buffer_f_q+h_dirac_output_synthesis_state->gains_dir_prev_q )+ q_align )- 32 )*/ + move32(); +#else /* OPT_SBA_REND_V1_BE */ RealBuffer[k][buf_idx][l] = W_extract_h( W_shl( Cldfb_RealBuffer64_fx[k][buf_idx][l], q_align ) ); /*( ( ( h_dirac_output_synthesis_state->proto_direct_buffer_f_q+h_dirac_output_synthesis_state->gains_dir_prev_q )+ q_align )- 32 )*/ move32(); ImagBuffer[k][buf_idx][l] = W_extract_h( W_shl( Cldfb_ImagBuffer64_fx[k][buf_idx][l], q_align ) ); /*( ( ( h_dirac_output_synthesis_state->proto_direct_buffer_f_q+h_dirac_output_synthesis_state->gains_dir_prev_q )+ q_align )- 32 )*/ move32(); +#endif /* OPT_SBA_REND_V1_BE */ } } } diff --git a/lib_rend/ivas_dirac_rend_fx.c b/lib_rend/ivas_dirac_rend_fx.c index 99a84bf52..155524790 100644 --- a/lib_rend/ivas_dirac_rend_fx.c +++ b/lib_rend/ivas_dirac_rend_fx.c @@ -3155,7 +3155,11 @@ void protoSignalComputation4_fx( sq_tmp_fx = Madd_32_32( Mpy_32_32( proto_frame_f_fx[idx], proto_frame_f_fx[idx] ), proto_frame_f_fx[idx + 1], proto_frame_f_fx[idx + 1] ); // 2*(proto_frame_f_q)-31 sq_tmp_q = sub( add( *proto_frame_f_q, *proto_frame_f_q ), 31 ); +#ifdef OPT_SBA_REND_V1_BE + proto_power_smooth_fx_q = s_min( *proto_power_smooth_q, sq_tmp_q ); + proto_power_smooth_fx[l + ( k * num_freq_bands )] = L_add( L_shr( proto_power_smooth_fx[l + ( k * num_freq_bands )], sub( *proto_power_smooth_q, proto_power_smooth_fx_q ) ), L_shr( sq_tmp_fx, sub( sq_tmp_q, proto_power_smooth_fx_q ) ) ); // proto_power_smooth_fx_q +#else /* OPT_SBA_REND_V1_BE */ IF( LT_16( *proto_power_smooth_q, sq_tmp_q ) ) { proto_power_smooth_fx[l + ( k * num_freq_bands )] = L_add( proto_power_smooth_fx[l + ( k * num_freq_bands )], L_shr( sq_tmp_fx, sub( sq_tmp_q, *proto_power_smooth_q ) ) ); // proto_power_smooth_q @@ -3170,8 +3174,8 @@ void protoSignalComputation4_fx( proto_power_smooth_fx_q = sq_tmp_q; move16(); } - - p_proto_buffer_fx[idx] = proto_frame_f_fx[idx]; // proto_frame_f_q +#endif /* OPT_SBA_REND_V1_BE */ + p_proto_buffer_fx[idx] = proto_frame_f_fx[idx]; // proto_frame_f_q move32(); p_proto_buffer_fx[idx + 1] = proto_frame_f_fx[idx + 1]; // proto_frame_f_q move32(); diff --git a/lib_rend/ivas_efap_fx.c b/lib_rend/ivas_efap_fx.c index fbcfdfe77..2ac397e2c 100644 --- a/lib_rend/ivas_efap_fx.c +++ b/lib_rend/ivas_efap_fx.c @@ -1528,7 +1528,7 @@ static void get_poly_gains_fx( #ifdef VEC_ARITH_OPT_v1 v_sub_fixed_no_hdrm( P, A, P_minus_A, 2 ); /* Precalculate value of (P-A) q22*/ #else /* VEC_ARITH_OPT_v1 */ - v_sub_fixed( P, A, P_minus_A, 2, 0 ); /* Precalculate value of (P-A) q22*/ + v_sub_fixed( P, A, P_minus_A, 2, 0 ); /* Precalculate value of (P-A) q22*/ #endif /* VEC_ARITH_OPT_v1 */ FOR( j = i; j < numChan - 2 + i; ++j ) @@ -1585,7 +1585,7 @@ static Word32 get_tri_gain_fx( #ifdef VEC_ARITH_OPT_v1 v_sub_fixed_no_hdrm( B, A, tmpSub1, 2 ); // tmpSub1 q22 #else /* VEC_ARITH_OPT_v1 */ - v_sub_fixed( B, A, tmpSub1, 2, 0 ); // tmpSub1 q22 + v_sub_fixed( B, A, tmpSub1, 2, 0 ); // tmpSub1 q22 #endif /* VEC_ARITH_OPT_v1 */ tmpDot1 = dotp_fixed( tmpN, tmpSub1, 2 ); // Q13 @@ -2248,7 +2248,7 @@ static void sort_channels_vertex_fx( #ifdef VEC_ARITH_OPT_v1 v_sub_fixed_no_hdrm( tmpV1, tmpV2, tmpV3, 3 ); // tmpV3 Q30 #else /* VEC_ARITH_OPT_v1 */ - v_sub_fixed( tmpV1, tmpV2, tmpV3, 3, 0 ); // tmpV3 Q30 + v_sub_fixed( tmpV1, tmpV2, tmpV3, 3, 0 ); // tmpV3 Q30 #endif /* VEC_ARITH_OPT_v1 */ Word16 exp2 = 2; move16(); @@ -2434,7 +2434,7 @@ static Word16 in_poly_fx( /* Angles are in Q22 */ #ifdef VEC_ARITH_OPT_v1 v_sub_fixed_no_hdrm( P, A, P_minus_A, 2 ); /* Precalculate value of (P-A) q22*/ #else /* VEC_ARITH_OPT_v1 */ - v_sub_fixed( P, A, P_minus_A, 2, 0 ); /* Precalculate value of (P-A) q22*/ + v_sub_fixed( P, A, P_minus_A, 2, 0 ); /* Precalculate value of (P-A) q22*/ #endif /* VEC_ARITH_OPT_v1 */ FOR( n = 1; n < sub( numVertices, 1 ); ++n ) @@ -2508,12 +2508,16 @@ static Word16 in_tri_fx( v_sub_fixed_no_hdrm( B, A, tmpDot1, 2 ); // tmpDot1 q22 v_sub_fixed_no_hdrm( C, A, tmpDot2, 2 ); // tmpDot2 q22 #else /* VEC_ARITH_OPT_v1 */ - v_sub_fixed( B, A, tmpDot1, 2, 0 ); // tmpDot1 q22 - v_sub_fixed( C, A, tmpDot2, 2, 0 ); // tmpDot2 q22 + v_sub_fixed( B, A, tmpDot1, 2, 0 ); // tmpDot1 q22 + v_sub_fixed( C, A, tmpDot2, 2, 0 ); // tmpDot2 q22 #endif /* VEC_ARITH_OPT_v1 */ /* Verification of the non-colinearity : Q22 * Q22 = Q13 */ +#ifdef OPT_SBA_REND_V1_BE + invFactor = Msub_32_32( Mpy_32_32( tmpDot1[0], tmpDot2[1] ), tmpDot1[1], tmpDot2[0] ); /*q22+q22-q31->q13*/ +#else /* OPT_SBA_REND_V1_BE */ invFactor = L_sub( Mpy_32_32( tmpDot1[0], tmpDot2[1] ), Mpy_32_32( tmpDot1[1], tmpDot2[0] ) ); /*q22+q22-q31->q13*/ +#endif /* OPT_SBA_REND_V1_BE */ IF( invFactor == 0 ) { diff --git a/lib_rend/ivas_vbap_fx.c b/lib_rend/ivas_vbap_fx.c index 7495953e3..cfcbc6760 100644 --- a/lib_rend/ivas_vbap_fx.c +++ b/lib_rend/ivas_vbap_fx.c @@ -578,7 +578,11 @@ void vbap_determine_gains_fx( move32(); FOR( ch = 0; ch < 3; ch++ ) { +#ifdef OPT_SBA_REND_V1_BE + gain_ene_fx = Madd_32_32( gain_ene_fx, gain_triplet_fx[ch], gain_triplet_fx[ch] ); /* Q(2 * VBAP_VS_TRIPLET.q_inverse_matrix - 31) */ +#else /* OPT_SBA_REND_V1_BE */ gain_ene_fx = L_add( gain_ene_fx, Mpy_32_32( gain_triplet_fx[ch], gain_triplet_fx[ch] ) ); /* Q(2 * VBAP_VS_TRIPLET.q_inverse_matrix - 31) */ +#endif /* OPT_SBA_REND_V1_BE */ } norm_value_fx = Isqrt( L_shr( gain_ene_fx, 1 ) ); /* Q(31 - (2 * VBAP_VS_TRIPLET.q_inverse_matrix - 31 - 1) / 2 ) = Q(47 - VBAP_VS_TRIPLET.q_inverse_matrix) */ @@ -681,9 +685,35 @@ static UWord8 vector_matrix_multiply_3x3_fx( Word32 *result, /* o : output vector Q(q_matrix) */ Word16 q_matrix ) { - result[0] = Mpy_32_16_1( matrix[0][0], src_vector[0] ); /* Q(q_matrix) */ - result[0] = L_add( result[0], Mpy_32_16_1( matrix[1][0], src_vector[1] ) ); /* Q(q_matrix) */ - result[0] = L_add( result[0], Mpy_32_16_1( matrix[2][0], src_vector[2] ) ); /* Q(q_matrix) */ +#ifdef OPT_SBA_REND_V1_BE + Word32 pointzero_one = Mpy_32_16_1( L_lshl( 1, q_matrix ), -327 /* -0.01 in Q15 */ ); + result[0] = Madd_32_16( Madd_32_16( Mpy_32_16_1( matrix[0][0], src_vector[0] ), matrix[1][0], src_vector[1] ), matrix[2][0], src_vector[2] ); /* Q(q_matrix) */ + move32(); + + IF( LT_32( result[0], pointzero_one ) ) + { + return 0; + } + + result[1] = Madd_32_16( Madd_32_16( Mpy_32_16_1( matrix[0][1], src_vector[0] ), matrix[1][1], src_vector[1] ), matrix[2][1], src_vector[2] ); /* Q(q_matrix) */ + move32(); + + IF( LT_32( result[1], pointzero_one ) ) + { + return 0; + } + + result[2] = Madd_32_16( Madd_32_16( Mpy_32_16_1( matrix[0][2], src_vector[0] ), matrix[1][2], src_vector[1] ), matrix[2][2], src_vector[2] ); /* Q(q_matrix) */ + move32(); + + IF( LT_32( result[2], pointzero_one ) ) + { + return 0; + } +#else /* OPT_SBA_REND_V1_BE */ + result[0] = Mpy_32_16_1( matrix[0][0], src_vector[0] ); /* Q(q_matrix) */ + result[0] = L_add( result[0], Mpy_32_16_1( matrix[1][0], src_vector[1] ) ); /* Q(q_matrix) */ + result[0] = L_add( result[0], Mpy_32_16_1( matrix[2][0], src_vector[2] ) ); /* Q(q_matrix) */ move32(); move32(); move32(); @@ -716,7 +746,7 @@ static UWord8 vector_matrix_multiply_3x3_fx( { return 0; } - +#endif /* OPT_SBA_REND_V1_BE */ return 1; } -- GitLab