diff --git a/lib_com/cldfb.c b/lib_com/cldfb.c index 5a3d2e1a27b1fd60aba746579f8bc5cee346fe1e..b6eafd993ccf0ed5786ef9c075f171c8c45fad8f 100644 --- a/lib_com/cldfb.c +++ b/lib_com/cldfb.c @@ -1213,7 +1213,7 @@ void cldfbSynthesis_ivas_fx( /*cplxMult(&iBuffer[2*i], &iBuffer[2*i+1],-imagBuffer[k][2*i], imagBuffer[k][M1-1-2*i], rot_vctr_re[i], rot_vctr_im[i]);*/ iBuffer_fx[2 * i] = Msub_32_32( Mpy_32_32( ( L_negate( imagBuffer_fx[k][2 * i] ) ), rot_vctr_re_fx[i] ), imagBuffer_fx[k][( M1 - 1 ) - ( i * 2 )], rot_vctr_im_fx[i] ); // Qx move32(); - iBuffer_fx[2 * i + 1] = Madd_32_32( Mpy_32_32( ( L_negate( imagBuffer_fx[k][2 * i] ) ), rot_vctr_im_fx[i] ), imagBuffer_fx[k][( M1 - 1 ) - ( i * 2 )], rot_vctr_re_fx[i] ); // Qx + iBuffer_fx[2 * i + 1] = Msub_32_32( Mpy_32_32( imagBuffer_fx[k][( M1 - 1 ) - ( i * 2 )], rot_vctr_re_fx[i] ), imagBuffer_fx[k][2 * i], rot_vctr_im_fx[i] ); // Qx move32(); } diff --git a/lib_com/ivas_stereo_ica_com_fx.c b/lib_com/ivas_stereo_ica_com_fx.c index 3ebfd99c1e065ba93c7f0a8deeee608c1acb3ce0..be9f358683e12cb417e25c2e2a447f9220d4f958 100644 --- a/lib_com/ivas_stereo_ica_com_fx.c +++ b/lib_com/ivas_stereo_ica_com_fx.c @@ -131,17 +131,7 @@ static void interpTargetChannel_fx( Word32 spread_factor2_fx; Word64 tempD1_fx, tempD2_fx; - d = negate( sub( currShift, prevShift ) ); - IF( d >= 0 ) - { - signShift = 1; - move16(); - } - ELSE - { - signShift = -1; - move16(); - } + d = sub( prevShift, currShift ); IF( d == 0 ) { @@ -149,6 +139,15 @@ static void interpTargetChannel_fx( return; } + signShift = 1; + move16(); + + if ( d < 0 ) + { + signShift = -1; + move16(); + } + N = L_shift_adapt; move16(); Word32 *table_pointer = NULL; @@ -207,7 +206,7 @@ static void interpTargetChannel_fx( FOR( j = lim1; j <= lim2; j++ ) { - ptr2_fx[i] = L_add( Mpy_32_32( win_fx[j * INTERP_FACTOR1 - i], ptr1_fx[j] ), ptr2_fx[i] ); // qsynth + ptr2_fx[i] = Madd_32_32( ptr2_fx[i], win_fx[j * INTERP_FACTOR1 - i], ptr1_fx[j] ); // qsynth move32(); } } @@ -225,44 +224,31 @@ static void interpTargetChannel_fx( tempD1_fx = W_deposit32_l( table_D1_pointer[abs( d )] ); // Q35 tempD2_fx = W_mult0_32_32( 3, table_D1_pointer[abs( d )] ); // Q35 - IF( EQ_16( signShift, 1 ) ) + tempF1_fx = -ONE_IN_Q12; // Q12 + move32(); + + if ( EQ_16( signShift, 1 ) ) { tempF1_fx = ONE_IN_Q12; // Q12 move32(); } - ELSE - { - tempF1_fx = -ONE_IN_Q12; // Q12 - move32(); - } + tempF1_fx = L_sub( imult3216( factor_fx, d ), tempF1_fx ); // Q12 - FOR( k = 0; k < sub( N, 1 ); k++ ) + FOR( k = 0; k < N - 1; k++ ) { - Word32 local = L_sub( W_extract_l( W_shr( W_mult0_32_32( tempF1_fx, spread_factor2_fx ), 31 ) ), ONE_IN_Q12 ); // Q12 - Word32 sign_local; - IF( local > 0 ) - { - sign_local = 1; - move32(); - } - ELSE - { - sign_local = -1; - move32(); - } - Word32 local_int = W_extract_l( W_shr( W_abs( local ), 12 ) ); // Q0 + Word32 local = Madd_32_32( -ONE_IN_Q12, tempF1_fx, spread_factor2_fx ); // Q12 + Word32 local_int = L_shr( local, 12 ); // Q0 Word32 res_a1, res_a2, res_a3; Word32 res_b1, res_b2, res_b3; Word32 res_c1, res_c2, res_c3; Word32 res_d1, res_d2, res_d3; - Word64 local_int_scaled; + Word32 local_int_scaled; Word64 res_a, res_b, res_c, res_d; Word64 tempa, tempb; Word64 mult_a_D1, mult_b_D2; - local_int = W_extract_l( W_mult0_32_32( sign_local, local_int ) ); // Q0 - local_int_scaled = W_deposit32_l( L_shl( local_int, 12 ) ); // Q12 - lim1 = extract_l( local_int ); // Q0 - IF( W_sub( local_int_scaled, local ) > 0 ) // Q21 + local_int_scaled = L_shl( local_int, 12 ); // Q12 + lim1 = extract_l( local_int ); // Q0 + if ( L_sub( local_int_scaled, local ) > 0 ) // Q12 { lim1 = sub( lim1, 1 ); // Q0 } @@ -387,7 +373,7 @@ static void targetCh_AlignStereoDFT_fx( } FOR( i = 0; i < L_shift_adapt; i++ ) { - target_fx[i] = L_add( Mpy_32_32( alpha_fx, fadeInBuff_fx[i] ), Mpy_32_32( L_sub( ONE_IN_Q31, alpha_fx ), fadeOutBuff_fx[i] ) ); // qsynth + target_fx[i] = Madd_32_32( Mpy_32_32( alpha_fx, fadeInBuff_fx[i] ), L_sub( ONE_IN_Q31, alpha_fx ), fadeOutBuff_fx[i] ); // qsynth move32(); alpha_fx = L_add_sat( alpha_fx, winSlope_fx ); // Q31 diff --git a/lib_com/ivas_tools.c b/lib_com/ivas_tools.c index 8d486df04dd62d69e843008e92faa6cc86770729..d6210dfc7d4b3c050bd77310f010c93922678654 100644 --- a/lib_com/ivas_tools.c +++ b/lib_com/ivas_tools.c @@ -942,6 +942,12 @@ Word16 matrix_product_mant_exp_fx( Word16 *Zp_fx_e = out_e; Word16 row, col; Word16 x_idx, y_idx; + Word64 temp; + Word16 temp_e; + Word16 prod_e = add( X_fx_e, Y_fx_e ); + + Word16 max_exp = -31; + move16(); /* Processing */ test(); @@ -957,17 +963,28 @@ Word16 matrix_product_mant_exp_fx( { FOR( i = 0; i < colsX; ++i ) { - ( *Zp_fx ) = 0; - move32(); - ( *Zp_fx_e ) = 0; - move16(); + temp = 0; + move64(); + FOR( k = 0; k < rowsX; ++k ) { - x_idx = add( k, imult1616( i, rowsX ) ); - y_idx = add( k, imult1616( j, rowsY ) ); - ( *Zp_fx ) = BASOP_Util_Add_Mant32Exp( *Zp_fx, *Zp_fx_e, Mpy_32_32( X_fx[x_idx], Y_fx[y_idx] ), add( X_fx_e, Y_fx_e ), Zp_fx_e ); /*Q31 - Zp_fx_e*/ - move32(); + x_idx = k + i * rowsX; + y_idx = k + j * rowsY; + temp = W_mac_32_32( temp, X_fx[x_idx], Y_fx[y_idx] ); // X_fx_e + Y_fx_e } + /* Maximize accumulated value to 32-bit */ + temp_e = W_norm( temp ); + temp = W_shl( temp, temp_e ); + if ( 0 == temp ) + { + temp_e = prod_e; + move16(); + } + *Zp_fx_e = sub( prod_e, temp_e ); + move16(); + ( *Zp_fx ) = W_extract_h( temp ); + move32(); + max_exp = s_max( max_exp, *Zp_fx_e ); // Find the max exp Zp_fx++; Zp_fx_e++; } @@ -987,17 +1004,27 @@ Word16 matrix_product_mant_exp_fx( { FOR( i = 0; i < rowsX; ++i ) { - ( *Zp_fx ) = 0; - move32(); - ( *Zp_fx_e ) = 0; - move16(); + temp = 0; + move64(); FOR( k = 0; k < colsX; ++k ) { - x_idx = add( i, imult1616( k, rowsX ) ); - y_idx = add( j, imult1616( k, rowsY ) ); - ( *Zp_fx ) = BASOP_Util_Add_Mant32Exp( *Zp_fx, *Zp_fx_e, Mpy_32_32( X_fx[x_idx], Y_fx[y_idx] ), add( X_fx_e, Y_fx_e ), Zp_fx_e ); /*Q31 - Zp_fx_e*/ - move32(); + x_idx = i + k * rowsX; + y_idx = j + k * rowsY; + temp = W_mac_32_32( temp, X_fx[x_idx], Y_fx[y_idx] ); // X_fx_e + Y_fx_e + } + /* Maximize accumulated value to 32-bit */ + temp_e = W_norm( temp ); + temp = W_shl( temp, temp_e ); + if ( 0 == temp ) + { + temp_e = prod_e; + move16(); } + *Zp_fx_e = sub( prod_e, temp_e ); + move16(); + ( *Zp_fx ) = W_extract_h( temp ); + move32(); + max_exp = s_max( max_exp, *Zp_fx_e ); // Find the max exp Zp_fx++; Zp_fx_e++; } @@ -1017,18 +1044,27 @@ Word16 matrix_product_mant_exp_fx( { FOR( i = 0; i < colsX; ++i ) { - ( *Zp_fx ) = 0; - move32(); - ( *Zp_fx_e ) = 0; - move16(); + temp = 0; + move64(); FOR( k = 0; k < colsX; ++k ) { - x_idx = add( k, imult1616( i, rowsX ) ); - y_idx = add( j, imult1616( k, rowsY ) ); - ( *Zp_fx ) = BASOP_Util_Add_Mant32Exp( *Zp_fx, *Zp_fx_e, Mpy_32_32( X_fx[x_idx], Y_fx[y_idx] ), add( X_fx_e, Y_fx_e ), Zp_fx_e ); /*Q31 - Zp_fx_e*/ - move32(); + x_idx = k + i * rowsX; + y_idx = j + k * rowsY; + temp = W_mac_32_32( temp, X_fx[x_idx], Y_fx[y_idx] ); // X_fx_e + Y_fx_e } - + /* Maximize accumulated value to 32-bit */ + temp_e = W_norm( temp ); + temp = W_shl( temp, temp_e ); + if ( 0 == temp ) + { + temp_e = prod_e; + move16(); + } + *Zp_fx_e = sub( prod_e, temp_e ); + move16(); + ( *Zp_fx ) = W_extract_h( temp ); + move32(); + max_exp = s_max( max_exp, *Zp_fx_e ); // Find the max exp Zp_fx++; Zp_fx_e++; } @@ -1049,17 +1085,26 @@ Word16 matrix_product_mant_exp_fx( { FOR( i = 0; i < rowsX; ++i ) { - ( *Zp_fx ) = 0; - move32(); - ( *Zp_fx_e ) = 0; - move16(); + temp = 0; + move64(); FOR( k = 0; k < colsX; ++k ) { - x_idx = add( i, imult1616( k, rowsX ) ); - y_idx = add( k, imult1616( j, rowsY ) ); - ( *Zp_fx ) = BASOP_Util_Add_Mant32Exp( *Zp_fx, *Zp_fx_e, Mpy_32_32( X_fx[x_idx], Y_fx[y_idx] ), add( X_fx_e, Y_fx_e ), Zp_fx_e ); /*Q31 - Zp_fx_e*/ - move32(); + x_idx = i + k * rowsX; + y_idx = k + j * rowsY; + temp = W_mac_32_32( temp, X_fx[x_idx], Y_fx[y_idx] ); // X_fx_e + Y_fx_e + } + /* Maximize accumulated value to 32-bit */ + temp_e = W_norm( temp ); + temp = W_shl( temp, temp_e ); + if ( 0 == temp ) + { + temp_e = prod_e; } + *Zp_fx_e = sub( prod_e, temp_e ); + move16(); + ( *Zp_fx ) = W_extract_h( temp ); + move32(); + max_exp = s_max( max_exp, *Zp_fx_e ); // Find the max exp Zp_fx++; Zp_fx_e++; } @@ -1070,18 +1115,11 @@ Word16 matrix_product_mant_exp_fx( move16(); } Zp_fx = Z_fx; /*Q31 - Zp_fx_e*/ + Zp_fx_e = out_e; - Word16 max_exp = -31; move16(); - FOR( j = 0; j < row; ++j ) - { - FOR( i = 0; i < col; ++i ) - { - max_exp = s_max( max_exp, *Zp_fx_e ); - Zp_fx_e++; - } - } - Zp_fx_e = out_e; + + *Z_fx_e = max_exp; move16(); FOR( j = 0; j < row; ++j ) diff --git a/lib_com/options.h b/lib_com/options.h index ea2823ad7fc7c7836f27203b128295f34d17f16e..1b2c15f872080cbde13e6d7fa2f8cb912b9fe84b 100755 --- a/lib_com/options.h +++ b/lib_com/options.h @@ -152,5 +152,6 @@ #define FIX_ISSUE_1214 /* Ittiam: Fix for issue 1214: Energy leakage in IGF tiles for MDCT-stereo @64kbps SWB*/ #define FIX_881_HILBERT_FILTER /* VA: improve the precision of the Hilbert filter to remove 2kHz unwanted tone */ #define FIX_ISSUE_1245 /* Ittiam: Fix for issue 1245: Basop Encoder: Audible noise for silent Stereo input DTX on @24.4 kbps, @32 kbps*/ -#endif #define FIX_MINOR_SVD_WMOPS_MR1010X /* FhG: Minor WMOPS tuning, bit-exact to previous version, saves about 8.2 WMOPS for MR1010 */ +#define SVD_WMOPS_OPT /* Ittiam : SVD related optimizations */ +#endif diff --git a/lib_dec/dec_tcx.c b/lib_dec/dec_tcx.c index ae0474f3d987d452a00f59905bd09924038a8e21..708527df5fa21a97b807fd13f934b06c56d1cac1 100644 --- a/lib_dec/dec_tcx.c +++ b/lib_dec/dec_tcx.c @@ -237,6 +237,7 @@ void decoder_tcx_imdct_fx( Word16 q_a_itf = 15; Word16 x_e = sub( 31, q_x ); move16(); + Word16 shift_q = sub( q_x, q_win ); /*-----------------------------------------------------------------* * Initializations @@ -364,9 +365,10 @@ void decoder_tcx_imdct_fx( IF( EQ_16( st->element_mode, IVAS_CPE_MDCT ) ) { + Word16 copy_len = s_min( L_FRAME48k, s_max( L_spec, s_max( L_frame, L_frameTCX ) ) ); set32_fx( x_tmp_fx, 0, L_FRAME_PLUS ); - Copy32( x_fx, x_tmp_fx, s_min( L_FRAME48k, s_max( L_spec, s_max( L_frame, L_frameTCX ) ) ) ); // q_x - Copy32( x_fx, xn_bufFB_fx, s_min( L_FRAME48k, s_max( L_spec, s_max( L_frame, L_frameTCX ) ) ) ); // q_x + Copy32( x_fx, x_tmp_fx, copy_len ); // q_x + Copy32( x_fx, xn_bufFB_fx, copy_len ); // q_x } ELSE IF( ( st->element_mode == EVS_MONO ) ) { @@ -374,8 +376,9 @@ void decoder_tcx_imdct_fx( } ELSE { - Copy32( x_fx, x_tmp_fx, s_max( L_spec, s_max( L_frame, L_frameTCX ) ) ); // q_x - Copy32( x_fx, xn_bufFB_fx, s_max( L_spec, s_max( L_frame, L_frameTCX ) ) ); // q_x + Word16 copy_len = s_max( L_spec, s_max( L_frame, L_frameTCX ) ); + Copy32( x_fx, x_tmp_fx, copy_len ); // q_x + Copy32( x_fx, xn_bufFB_fx, copy_len ); // q_x } IF( ( st->igf != 0 ) ) @@ -416,24 +419,29 @@ void decoder_tcx_imdct_fx( FOR( Word16 ind = 0; ind < L_MDCT_OVLP_MAX + L_FRAME_PLUS + L_MDCT_OVLP_MAX; ind++ ) { - xn_bufFB_fx_16[ind] = extract_l( L_shr( xn_bufFB_fx[ind], sub( q_x, q_win ) ) ); // q_x + xn_bufFB_fx_16[ind] = extract_l( L_shr( xn_bufFB_fx[ind], shift_q ) ); // q_x move16(); } + + Word16 ratio_e; + Word16 ratio = BASOP_Util_Divide1616_Scale( L_frameTCX_glob, L_frame_glob, &ratio_e ); // Q = 15-ratio_e. * FSCALE_DENOM is (1 << 9) + ratio = shr( ratio, sub( 6, ratio_e ) ); + IF( st->element_mode != EVS_MONO ) { IMDCT_ivas_fx( x_tmp_fx, q_x, hTcxDec->syn_OverlFB, hTcxDec->syn_Overl_TDACFB, xn_bufFB_fx_16, hTcxCfg->tcx_aldo_window_1_FB, hTcxCfg->tcx_aldo_window_1_FB_trunc, hTcxCfg->tcx_aldo_window_2_FB, hTcxCfg->tcx_mdct_window_halfFB, hTcxCfg->tcx_mdct_window_minimumFB, hTcxCfg->tcx_mdct_window_transFB, hTcxCfg->tcx_mdct_window_half_lengthFB, hTcxCfg->tcx_mdct_window_min_lengthFB, index, - kernelType, left_rect, tcx_offsetFB, overlapFB, L_frameTCX, L_frameTCX, max( L_frameTCX, L_spec ) >> 1, L_frameTCX_glob, frame_cnt, bfi, st->hHQ_core->old_out_fx, 1, st, FSCALE_DENOM * L_frameTCX_glob / L_frame_glob, acelp_zir_fx, q_win ); + kernelType, left_rect, tcx_offsetFB, overlapFB, L_frameTCX, L_frameTCX, shr( max( L_frameTCX, L_spec ), 1 ), L_frameTCX_glob, frame_cnt, bfi, st->hHQ_core->old_out_fx, 1, st, ratio, acelp_zir_fx, q_win ); } ELSE { IMDCT_ivas_fx( x_fx, q_x, hTcxDec->syn_OverlFB, hTcxDec->syn_Overl_TDACFB, xn_bufFB_fx_16, hTcxCfg->tcx_aldo_window_1_FB, hTcxCfg->tcx_aldo_window_1_FB_trunc, hTcxCfg->tcx_aldo_window_2_FB, hTcxCfg->tcx_mdct_window_halfFB, hTcxCfg->tcx_mdct_window_minimumFB, hTcxCfg->tcx_mdct_window_transFB, hTcxCfg->tcx_mdct_window_half_lengthFB, hTcxCfg->tcx_mdct_window_min_lengthFB, index, - kernelType, left_rect, tcx_offsetFB, overlapFB, L_frameTCX, L_frameTCX, shr( s_max( L_frameTCX, L_spec ), 1 ), L_frameTCX_glob, frame_cnt, bfi, st->hHQ_core->old_out_fx, 1, st, FSCALE_DENOM * L_frameTCX_glob / L_frame_glob, acelp_zir_fx, q_win ); + kernelType, left_rect, tcx_offsetFB, overlapFB, L_frameTCX, L_frameTCX, shr( s_max( L_frameTCX, L_spec ), 1 ), L_frameTCX_glob, frame_cnt, bfi, st->hHQ_core->old_out_fx, 1, st, ratio, acelp_zir_fx, q_win ); } FOR( Word16 ind = 0; ind < L_MDCT_OVLP_MAX + L_FRAME_PLUS + L_MDCT_OVLP_MAX; ind++ ) { - xn_bufFB_fx[ind] = L_shl( xn_bufFB_fx_16[ind], sub( q_x, q_win ) ); // Q_x + xn_bufFB_fx[ind] = L_shl( L_deposit_l( xn_bufFB_fx_16[ind] ), shift_q ); // Q_x } IF( ( bfi == 0 ) ) @@ -453,19 +461,22 @@ void decoder_tcx_imdct_fx( IF( EQ_16( st->element_mode, IVAS_CPE_MDCT ) ) { - res_m = BASOP_Util_Divide1616_Scale( L_frame_glob, L_FRAME, &res_e ); - st->old_fpitch = L_shl( Mpy_32_16_1( st->old_fpitch, res_m ), res_e ); + // Using sat as a single instruction shifts and extracts + st->old_fpitch = W_shl_sat_l( W_mult0_32_32( st->old_fpitch, L_frame_glob ), -8 ); // Divide by 256 ==> SHR by 8 + move32(); } IF( GT_16( st->element_mode, EVS_MONO ) ) { res_m = BASOP_Util_Divide1616_Scale( L_frameTCX_glob, L_frame_glob, &res_e ); st->old_fpitchFB = L_shl( Mpy_32_16_1( st->old_fpitch, res_m ), res_e ); + move32(); } ELSE { res_m = BASOP_Util_Divide1616_Scale( L_frameTCX, L_frame, &res_e ); st->old_fpitchFB = L_shl( Mpy_32_16_1( st->old_fpitch, res_m ), res_e ); + move32(); } } @@ -475,7 +486,7 @@ void decoder_tcx_imdct_fx( Copy( xn_buf_fx + L_frame, hTcxDec->syn_Overl, overlap ); // Q(-2) FOR( Word16 ind = 0; ind < overlapFB; ind++ ) { - hTcxDec->syn_OverlFB[ind] = (Word16) L_shr( xn_bufFB_fx[( ind + L_frameTCX )], sub( q_x, q_win ) ); // q_x + hTcxDec->syn_OverlFB[ind] = extract_l( L_shr( xn_bufFB_fx[( ind + L_frameTCX )], shift_q ) ); // q_x } } @@ -483,7 +494,7 @@ void decoder_tcx_imdct_fx( Copy( xn_buf_fx + sub( shr( overlap, 1 ), tcx_offset ), synth_fx, L_frame_glob ); // Q(-2) FOR( Word16 ind = 0; ind < L_frameTCX_glob; ind++ ) { - synthFB_fx[ind] = (Word16) L_shr( xn_bufFB_fx[( ind + ( ( overlapFB >> 1 ) - tcx_offsetFB ) )], sub( q_x, q_win ) ); // q_x + synthFB_fx[ind] = extract_l( L_shr( xn_bufFB_fx[( ind + ( ( overlapFB >> 1 ) - tcx_offsetFB ) )], shift_q ) ); // q_x } diff --git a/lib_dec/ivas_binRenderer_internal.c b/lib_dec/ivas_binRenderer_internal.c index 36246f73923c7b643e2709a7b7fc6faa71ace490..f23c0b7106036dbd7f90b8539e4b9e28c027e5f5 100644 --- a/lib_dec/ivas_binRenderer_internal.c +++ b/lib_dec/ivas_binRenderer_internal.c @@ -70,6 +70,7 @@ static void ivas_binRenderer_filterModule_fx( Word32 *filterStatesLeftRealPtr_fx, *filterStatesLeftImagPtr_fx; Word16 *Q_filterStates; const Word32 *filterTapsLeftRealPtr_fx, *filterTapsLeftImagPtr_fx, *filterTapsRightRealPtr_fx, *filterTapsRightImagPtr_fx; + Word16 shift_q; FOR( bandIdx = 0; bandIdx < hBinRenderer->conv_band; bandIdx++ ) { @@ -87,11 +88,6 @@ static void ivas_binRenderer_filterModule_fx( FOR( k = 0; k < numTimeSlots; k++ ) { Word64 outRealLeft_fx = 0, outRealRight_fx = 0, outImagLeft_fx = 0, outImagRight_fx = 0; - Word64 W_sub1 = 0, W_add1 = 0, W_sub2 = 0, W_add2 = 0; - move64(); - move64(); - move64(); - move64(); move64(); move64(); move64(); @@ -104,31 +100,32 @@ static void ivas_binRenderer_filterModule_fx( filterStatesLeftImagPtr_fx[tapIdx] = filterStatesLeftImagPtr_fx[tapIdx - 1]; move32(); - W_sub1 = W_sub( W_mult0_32_32( filterStatesLeftRealPtr_fx[tapIdx], filterTapsLeftRealPtr_fx[tapIdx] ), - W_mult0_32_32( filterStatesLeftImagPtr_fx[tapIdx], filterTapsLeftImagPtr_fx[tapIdx] ) ); // Q29 + Q_filterStates[tapIdx - 1] - W_add1 = W_add( W_mult0_32_32( filterStatesLeftRealPtr_fx[tapIdx], filterTapsLeftImagPtr_fx[tapIdx] ), - W_mult0_32_32( filterStatesLeftImagPtr_fx[tapIdx], filterTapsLeftRealPtr_fx[tapIdx] ) ); // Q29 + Q_filterStates[tapIdx - 1] - W_sub2 = W_sub( W_mult0_32_32( filterStatesLeftRealPtr_fx[tapIdx], filterTapsRightRealPtr_fx[tapIdx] ), - W_mult0_32_32( filterStatesLeftImagPtr_fx[tapIdx], filterTapsRightImagPtr_fx[tapIdx] ) ); // Q29 + Q_filterStates[tapIdx - 1] - W_add2 = W_add( W_mult0_32_32( filterStatesLeftRealPtr_fx[tapIdx], filterTapsRightImagPtr_fx[tapIdx] ), - W_mult0_32_32( filterStatesLeftImagPtr_fx[tapIdx], filterTapsRightRealPtr_fx[tapIdx] ) ); // Q29 + Q_filterStates[tapIdx - 1] + shift_q = sub( Q_filterStates[tapIdx], Q_filterStates[tapIdx - 1] ); + outRealLeft_fx = W_shr( outRealLeft_fx, shift_q ); + outImagLeft_fx = W_shr( outImagLeft_fx, shift_q ); + outRealRight_fx = W_shr( outRealRight_fx, shift_q ); + outImagRight_fx = W_shr( outImagRight_fx, shift_q ); - outRealLeft_fx = W_shr( outRealLeft_fx, sub( Q_filterStates[tapIdx], Q_filterStates[tapIdx - 1] ) ); - outImagLeft_fx = W_shr( outImagLeft_fx, sub( Q_filterStates[tapIdx], Q_filterStates[tapIdx - 1] ) ); - outRealRight_fx = W_shr( outRealRight_fx, sub( Q_filterStates[tapIdx], Q_filterStates[tapIdx - 1] ) ); - outImagRight_fx = W_shr( outImagRight_fx, sub( Q_filterStates[tapIdx], Q_filterStates[tapIdx - 1] ) ); + outRealLeft_fx = W_mac_32_32( outRealLeft_fx, filterStatesLeftRealPtr_fx[tapIdx], filterTapsLeftRealPtr_fx[tapIdx] ); + outRealLeft_fx = W_mac_32_32( outRealLeft_fx, L_negate( filterStatesLeftImagPtr_fx[tapIdx] ), filterTapsLeftImagPtr_fx[tapIdx] ); // Q30 + Q_filterStates[tapIdx - 1] - Q_filterStates[tapIdx] = Q_filterStates[tapIdx - 1]; - move16(); + outImagLeft_fx = W_mac_32_32( outImagLeft_fx, filterStatesLeftRealPtr_fx[tapIdx], filterTapsLeftImagPtr_fx[tapIdx] ); + outImagLeft_fx = W_mac_32_32( outImagLeft_fx, filterStatesLeftImagPtr_fx[tapIdx], filterTapsLeftRealPtr_fx[tapIdx] ); + + outRealRight_fx = W_mac_32_32( outRealRight_fx, filterStatesLeftRealPtr_fx[tapIdx], filterTapsRightRealPtr_fx[tapIdx] ); + outRealRight_fx = W_mac_32_32( outRealRight_fx, L_negate( filterStatesLeftImagPtr_fx[tapIdx] ), filterTapsRightImagPtr_fx[tapIdx] ); - /* Left Real and Imag */ - outRealLeft_fx = W_add( outRealLeft_fx, W_sub1 ); // Q29 + Q_filterStates[1] - outImagLeft_fx = W_add( outImagLeft_fx, W_add1 ); // Q29 + Q_filterStates[1] + outImagRight_fx = W_mac_32_32( outImagRight_fx, filterStatesLeftRealPtr_fx[tapIdx], filterTapsRightImagPtr_fx[tapIdx] ); + outImagRight_fx = W_mac_32_32( outImagRight_fx, filterStatesLeftImagPtr_fx[tapIdx], filterTapsRightRealPtr_fx[tapIdx] ); - /* Right Real and Imag*/ - outRealRight_fx = W_add( outRealRight_fx, W_sub2 ); // Q29 + Q_filterStates[1] - outImagRight_fx = W_add( outImagRight_fx, W_add2 ); // Q29 + Q_filterStates[1] + Q_filterStates[tapIdx] = Q_filterStates[tapIdx - 1]; + move16(); } + shift_q = add( sub( Q_filterStates[1], Q_curr ), 1 ); + outRealLeft_fx = W_shr( outRealLeft_fx, shift_q ); + outImagLeft_fx = W_shr( outImagLeft_fx, shift_q ); + outRealRight_fx = W_shr( outRealRight_fx, shift_q ); + outImagRight_fx = W_shr( outImagRight_fx, shift_q ); filterStatesLeftRealPtr_fx[0] = CLDFB_real[chIdx][k][bandIdx]; move32(); @@ -141,27 +138,29 @@ static void ivas_binRenderer_filterModule_fx( /* Left Real and Imag */ // Q29 + Q_curr - out_Conv_CLDFB_real[0][k][bandIdx] = W_add( out_Conv_CLDFB_real[0][k][bandIdx], - W_add( W_shr( outRealLeft_fx, sub( Q_filterStates[1], Q_curr ) ), - W_sub( W_mult0_32_32( filterStatesLeftRealPtr_fx[0], filterTapsLeftRealPtr_fx[0] ), - W_mult0_32_32( filterStatesLeftImagPtr_fx[0], filterTapsLeftImagPtr_fx[0] ) ) ) ); // Q29 + Word32 temp1 = L_shr( filterStatesLeftRealPtr_fx[0], 1 ); + Word32 temp2 = L_shr( filterStatesLeftImagPtr_fx[0], 1 ); + + + outRealLeft_fx = W_mac_32_32( outRealLeft_fx, temp1, filterTapsLeftRealPtr_fx[0] ); + outRealLeft_fx = W_mac_32_32( outRealLeft_fx, L_negate( temp2 ), filterTapsLeftImagPtr_fx[0] ); + out_Conv_CLDFB_real[0][k][bandIdx] = W_add( out_Conv_CLDFB_real[0][k][bandIdx], outRealLeft_fx ); // Q29 move64(); - out_Conv_CLDFB_imag[0][k][bandIdx] = W_add( out_Conv_CLDFB_imag[0][k][bandIdx], - W_add( W_shr( outImagLeft_fx, sub( Q_filterStates[1], Q_curr ) ), - W_add( W_mult0_32_32( filterStatesLeftRealPtr_fx[0], filterTapsLeftImagPtr_fx[0] ), - W_mult0_32_32( filterStatesLeftImagPtr_fx[0], filterTapsLeftRealPtr_fx[0] ) ) ) ); // Q29 + + outImagLeft_fx = W_mac_32_32( outImagLeft_fx, temp1, filterTapsLeftImagPtr_fx[0] ); + outImagLeft_fx = W_mac_32_32( outImagLeft_fx, temp2, filterTapsLeftRealPtr_fx[0] ); + out_Conv_CLDFB_imag[0][k][bandIdx] = W_add( out_Conv_CLDFB_imag[0][k][bandIdx], outImagLeft_fx ); // Q29 move64(); /* Right Real and Imag */ - out_Conv_CLDFB_real[1][k][bandIdx] = W_add( out_Conv_CLDFB_real[1][k][bandIdx], - W_add( W_shr( outRealRight_fx, sub( Q_filterStates[1], Q_curr ) ), - W_sub( W_mult0_32_32( filterStatesLeftRealPtr_fx[0], filterTapsRightRealPtr_fx[0] ), - W_mult0_32_32( filterStatesLeftImagPtr_fx[0], filterTapsRightImagPtr_fx[0] ) ) ) ); // Q29 + outRealRight_fx = W_mac_32_32( outRealRight_fx, temp1, filterTapsRightRealPtr_fx[0] ); + outRealRight_fx = W_mac_32_32( outRealRight_fx, L_negate( temp2 ), filterTapsRightImagPtr_fx[0] ); + out_Conv_CLDFB_real[1][k][bandIdx] = W_add( out_Conv_CLDFB_real[1][k][bandIdx], outRealRight_fx ); // Q29 move64(); - out_Conv_CLDFB_imag[1][k][bandIdx] = W_add( out_Conv_CLDFB_imag[1][k][bandIdx], - W_add( W_shr( outImagRight_fx, sub( Q_filterStates[1], Q_curr ) ), - W_add( W_mult0_32_32( filterStatesLeftRealPtr_fx[0], filterTapsRightImagPtr_fx[0] ), - W_mult0_32_32( filterStatesLeftImagPtr_fx[0], filterTapsRightRealPtr_fx[0] ) ) ) ); // Q29 + + outImagRight_fx = W_mac_32_32( outImagRight_fx, temp1, filterTapsRightImagPtr_fx[0] ); + outImagRight_fx = W_mac_32_32( outImagRight_fx, temp2, filterTapsRightRealPtr_fx[0] ); + out_Conv_CLDFB_imag[1][k][bandIdx] = W_add( out_Conv_CLDFB_imag[1][k][bandIdx], outImagRight_fx ); // Q29 move64(); } } diff --git a/lib_dec/ivas_dirac_output_synthesis_cov.c b/lib_dec/ivas_dirac_output_synthesis_cov.c index 5aa649ecda5e1fe8388321724dac753b47b0d351..fd039fe965fd2a8f87867d037f31f73ec8267f3a 100644 --- a/lib_dec/ivas_dirac_output_synthesis_cov.c +++ b/lib_dec/ivas_dirac_output_synthesis_cov.c @@ -410,8 +410,6 @@ void ivas_dirac_dec_output_synthesis_cov_param_mc_collect_slot_fx( const Word16 nchan_in /* i : number of input channels */ ) { - Word16 cx_init_e; - Word16 cx_init_imag_e; Word16 band_idx, ch_idx; Word16 brange[2]; Word32 real_in_buffer_fx[PARAM_MC_MAX_BANDS_IN_PARAMETER_BAND * MAX_TRANSPORT_CHANNELS]; @@ -421,10 +419,9 @@ void ivas_dirac_dec_output_synthesis_cov_param_mc_collect_slot_fx( Word32 real_buffer_fx[PARAM_MC_MAX_TRANSPORT_CHANS * PARAM_MC_MAX_TRANSPORT_CHANS]; Word32 imag_buffer_fx[PARAM_MC_MAX_TRANSPORT_CHANS * PARAM_MC_MAX_TRANSPORT_CHANS]; Word16 output_e; - Word16 i, j, tmp1, tmp2, tmp1_e, tmp2_e, shift_imag, shift_real; - Word32 L_tmp; + Word16 tmp1_e, tmp2_e, shift_imag, shift_real; Word16 band, num_bands; - + Word16 cx_fx_norm, cx_imag_fx_norm; /* estimate input covariance */ /* Already stack here instead of in the process_subframe */ @@ -451,8 +448,11 @@ void ivas_dirac_dec_output_synthesis_cov_param_mc_collect_slot_fx( move16(); imag_in_e = ImagBuffer_e; move16(); - shift_real = sub( L_norm_arr( real_in_buffer_fx, imult1616( num_bands, nchan_in ) ), find_guarded_bits_fx( add( num_bands, 1 ) ) ); - shift_imag = sub( L_norm_arr( imag_in_buffer_fx, imult1616( num_bands, nchan_in ) ), find_guarded_bits_fx( add( num_bands, 1 ) ) ); + + Word16 buf_len = imult1616( num_bands, nchan_in ); + + shift_real = sub( L_norm_arr( real_in_buffer_fx, buf_len ), find_guarded_bits_fx( add( num_bands, 1 ) ) ); + shift_imag = sub( L_norm_arr( imag_in_buffer_fx, buf_len ), find_guarded_bits_fx( add( num_bands, 1 ) ) ); real_in_e = sub( real_in_e, shift_real ); imag_in_e = sub( imag_in_e, shift_imag ); @@ -460,50 +460,23 @@ void ivas_dirac_dec_output_synthesis_cov_param_mc_collect_slot_fx( output_e = s_max( real_in_e, imag_in_e ); - FOR( i = 0; i < num_bands * nchan_in; ++i ) - { - real_in_buffer_fx[i] = L_shr( real_in_buffer_fx[i], sub( output_e, RealBuffer_e ) ); // Q(31-output_e) - move32(); - imag_in_buffer_fx[i] = L_shr( imag_in_buffer_fx[i], sub( output_e, ImagBuffer_e ) ); // Q(31-output_e) - move32(); - } + scale_sig32( real_in_buffer_fx, buf_len, sub( RealBuffer_e, output_e ) ); + scale_sig32( imag_in_buffer_fx, buf_len, sub( ImagBuffer_e, output_e ) ); cmplx_matrix_square_fx( real_in_buffer_fx, imag_in_buffer_fx, num_bands, nchan_in, real_buffer_fx, imag_buffer_fx, output_e, &output_e ); v_add_fixed_me( cx_fx, *cx_e, real_buffer_fx, output_e, cx_fx, &tmp1_e, imult1616( nchan_in, nchan_in ), 1 ); v_add_fixed_me( cx_imag_fx, *cx_imag_e, imag_buffer_fx, output_e, cx_imag_fx, &tmp2_e, imult1616( nchan_in, nchan_in ), 1 ); - cx_init_e = tmp1_e; - move16(); - cx_init_imag_e = tmp2_e; - move16(); - // normalizing both the matrices to a common exponent for a better precision - tmp1 = 0; - move16(); - tmp2 = 0; - move16(); - - FOR( j = 0; j < PARAM_MC_MAX_TRANSPORT_CHANS * PARAM_MC_MAX_TRANSPORT_CHANS; j++ ) - { - L_tmp = BASOP_Util_Add_Mant32Exp( cx_fx[j], cx_init_e, 0, 0, &tmp1_e ); - L_tmp = BASOP_Util_Add_Mant32Exp( cx_imag_fx[j], cx_init_imag_e, 0, 0, &tmp2_e ); - tmp1 = s_max( tmp1, tmp1_e ); - tmp2 = s_max( tmp2, tmp2_e ); - } + cx_fx_norm = L_norm_arr( cx_fx, PARAM_MC_MAX_TRANSPORT_CHANS * PARAM_MC_MAX_TRANSPORT_CHANS ); + cx_imag_fx_norm = L_norm_arr( cx_imag_fx, PARAM_MC_MAX_TRANSPORT_CHANS * PARAM_MC_MAX_TRANSPORT_CHANS ); - FOR( j = 0; j < PARAM_MC_MAX_TRANSPORT_CHANS * PARAM_MC_MAX_TRANSPORT_CHANS; j++ ) - { - L_tmp = BASOP_Util_Add_Mant32Exp( cx_fx[j], cx_init_e, 0, 0, &tmp1_e ); - cx_fx[j] = L_shr( L_tmp, sub( tmp1, tmp1_e ) ); // Q(31-tmp1) - move32(); - L_tmp = BASOP_Util_Add_Mant32Exp( cx_imag_fx[j], cx_init_imag_e, 0, 0, &tmp2_e ); - cx_imag_fx[j] = L_shr( L_tmp, sub( tmp2, tmp2_e ) ); // Q(31-tmp2) - move32(); - } + scale_sig32( cx_fx, PARAM_MC_MAX_TRANSPORT_CHANS * PARAM_MC_MAX_TRANSPORT_CHANS, cx_fx_norm ); + scale_sig32( cx_imag_fx, PARAM_MC_MAX_TRANSPORT_CHANS * PARAM_MC_MAX_TRANSPORT_CHANS, cx_imag_fx_norm ); - *cx_e = tmp1; + *cx_e = sub( tmp1_e, cx_fx_norm ); move16(); - *cx_imag_e = tmp2; + *cx_imag_e = sub( tmp2_e, cx_imag_fx_norm ); move16(); return; diff --git a/lib_dec/ivas_svd_dec.c b/lib_dec/ivas_svd_dec.c index 942a2b5b0c484cf3a90b25c5dd5cc17a3da01a5e..c8778f99ccf301a493996b9489d1139c3a6074f9 100644 --- a/lib_dec/ivas_svd_dec.c +++ b/lib_dec/ivas_svd_dec.c @@ -912,13 +912,55 @@ static void ApplyRotation_fx( ) { Word16 ch; - Word16 temp_exp; *d = BASOP_Util_Add_Mant32Exp( Mpy_32_32( c, x11 ), add( c_e, x11_e ), Mpy_32_32( s, x12 ), add( s_e, x12_e ), d_e ); /* exp(d_e) */ move32(); *g = BASOP_Util_Add_Mant32Exp( Mpy_32_32( c, x12 ), add( c_e, x12_e ), Mpy_32_32( L_negate( s ), x11 ), add( s_e, x11_e ), g_e ); /* exp(g_e) */ move32(); +#ifdef SVD_WMOPS_OPT + Word16 c_q = sub( 31, c_e ); + Word16 s_q = sub( 31, s_e ); + Word32 op1, op2; + Word16 op_e; + + // Bring c and s to same Q + IF( GT_16( c_q, s_q ) ) + { + op1 = L_shr( c, sub( c_q, s_q ) ); + op2 = s; + move32(); + op_e = s_q; + move16(); + } + ELSE + { + op1 = c; + move32(); + op2 = L_shr( s, sub( s_q, c_q ) ); + op_e = c_q; + move16(); + } + op_e = add( op_e, 1 ); // 64 bit mac -> +1 + + FOR( ch = 0; ch < nChannels; ch++ ) + { + x11 = singularVector[ch][currentIndex2]; + move32(); + x12 = singularVector[ch][currentIndex1]; + move32(); + + Word64 temp = W_mac_32_32( W_mult_32_32( op1, x11 ), op2, x12 ); // Q(singularVector) + op_e + temp = W_shr( temp, op_e ); // Q(singularVector) + singularVector[ch][currentIndex2] = W_sat_l( temp ); // Q(singularVector) + move32(); + + temp = W_mac_32_32( W_mult_32_32( op1, x12 ), L_negate( op2 ), x11 ); // Q(singularVector) + op_e + temp = W_shr( temp, op_e ); // Q(singularVector) + singularVector[ch][currentIndex1] = W_sat_l( temp ); // Q(singularVector) + move32(); + } +#else #ifndef FIX_MINOR_SVD_WMOPS_MR1010X FOR( ch = 0; ch < nChannels; ch++ ) { @@ -952,6 +994,7 @@ static void ApplyRotation_fx( move32(); } +#endif #endif return; @@ -1605,26 +1648,43 @@ static void singularVectorsAccumulationLeft_fx( t_ii = BASOP_Util_Divide3232_Scale_cadence( ONE_IN_Q30, maxWithSign_fx( t_ii ), &temp_exp ); /* exp(1 + (temp_exp + tii_e)) */ t_ii_e = add( 1, sub( temp_exp, t_ii_e ) ); #endif + Word16 tempe; + Word32 temp = BASOP_Util_Divide3232_Scale_cadence( t_ii, maxWithSign_fx( singularVectors_Left[nCh][nCh] ), &tempe ); + tempe = add( tempe, sub( t_ii_e, singularVectors_Left_e[nCh][nCh] ) ); // fprintf( fp, "%e\n", me2f( t_ii, t_ii_e ) ); FOR( iCh = nCh + 1; iCh < nChannelsC; iCh++ ) /* nChannelsC */ { - norm_y = 0; - move32(); - norm_y_e = 0; + Word64 acc = 0; + move64(); + Word64 prod[16]; + Word16 prod_e[16]; + Word16 max_e = -31; move16(); FOR( k = nCh + 1; k < nChannelsL; k++ ) /* nChannelsL */ { #ifndef FIX_1010_OPT_SINGLE_RESCALE norm_y = BASOP_Util_Add_Mant32Exp( norm_y, norm_y_e, Mpy_32_32( singularVectors_Left[k][nCh], singularVectors_Left[k][iCh] ), add( sing_exp2[k][nCh], sing_exp2[k][iCh] ), &norm_y_e ); /* exp(norm_y_e) */ #else - norm_y = BASOP_Util_Add_Mant32Exp( norm_y, norm_y_e, Mpy_32_32( singularVectors_Left[k][nCh], singularVectors_Left[k][iCh] ), add( singularVectors_Left_e[k][nCh], singularVectors_Left_e[k][iCh] ), &norm_y_e ); /* exp(norm_y_e) */ + prod[k] = W_mult0_32_32( singularVectors_Left[k][nCh], singularVectors_Left[k][iCh] ); + prod_e[k] = add( singularVectors_Left_e[k][nCh], singularVectors_Left_e[k][iCh] ); + max_e = s_max( max_e, prod_e[k] ); #endif } - t_jj = BASOP_Util_Divide3232_Scale_cadence( Mpy_32_32( t_ii, norm_y ), maxWithSign_fx( singularVectors_Left[nCh][nCh] ), &temp_exp ); // t_ii_e+norm_y_e-*singularVectors_e, + + FOR( k = nCh + 1; k < nChannelsL; k++ ) /* nChannelsL */ + { + acc = W_add( acc, W_shr( prod[k], sub( max_e, prod_e[k] ) ) ); + } + Word16 acc_e = W_norm( acc ); + acc = W_shl( acc, acc_e ); + + norm_y = W_extract_h( acc ); + norm_y_e = add( sub( max_e, acc_e ), 1 ); + t_jj = Mpy_32_32( temp, norm_y ); #ifndef FIX_1010_OPT_SINGLE_RESCALE t_jj_e = add( temp_exp, sub( add( t_ii_e, norm_y_e ), sing_exp2[nCh][nCh] ) ); #else - t_jj_e = add( temp_exp, sub( add( t_ii_e, norm_y_e ), singularVectors_Left_e[nCh][nCh] ) ); + t_jj_e = add( tempe, norm_y_e ); #endif FOR( k = nCh; k < nChannelsL; k++ ) /* nChannelsL */ { diff --git a/lib_rend/ivas_dirac_decorr_dec.c b/lib_rend/ivas_dirac_decorr_dec.c index c50d690c5b80f4812770e4aab2d6bbb26aabc62f..1788536e4ba8ad65fbf2ff9fa777eaff7972aef4 100644 --- a/lib_rend/ivas_dirac_decorr_dec.c +++ b/lib_rend/ivas_dirac_decorr_dec.c @@ -409,7 +409,7 @@ void ivas_dirac_dec_decorr_process_fx( HANDLE_DIRAC_DECORR_STATE h_freq_domain_decorr_ap_state ) { - Word16 ch_idx, k, l, idx_in_out, max_band_decorr; + Word16 ch_idx, k, l, max_band_decorr; Word16 split_bands_idx, band_idx, decorr_buffer_len, time_idx; Word16 offset, idx_filter, incr_aux; Word16 k_1, k_2, num_bands, filter_length, pre_delay, decorr_buffer_step; @@ -506,22 +506,24 @@ void ivas_dirac_dec_decorr_process_fx( set32_fx( onset_filter_fx, ONE_IN_Q31, imult1616( num_protos_diff, num_freq_bands ) ); Word16 q_temp = s_min( q_onset_dec, q_aux_buffer ); + Word16 shift_q = sub( q_onset_dec, q_temp ); - IF( NE_16( q_temp, q_onset_dec ) ) + IF( shift_q != 0 ) { - FOR( Word16 i = 0; i < imult1616( num_protos_diff, max_band_decorr_temp ); i++ ) + FOR( Word16 i = 0; i < num_protos_diff * max_band_decorr_temp; i++ ) { - h_freq_domain_decorr_ap_state->h_onset_detection_power_state.onset_detector_2_fx[i] = L_shr( h_freq_domain_decorr_ap_state->h_onset_detection_power_state.onset_detector_2_fx[i], sub( q_onset_dec, q_temp ) ); // q_temp - h_freq_domain_decorr_ap_state->h_onset_detection_power_state.onset_detector_1_fx[i] = L_shr( h_freq_domain_decorr_ap_state->h_onset_detection_power_state.onset_detector_1_fx[i], sub( q_onset_dec, q_temp ) ); // q_temp + h_freq_domain_decorr_ap_state->h_onset_detection_power_state.onset_detector_2_fx[i] = L_shr( h_freq_domain_decorr_ap_state->h_onset_detection_power_state.onset_detector_2_fx[i], shift_q ); // q_temp + h_freq_domain_decorr_ap_state->h_onset_detection_power_state.onset_detector_1_fx[i] = L_shr( h_freq_domain_decorr_ap_state->h_onset_detection_power_state.onset_detector_1_fx[i], shift_q ); // q_temp move32(); move32(); } } - IF( NE_16( q_temp, q_aux_buffer ) ) + shift_q = sub( q_aux_buffer, q_temp ); + IF( shift_q != 0 ) { - FOR( Word16 i = 0; i < shl( imult1616( num_protos_diff, max_band_decorr_temp ), 1 ); i++ ) + FOR( Word16 i = 0; i < 2 * num_protos_diff * max_band_decorr_temp; i++ ) { - aux_buffer_fx[i] = L_shr( aux_buffer_fx[i], sub( q_aux_buffer, q_temp ) ); // q_temp + aux_buffer_fx[i] = L_shr( aux_buffer_fx[i], shift_q ); // q_temp move32(); } } @@ -566,9 +568,9 @@ void ivas_dirac_dec_decorr_process_fx( /* final phase rotation */ FOR( k = 0; k < max_band_decorr; k++ ) { - *p_frame_dec_fx = L_sub( Mpy_32_16_1( ( *decorr_buffer_fx ), ( *phase_coeff_real_fx ) ), Mpy_32_16_1( ( *( decorr_buffer_fx + 1 ) ), ( *phase_coeff_imag_fx ) ) ); // sub( q_decorr_buf, 1 ) + *p_frame_dec_fx = Msub_32_16( Mpy_32_16_1( *decorr_buffer_fx, ( *phase_coeff_real_fx ) ), *( decorr_buffer_fx + 1 ), ( *phase_coeff_imag_fx ) ); // sub( q_decorr_buf, 1 ) p_frame_dec_fx++; - *p_frame_dec_fx = L_add( Mpy_32_16_1( ( *decorr_buffer_fx ), ( *phase_coeff_imag_fx ) ), Mpy_32_16_1( ( *( decorr_buffer_fx + 1 ) ), ( *phase_coeff_real_fx ) ) ); // sub( q_decorr_buf, 1 ) + *p_frame_dec_fx = Madd_32_16( Mpy_32_16_1( *decorr_buffer_fx, ( *phase_coeff_imag_fx ) ), ( *( decorr_buffer_fx + 1 ) ), ( *phase_coeff_real_fx ) ); // sub( q_decorr_buf, 1 ) p_frame_dec_fx++; phase_coeff_imag_fx++; phase_coeff_real_fx++; @@ -620,8 +622,8 @@ void ivas_dirac_dec_decorr_process_fx( #endif q_shift = getScaleFactor32( aux_buffer_fx, imult1616( imult1616( 2, num_protos_dir ), max_band_decorr_temp ) ); - - FOR( Word16 j = 0; j < shl( imult1616( num_protos_dir, max_band_decorr_temp ), 1 ); j++ ) + Word16 buf_len = shl( imult1616( num_protos_dir, max_band_decorr_temp ), 1 ); + FOR( Word16 j = 0; j < buf_len; j++ ) { aux_buffer_fx[j] = L_shl( aux_buffer_fx[j], q_shift ); // add( q_aux_buffer, q_shift ) move32(); @@ -674,8 +676,9 @@ void ivas_dirac_dec_decorr_process_fx( filter_coeff_num_real_fx = &h_freq_domain_decorr_ap_params->filter_coeff_num_real_fx[idx_filter]; // Q12 filter_coeff_den_real_fx = &h_freq_domain_decorr_ap_params->filter_coeff_den_real_fx[idx_filter]; // Q12 decorr_buffer_start_ptr_fx = &h_freq_domain_decorr_ap_state->decorr_buffer_fx[2 * ( ch_idx * max_band_decorr + band_idx )]; - input_real_fx = aux_buffer_fx[shl( add( imult1616( proto_index_dir[ch_idx], max_band_decorr ), band_idx ), 1 )]; // q_aux - input_imag_fx = aux_buffer_fx[add( shl( add( imult1616( proto_index_dir[ch_idx], max_band_decorr ), band_idx ), 1 ), 1 )]; // q_aux + Word16 idx = shl( add( imult1616( proto_index_dir[ch_idx], max_band_decorr ), band_idx ), 1 ); + input_real_fx = aux_buffer_fx[idx]; // q_aux + input_imag_fx = aux_buffer_fx[idx + 1]; // q_aux /* MA part of filter impulse response */ FOR( l = 0; l < filter_length; l++ ) @@ -766,14 +769,15 @@ void ivas_dirac_dec_decorr_process_fx( #ifdef MSAN_FIX q_shift = Q31; move16(); + offset = shl( max_band_decorr, 1 ); FOR( ch_idx = 0; ch_idx < num_channels; ++ch_idx ) { q_shift = s_min( q_shift, - L_norm_arr( &frame_dec_fx[shl( imult1616( ch_idx, num_freq_bands ), 1 )], shl( max_band_decorr, 1 ) ) ); + L_norm_arr( &frame_dec_fx[2 * ch_idx * num_freq_bands], offset ) ); } FOR( ch_idx = 0; ch_idx < num_channels; ++ch_idx ) { - Scale_sig32( &frame_dec_fx[shl( imult1616( ch_idx, num_freq_bands ), 1 )], shl( max_band_decorr, 1 ), q_shift ); + scale_sig32( &frame_dec_fx[2 * ch_idx * num_freq_bands], offset, q_shift ); } #else q_shift = L_norm_arr( frame_dec_fx, ( 2 * max_band_decorr + incr_aux ) * num_channels ); @@ -784,13 +788,14 @@ void ivas_dirac_dec_decorr_process_fx( IF( h_freq_domain_decorr_ap_params->use_ducker ) { + Word16 len1 = shl( imult1616( max_band_decorr, num_protos_dir ), 1 ); /* compute direct power w/o onsets for the energy ratio, signal is still in the aux buffer */ - v_mult_fixed( aux_buffer_fx, aux_buffer_fx, aux_buffer_fx, shl( imult1616( max_band_decorr, num_protos_dir ), 1 ) ); // 2 *q_aux -31 + v_mult_fixed( aux_buffer_fx, aux_buffer_fx, aux_buffer_fx, len1 ); // 2 *q_aux -31 - q_aux_buffer = sub( imult1616( 2, q_aux_buffer ), 31 ); + q_aux_buffer = sub( shl( q_aux_buffer, 1 ), 31 ); // if this scaling is eliminated overflow is happening fot v_add_inc_fix - q_shift = sub( L_norm_arr( aux_buffer_fx, shl( imult1616( num_protos_dir, max_band_decorr ), 1 ) ), find_guarded_bits_fx( 2 ) ); + q_shift = sub( L_norm_arr( aux_buffer_fx, len1 ), 1 /*find_guarded_bits_fx( 2 )*/ ); Scale_sig32( aux_buffer_fx, shl( imult1616( num_protos_dir, max_band_decorr_temp ), 1 ), q_shift ); q_aux_buffer = add( q_aux_buffer, q_shift ); @@ -847,18 +852,18 @@ void ivas_dirac_dec_decorr_process_fx( } norm = W_norm( min64 ); #endif - - FOR( Word16 i = 0; i < shl( imult1616( num_channels, max_band_decorr ), 1 ); i++ ) + norm = sub( norm, 1 /*find_guarded_bits_fx( 2 )*/ ); + FOR( Word16 i = 0; i < 2 * num_channels * max_band_decorr; i++ ) { - aux_buffer_fx[i] = W_extract_h( W_shl( aux_64[i], sub( norm, find_guarded_bits_fx( 2 ) ) ) ); + aux_buffer_fx[i] = W_extract_h( W_shl( aux_64[i], norm ) ); move32(); } - q_aux_buffer = add( imult1616( 2, q_frame_f ), sub( sub( norm, 1 ), 32 ) ); + q_aux_buffer = add( shl( q_frame_f, 1 ), sub( norm, 32 ) ); FOR( ch_idx = 0; ch_idx < num_channels; ++ch_idx ) { - v_add_inc_fx( &aux_buffer_fx[shl( imult1616( ch_idx, max_band_decorr ), 1 )], 2, &aux_buffer_fx[add( shl( imult1616( ch_idx, max_band_decorr ), 1 ), 1 )], 2, &aux_buffer_fx[imult1616( ch_idx, max_band_decorr )], 1, max_band_decorr ); + v_add_inc_fx( &aux_buffer_fx[2 * ch_idx * max_band_decorr], 2, &aux_buffer_fx[2 * ch_idx * max_band_decorr + 1], 2, &aux_buffer_fx[ch_idx * max_band_decorr], 1, max_band_decorr ); } /* smooth energies */ @@ -959,7 +964,7 @@ void ivas_dirac_dec_decorr_process_fx( FOR( ch_idx = 0; ch_idx < num_channels; ++ch_idx ) { q_shift = s_min( q_shift, - sub( L_norm_arr( &frame_dec_fx[shl( imult1616( ch_idx, num_freq_bands ), 1 )], shl( max_band_decorr, 1 ) ), + sub( L_norm_arr( &frame_dec_fx[2 * ch_idx * num_freq_bands], shl( max_band_decorr, 1 ) ), Q2 ) ); } FOR( ch_idx = 0; ch_idx < num_channels; ++ch_idx ) @@ -974,7 +979,7 @@ void ivas_dirac_dec_decorr_process_fx( FOR( ch_idx = 0; ch_idx < num_channels; ch_idx++ ) { - Word32 *frame_dec_fx_ptr = &frame_dec_fx[shl( imult1616( ch_idx, num_freq_bands ), 1 )]; + Word32 *frame_dec_fx_ptr = &frame_dec_fx[2 * ch_idx * num_freq_bands]; Word16 cur_proto_index = imult1616( proto_index_dir[ch_idx], max_band_decorr ); Word16 cur_reverb_index = imult1616( ch_idx, max_band_decorr ); Word32 *reverb_energy_smooth_ptr = &h_freq_domain_decorr_ap_state->reverb_energy_smooth_fx[cur_reverb_index]; // q_aux @@ -1020,22 +1025,18 @@ void ivas_dirac_dec_decorr_process_fx( e_duck_gain = add( e_duck_gain, sub( e_direct_energy_smooth, add( e_reverb_energy_smooth, 1 ) ) ); duck_gain = Sqrt16( duck_gain, &e_duck_gain ); - Word16 comp_flag = BASOP_Util_Cmp_Mant32Exp( duck_gain, e_duck_gain, 16384, 2 ); - IF( EQ_16( comp_flag, 1 ) ) - { - duck_gain = 16384; // 2inQ13 - move16(); - } - ELSE + /* if ( duck_gain > 2.0f ) { - duck_gain = shl( duck_gain, sub( e_duck_gain, 2 ) ); // Q13 - } + duck_gain = 2.0f; + } */ + duck_gain = shl_sat( duck_gain, sub( e_duck_gain, 1 ) ); // Q14 + #ifndef FIX_1110_OPTIM_DIRAC_DECORR_PROC frame_dec_fx_ptr[2 * band_idx] = L_shl( Mpy_32_16_1( frame_dec_fx_ptr[2 * band_idx], duck_gain ), 2 ); // q_frame_dec frame_dec_fx_ptr[add( shl( band_idx, 1 ), 1 )] = L_shl( Mpy_32_16_1( frame_dec_fx_ptr[add( shl( band_idx, 1 ), 1 )], duck_gain ), 2 ); // q_frame_dec #else - frame_dec_fx_ptr[2 * band_idx] = L_shl( Mpy_32_16_1( frame_dec_fx_ptr[2 * band_idx], duck_gain ), 2 ); // q_frame_dec - frame_dec_fx_ptr[2 * band_idx + 1] = L_shl( Mpy_32_16_1( frame_dec_fx_ptr[2 * band_idx + 1], duck_gain ), 2 ); // q_frame_dec + frame_dec_fx_ptr[2 * band_idx] = L_shl( Mpy_32_16_1( frame_dec_fx_ptr[2 * band_idx], duck_gain ), 1 ); // q_frame_dec + frame_dec_fx_ptr[2 * band_idx + 1] = L_shl( Mpy_32_16_1( frame_dec_fx_ptr[2 * band_idx + 1], duck_gain ), 1 ); // q_frame_dec #endif move32(); move32(); @@ -1055,7 +1056,7 @@ void ivas_dirac_dec_decorr_process_fx( Word16 sf = MAX_16; FOR( ch_idx = 0; ch_idx < num_channels; ++ch_idx ) { - sf = s_min( sf, getScaleFactor32( &frame_dec_fx[shl( imult1616( ch_idx, num_freq_bands ), 1 )], shl( max_band_decorr, 1 ) ) ); + sf = s_min( sf, getScaleFactor32( &frame_dec_fx[2 * ch_idx * num_freq_bands], shl( max_band_decorr, 1 ) ) ); } sf = s_min( sub( sf, 1 ), q_shift ); q_if_local = sub( q_shift, sf ); @@ -1065,7 +1066,7 @@ void ivas_dirac_dec_decorr_process_fx( #ifdef MSAN_FIX FOR( ch_idx = 0; ch_idx < num_channels; ++ch_idx ) { - scale_sig32( &frame_dec_fx[shl( imult1616( ch_idx, num_freq_bands ), 1 )], shl( max_band_decorr, 1 ), q_shift ); + scale_sig32( &frame_dec_fx[2 * ch_idx * num_freq_bands], shl( max_band_decorr, 1 ), q_shift ); } #else Scale_sig32( frame_dec_fx, ( 2 * max_band_decorr + incr_aux ) * num_channels, q_shift ); // scaling it to input q @@ -1078,7 +1079,7 @@ void ivas_dirac_dec_decorr_process_fx( #ifdef MSAN_FIX FOR( ch_idx = 0; ch_idx < num_channels; ++ch_idx ) { - scale_sig32( &frame_dec_fx[shl( imult1616( ch_idx, num_freq_bands ), 1 )], shl( max_band_decorr, 1 ), q_shift ); + scale_sig32( &frame_dec_fx[2 * ch_idx * num_freq_bands], shl( max_band_decorr, 1 ), q_shift ); } #else Scale_sig32( frame_dec_fx, ( 2 * max_band_decorr + incr_aux ) * num_channels, q_shift ); // scaling it to input q @@ -1098,24 +1099,22 @@ void ivas_dirac_dec_decorr_process_fx( FOR( k = 0; k < max_band_decorr; ++k ) { - aux_buffer_fx[2 * k] = Mpy_32_32( L_shr_r( input_frame_fx[add( shl( offset, 1 ), shl( k, 1 ) )], q_if_local ), L_sub( ONE_IN_Q31, onset_filter_fx[add( offset, k )] ) ); - aux_buffer_fx[add( shl( k, 1 ), 1 )] = Mpy_32_32( L_shr_r( input_frame_fx[add( add( shl( offset, 1 ), shl( k, 1 ) ), 1 )], q_if_local ), L_sub( ONE_IN_Q31, onset_filter_fx[add( offset, k )] ) ); // q_frame_f + aux_buffer_fx[2 * k] = Mpy_32_32( L_shr_r( input_frame_fx[2 * ( offset + k )], q_if_local ), L_sub( ONE_IN_Q31, onset_filter_fx[offset + k] ) ); + aux_buffer_fx[add( shl( k, 1 ), 1 )] = Mpy_32_32( L_shr_r( input_frame_fx[2 * ( offset + k ) + 1], q_if_local ), L_sub( ONE_IN_Q31, onset_filter_fx[offset + k] ) ); // q_frame_f move32(); move32(); } - v_add_fx( &frame_dec_fx[imult1616( ch_idx, shl( num_freq_bands, 1 ) )], aux_buffer_fx, &frame_dec_fx[imult1616( ch_idx, shl( num_freq_bands, 1 ) )], shl( max_band_decorr, 1 ) ); + v_add_fx( &frame_dec_fx[2 * ch_idx * num_freq_bands], aux_buffer_fx, &frame_dec_fx[2 * ch_idx * num_freq_bands], shl( max_band_decorr, 1 ) ); } } /* avoid decorrelation above maximum frequency -> set to zero the remaining frequencies*/ + Word16 val = shl( sub( num_freq_bands, h_freq_domain_decorr_ap_params->max_band_decorr ), 1 ); FOR( ch_idx = 0; ch_idx < num_channels; ++ch_idx ) { - /* calc output indices */ - idx_in_out = shl( ( add( imult1616( ch_idx, num_freq_bands ), h_freq_domain_decorr_ap_params->max_band_decorr ) ), 1 ); - /* copy to output signal */ - set32_fx( &frame_dec_fx[idx_in_out], 0, shl( sub( num_freq_bands, h_freq_domain_decorr_ap_params->max_band_decorr ), 1 ) ); + set32_fx( &frame_dec_fx[2 * ( ch_idx * num_freq_bands + h_freq_domain_decorr_ap_params->max_band_decorr )], 0, val ); } *q_frame_dec = q_frame_f; move16();