diff --git a/lib_com/options.h b/lib_com/options.h index 945329baa13761a718b08ae3110452f363faa643..9ae9b3a4093663aa6e4a6f390c0d53155d2e326c 100644 --- a/lib_com/options.h +++ b/lib_com/options.h @@ -99,4 +99,5 @@ #define FIX_1107_VADDINC /* FhG: Optimize v_add_inc_fx() for most frequent case */ #define FIX_1009_OPT_PARAMMC_RENDER /* FhG: Optimize ivas_param_mc_dec_render_fx() */ #define FIX_1109_OPTIM_MCT_STEREO_IGF_DEC /* FhG: optimize mctStereoIGF_dec_fx() */ +#define FIX_1110_OPTIM_DIRAC_DECORR_PROC /* FhG: optimize ivas_dirac_dec_decorr_process() */ #endif diff --git a/lib_rend/ivas_dirac_decorr_dec.c b/lib_rend/ivas_dirac_decorr_dec.c index a043e97f93f2e6d5c505d3fa8c5659bcaea78ebd..c50d690c5b80f4812770e4aab2d6bbb26aabc62f 100644 --- a/lib_rend/ivas_dirac_decorr_dec.c +++ b/lib_rend/ivas_dirac_decorr_dec.c @@ -57,6 +57,12 @@ #define DIRAC_DUCK_ALPHA_FX 1717986944 /* Q31 */ #define ONE_M_DIRAC_DUCK_ALPHA 429496736 /* Q31 */ +#ifdef FIX_1110_OPTIM_DIRAC_DECORR_PROC +/* Maximal useful q-format, represents range of 2^-126 (float min) */ +#define MAX_Q_FX 157 + + +#endif /*------------------------------------------------------------------------- * Local function prototypes *------------------------------------------------------------------------*/ @@ -583,6 +589,8 @@ void ivas_dirac_dec_decorr_process_fx( Word16 decorr_buff_tot_len = imult1616( imult1616( shl( decorr_buffer_len, 1 ), max_band_decorr ), num_channels ); guarded_bits = 0; + +#ifndef FIX_1110_OPTIM_DIRAC_DECORR_PROC FOR( Word16 i = 0; i < decorr_buff_tot_len; i++ ) { IF( h_freq_domain_decorr_ap_state->decorr_buffer_fx[i] != 0 ) @@ -593,6 +601,23 @@ void ivas_dirac_dec_decorr_process_fx( q_shift = sub( getScaleFactor32( h_freq_domain_decorr_ap_state->decorr_buffer_fx, decorr_buff_tot_len ), guarded_bits ); Scale_sig32( h_freq_domain_decorr_ap_state->decorr_buffer_fx, decorr_buff_tot_len, q_shift ); q_decorr_buf = add( q_decorr_buf, q_shift ); +#else + Flag is_zero = is_zero_arr( h_freq_domain_decorr_ap_state->decorr_buffer_fx, decorr_buff_tot_len ); + if ( is_zero == 0 ) + { + guarded_bits = 3; + } + + IF( is_zero == 0 ) + { + q_shift = sub( getScaleFactor32( h_freq_domain_decorr_ap_state->decorr_buffer_fx, decorr_buff_tot_len ), guarded_bits ); + IF( q_shift != 0 ) + { + Scale_sig32( h_freq_domain_decorr_ap_state->decorr_buffer_fx, decorr_buff_tot_len, q_shift ); + q_decorr_buf = add( q_decorr_buf, q_shift ); + } + } +#endif q_shift = getScaleFactor32( aux_buffer_fx, imult1616( imult1616( 2, num_protos_dir ), max_band_decorr_temp ) ); @@ -655,10 +680,15 @@ void ivas_dirac_dec_decorr_process_fx( /* MA part of filter impulse response */ FOR( l = 0; l < filter_length; l++ ) { +#ifndef FIX_1110_OPTIM_DIRAC_DECORR_PROC frame_ma_fx[2 * l] = Mpy_32_16_1( input_real_fx, filter_coeff_num_real_fx[l] ); // Q_qux -3 = q_deorr // frame_ma_fx[2 * l] = L_shr(frame_ma_fx[2 * l],3); // scaling to q_decorr_buf frame_ma_fx[add( shl( l, 1 ), 1 )] = Mpy_32_16_1( input_imag_fx, filter_coeff_num_real_fx[l] ); // Q_qux - 3 = q_deorr // frame_ma_fx[2 * l + 1] = L_shr(frame_ma_fx[2 * l + 1], 3); // scaling to q_decorr_buf +#else + frame_ma_fx[2 * l] = Mpy_32_16_1( input_real_fx, filter_coeff_num_real_fx[l] ); // Q_qux -3 = q_deorr + frame_ma_fx[2 * l + 1] = Mpy_32_16_1( input_imag_fx, filter_coeff_num_real_fx[l] ); // Q_qux - 3 = q_deorr +#endif move32(); move32(); } @@ -672,13 +702,21 @@ void ivas_dirac_dec_decorr_process_fx( /*get values for AR part */ filter_frame_real_fx = decorr_buffer_ptr_fx[0]; // q_decorr - filter_frame_imag_fx = decorr_buffer_ptr_fx[1]; // q_deocrr + filter_frame_imag_fx = decorr_buffer_ptr_fx[1]; // q_decorr +#ifndef FIX_1110_OPTIM_DIRAC_DECORR_PROC decorr_buffer_ptr_fx += shl( decorr_buffer_step, 1 ); +#else + Word16 decorr_buffer_step2x = shl( decorr_buffer_step, 1 ); + + decorr_buffer_ptr_fx += decorr_buffer_step2x; + move16(); +#endif FOR( l = 1; l < filter_length; l++ ) { // q adjustment needed// +#ifndef FIX_1110_OPTIM_DIRAC_DECORR_PROC decorr_buffer_ptr_fx[0] = L_add( decorr_buffer_ptr_fx[0], frame_ma_fx[2 * l] ); // q_decorr Word32 temp_1 = Mpy_32_16_1( filter_frame_real_fx, filter_coeff_den_real_fx[l] ); // q_decorr - 3 temp_1 = L_shl( temp_1, 3 ); // q_decorr @@ -691,7 +729,20 @@ void ivas_dirac_dec_decorr_process_fx( move32(); move32(); move32(); +#else + Word32 temp_1 = Mpy_32_16_1( filter_frame_real_fx, filter_coeff_den_real_fx[l] ); // q_decorr - 3 + temp_1 = L_shl( temp_1, 3 ); // q_decorr + decorr_buffer_ptr_fx[0] = L_sub( L_add( decorr_buffer_ptr_fx[0], frame_ma_fx[2 * l] ), temp_1 ); // q_deocor move32(); + + Word32 temp_2 = Mpy_32_16_1( filter_frame_imag_fx, filter_coeff_den_real_fx[l] ); // q_decorr - 3 + temp_2 = L_shl( temp_2, 3 ); // q_decorr + decorr_buffer_ptr_fx[1] = L_sub( L_add( decorr_buffer_ptr_fx[1], frame_ma_fx[2 * l + 1] ), temp_2 ); // q_decorr + move32(); + + decorr_buffer_ptr_fx += decorr_buffer_step2x; + move16(); +#endif } } } @@ -748,6 +799,11 @@ void ivas_dirac_dec_decorr_process_fx( q_direct_energy = q_aux_buffer; move16(); +#ifdef FIX_1110_OPTIM_DIRAC_DECORR_PROC + /* Attention: this loop reports norm=0, whenever any data is 0. */ + /* Therefore, useful left-shifts are skipped, accuracy is lost. */ +#endif +#ifndef FIX_1110_OPTIM_DIRAC_DECORR_PROC /* calculate the power of the decorrelated signal */ FOR( ch_idx = 0; ch_idx < num_channels; ++ch_idx ) { @@ -760,6 +816,37 @@ void ivas_dirac_dec_decorr_process_fx( norm = s_min( norm, W_norm( aux_64[add( offset2, i )] ) ); } } +#else + /* calculate the power of the decorrelated signal */ + Word64 *m64_aux = aux_64; + move32(); + Word64 min64 = (Word64) 0; + move64(); + Word32 *m32_frame_dec_fx = frame_dec_fx; + move32(); + offset1 = shl( num_freq_bands, 1 ); + offset2 = shl( max_band_decorr, 1 ); + + + FOR( ch_idx = 0; ch_idx < num_channels; ++ch_idx ) + { + FOR( Word16 i = 0; i < offset2; i++ ) + { + m64_aux[i] = W_mult0_32_32( m32_frame_dec_fx[i], m32_frame_dec_fx[i] ); + move64(); + if ( GT_64( m64_aux[i], min64 ) ) + { + min64 = m64_aux[i]; + move64(); + } + } + m64_aux += offset2; + m32_frame_dec_fx += offset1; + move64(); + move32(); + } + norm = W_norm( min64 ); +#endif FOR( Word16 i = 0; i < shl( imult1616( num_channels, max_band_decorr ), 1 ); i++ ) { @@ -775,6 +862,8 @@ void ivas_dirac_dec_decorr_process_fx( } /* smooth energies */ + +#ifndef FIX_1110_OPTIM_DIRAC_DECORR_PROC v_multc_fixed( aux_buffer_fx, ONE_M_DIRAC_DUCK_ALPHA, aux_buffer_fx, imult1616( num_channels, max_band_decorr ) ); // q_aux_buffer v_multc_fixed( h_freq_domain_decorr_ap_state->reverb_energy_smooth_fx, DIRAC_DUCK_ALPHA_FX, h_freq_domain_decorr_ap_state->reverb_energy_smooth_fx, imult1616( num_channels, max_band_decorr ) ); // same-q @@ -801,6 +890,64 @@ void ivas_dirac_dec_decorr_process_fx( Scale_sig32( h_freq_domain_decorr_ap_state->reverb_energy_smooth_fx, imult1616( num_channels, max_band_decorr ), q_shift ); h_freq_domain_decorr_ap_state->q_reverb_energy_smooth = add( h_freq_domain_decorr_ap_state->q_reverb_energy_smooth, q_shift ); move16(); +#else + Word16 len = imult1616( num_channels, max_band_decorr ); + Word16 aux_e = sub( 31, q_aux_buffer ); + Word16 max_e = s_max( aux_e, e_reverb_energy_smooth ); + Word16 shr_aux = sub( max_e, aux_e ); /* Note: headroom is zero */ + Word16 shr_res = sub( max_e, e_reverb_energy_smooth ); /* Note: headroom is zero */ + + /* Note: DIRAC_DUCK_ALPHA_FX and ONE_M_DIRAC_DUCK_ALPHA are both in Q31 (e=0) */ + /* => a multiplication with this values does not change the q/e value. */ + + FOR( Word16 i = 0; i < len; i++ ) + { + h_freq_domain_decorr_ap_state->reverb_energy_smooth_fx[i] = L_add( + L_shr( Mpy_32_32( aux_buffer_fx[i], ONE_M_DIRAC_DUCK_ALPHA ), shr_aux ), + L_shr( Mpy_32_32( h_freq_domain_decorr_ap_state->reverb_energy_smooth_fx[i], DIRAC_DUCK_ALPHA_FX ), shr_res ) ); + move32(); + } + e_reverb_energy_smooth = max_e; + move16(); + h_freq_domain_decorr_ap_state->q_reverb_energy_smooth = sub( 31, e_reverb_energy_smooth ); + move16(); + + len = imult1616( num_protos_dir, max_band_decorr ); + Word16 den_e = sub( 31, q_direct_energy ); + Word16 max_x = s_max( den_e, e_direct_energy_smooth ); + Word16 shr_den = sub( max_x, den_e ); /* Note: headroom is zero */ + Word16 shr_des = sub( max_x, e_direct_energy_smooth ); /* Note: headroom is zero */ + + FOR( Word16 i = 0; i < len; i++ ) + { + h_freq_domain_decorr_ap_state->direct_energy_smooth_fx[i] = L_add( + L_shr( Mpy_32_32( direct_energy_fx[i], ONE_M_DIRAC_DUCK_ALPHA ), shr_den ), + L_shr( Mpy_32_32( h_freq_domain_decorr_ap_state->direct_energy_smooth_fx[i], DIRAC_DUCK_ALPHA_FX ), shr_des ) ); + move32(); + } + e_direct_energy_smooth = max_x; + move16(); + h_freq_domain_decorr_ap_state->q_direct_energy_smooth = sub( 31, e_direct_energy_smooth ); + move16(); + + // scaling energy buffers for better precision for higher values// + q_shift = L_norm_arr( h_freq_domain_decorr_ap_state->direct_energy_smooth_fx, imult1616( num_protos_dir, max_band_decorr ) ); + IF( q_shift != 0 ) + { + Scale_sig32( h_freq_domain_decorr_ap_state->direct_energy_smooth_fx, imult1616( num_protos_dir, max_band_decorr ), q_shift ); + h_freq_domain_decorr_ap_state->q_direct_energy_smooth = add( h_freq_domain_decorr_ap_state->q_direct_energy_smooth, q_shift ); + move16(); + } + q_shift = L_norm_arr( h_freq_domain_decorr_ap_state->reverb_energy_smooth_fx, imult1616( num_channels, max_band_decorr ) ); + IF( q_shift != 0 ) + { + Scale_sig32( h_freq_domain_decorr_ap_state->reverb_energy_smooth_fx, imult1616( num_channels, max_band_decorr ), q_shift ); + h_freq_domain_decorr_ap_state->q_reverb_energy_smooth = add( h_freq_domain_decorr_ap_state->q_reverb_energy_smooth, q_shift ); + move16(); + } + h_freq_domain_decorr_ap_state->q_reverb_energy_smooth = min( MAX_Q_FX, h_freq_domain_decorr_ap_state->q_reverb_energy_smooth ); + h_freq_domain_decorr_ap_state->q_direct_energy_smooth = min( MAX_Q_FX, h_freq_domain_decorr_ap_state->q_direct_energy_smooth ); +#endif e_reverb_energy_smooth = sub( 31, h_freq_domain_decorr_ap_state->q_reverb_energy_smooth ); e_direct_energy_smooth = sub( 31, h_freq_domain_decorr_ap_state->q_direct_energy_smooth ); @@ -856,8 +1003,13 @@ void ivas_dirac_dec_decorr_process_fx( duck_gain = shl( duck_gain, sub( e_duck_gain, 1 ) ); // Q14 +#ifndef FIX_1110_OPTIM_DIRAC_DECORR_PROC frame_dec_fx_ptr[2 * band_idx] = L_shl( Mpy_32_16_1( frame_dec_fx_ptr[2 * band_idx], duck_gain ), 1 ); // q_frame_f frame_dec_fx_ptr[add( shl( band_idx, 1 ), 1 )] = L_shl( Mpy_32_16_1( frame_dec_fx_ptr[add( shl( band_idx, 1 ), 1 )], duck_gain ), 1 ); // q_frame_f +#else + frame_dec_fx_ptr[2 * band_idx] = L_shl( Mpy_32_16_1( frame_dec_fx_ptr[2 * band_idx], duck_gain ), 1 ); // q_frame_f + frame_dec_fx_ptr[2 * band_idx + 1] = L_shl( Mpy_32_16_1( frame_dec_fx_ptr[2 * band_idx + 1], duck_gain ), 1 ); // q_frame_f +#endif move32(); move32(); } @@ -878,8 +1030,13 @@ void ivas_dirac_dec_decorr_process_fx( { duck_gain = shl( duck_gain, sub( e_duck_gain, 2 ) ); // Q13 } +#ifndef FIX_1110_OPTIM_DIRAC_DECORR_PROC frame_dec_fx_ptr[2 * band_idx] = L_shl( Mpy_32_16_1( frame_dec_fx_ptr[2 * band_idx], duck_gain ), 2 ); // q_frame_dec frame_dec_fx_ptr[add( shl( band_idx, 1 ), 1 )] = L_shl( Mpy_32_16_1( frame_dec_fx_ptr[add( shl( band_idx, 1 ), 1 )], duck_gain ), 2 ); // q_frame_dec +#else + frame_dec_fx_ptr[2 * band_idx] = L_shl( Mpy_32_16_1( frame_dec_fx_ptr[2 * band_idx], duck_gain ), 2 ); // q_frame_dec + frame_dec_fx_ptr[2 * band_idx + 1] = L_shl( Mpy_32_16_1( frame_dec_fx_ptr[2 * band_idx + 1], duck_gain ), 2 ); // q_frame_dec +#endif move32(); move32(); } @@ -911,7 +1068,7 @@ void ivas_dirac_dec_decorr_process_fx( scale_sig32( &frame_dec_fx[shl( imult1616( ch_idx, num_freq_bands ), 1 )], shl( max_band_decorr, 1 ), q_shift ); } #else - Scale_sig32( frame_dec_fx, ( 2 * max_band_decorr + incr_aux ) * num_channels, q_shift ); // scaling it to input q + Scale_sig32( frame_dec_fx, ( 2 * max_band_decorr + incr_aux ) * num_channels, q_shift ); // scaling it to input q #endif q_frame_f = add( q_frame_f, sf ); } @@ -924,7 +1081,7 @@ void ivas_dirac_dec_decorr_process_fx( scale_sig32( &frame_dec_fx[shl( imult1616( ch_idx, num_freq_bands ), 1 )], shl( max_band_decorr, 1 ), q_shift ); } #else - Scale_sig32( frame_dec_fx, ( 2 * max_band_decorr + incr_aux ) * num_channels, q_shift ); // scaling it to input q + Scale_sig32( frame_dec_fx, ( 2 * max_band_decorr + incr_aux ) * num_channels, q_shift ); // scaling it to input q #endif q_frame_f = q_input_frame; q_if_local = 0;