diff --git a/lib_com/ivas_prot_fx.h b/lib_com/ivas_prot_fx.h index d87207a908def528694a9b3dc597f32dd6e43f2a..57f7226461242bed95799a94b545f4da33ca7d4a 100644 --- a/lib_com/ivas_prot_fx.h +++ b/lib_com/ivas_prot_fx.h @@ -3031,9 +3031,15 @@ void acelp_fast_fx( BSTR_ENC_HANDLE hBstr, /* i/o: encoder bitstream handle */ const Word16 cdk_index, /* i : codebook index */ const Word16 dn_orig[L_SUBFR], /* i : corr. between target and h[]. Q_dn */ +#ifdef OPT_2416_ACELP_FAST + const Word16 Q_dncn, /* i : scaling factor of dn and cn */ +#else Word16 Q_dn, +#endif const Word16 cn[L_SUBFR], /* i : residual after long term prediction q_cn*/ +#ifndef OPT_2416_ACELP_FAST const Word16 q_cn, +#endif const Word16 H[L_SUBFR], /* i : impulse response of weighted synthesis filter e(norm_s(H[0])+1) */ Word16 code[L_SUBFR], /* o : algebraic (fixed) codebook excitation */ Word16 y[], /* o : filtered fixed codebook excitation */ diff --git a/lib_com/options.h b/lib_com/options.h index 2d6a4236946d22c6261a9a7b672c0f25eba38c07..52f629c55993b68724bbb514caed9238f8be7e71 100644 --- a/lib_com/options.h +++ b/lib_com/options.h @@ -95,6 +95,7 @@ #define HARMONIZE_ACELP_ENC /* VA: basop issue 2400: Remove duplicated main ACELP encoder function */ #define FIX_2392_MSAN_DESTROY_DEC /* VA: basop issue 2392: fix MSAN in ivas_destroy_dec_fx() */ #define FIX_FLOAT_1522_LTV_MSAN_QMETADATA_ENC_EC3 /* Nokia: float issue 1522: fix uninit MSAN in EC3 of qmetadata encoding */ +#define OPT_2416_ACELP_FAST /* VA: basop issue 2426, optimisation of acelp_fast_fx ( reduc. compl. by 0.35 wmops ) */ #define FIX_2410_HARM_MODIF_FS /* VA: basop issue 2410: Remove duplicated modif_Fs */ #define HARM_LEV_DURBIN /* VA: basop issue 2423: harmonize levinson-Durbin algorithm */ diff --git a/lib_enc/cod4t64_fast_fx.c b/lib_enc/cod4t64_fast_fx.c index 4bba582550f9f62581730bb163b60a4e43de336b..66378d261dde31d4ec7cbcdcff1967607bc1d3a2 100644 --- a/lib_enc/cod4t64_fast_fx.c +++ b/lib_enc/cod4t64_fast_fx.c @@ -42,11 +42,8 @@ * Local constants *-------------------------------------------------------------------*/ -#define BETA_BN1 2.0f -#define BETA_BN2 2.25f - -#define BETA_BN1_FX 2 // Q0 -#define BETA_BN2_FX 9 // Q2 +#define BETA_BN1_FX 2 // 2.0f in Q0 +#define BETA_BN2_FX 9 // 2.25f in Q2 #define L_SUBFR_MAX 2 * L_SUBFR #define MAX_NUM_INTER 5 @@ -76,6 +73,8 @@ static Word16 quant_1p_N1_L_subfr_fx( return index; } + + static Word16 find_best_pulse_fx( const Word16 L_subfr, const Word16 nb_tracks, @@ -111,6 +110,7 @@ static Word16 find_best_pulse_fx( return m; } + /*-------------------------------------------------------------------* * Function acelp_fast() * @@ -119,21 +119,22 @@ static Word16 find_best_pulse_fx( *-------------------------------------------------------------------*/ void acelp_fast_fx( - BSTR_ENC_HANDLE hBstr, /* i/o: encoder bitstream handle */ - const Word16 cdk_index, /* i : codebook index */ - const Word16 dn_orig[L_SUBFR], - /* i : corr. between target and h[]. */ // Q_dn + BSTR_ENC_HANDLE hBstr, /* i/o: encoder bitstream handle */ + const Word16 cdk_index, /* i : codebook index */ + const Word16 dn_orig[L_SUBFR], /* i : corr. between target and h[]. Q_dncn */ +#ifdef OPT_2416_ACELP_FAST + const Word16 Q_dncn, /* i : scaling factor of dn and cn */ +#else Word16 Q_dn, - const Word16 cn[L_SUBFR], - /* i : residual after long term prediction */ // q_cn +#endif + const Word16 cn[L_SUBFR], /* i : residual after long term prediction Q_dncn */ +#ifndef OPT_2416_ACELP_FAST const Word16 q_cn, - const Word16 H[L_SUBFR], - /* i : impulse response of weighted synthesis filter */ // e(norm_s(H[0])+1) - Word16 code[L_SUBFR], - /* o : algebraic (fixed) codebook excitation */ // Q0 - Word16 y[], - /* o : filtered fixed codebook excitation */ // e(norm_s(H[0])+1) - const Word16 L_subfr /* i : subframe length */ +#endif + const Word16 H[L_SUBFR], /* i : impulse response of weighted synthesis filter e(norm_s(H[0])+1) */ + Word16 code[L_SUBFR], /* o : algebraic (fixed) codebook excitation Q0 */ + Word16 y[], /* o : filtered fixed codebook excitation e(norm_s(H[0])+1) */ + const Word16 L_subfr /* i : subframe length */ ) { Word16 i, j, q, bits, bits_track, nb_pos, nb_pulse, track, nb_iter, nb_tracks; @@ -163,8 +164,11 @@ void acelp_fast_fx( Word16 flag = 0; move16(); Word32 temp1, temp2, temp3, temp4, temp5, temp6; +#ifndef OPT_2416_ACELP_FAST Word16 q_temp1, q_temp2; +#endif Word16 scale_temp1, scale_temp2; + /*-----------------------------------------------------------------* * Initialization *-----------------------------------------------------------------*/ @@ -223,7 +227,6 @@ void acelp_fast_fx( codetrackpos = -1; /* to avoid compilation warnings */ move16(); - IF( EQ_16( cdk_index, 14 ) ) { /* 14 bits, 2 pulses, 2 tracks: 11 (used all tracks) */ @@ -295,8 +298,9 @@ void acelp_fast_fx( * Find signal bn[] and sign pre-selection vector sign[]. *-----------------------------------------------------------------*/ +#ifndef OPT_2416_ACELP_FAST exp = sub( Q31, shl( Q_dn, 1 ) ); - +#endif s64 = 0; move64(); FOR( i = 0; i < L_subfr; i++ ) @@ -311,10 +315,13 @@ void acelp_fast_fx( { Word16 new_exp1 = W_norm( s64 ); dndn_fx = W_extract_h( W_shl( s64, new_exp1 ) ); // 2 * Q_dyn + exp1 - 31 +#ifndef OPT_2416_ACELP_FAST dndn_e = sub( 31, sub( add( add( shl( Q_dn, 1 ), 1 ), new_exp1 ), 32 ) ); +#else + dndn_e = sub( 31, sub( add( add( shl( Q_dncn, 1 ), 1 ), new_exp1 ), 32 ) ); +#endif } - cncn_fx = 214748365 /* 0.1f in Q31 */; move32(); cncn_e = 0; @@ -337,7 +344,11 @@ void acelp_fast_fx( { Word16 new_exp1 = W_norm( s64 ); cncn_track[q] = W_extract_h( W_shl( s64, new_exp1 ) ); // 2 * Q_dyn + exp1 - 31 +#ifndef OPT_2416_ACELP_FAST cncn_track_e[q] = sub( 31, sub( add( add( shl( q_cn, 1 ), 1 ), new_exp1 ), 32 ) ); +#else + cncn_track_e[q] = sub( 31, sub( add( add( shl( Q_dncn, 1 ), 1 ), new_exp1 ), 32 ) ); +#endif } cncn_fx = BASOP_Util_Add_Mant32Exp( cncn_fx, cncn_e, cncn_track[q], cncn_track_e[q], &cncn_e ); // Q(cncn_e) } @@ -348,8 +359,14 @@ void acelp_fast_fx( tmp = add( tmp, sub( dndn_e, cncn_e ) ); s_coef_fx = Sqrt16( s_coef_fx, &tmp ); // Q(15 - tmp) +#ifdef OPT_2416_ACELP_FAST + scale_temp1 = sub( Q16, tmp ); + /* Q_dn = q_cn and it doesn't matter */ +#else q_temp1 = add( add( sub( Q15, tmp ), q_cn ), Q1 ); scale_temp1 = sub( q_temp1, Q_dn ); +#endif + FOR( i = 0; i < L_subfr; i++ ) { temp1 = L_mult( s_coef_fx, cn[i] ); // Q(15 - tmp)+q_cn+1 @@ -360,6 +377,15 @@ void acelp_fast_fx( bn_orig_fx[i] = L_add( temp1, temp2 ); // Q_dn move32(); +#ifdef OPT_2416_ACELP_FAST + sign_fx[i] = -1; + move16(); + if ( bn_orig_fx[i] >= 0 ) + { + sign_fx[i] = 1; + move16(); + } +#else IF( bn_orig_fx[i] >= 0 ) { sign_fx[i] = 1; @@ -369,6 +395,7 @@ void acelp_fast_fx( sign_fx[i] = -1; } move16(); +#endif } /*-----------------------------------------------------------------* @@ -397,6 +424,7 @@ void acelp_fast_fx( /*-----------------------------------------------------------------* * Approximate FI[i][j] by alp[abs(i-j)] and compute buffer alp_buf[]. *-----------------------------------------------------------------*/ + q_H = sub( 14, norm_s( H[0] ) ); shift = sub( shl( q_H, 1 ), 6 ); @@ -458,6 +486,29 @@ void acelp_fast_fx( test(); test(); /* skip certain tracks if number of pulses is lower than number of tracks */ +#ifdef OPT_2416_ACELP_FAST /* Just need a negative number, it doesn't need to be scaled */ + IF( EQ_16( nb_pulse, 2 ) && EQ_16( nb_tracks, NB_TRACK_FCB_4T ) ) + { + max_track[NB_TRACK_FCB_4T - 3] = L_deposit_l( -1 ); + move32(); + max_track[NB_TRACK_FCB_4T - 1] = L_deposit_l( -1 ); + move32(); + } + ELSE IF( EQ_16( nb_pulse, 3 ) && EQ_16( codetrackpos, TRACKPOS_FIXED_FIRST ) ) + { + max_track[NB_TRACK_FCB_4T - 1] = L_deposit_l( -1 ); + move32(); + } + + FOR( q = 0; q < nb_tracks; q++ ) + { + i = maximum_32_fx( max_track, nb_tracks, &L_tmp1 ); + track_order[q] = i; + move16(); + max_track[i] = L_deposit_l( -1 ); + move32(); + } +#else IF( EQ_16( nb_pulse, 2 ) && EQ_16( nb_tracks, NB_TRACK_FCB_4T ) ) { max_track[NB_TRACK_FCB_4T - 3] = L_shl( -1, Q_dn ); // Q_dn @@ -479,6 +530,7 @@ void acelp_fast_fx( max_track[i] = L_shl( -1, Q_dn ); // Q_dn move32(); } +#endif track_order[4] = track_order[1]; // Q0 move16(); @@ -617,9 +669,13 @@ void acelp_fast_fx( move64(); FOR( i = track; i < L_subfr; i += nb_tracks ) { +#ifdef OPT_2416_ACELP_FAST + temp3 = L_msu0( L_mult0( Gd, dn_orig[i] ), G, *alp_pos0 ); +#else temp1 = L_mult0( Gd, dn_orig[i] ); temp2 = L_mult0( G, *alp_pos0 ); temp3 = L_sub( temp1, temp2 ); +#endif dn[i] = L_shr( temp3, 6 ); move32(); alp_pos0 += nb_tracks; @@ -627,8 +683,11 @@ void acelp_fast_fx( } exp1 = W_norm( s64 ); dndn_fx = W_extract_h( W_shl( s64, exp1 ) ); // 2 * Q_dyn + exp1 - 31 +#ifndef OPT_2416_ACELP_FAST dndn_e = sub( 31, sub( add( add( shl( Q_dn, 1 ), 1 ), exp1 ), 32 ) ); - +#else + dndn_e = sub( 31, sub( add( add( shl( Q_dncn, 1 ), 1 ), exp1 ), 32 ) ); +#endif IF( dndn_fx == 0 ) { dndn_fx = 214748365 /* 0.1f in Q31 */; @@ -645,10 +704,16 @@ void acelp_fast_fx( move16(); m[1] = track; // Q0 move16(); +#ifdef OPT_2416_ACELP_FAST + scale_temp1 = sub( Q16, exp1 ); + scale_temp2 = Q2; + move16(); +#else q_temp1 = add( add( sub( Q15, exp1 ), q_cn ), 1 ); q_temp2 = add( Q_dn, Q2 ); scale_temp1 = sub( q_temp1, Q_dn ); scale_temp2 = sub( q_temp2, Q_dn ); +#endif FOR( i = track; i < L_subfr; i += nb_tracks ) { temp1 = L_mult( s_coef_fx, cn[i] ); // Q(15 - tmp)+q_cn+1 @@ -683,8 +748,12 @@ void acelp_fast_fx( Gn = add( Gn, i_mult( s[1], dn_orig[m[1]] ) ); // Q_dn Gd32 = Gd; move16(); +#ifdef OPT_2416_ACELP_FAST + Gd32 = L_add( Gd32, L_mac0( alp[0], i_mult( shl( s[0], 1 ), s[1] ), alp[m[0] - m[1]] ) ); // Q6 +#else Gd32 = L_add( Gd32, L_add( alp[0], L_mult0( i_mult( shl( s[0], 1 ), s[1] ), alp[m[0] - m[1]] ) ) ); // Q6 - G = Gn; // Q_dn +#endif + G = Gn; // Q_dn move16(); G1 = i_mult( G, s[1] ); // Q_dn G = i_mult( G, s[0] ); // Q_dn @@ -697,10 +766,15 @@ void acelp_fast_fx( FOR( i = track; i < L_subfr; i += nb_tracks ) { temp1 = imult3216( Gd32, dn_orig[i] ); +#ifdef OPT_2416_ACELP_FAST + temp4 = L_msu0( temp1, G, *alp_pos0 ); + temp4 = L_msu0( temp4, G1, *alp_pos1 ); +#else temp2 = L_mult0( G, *alp_pos0 ); temp3 = L_mult0( G1, *alp_pos1 ); temp4 = L_sub( temp1, temp2 ); temp4 = L_sub( temp4, temp3 ); +#endif dn[i] = L_shr( temp4, 6 ); move32(); alp_pos0 += nb_tracks; @@ -720,10 +794,16 @@ void acelp_fast_fx( Gn = add( Gn, i_mult( s[2], dn_orig[m[2]] ) ); // Q_dn temp1 = alp[0]; move32(); +#ifdef OPT_2416_ACELP_FAST + temp2 = L_mac0( temp1, i_mult( shl( s[0], 1 ), s[2] ), alp[m[0] - m[2]] ); + temp3 = L_mac0( temp2, i_mult( shl( s[1], 1 ), s[2] ), alp[m[1] - m[2]] ); + Gd32 = L_add( Gd32, temp3 ); // Q6 +#else temp2 = L_mult0( i_mult( shl( s[0], 1 ), s[2] ), alp[m[0] - m[2]] ); temp3 = L_mult0( i_mult( shl( s[1], 1 ), s[2] ), alp[m[1] - m[2]] ); Gd32 = L_add( Gd32, L_add( L_add( temp1, temp2 ), temp3 ) ); // Q6 - G = Gn; // Q_dn +#endif + G = Gn; // Q_dn move16(); G1 = i_mult( G, s[1] ); // Q_dn G2 = i_mult( G, s[2] ); // Q_dn @@ -739,12 +819,18 @@ void acelp_fast_fx( { temp1 = imult3216( Gd32, dn_orig[i] ); +#ifdef OPT_2416_ACELP_FAST + temp5 = L_msu0( temp1, G, *alp_pos0 ); + temp5 = L_msu0( temp5, G1, *alp_pos1 ); + temp5 = L_msu0( temp5, G2, *alp_pos2 ); +#else temp2 = L_mult0( G, *alp_pos0 ); temp3 = L_mult0( G1, *alp_pos1 ); temp4 = L_mult0( G2, *alp_pos2 ); temp5 = L_sub( temp1, temp2 ); temp5 = L_sub( temp5, temp3 ); temp5 = L_sub( temp5, temp4 ); +#endif dn[i] = L_shr( temp5, 6 ); move32(); alp_pos0 += nb_tracks; @@ -770,11 +856,19 @@ void acelp_fast_fx( Gn = add( Gn, i_mult( s[3], dn_orig[m[3]] ) ); // Q_dn temp1 = alp[0]; move32(); +#ifdef OPT_2416_ACELP_FAST + temp2 = L_mac0( temp1, i_mult( shl( s[0], 1 ), s[3] ), alp[m[0] - m[3]] ); + temp2 = L_mac0( temp2, i_mult( shl( s[1], 1 ), s[3] ), alp[m[1] - m[3]] ); + temp2 = L_mac0( temp2, i_mult( shl( s[2], 1 ), s[3] ), alp[m[2] - m[3]] ); + + Gd32 = L_add( Gd32, temp2 ); // Q6 +#else temp2 = L_mult0( i_mult( shl( s[0], 1 ), s[3] ), alp[m[0] - m[3]] ); temp3 = L_mult0( i_mult( shl( s[1], 1 ), s[3] ), alp[m[1] - m[3]] ); temp4 = L_mult0( i_mult( shl( s[2], 1 ), s[3] ), alp[m[2] - m[3]] ); Gd32 = L_add( Gd32, L_add( L_add( L_add( temp1, temp2 ), temp3 ), temp4 ) ); // Q6 +#endif G = Gn; move16(); // Q_dn G1 = i_mult( G, s[1] ); // Q_dn @@ -795,6 +889,12 @@ void acelp_fast_fx( FOR( i = track; i < L_subfr; i += nb_tracks ) { temp1 = imult3216( Gd32, dn_orig[i] ); +#ifdef OPT_2416_ACELP_FAST + temp6 = L_msu0( temp1, G, *alp_pos0 ); + temp6 = L_msu0( temp6, G1, *alp_pos1 ); + temp6 = L_msu0( temp6, G2, *alp_pos2 ); + temp6 = L_msu0( temp6, G3, *alp_pos3 ); +#else temp2 = L_mult0( G, *alp_pos0 ); temp3 = L_mult0( G1, *alp_pos1 ); temp4 = L_mult0( G2, *alp_pos2 ); @@ -803,6 +903,7 @@ void acelp_fast_fx( temp6 = L_sub( temp6, temp3 ); temp6 = L_sub( temp6, temp4 ); temp6 = L_sub( temp6, temp5 ); +#endif dn[i] = L_shr( temp6, 6 ); move32(); alp_pos0 += nb_tracks; @@ -824,6 +925,12 @@ void acelp_fast_fx( FOR( i = 0; i < L_subfr; i++ ) { temp1 = imult3216( Gd32, dn_orig[i] ); +#ifdef OPT_2416_ACELP_FAST + temp6 = L_msu0( temp1, G, *alp_pos0 ); + temp6 = L_msu0( temp6, G1, *alp_pos1 ); + temp6 = L_msu0( temp6, G2, *alp_pos2 ); + temp6 = L_msu0( temp6, G3, *alp_pos3 ); +#else temp2 = L_mult0( G, *alp_pos0 ); temp3 = L_mult0( G1, *alp_pos1 ); temp4 = L_mult0( G2, *alp_pos2 ); @@ -832,6 +939,7 @@ void acelp_fast_fx( temp6 = L_sub( temp6, temp3 ); temp6 = L_sub( temp6, temp4 ); temp6 = L_sub( temp6, temp5 ); +#endif dn[i] = L_shr( temp6, 6 ); move16(); alp_pos0++; @@ -885,6 +993,13 @@ void acelp_fast_fx( FOR( j = 0; j < nb_pulse; j++ ) { +#ifdef OPT_2416_ACELP_FAST + p_hn = h_inv - m[j]; + if ( s[j] > 0 ) + { + p_hn = h - m[j]; + } +#else IF( s[j] > 0 ) { p_hn = h - m[j]; @@ -893,7 +1008,7 @@ void acelp_fast_fx( { p_hn = h_inv - m[j]; } - +#endif FOR( i = 0; i < L_subfr; i++ ) { y_tmp[i] = add_sat( y_tmp[i], *p_hn++ ); // q_H @@ -906,8 +1021,11 @@ void acelp_fast_fx( s64 = W_mult0_32_32( crit_num, crit_num ); // 2*Q_dn exp = W_norm( s64 ); crit_num = W_extract_h( W_shl( s64, exp ) ); // 2*Q_dn + exp - 32 +#ifndef OPT_2416_ACELP_FAST q_crit_num = add( shl( Q_dn, 1 ), sub( exp, 32 ) ); - +#else + q_crit_num = add( shl( Q_dncn, 1 ), sub( exp, 32 ) ); +#endif // crit_den = sum2_fx( y_tmp, L_subfr ); // 2*q_H s64 = 0; move64(); @@ -926,6 +1044,15 @@ void acelp_fast_fx( IF( GT_16( exp, exp1 ) ) { +#ifdef OPT_2416_ACELP_FAST + flag = 0; + move16(); + if ( GE_32( L_shr( L_tmp1, sub( exp, exp1 ) ), L_tmp2 ) ) + { + flag = 1; + move16(); + } +#else IF( GE_32( L_shr( L_tmp1, sub( exp, exp1 ) ), L_tmp2 ) ) { flag = 1; @@ -936,9 +1063,19 @@ void acelp_fast_fx( flag = 0; move16(); } +#endif } ELSE { +#ifdef OPT_2416_ACELP_FAST + flag = 0; + move16(); + if ( GE_32( L_tmp1, L_shr( L_tmp2, sub( exp1, exp ) ) ) ) + { + flag = 1; + move16(); + } +#else IF( GE_32( L_tmp1, L_shr( L_tmp2, sub( exp1, exp ) ) ) ) { flag = 1; @@ -949,9 +1086,9 @@ void acelp_fast_fx( flag = 0; move16(); } +#endif } - IF( flag ) { crit_num_max = crit_num; diff --git a/lib_enc/inov_enc_fx.c b/lib_enc/inov_enc_fx.c index 0efcf795b2369c00a81ab0d120095893cf4166db..da6719f36f1773951fdfef8e53660885eacdac18 100644 --- a/lib_enc/inov_enc_fx.c +++ b/lib_enc/inov_enc_fx.c @@ -369,7 +369,11 @@ Word16 inov_encode_fx( } ELSE { +#ifdef OPT_2416_ACELP_FAST + acelp_fast_fx( hBstr, nBits, dn, Qdn, cn, h2, code, y2, L_subfr ); +#else acelp_fast_fx( hBstr, nBits, dn, Qdn, cn, Qcn, h2, code, y2, L_subfr ); +#endif } } ELSE IF( ( EQ_16( st_fx->idchan, 1 ) && LE_16( st_fx->acelp_cfg.fixed_cdk_index[idx2], 7 ) ) || ( st_fx->idchan == 0 && LE_16( st_fx->acelp_cfg.fixed_cdk_index[idx2], 3 ) ) ) @@ -380,7 +384,11 @@ Word16 inov_encode_fx( } ELSE { +#ifdef OPT_2416_ACELP_FAST + acelp_fast_fx( hBstr, st_fx->acelp_cfg.fixed_cdk_index[idx2], dn, Qdn, cn, h2, code, y2, L_SUBFR ); +#else acelp_fast_fx( hBstr, st_fx->acelp_cfg.fixed_cdk_index[idx2], dn, Qdn, cn, Qcn, h2, code, y2, L_SUBFR ); +#endif } } ELSE