diff --git a/lib_com/ivas_prot_fx.h b/lib_com/ivas_prot_fx.h index 80bdda3c2e6fd4ce43204716ae9aa789eded3be5..1bdc334391239759cc906a3afe73dc709543a87a 100644 --- a/lib_com/ivas_prot_fx.h +++ b/lib_com/ivas_prot_fx.h @@ -4706,6 +4706,8 @@ Word32 dot_product_cholesky_fx( const Word32 *A, /* i : Cholesky matrix A */ const Word16 N /* i : vector & matrix size */ ); + +#ifndef DOT_PROD_CHOLESKY_64BIT Word32 dot_product_cholesky_fixed( const Word32 *x, /* i : vector x */ const Word32 *A, /* i : Cholesky matrix A */ @@ -4713,6 +4715,13 @@ Word32 dot_product_cholesky_fixed( const Word16 exp_x, const Word16 exp_A, Word16 *exp_sum ); +#else +Word64 dot_product_cholesky_fixed( + const Word32 *x, /* i : vector x */ + const Word32 *A, /* i : Cholesky matrix A */ + const Word16 N /* i : vector & matrix size */ +); +#endif void v_mult_mat_fx( Word32 *y_fx, /* o : the product x*A */ diff --git a/lib_com/ivas_tools.c b/lib_com/ivas_tools.c index ec3c30723a14064891ab567e01b2f2dcdb4fdf9a..d557669289d85ac2da4113bda2f6fa5b210f5f23 100644 --- a/lib_com/ivas_tools.c +++ b/lib_com/ivas_tools.c @@ -606,6 +606,7 @@ void v_sub32_fx( * Therefore, S=A*A' where A is upper triangular matrix of size (m*m+m)/2 (zeros ommitted, column-wise) *---------------------------------------------------------------------*/ +#ifndef DOT_PROD_CHOLESKY_64BIT /*! r: the dot product x'*A*A'*x */ Word32 dot_product_cholesky_fixed( const Word32 *x, /* i : vector x Q31 - exp_x*/ @@ -642,6 +643,44 @@ Word32 dot_product_cholesky_fixed( return suma; } +#else +/*! r: the dot product x'*A*A'*x */ +Word64 dot_product_cholesky_fixed( + const Word32 *x, /* i : vector x Q31 - exp_x*/ + const Word32 *A, /* i : Cholesky matrix A Q31 - exp_A*/ + const Word16 N /* i : vector & matrix size Q0*/ +) +{ + Word16 i, j; + Word64 suma, tmp_sum; + Word32 mul; + Word32 tmp; + const Word32 *pt_x, *pt_A; + pt_A = A; + suma = 0; + move64(); + + FOR( i = 0; i < N; i++ ) + { + tmp_sum = 0; + move32(); + pt_x = x; + + FOR( j = 0; j <= i; j++ ) + { + mul = Mpy_32_32( *pt_x++, *pt_A++ ); + tmp_sum = W_add( tmp_sum, W_deposit32_l( mul ) ); + } + + tmp_sum = W_shr( tmp_sum, 4 ); // to make sure that the tmp_sum will not overflow + tmp = W_extract_l( tmp_sum ); + suma = W_mac_32_32( suma, tmp, tmp ); + } + + return suma; +} +#endif + void v_mult_mat_fixed( Word32 *y, /* o : the product x*A Qx - guardbits*/ const Word32 *x, /* i : vector x Qx*/ diff --git a/lib_com/options.h b/lib_com/options.h index d3570004d0ed950bbe6677bedf3af9376621fafa..59698f78e2d1313b399f15fb0b87c2761476ff07 100644 --- a/lib_com/options.h +++ b/lib_com/options.h @@ -173,4 +173,5 @@ #define NONBE_FIX_1277_EVS_DTX_HIGH_RATE_THRESHOLD /* VA/Eri: FLP issue 1277: Fix Mismatch in DTX high-rate threshold between EVS float and BASOP */ #define NONBE_FIX_708_OSBA_BR_SWITCHING_CRASH /* FhG: issue 708: fix crash in OSBA BR switching with long test vectors */ //#define OPT_STEREO_32KBPS_V1 /* Optimization made in stereo decoding path for 32kbps decoding */ +#define DOT_PROD_CHOLESKY_64BIT /* FhG: Issue 1323, optimized 64 bit implementation of dot_product_cholesky() */ #endif diff --git a/lib_enc/speech_music_classif_fx.c b/lib_enc/speech_music_classif_fx.c index 295cff8a1a889597baf9340d513c08348b248002..0188ba45c608f4d9cb87e9dacab0f9f122e04a75 100644 --- a/lib_enc/speech_music_classif_fx.c +++ b/lib_enc/speech_music_classif_fx.c @@ -1683,10 +1683,16 @@ Word16 ivas_smc_gmm_fx( Word16 flag_odv; Word32 lps_fx, lpm_fx, lpn_fx; Word32 ps_fx[N_SMC_MIXTURES], pm_fx[N_SMC_MIXTURES], pn_fx[N_SMC_MIXTURES]; +#ifndef DOT_PROD_CHOLESKY_64BIT Word32 lprob_fx; Word16 lprob_exp = 0; +#else + Word64 wprob_fx; +#endif Word32 fvm_fx[N_PCA_COEF]; +#ifndef DOT_PROD_CHOLESKY_64BIT Word16 fvm_exp = 0; +#endif Word32 sum_PS_fx, ps_diff_fx, ps_sta_fx; Word32 dlp_fx, wrelE_fx, wdrop_fx, wght_fx; Word32 wrise_fx; @@ -2273,23 +2279,38 @@ Word16 ivas_smc_gmm_fx( FOR( m = 0; m < N_SMC_MIXTURES; m++ ) { v_sub32_fx( FV_fx, &means_speech_fx[m * N_PCA_COEF], fvm_fx, N_PCA_COEF ); +#ifndef DOT_PROD_CHOLESKY_64BIT fvm_exp = sub( 31, Qfact_FV ); lprob_exp = 0; move16(); lprob_fx = dot_product_cholesky_fixed( fvm_fx, &prec_chol_speech_fx[m * ( N_PCA_COEF * N_PCA_COEF + N_PCA_COEF ) / 2], N_PCA_COEF, fvm_exp, 31 - 28, &lprob_exp ); ps_fx[m] = L_sub( L_sub( L_add( log_weights_speech_compute[m], log_det_chol_speech_fx[m] ), L_shl( lprob_fx, sub( Q18 - 1, sub( Q31, lprob_exp ) ) ) ), HALF_N_PCA_COEF_LOG_P12_Q18 ); // Q18 +#else + wprob_fx = dot_product_cholesky_fixed( fvm_fx, &prec_chol_speech_fx[m * ( N_PCA_COEF * N_PCA_COEF + N_PCA_COEF ) / 2], N_PCA_COEF ); // Q10 + ps_fx[m] = L_sub( L_sub( L_add( log_weights_speech_compute[m], log_det_chol_speech_fx[m] ), W_shr( wprob_fx, Q10 ) ), HALF_N_PCA_COEF_LOG_P12_Q18 ); // Q18 +#endif move32(); v_sub32_fx( FV_fx, &means_music_fx[m * N_PCA_COEF], fvm_fx, N_PCA_COEF ); +#ifndef DOT_PROD_CHOLESKY_64BIT lprob_exp = 0; move16(); lprob_fx = dot_product_cholesky_fixed( fvm_fx, &prec_chol_music_fx[m * ( N_PCA_COEF * N_PCA_COEF + N_PCA_COEF ) / 2], N_PCA_COEF, fvm_exp, 31 - 28, &lprob_exp ); pm_fx[m] = L_sub( L_sub( L_add( log_weights_music_compute[m], log_det_chol_music_fx[m] ), L_shl( lprob_fx, sub( Q18 - 1, sub( Q31, lprob_exp ) ) ) ), HALF_N_PCA_COEF_LOG_P12_Q18 ); // Q18 +#else + wprob_fx = dot_product_cholesky_fixed( fvm_fx, &prec_chol_music_fx[m * ( N_PCA_COEF * N_PCA_COEF + N_PCA_COEF ) / 2], N_PCA_COEF ); // Q10 + pm_fx[m] = L_sub( L_sub( L_add( log_weights_music_compute[m], log_det_chol_music_fx[m] ), W_shr( wprob_fx, Q10 ) ), HALF_N_PCA_COEF_LOG_P12_Q18 ); // Q18 +#endif move32(); v_sub32_fx( FV_fx, &means_noise_fx[m * N_PCA_COEF], fvm_fx, N_PCA_COEF ); +#ifndef DOT_PROD_CHOLESKY_64BIT lprob_exp = 0; move16(); lprob_fx = dot_product_cholesky_fixed( fvm_fx, &prec_chol_noise_fx[m * ( N_PCA_COEF * N_PCA_COEF + N_PCA_COEF ) / 2], N_PCA_COEF, fvm_exp, 31 - 28, &lprob_exp ); pn_fx[m] = L_sub( L_sub( L_add( log_weights_noise_compute[m], log_det_chol_noise_fx[m] ), L_shl( lprob_fx, sub( Q18 - 1, sub( Q31, lprob_exp ) ) ) ), HALF_N_PCA_COEF_LOG_P12_Q18 ); // Q18 +#else + wprob_fx = dot_product_cholesky_fixed( fvm_fx, &prec_chol_noise_fx[m * ( N_PCA_COEF * N_PCA_COEF + N_PCA_COEF ) / 2], N_PCA_COEF ); // Q10 + pn_fx[m] = L_sub( L_sub( L_add( log_weights_noise_compute[m], log_det_chol_noise_fx[m] ), W_shr( wprob_fx, Q10 ) ), HALF_N_PCA_COEF_LOG_P12_Q18 ); // Q18 +#endif move32(); } @@ -2311,7 +2332,7 @@ Word16 ivas_smc_gmm_fx( hSpMusClas->lpn_fx = extract_l( L_shr( lpn_fx, 11 ) ); // Q7 move16(); #else - hSpMusClas->lpm_fx = extract_h( L_shl_sat( lpm_fx, 16 - 11 ) ); // Q7 + hSpMusClas->lpm_fx = extract_h( L_shl_sat( lpm_fx, 16 - 11 ) ); // Q7 move16(); hSpMusClas->lps_fx = extract_h( L_shl_sat( lps_fx, 16 - 11 ) ); // Q7 move16();