From 726b69f070b295ddd2c65a28eeb3e76757de6606 Mon Sep 17 00:00:00 2001 From: Sandesh Venkatesh Date: Wed, 5 Mar 2025 21:03:44 +0530 Subject: [PATCH] Optimizations for multichannel file decoded with binaural rendering ivas_binRenderer_filterModule_fx - 3.96 WMOPS improvement matrix_product_mant_exp - 9.631 WMOPS improvement BASOP_Util_Add_Mant32Exp - 15.926 WMOPS improvement --- lib_com/ivas_prot_fx.h | 14 ++ lib_com/ivas_tools_fx.c | 238 +++++++++++++++++-- lib_com/options.h | 1 + lib_dec/ivas_binRenderer_internal_fx.c | 78 +++++- lib_dec/ivas_dirac_output_synthesis_cov_fx.c | 159 ++++++++++++- lib_dec/ivas_mc_param_dec_fx.c | 4 + lib_rend/ivas_stat_rend.h | 4 + 7 files changed, 466 insertions(+), 32 deletions(-) diff --git a/lib_com/ivas_prot_fx.h b/lib_com/ivas_prot_fx.h index 79ba75b81..3c01bebfd 100644 --- a/lib_com/ivas_prot_fx.h +++ b/lib_com/ivas_prot_fx.h @@ -1314,6 +1314,20 @@ Word16 matrix_diag_product_fx( Word32 *Z, /* o : resulting matrix after the matrix multiplication */ Word16 *Z_e ); +#ifdef OPT_BASOP_ADD_v1 +Word16 matrix_diag_product_fx_2( + const Word32 *X, /* i : left hand matrix Q31 - X_e*/ + const Word16 X_e, + const Word16 rowsX, /* i : number of rows of the left hand matrix Q0*/ + const Word16 colsX, /* i : number of columns of the left hand matrix Q0*/ + const Word16 transpX, /* i : flag indicating the transposition of the left hand matrix prior to the multiplication Q0*/ + const Word32 *Y, /* i : right hand diagonal matrix as vector containing the diagonal elements Q31 - Y_e*/ + const Word16 *Y_e, + const Word16 entriesY, /* i : number of entries in the diagonal Q0*/ + Word32 *Z, /* o : resulting matrix after the matrix multiplication Q31 - Z_e*/ + Word16 *Z_e ); +#endif /* OPT_BASOP_ADD_v1 */ + Word16 matrix_diag_product_fx_1( const Word32 *X, /* i : left hand matrix */ const Word16 *X_e, diff --git a/lib_com/ivas_tools_fx.c b/lib_com/ivas_tools_fx.c index d55766928..9cd1a063e 100644 --- a/lib_com/ivas_tools_fx.c +++ b/lib_com/ivas_tools_fx.c @@ -980,7 +980,9 @@ Word16 matrix_product_mant_exp_fx( Word16 out_e[MAX_OUTPUT_CHANNELS * MAX_OUTPUT_CHANNELS]; Word16 *Zp_fx_e = out_e; Word16 row, col; +#ifndef OPT_BASOP_ADD_v1 Word16 x_idx, y_idx; +#endif /* OPT_BASOP_ADD_v1 */ Word64 temp; Word16 temp_e; Word16 prod_e = add( X_fx_e, Y_fx_e ); @@ -1007,9 +1009,13 @@ Word16 matrix_product_mant_exp_fx( FOR( k = 0; k < rowsX; ++k ) { +#ifdef OPT_BASOP_ADD_v1 + temp = W_mac_32_32( temp, X_fx[k + i * rowsX], Y_fx[k + j * rowsY] ); // X_fx_e + Y_fx_e +#else /* OPT_BASOP_ADD_v1 */ x_idx = k + i * rowsX; y_idx = k + j * rowsY; temp = W_mac_32_32( temp, X_fx[x_idx], Y_fx[y_idx] ); // X_fx_e + Y_fx_e +#endif /* OPT_BASOP_ADD_v1 */ } /* Maximize accumulated value to 32-bit */ temp_e = W_norm( temp ); @@ -1047,9 +1053,13 @@ Word16 matrix_product_mant_exp_fx( move64(); FOR( k = 0; k < colsX; ++k ) { +#ifdef OPT_BASOP_ADD_v1 + temp = W_mac_32_32( temp, X_fx[i + k * rowsX], Y_fx[j + k * rowsY] ); // X_fx_e + Y_fx_e +#else /* OPT_BASOP_ADD_v1 */ x_idx = i + k * rowsX; y_idx = j + k * rowsY; temp = W_mac_32_32( temp, X_fx[x_idx], Y_fx[y_idx] ); // X_fx_e + Y_fx_e +#endif /* OPT_BASOP_ADD_v1 */ } /* Maximize accumulated value to 32-bit */ temp_e = W_norm( temp ); @@ -1087,9 +1097,13 @@ Word16 matrix_product_mant_exp_fx( move64(); FOR( k = 0; k < colsX; ++k ) { +#ifdef OPT_BASOP_ADD_v1 + temp = W_mac_32_32( temp, X_fx[k + i * rowsX], Y_fx[j + k * rowsY] ); // X_fx_e + Y_fx_e +#else /* OPT_BASOP_ADD_v1 */ x_idx = k + i * rowsX; y_idx = j + k * rowsY; temp = W_mac_32_32( temp, X_fx[x_idx], Y_fx[y_idx] ); // X_fx_e + Y_fx_e +#endif /* OPT_BASOP_ADD_v1 */ } /* Maximize accumulated value to 32-bit */ temp_e = W_norm( temp ); @@ -1128,9 +1142,13 @@ Word16 matrix_product_mant_exp_fx( move64(); FOR( k = 0; k < colsX; ++k ) { +#ifdef OPT_BASOP_ADD_v1 + temp = W_mac_32_32( temp, X_fx[i + k * rowsX], Y_fx[k + j * rowsY] ); // X_fx_e + Y_fx_e +#else /* OPT_BASOP_ADD_v1 */ x_idx = i + k * rowsX; y_idx = k + j * rowsY; - temp = W_mac_32_32( temp, X_fx[x_idx], Y_fx[y_idx] ); // X_fx_e + Y_fx_e + temp = W_mac_32_32( temp, X_fx[x_idx], Y_fx[y_idx] ); // X_fx_e + Y_fx_e +#endif /* OPT_BASOP_ADD_v1 */ } /* Maximize accumulated value to 32-bit */ temp_e = W_norm( temp ); @@ -1188,7 +1206,9 @@ Word16 matrix_product_fx( ) { Word16 i, j, k; +#ifndef OPT_BASOP_ADD_v1 Word16 x_idx, y_idx; +#endif /* OPT_BASOP_ADD_v1 */ Word32 *Zp_fx = Z_fx; /* Processing */ @@ -1209,9 +1229,13 @@ Word16 matrix_product_fx( move32(); FOR( k = 0; k < rowsX; ++k ) { - x_idx = add( k, imult1616( i, rowsX ) ); /*Q0*/ - y_idx = add( k, imult1616( j, rowsY ) ); /*Q0*/ - ( *Zp_fx ) = L_add( *Zp_fx, Mpy_32_32( X_fx[x_idx], Y_fx[y_idx] ) ); /*Qx + Qy - 31*/ +#ifdef OPT_BASOP_ADD_v1 + ( *Zp_fx ) = Madd_32_32( *Zp_fx, X_fx[k + i * rowsX], Y_fx[k + j * rowsY] ); /*Qx + Qy - 31*/ +#else /* OPT_BASOP_ADD_v1 */ + x_idx = add( k, imult1616( i, rowsX ) ); /*Q0*/ + y_idx = add( k, imult1616( j, rowsY ) ); /*Q0*/ + ( *Zp_fx ) = L_add( *Zp_fx, Mpy_32_32( X_fx[x_idx], Y_fx[y_idx] ) ); /*Qx + Qy - 31*/ +#endif /* OPT_BASOP_ADD_v1 */ move32(); } Zp_fx++; @@ -1232,9 +1256,13 @@ Word16 matrix_product_fx( move32(); FOR( k = 0; k < colsX; ++k ) { - x_idx = add( i, imult1616( k, rowsX ) ); /*Q0*/ - y_idx = add( j, imult1616( k, rowsY ) ); /*Q0*/ - ( *Zp_fx ) = L_add( *Zp_fx, Mpy_32_32( X_fx[x_idx], Y_fx[y_idx] ) ); /*Qx + Qy - 31*/ +#ifdef OPT_BASOP_ADD_v1 + ( *Zp_fx ) = Madd_32_32( *Zp_fx, X_fx[i + k * rowsX], Y_fx[j + k * rowsY] ); /*Qx + Qy - 31*/ +#else /* OPT_BASOP_ADD_v1 */ + x_idx = add( i, imult1616( k, rowsX ) ); /*Q0*/ + y_idx = add( j, imult1616( k, rowsY ) ); /*Q0*/ + ( *Zp_fx ) = L_add( *Zp_fx, Mpy_32_32( X_fx[x_idx], Y_fx[y_idx] ) ); /*Qx + Qy - 31*/ +#endif /* OPT_BASOP_ADD_v1 */ move32(); } Zp_fx++; @@ -1255,9 +1283,13 @@ Word16 matrix_product_fx( move32(); FOR( k = 0; k < colsX; ++k ) { - x_idx = add( k, imult1616( i, rowsX ) ); /*Q0*/ - y_idx = add( j, imult1616( k, rowsY ) ); /*Q0*/ - ( *Zp_fx ) = L_add( *Zp_fx, Mpy_32_32( X_fx[x_idx], Y_fx[y_idx] ) ); /*Qx + Qy - 31*/ +#ifdef OPT_BASOP_ADD_v1 + ( *Zp_fx ) = Madd_32_32( *Zp_fx, X_fx[k + i * rowsX], Y_fx[j + k * rowsY] ); /*Qx + Qy - 31*/ +#else /* OPT_BASOP_ADD_v1 */ + x_idx = add( k, imult1616( i, rowsX ) ); /*Q0*/ + y_idx = add( j, imult1616( k, rowsY ) ); /*Q0*/ + ( *Zp_fx ) = L_add( *Zp_fx, Mpy_32_32( X_fx[x_idx], Y_fx[y_idx] ) ); /*Qx + Qy - 31*/ +#endif /* OPT_BASOP_ADD_v1 */ move32(); } @@ -1280,9 +1312,13 @@ Word16 matrix_product_fx( move32(); FOR( k = 0; k < colsX; ++k ) { +#ifdef OPT_BASOP_ADD_v1 + ( *Zp_fx ) = L_add_sat( *Zp_fx, Mpy_32_32( X_fx[i + k * rowsX], Y_fx[k + j * rowsY] ) ); /*Qx + Qy - 31*/ +#else /* OPT_BASOP_ADD_v1 */ x_idx = add( i, imult1616( k, rowsX ) ); /*Q0*/ y_idx = add( k, imult1616( j, rowsY ) ); /*Q0*/ ( *Zp_fx ) = L_add_sat( *Zp_fx, Mpy_32_32( X_fx[x_idx], Y_fx[y_idx] ) ); /*Qx + Qy - 31*/ +#endif /* OPT_BASOP_ADD_v1 */ // TODO: overflow of Z_fx to be checked move32(); } @@ -1307,7 +1343,9 @@ Word16 matrix_product_q30_fx( ) { Word16 i, j, k; +#ifndef OPT_BASOP_ADD_v1 Word16 x_idx, y_idx; +#endif /* OPT_BASOP_ADD_v1 */ Word32 *Zp_fx = Z_fx; Word64 W_tmp; @@ -1330,10 +1368,14 @@ Word16 matrix_product_q30_fx( move64(); FOR( k = 0; k < rowsX; ++k ) { +#ifdef OPT_BASOP_ADD_v1 + W_tmp = W_add( W_tmp, W_mult0_32_32( X_fx[k + i * rowsX], Y_fx[k + j * rowsY] ) ); // Q56 +#else /* OPT_BASOP_ADD_v1 */ //( *Zp_fx ) = L_add( *Zp_fx, Mpy_32_32( X_fx[k + i * rowsX], Y_fx[k + j * rowsY] ) ); x_idx = add( k, imult1616( i, rowsX ) ); /*Q0*/ y_idx = add( k, imult1616( j, rowsY ) ); /*Q0*/ W_tmp = W_add( W_tmp, W_mult0_32_32( X_fx[x_idx], Y_fx[y_idx] ) ); // Q56 +#endif /* OPT_BASOP_ADD_v1 */ } W_tmp = W_shl( W_tmp, 6 ); /*Q62*/ ( *Zp_fx ) = W_round64_L( W_tmp ); /*Q30*/ @@ -1357,10 +1399,14 @@ Word16 matrix_product_q30_fx( move64(); FOR( k = 0; k < colsX; ++k ) { +#ifdef OPT_BASOP_ADD_v1 + W_tmp = W_add( W_tmp, W_mult0_32_32( X_fx[i + k * rowsX], Y_fx[j + k * rowsY] ) ); // Q56 +#else /* OPT_BASOP_ADD_v1 */ //( *Zp_fx ) = L_add( *Zp_fx, Mpy_32_32( X_fx[i + k * rowsX], Y_fx[j + k * rowsY] ) ); x_idx = add( i, imult1616( k, rowsX ) ); /*Q0*/ y_idx = add( j, imult1616( k, rowsY ) ); /*Q0*/ W_tmp = W_add( W_tmp, W_mult0_32_32( X_fx[x_idx], Y_fx[y_idx] ) ); // Q56 +#endif /* OPT_BASOP_ADD_v1 */ } W_tmp = W_shl( W_tmp, 6 ); /*Q62*/ ( *Zp_fx ) = W_round64_L( W_tmp ); /*Q30*/ @@ -1384,9 +1430,11 @@ Word16 matrix_product_q30_fx( move64(); FOR( k = 0; k < colsX; ++k ) { +#ifndef OPT_BASOP_ADD_v1 //( *Zp_fx ) = L_add( *Zp_fx, Mpy_32_32( X_fx[k + i * rowsX], Y_fx[j + k * rowsY] ) ); x_idx = add( k, imult1616( i, rowsX ) ); /*Q0*/ y_idx = add( j, imult1616( k, rowsY ) ); /*Q0*/ +#endif /* OPT_BASOP_ADD_v1 */ W_tmp = W_add( W_tmp, W_mult0_32_32( X_fx[k + i * rowsX], Y_fx[j + k * rowsY] ) ); // Q56 } @@ -1413,10 +1461,14 @@ Word16 matrix_product_q30_fx( move64(); FOR( k = 0; k < colsX; ++k ) { +#ifdef OPT_BASOP_ADD_v1 + W_tmp = W_add( W_tmp, W_mult0_32_32( X_fx[i + k * rowsX], Y_fx[k + j * rowsY] ) ); // Q56 +#else /* OPT_BASOP_ADD_v1 */ //( *Zp_fx ) = L_add( *Zp_fx, Mpy_32_32( X_fx[i + k * rowsX], Y_fx[k + j * rowsY] ) ); x_idx = add( i, imult1616( k, rowsX ) ); /*Q0*/ y_idx = add( k, imult1616( j, rowsY ) ); /*Q0*/ W_tmp = W_add( W_tmp, W_mult0_32_32( X_fx[x_idx], Y_fx[y_idx] ) ); // Q56 +#endif /* OPT_BASOP_ADD_v1 */ } W_tmp = W_shl( W_tmp, 6 ); /*Q62*/ ( *Zp_fx ) = W_round64_L( W_tmp ); /*Q30*/ @@ -1449,7 +1501,9 @@ Word16 matrix_product_mant_exp( Word16 *Zp_e = Z_e; Word32 L_tmp; Word16 tmp_e; +#ifndef OPT_BASOP_ADD_v1 Word16 x_idx, y_idx; +#endif /* OPT_BASOP_ADD_v1 */ /* Processing */ test(); @@ -1471,11 +1525,16 @@ Word16 matrix_product_mant_exp( move16(); FOR( k = 0; k < rowsX; ++k ) { - x_idx = add( k, imult1616( i, rowsX ) ); /*Q0*/ - y_idx = add( k, imult1616( j, rowsY ) ); /*Q0*/ +#ifdef OPT_BASOP_ADD_v1 + L_tmp = Mpy_32_32( X_fx[k + i * rowsX], Y_fx[k + j * rowsY] ); /*Q31 - (X_e + Y_e)*/ + tmp_e = add( X_e[k + i * rowsX], Y_e[k + j * rowsY] ); +#else /* OPT_BASOP_ADD_v1 */ + x_idx = add( k, imult1616( i, rowsX ) ); /*Q0*/ + y_idx = add( k, imult1616( j, rowsY ) ); /*Q0*/ //( *Zp ) += X[k + i * rowsX] * Y[k + j * rowsY]; L_tmp = Mpy_32_32( X_fx[x_idx], Y_fx[y_idx] ); /*Q31 - (X_e + Y_e)*/ tmp_e = add( X_e[x_idx], Y_e[y_idx] ); +#endif /* OPT_BASOP_ADD_v1 */ ( *Zp ) = BASOP_Util_Add_Mant32Exp( *Zp, *Zp_e, L_tmp, tmp_e, &tmp_e ); move32(); @@ -1503,11 +1562,16 @@ Word16 matrix_product_mant_exp( move16(); FOR( k = 0; k < colsX; ++k ) { +#ifdef OPT_BASOP_ADD_v1 + L_tmp = Mpy_32_32( X_fx[i + k * rowsX], Y_fx[j + k * rowsY] ); /*Q31 - (X_e + Y_e)*/ + tmp_e = add( X_e[i + k * rowsX], Y_e[j + k * rowsY] ); +#else /* OPT_BASOP_ADD_v1 */ x_idx = add( i, imult1616( k, rowsX ) ); /*Q0*/ y_idx = add( j, imult1616( k, rowsY ) ); /*Q0*/ //( *Zp ) += X_fx[i + k * rowsX] * Y_fx[j + k * rowsY]; L_tmp = Mpy_32_32( X_fx[x_idx], Y_fx[y_idx] ); /*Q31 - (X_e + Y_e)*/ tmp_e = add( X_e[x_idx], Y_e[y_idx] ); +#endif /* OPT_BASOP_ADD_v1 */ ( *Zp ) = BASOP_Util_Add_Mant32Exp( *Zp, *Zp_e, L_tmp, tmp_e, &tmp_e ); ( *Zp_e ) = tmp_e; @@ -1534,11 +1598,16 @@ Word16 matrix_product_mant_exp( move16(); FOR( k = 0; k < colsX; ++k ) { +#ifdef OPT_BASOP_ADD_v1 + L_tmp = Mpy_32_32( X_fx[k + i * rowsX], Y_fx[j + k * rowsY] ); /*Q31 - (X_e + Y_e)*/ + tmp_e = add( X_e[k + i * rowsX], Y_e[j + k * rowsY] ); +#else /* OPT_BASOP_ADD_v1 */ x_idx = add( k, imult1616( i, rowsX ) ); /*Q0*/ y_idx = add( j, imult1616( k, rowsY ) ); /*Q0*/ //( *Zp ) += X_fx[k + i * rowsX] * Y_fx[j + k * rowsY]; L_tmp = Mpy_32_32( X_fx[x_idx], Y_fx[y_idx] ); /*Q31 - (X_e + Y_e)*/ tmp_e = add( X_e[x_idx], Y_e[y_idx] ); +#endif /* OPT_BASOP_ADD_v1 */ ( *Zp ) = BASOP_Util_Add_Mant32Exp( *Zp, *Zp_e, L_tmp, tmp_e, &tmp_e ); move32(); @@ -1568,11 +1637,16 @@ Word16 matrix_product_mant_exp( move16(); FOR( k = 0; k < colsX; ++k ) { +#ifdef OPT_BASOP_ADD_v1 + L_tmp = Mpy_32_32( X_fx[i + k * rowsX], Y_fx[k + j * rowsY] ); /*Q31 - (X_e + Y_e)*/ + tmp_e = add( X_e[i + k * rowsX], Y_e[k + j * rowsY] ); +#else /* OPT_BASOP_ADD_v1 */ x_idx = add( i, imult1616( k, rowsX ) ); /*Q0*/ y_idx = add( k, imult1616( j, rowsY ) ); /*Q0*/ //( *Zp ) += X_fx[i + k * rowsX] * Y_fx[k + j * rowsY]; L_tmp = Mpy_32_32( X_fx[x_idx], Y_fx[y_idx] ); /*Q31 - (X_e + Y_e)*/ tmp_e = add( X_e[x_idx], Y_e[y_idx] ); +#endif /* OPT_BASOP_ADD_v1 */ ( *Zp ) = BASOP_Util_Add_Mant32Exp( *Zp, *Zp_e, L_tmp, tmp_e, &tmp_e ); move32(); @@ -1603,7 +1677,9 @@ Word16 matrix_diag_product_fx( { Word16 i, j; Word32 *Zp = Z; +#ifndef OPT_BASOP_ADD_v1 Word16 tmp; +#endif /* OPT_BASOP_ADD_v1 */ /* Processing */ IF( EQ_16( transpX, 1 ) ) /* We use X transpose */ @@ -1616,8 +1692,12 @@ Word16 matrix_diag_product_fx( { FOR( i = 0; i < colsX; ++i ) { +#ifdef OPT_BASOP_ADD_v1 + *( Zp ) = Mpy_32_32( X[j + i * rowsX], Y[j] ); /*Q31 - (X_e + Y_e)*/ +#else /* OPT_BASOP_ADD_v1 */ tmp = add( j, imult1616( i, rowsX ) ); - *( Zp ) = Mpy_32_32( X[tmp], Y[j] ); /*Q31 - (X_e + Y_e)*/ + *( Zp ) = Mpy_32_32( X[tmp], Y[j] ); /*Q31 - (X_e + Y_e)*/ +#endif /* OPT_BASOP_ADD_v1 */ move32(); Zp++; } @@ -1648,6 +1728,100 @@ Word16 matrix_diag_product_fx( return EXIT_SUCCESS; } +#ifdef OPT_BASOP_ADD_v1 +Word16 matrix_diag_product_fx_2( + const Word32 *X, /* i : left hand matrix Q31 - X_e*/ + const Word16 X_e, + const Word16 rowsX, /* i : number of rows of the left hand matrix Q0*/ + const Word16 colsX, /* i : number of columns of the left hand matrix Q0*/ + const Word16 transpX, /* i : flag indicating the transposition of the left hand matrix prior to the multiplication Q0*/ + const Word32 *Y, /* i : right hand diagonal matrix as vector containing the diagonal elements Q31 - Y_e*/ + const Word16 *Y_e, + const Word16 entriesY, /* i : number of entries in the diagonal Q0*/ + Word32 *Z, /* o : resulting matrix after the matrix multiplication Q31 - Z_e*/ + Word16 *Z_e ) +{ + Word16 i, j; + Word32 *Zp = Z; + Word16 *Z_ep = Z_e; + Word16 tmp; + Word16 max_exp = -31; + move16(); + + /* Processing */ + IF( EQ_16( transpX, 1 ) ) /* We use X transpose */ + { + IF( NE_16( rowsX, entriesY ) ) + { + return EXIT_FAILURE; + } + FOR( j = 0; j < entriesY; ++j ) + { + FOR( i = 0; i < colsX; ++i ) + { + tmp = j + i * rowsX; /*Q0*/ + *( Zp ) = Mpy_32_32( X[tmp], Y[j] ); /*Q31 - (X_e + Y_e)*/ + move32(); + Zp++; + *( Z_ep ) = add( X_e, Y_e[j] ); + move16(); + max_exp = s_max( max_exp, *Z_ep ); // Find the max exp + Z_ep++; + } + } + + Zp = Z; + Z_ep = Z_e; + FOR( j = 0; j < entriesY; ++j ) + { + FOR( i = 0; i < colsX; ++i ) + { + *Zp = L_shr( *Zp, sub( max_exp, *Z_ep ) ); + *Z_ep = max_exp; + Zp++; + Z_ep++; + } + } + } + ELSE /* Regular case */ + { + IF( NE_16( colsX, entriesY ) ) + { + return EXIT_FAILURE; + } + + FOR( j = 0; j < entriesY; ++j ) + { + FOR( i = 0; i < rowsX; ++i ) + { + *( Zp ) = Mpy_32_32( *( X ), Y[j] ); /*Q31 - (X_e + Y_e)*/ + move32(); + Zp++; + *( Z_ep ) = add( X_e, Y_e[j] ); + move16(); + max_exp = s_max( max_exp, *Z_ep ); // Find the max exp + Z_ep++; + X++; + } + } + Zp = Z; + Z_ep = Z_e; + FOR( j = 0; j < entriesY; ++j ) + { + FOR( i = 0; i < rowsX; ++i ) + { + *Zp = L_shr( *Zp, sub( max_exp, *Z_ep ) ); + *Z_ep = max_exp; + Zp++; + Z_ep++; + } + } + } + + return EXIT_SUCCESS; +} +#endif /* OPT_BASOP_ADD_v1 */ + Word16 matrix_diag_product_fx_1( const Word32 *X, /* i : left hand matrix Q31 - X_e*/ const Word16 *X_e, @@ -1663,7 +1837,9 @@ Word16 matrix_diag_product_fx_1( Word16 i, j; Word32 *Zp = Z; Word16 *Z_ep = Z_e; +#ifndef OPT_BASOP_ADD_v1 Word16 tmp; +#endif /* OPT_BASOP_ADD_v1 */ /* Processing */ IF( EQ_16( transpX, 1 ) ) /* We use X transpose */ @@ -1676,11 +1852,19 @@ Word16 matrix_diag_product_fx_1( { FOR( i = 0; i < colsX; ++i ) { +#ifdef OPT_BASOP_ADD_v1 + *( Zp ) = Mpy_32_32( X[j + i * rowsX], Y[j] ); /*Q31 - (X_e + Y_e)*/ +#else /* OPT_BASOP_ADD_v1 */ tmp = add( j, imult1616( i, rowsX ) ); /*Q0*/ *( Zp ) = Mpy_32_32( X[tmp], Y[j] ); /*Q31 - (X_e + Y_e)*/ +#endif /* OPT_BASOP_ADD_v1 */ move32(); Zp++; +#ifdef OPT_BASOP_ADD_v1 + *( Z_ep ) = add( X_e[j + i * rowsX], Y_e[j] ); +#else /* OPT_BASOP_ADD_v1 */ *( Z_ep ) = add( X_e[tmp], Y_e[j] ); +#endif /* OPT_BASOP_ADD_v1 */ move16(); Z_ep++; } @@ -1726,7 +1910,9 @@ Word16 diag_matrix_product_fx( { Word16 i, j; Word32 *Zp = Z; +#ifndef OPT_BASOP_ADD_v1 Word16 tmp; +#endif /* OPT_BASOP_ADD_v1 */ /* Processing */ IF( EQ_16( transpX, 1 ) ) /* We use X transpose */ @@ -1739,8 +1925,12 @@ Word16 diag_matrix_product_fx( { FOR( j = 0; j < entriesY; ++j ) { - tmp = add( i, imult1616( j, rowsX ) ); /*Q0*/ - *( Zp ) = Mpy_32_32( X[tmp], Y[j] ); /*Q31 - (X_e + Y_e)*/ +#ifdef OPT_BASOP_ADD_v1 + *( Zp ) = Mpy_32_32( X[i + j * rowsX], Y[j] ); /*Q31 - (X_e + Y_e)*/ +#else /* OPT_BASOP_ADD_v1 */ + tmp = add( i, imult1616( j, rowsX ) ); /*Q0*/ + *( Zp ) = Mpy_32_32( X[tmp], Y[j] ); /*Q31 - (X_e + Y_e)*/ +#endif /* OPT_BASOP_ADD_v1 */ move32(); Zp++; } @@ -1786,7 +1976,9 @@ Word16 matrix_product_diag_fx( { Word16 j, k; Word32 *Zp = Z; +#ifndef OPT_BASOP_ADD_v1 Word16 y_idx, x_idx; +#endif /* OPT_BASOP_ADD_v1 */ /* Processing */ test(); @@ -1805,9 +1997,13 @@ Word16 matrix_product_diag_fx( move32(); FOR( k = 0; k < rowsX; ++k ) { +#ifdef OPT_BASOP_ADD_v1 + ( *Zp ) = Madd_32_32( ( *Zp ), X[k + j * rowsX], Y[k + j * rowsY] ); /*Q31 - (X_e + Y_e)*/ +#else /* OPT_BASOP_ADD_v1 */ x_idx = add( k, imult1616( j, rowsX ) ); /*Q0*/ y_idx = add( k, imult1616( j, rowsY ) ); /*Q0*/ ( *Zp ) = L_add( ( *Zp ), Mpy_32_32( X[x_idx], Y[y_idx] ) ); /*Q31 - (X_e + Y_e)*/ +#endif /* OPT_BASOP_ADD_v1 */ move32(); } Zp++; @@ -1825,9 +2021,13 @@ Word16 matrix_product_diag_fx( move32(); FOR( k = 0; k < colsX; ++k ) { +#ifdef OPT_BASOP_ADD_v1 + ( *Zp ) = Madd_32_32( ( *Zp ), X[j + k * rowsX], Y[j + k * rowsY] ); /*Q31 - (X_e + Y_e)*/ +#else /* OPT_BASOP_ADD_v1 */ x_idx = add( j, imult1616( k, rowsX ) ); /*Q0*/ y_idx = add( j, imult1616( k, rowsY ) ); /*Q0*/ ( *Zp ) = L_add( ( *Zp ), Mpy_32_32( X[x_idx], Y[y_idx] ) ); /*Q31 - (X_e + Y_e)*/ +#endif /* OPT_BASOP_ADD_v1 */ move32(); } Zp++; @@ -1847,9 +2047,13 @@ Word16 matrix_product_diag_fx( move32(); FOR( k = 0; k < colsX; ++k ) { +#ifdef OPT_BASOP_ADD_v1 + ( *Zp ) = Madd_32_32( ( *Zp ), X[k + j * rowsX], Y[j + k * rowsY] ); /*Q31 - (X_e + Y_e)*/ +#else /* OPT_BASOP_ADD_v1 */ x_idx = add( k, imult1616( j, rowsX ) ); /*Q0*/ y_idx = add( j, imult1616( k, rowsY ) ); /*Q0*/ ( *Zp ) = L_add( ( *Zp ), Mpy_32_32( X[x_idx], Y[y_idx] ) ); /*Q31 - (X_e + Y_e)*/ +#endif /* OPT_BASOP_ADD_v1 */ move32(); } @@ -1869,9 +2073,13 @@ Word16 matrix_product_diag_fx( move32(); FOR( k = 0; k < colsX; ++k ) { +#ifdef OPT_BASOP_ADD_v1 + ( *Zp ) = Madd_32_32( ( *Zp ), X[j + k * rowsX], Y[k + j * rowsY] ); /*Q31 - (X_e + Y_e)*/ +#else /* OPT_BASOP_ADD_v1 */ x_idx = add( j, imult1616( k, rowsX ) ); /*Q0*/ y_idx = add( k, imult1616( j, rowsY ) ); /*Q0*/ ( *Zp ) = L_add( ( *Zp ), Mpy_32_32( X[x_idx], Y[y_idx] ) ); /*Q31 - (X_e + Y_e)*/ +#endif /* OPT_BASOP_ADD_v1 */ move32(); } Zp++; diff --git a/lib_com/options.h b/lib_com/options.h index 59698f78e..b2f44bfe9 100644 --- a/lib_com/options.h +++ b/lib_com/options.h @@ -174,4 +174,5 @@ #define NONBE_FIX_708_OSBA_BR_SWITCHING_CRASH /* FhG: issue 708: fix crash in OSBA BR switching with long test vectors */ //#define OPT_STEREO_32KBPS_V1 /* Optimization made in stereo decoding path for 32kbps decoding */ #define DOT_PROD_CHOLESKY_64BIT /* FhG: Issue 1323, optimized 64 bit implementation of dot_product_cholesky() */ +#define OPT_BASOP_ADD_v1 /* optimizations to avoid usage of BASOP_Util_Add_MantExp */ #endif diff --git a/lib_dec/ivas_binRenderer_internal_fx.c b/lib_dec/ivas_binRenderer_internal_fx.c index 748bdec38..a6e03eee3 100644 --- a/lib_dec/ivas_binRenderer_internal_fx.c +++ b/lib_dec/ivas_binRenderer_internal_fx.c @@ -68,9 +68,17 @@ static void ivas_binRenderer_filterModule_fx( { Word16 bandIdx, k, chIdx, tapIdx; Word32 *filterStatesLeftRealPtr_fx, *filterStatesLeftImagPtr_fx; +#ifdef OPT_BASOP_ADD_v1 + Word16 Q_filterStates; +#else /* OPT_BASOP_ADD_v1 */ Word16 *Q_filterStates; +#endif /* OPT_BASOP_ADD_v1 */ const Word32 *filterTapsLeftRealPtr_fx, *filterTapsLeftImagPtr_fx, *filterTapsRightRealPtr_fx, *filterTapsRightImagPtr_fx; Word16 shift_q; +#ifdef OPT_BASOP_ADD_v1 + Q_filterStates = hBinRenderer->hBinRenConvModule->Q_filterStatesLeft; + move16(); +#endif /* OPT_BASOP_ADD_v1 */ FOR( bandIdx = 0; bandIdx < hBinRenderer->conv_band; bandIdx++ ) { @@ -78,7 +86,9 @@ static void ivas_binRenderer_filterModule_fx( { filterStatesLeftRealPtr_fx = (Word32 *) &( hBinRenderer->hBinRenConvModule->filterStatesLeftReal_fx[bandIdx][chIdx][0] ); filterStatesLeftImagPtr_fx = (Word32 *) &( hBinRenderer->hBinRenConvModule->filterStatesLeftImag_fx[bandIdx][chIdx][0] ); +#ifndef OPT_BASOP_ADD_v1 Q_filterStates = (Word16 *) &( hBinRenderer->hBinRenConvModule->Q_filterStatesLeft[bandIdx][chIdx][0] ); +#endif /* OPT_BASOP_ADD_v1 */ filterTapsLeftRealPtr_fx = hBinRenderer->hBinRenConvModule->filterTapsLeftReal_fx[bandIdx][chIdx]; // Q29 filterTapsLeftImagPtr_fx = hBinRenderer->hBinRenConvModule->filterTapsLeftImag_fx[bandIdx][chIdx]; // Q29 @@ -100,39 +110,59 @@ static void ivas_binRenderer_filterModule_fx( filterStatesLeftImagPtr_fx[tapIdx] = filterStatesLeftImagPtr_fx[tapIdx - 1]; move32(); +#ifndef OPT_BASOP_ADD_v1 shift_q = sub( Q_filterStates[tapIdx], Q_filterStates[tapIdx - 1] ); outRealLeft_fx = W_shr( outRealLeft_fx, shift_q ); outImagLeft_fx = W_shr( outImagLeft_fx, shift_q ); outRealRight_fx = W_shr( outRealRight_fx, shift_q ); outImagRight_fx = W_shr( outImagRight_fx, shift_q ); +#endif /* OPT_BASOP_ADD_v1 */ - outRealLeft_fx = W_mac_32_32( outRealLeft_fx, filterStatesLeftRealPtr_fx[tapIdx], filterTapsLeftRealPtr_fx[tapIdx] ); - outRealLeft_fx = W_mac_32_32( outRealLeft_fx, L_negate( filterStatesLeftImagPtr_fx[tapIdx] ), filterTapsLeftImagPtr_fx[tapIdx] ); // Q30 + Q_filterStates[tapIdx - 1] - - outImagLeft_fx = W_mac_32_32( outImagLeft_fx, filterStatesLeftRealPtr_fx[tapIdx], filterTapsLeftImagPtr_fx[tapIdx] ); - outImagLeft_fx = W_mac_32_32( outImagLeft_fx, filterStatesLeftImagPtr_fx[tapIdx], filterTapsLeftRealPtr_fx[tapIdx] ); + outRealLeft_fx = W_mac_32_32( outRealLeft_fx, filterStatesLeftRealPtr_fx[tapIdx], filterTapsLeftRealPtr_fx[tapIdx] ); // Q30 + Q_filterStates + outRealLeft_fx = W_mac_32_32( outRealLeft_fx, L_negate( filterStatesLeftImagPtr_fx[tapIdx] ), filterTapsLeftImagPtr_fx[tapIdx] ); // Q30 + Q_filterStates - outRealRight_fx = W_mac_32_32( outRealRight_fx, filterStatesLeftRealPtr_fx[tapIdx], filterTapsRightRealPtr_fx[tapIdx] ); - outRealRight_fx = W_mac_32_32( outRealRight_fx, L_negate( filterStatesLeftImagPtr_fx[tapIdx] ), filterTapsRightImagPtr_fx[tapIdx] ); + outImagLeft_fx = W_mac_32_32( outImagLeft_fx, filterStatesLeftRealPtr_fx[tapIdx], filterTapsLeftImagPtr_fx[tapIdx] ); // Q30 + Q_filterStates + outImagLeft_fx = W_mac_32_32( outImagLeft_fx, filterStatesLeftImagPtr_fx[tapIdx], filterTapsLeftRealPtr_fx[tapIdx] ); // Q30 + Q_filterStates - outImagRight_fx = W_mac_32_32( outImagRight_fx, filterStatesLeftRealPtr_fx[tapIdx], filterTapsRightImagPtr_fx[tapIdx] ); - outImagRight_fx = W_mac_32_32( outImagRight_fx, filterStatesLeftImagPtr_fx[tapIdx], filterTapsRightRealPtr_fx[tapIdx] ); + outRealRight_fx = W_mac_32_32( outRealRight_fx, filterStatesLeftRealPtr_fx[tapIdx], filterTapsRightRealPtr_fx[tapIdx] ); // Q30 + Q_filterStates + outRealRight_fx = W_mac_32_32( outRealRight_fx, L_negate( filterStatesLeftImagPtr_fx[tapIdx] ), filterTapsRightImagPtr_fx[tapIdx] ); // Q30 + Q_filterStates + outImagRight_fx = W_mac_32_32( outImagRight_fx, filterStatesLeftRealPtr_fx[tapIdx], filterTapsRightImagPtr_fx[tapIdx] ); // Q30 + Q_filterStates + outImagRight_fx = W_mac_32_32( outImagRight_fx, filterStatesLeftImagPtr_fx[tapIdx], filterTapsRightRealPtr_fx[tapIdx] ); // Q30 + Q_filterStates +#ifndef OPT_BASOP_ADD_v1 Q_filterStates[tapIdx] = Q_filterStates[tapIdx - 1]; move16(); +#endif /* OPT_BASOP_ADD_v1 */ } + +#ifdef OPT_BASOP_ADD_v1 + shift_q = add( sub( Q_filterStates, Q_curr ), 1 ); +#else /* OPT_BASOP_ADD_v1 */ shift_q = add( sub( Q_filterStates[1], Q_curr ), 1 ); - outRealLeft_fx = W_shr( outRealLeft_fx, shift_q ); - outImagLeft_fx = W_shr( outImagLeft_fx, shift_q ); - outRealRight_fx = W_shr( outRealRight_fx, shift_q ); - outImagRight_fx = W_shr( outImagRight_fx, shift_q ); +#endif /* OPT_BASOP_ADD_v1 */ + +#ifdef OPT_BASOP_ADD_v1 + IF( shift_q != 0 ) + { +#endif /* OPT_BASOP_ADD_v1 */ + outRealLeft_fx = W_shr( outRealLeft_fx, shift_q ); // Q_curr + outImagLeft_fx = W_shr( outImagLeft_fx, shift_q ); // Q_curr + outRealRight_fx = W_shr( outRealRight_fx, shift_q ); // Q_curr + outImagRight_fx = W_shr( outImagRight_fx, shift_q ); // Q_curr +#ifdef OPT_BASOP_ADD_v1 + hBinRenderer->hBinRenConvModule->Q_filterStatesLeft = Q_curr; + move16(); + } +#endif /* OPT_BASOP_ADD_v1 */ filterStatesLeftRealPtr_fx[0] = CLDFB_real[chIdx][k][bandIdx]; move32(); filterStatesLeftImagPtr_fx[0] = CLDFB_imag[chIdx][k][bandIdx]; move32(); +#ifndef OPT_BASOP_ADD_v1 Q_filterStates[0] = Q_curr; move16(); +#endif /* OPT_BASOP_ADD_v1 */ /* Left Real and Imag */ @@ -318,10 +348,12 @@ static ivas_error ivas_binRenderer_convModuleOpen( return ( IVAS_ERROR( IVAS_ERR_FAILED_ALLOC, "Can not allocate memory for Convolution Module \n" ) ); } +#ifndef OPT_BASOP_ADD_v1 IF( ( hBinRenConvModule->Q_filterStatesLeft = (Word16 ***) malloc( hBinRenderer->conv_band * sizeof( Word16 ** ) ) ) == NULL ) { return ( IVAS_ERROR( IVAS_ERR_FAILED_ALLOC, "Can not allocate memory for Convolution Module \n" ) ); } +#endif /* OPT_BASOP_ADD_v1 */ FOR( bandIdx = 0; bandIdx < hBinRenderer->conv_band; bandIdx++ ) { @@ -335,10 +367,12 @@ static ivas_error ivas_binRenderer_convModuleOpen( return ( IVAS_ERROR( IVAS_ERR_FAILED_ALLOC, "Can not allocate memory for Convolution Module \n" ) ); } +#ifndef OPT_BASOP_ADD_v1 IF( ( hBinRenConvModule->Q_filterStatesLeft[bandIdx] = (Word16 **) malloc( hBinRenderer->nInChannels * sizeof( Word16 * ) ) ) == NULL ) { return ( IVAS_ERROR( IVAS_ERR_FAILED_ALLOC, "Can not allocate memory for Convolution Module \n" ) ); } +#endif /* OPT_BASOP_ADD_v1 */ FOR( chIdx = 0; chIdx < hBinRenderer->nInChannels; chIdx++ ) { @@ -352,10 +386,12 @@ static ivas_error ivas_binRenderer_convModuleOpen( return ( IVAS_ERROR( IVAS_ERR_FAILED_ALLOC, "Can not allocate memory for Convolution Module \n" ) ); } +#ifndef OPT_BASOP_ADD_v1 IF( ( hBinRenConvModule->Q_filterStatesLeft[bandIdx][chIdx] = (Word16 *) malloc( hBinRenConvModule->numTapsArray[bandIdx] * sizeof( Word16 ) ) ) == NULL ) { return ( IVAS_ERROR( IVAS_ERR_FAILED_ALLOC, "Can not allocate memory for Convolution Module \n" ) ); } +#endif /* OPT_BASOP_ADD_v1 */ } } /* set memories */ @@ -400,7 +436,12 @@ static ivas_error ivas_binRenderer_convModuleOpen( /* set the memories to zero */ set32_fx( hBinRenConvModule->filterStatesLeftReal_fx[bandIdx][chIdx], 0, hBinRenConvModule->numTapsArray[bandIdx] ); set32_fx( hBinRenConvModule->filterStatesLeftImag_fx[bandIdx][chIdx], 0, hBinRenConvModule->numTapsArray[bandIdx] ); +#ifdef OPT_BASOP_ADD_v1 + hBinRenConvModule->Q_filterStatesLeft = 31; + move16(); +#else /* OPT_BASOP_ADD_v1 */ set16_fx( hBinRenConvModule->Q_filterStatesLeft[bandIdx][chIdx], 31, hBinRenConvModule->numTapsArray[bandIdx] ); +#endif /* OPT_BASOP_ADD_v1 */ IF( isLoudspeaker ) { hBinRenConvModule->filterTapsLeftReal_fx[bandIdx][chIdx] = hHrtf->leftBRIRReal_fx[bandIdx][tmp]; @@ -414,7 +455,12 @@ static ivas_error ivas_binRenderer_convModuleOpen( /* set the memories to zero */ set32_fx( hBinRenConvModule->filterStatesLeftReal_fx[bandIdx][chIdx], 0, hBinRenConvModule->numTaps ); set32_fx( hBinRenConvModule->filterStatesLeftImag_fx[bandIdx][chIdx], 0, hBinRenConvModule->numTaps ); +#ifdef OPT_BASOP_ADD_v1 + hBinRenConvModule->Q_filterStatesLeft = 31; + move16(); +#else /* OPT_BASOP_ADD_v1 */ set16_fx( hBinRenConvModule->Q_filterStatesLeft[bandIdx][chIdx], 31, hBinRenConvModule->numTaps ); +#endif /* OPT_BASOP_ADD_v1 */ IF( isLoudspeaker ) { hBinRenConvModule->filterTapsLeftReal_fx[bandIdx][chIdx] = hHrtf->leftHRIRReal_fx[bandIdx][tmp]; @@ -1279,8 +1325,10 @@ static void ivas_binRenderer_convModuleClose_fx( free( hBinRenConvModule->filterStatesLeftImag_fx[bandIdx][chIdx] ); hBinRenConvModule->filterStatesLeftImag_fx[bandIdx][chIdx] = NULL; +#ifndef OPT_BASOP_ADD_v1 free( hBinRenConvModule->Q_filterStatesLeft[bandIdx][chIdx] ); hBinRenConvModule->Q_filterStatesLeft[bandIdx][chIdx] = NULL; +#endif /* OPT_BASOP_ADD_v1 */ } free( hBinRenConvModule->filterStatesLeftReal_fx[bandIdx] ); @@ -1289,8 +1337,10 @@ static void ivas_binRenderer_convModuleClose_fx( free( hBinRenConvModule->filterStatesLeftImag_fx[bandIdx] ); hBinRenConvModule->filterStatesLeftImag_fx[bandIdx] = NULL; +#ifndef OPT_BASOP_ADD_v1 free( hBinRenConvModule->Q_filterStatesLeft[bandIdx] ); hBinRenConvModule->Q_filterStatesLeft[bandIdx] = NULL; +#endif /* OPT_BASOP_ADD_v1 */ } free( hBinRenConvModule->filterStatesLeftReal_fx ); @@ -1299,8 +1349,10 @@ static void ivas_binRenderer_convModuleClose_fx( free( hBinRenConvModule->filterStatesLeftImag_fx ); hBinRenConvModule->filterStatesLeftImag_fx = NULL; +#ifndef OPT_BASOP_ADD_v1 free( hBinRenConvModule->Q_filterStatesLeft ); hBinRenConvModule->Q_filterStatesLeft = NULL; +#endif /* OPT_BASOP_ADD_v1 */ free( ( *hBinRenderer )->hBinRenConvModule ); ( *hBinRenderer )->hBinRenConvModule = NULL; diff --git a/lib_dec/ivas_dirac_output_synthesis_cov_fx.c b/lib_dec/ivas_dirac_output_synthesis_cov_fx.c index 13d8fc940..afbde5a60 100644 --- a/lib_dec/ivas_dirac_output_synthesis_cov_fx.c +++ b/lib_dec/ivas_dirac_output_synthesis_cov_fx.c @@ -725,7 +725,11 @@ Word16 computeMixingMatrices_fx( Word32 G_hat_fx[MAX_OUTPUT_CHANNELS]; Word16 G_hat_buff_e[MAX_OUTPUT_CHANNELS]; +#ifdef OPT_BASOP_ADD_v1 + Word16 mat_mult_buffer2_e, mat_mult_buffer3_e; +#else /* OPT_BASOP_ADD_v1 */ Word16 mat_mult_buffer1_e, mat_mult_buffer2_e, mat_mult_buffer3_e; +#endif /* OPT_BASOP_ADD_v1 */ Word32 mat_mult_buffer3_fx[MAX_OUTPUT_CHANNELS * MAX_OUTPUT_CHANNELS]; @@ -775,7 +779,9 @@ Word16 computeMixingMatrices_fx( mat2svdMat_fx( Cy_fx, svd_in_buffer_fx, lengthCy, lengthCy, 0 ); svd_fx( svd_in_buffer_fx, Cy_fx_e, svd_u_buffer_fx, svd_s_buffer_fx, svd_v_buffer_fx, svd_s_buffer_e, lengthCy, lengthCy ); - +#ifdef OPT_BASOP_ADD_v1 + Word16 max_e = -32; +#endif /* OPT_BASOP_ADD_v1 */ /* Computing Ky */ FOR( i = 0; i < lengthCy; ++i ) { @@ -788,8 +794,20 @@ Word16 computeMixingMatrices_fx( move32(); Ky_fx_e[i + ( j * lengthCy )] = tmp_e; move16(); +#ifdef OPT_BASOP_ADD_v1 + max_e = s_max( max_e, tmp_e ); +#endif /* OPT_BASOP_ADD_v1 */ } } +#ifdef OPT_BASOP_ADD_v1 + FOR( i = 0; i < lengthCy * lengthCy; ++i ) + { + Ky_fx[i] = L_shr( Ky_fx[i], sub( max_e, Ky_fx_e[i] ) ); + move32(); + Ky_fx_e[i] = max_e; + move16(); + } +#endif /* OPT_BASOP_ADD_v1 */ /*-----------------------------------------------------------------* * Decomposition of Cx @@ -800,7 +818,9 @@ Word16 computeMixingMatrices_fx( mat2svdMat_fx( Cx_fx, svd_in_buffer_fx, lengthCx, lengthCx, 0 ); svd_fx( svd_in_buffer_fx, Cx_fx_e, svd_u_buffer_fx, svd_s_buffer_fx, svd_v_buffer_fx, svd_s_buffer_e, lengthCx, lengthCx ); - +#ifdef OPT_BASOP_ADD_v1 + max_e = -32; +#endif /* OPT_BASOP_ADD_v1 */ FOR( i = 0; i < lengthCx; ++i ) { FOR( j = 0; j < lengthCx; ++j ) @@ -812,9 +832,20 @@ Word16 computeMixingMatrices_fx( move32(); Kx_fx_e[( i + ( j * lengthCx ) )] = tmp_e; move16(); +#ifdef OPT_BASOP_ADD_v1 + max_e = s_max( max_e, tmp_e ); +#endif /* OPT_BASOP_ADD_v1 */ } } - +#ifdef OPT_BASOP_ADD_v1 + FOR( i = 0; i < lengthCx * lengthCx; ++i ) + { + Kx_fx[i] = L_shr( Kx_fx[i], sub( max_e, Kx_fx_e[i] ) ); + move32(); + Kx_fx_e[i] = max_e; + move16(); + } +#endif /* OPT_BASOP_ADD_v1 */ FOR( i = 0; i < lengthCx; ++i ) { @@ -938,14 +969,25 @@ Word16 computeMixingMatrices_fx( /* Computing the input matrix Kx'*Q'*G_hat'*Ky */ +#ifdef OPT_BASOP_ADD_v1 + Word16 mat_mult_buffer1_fx_e; +#else /* OPT_BASOP_ADD_v1 */ Word16 mat_mult_buffer1_fx_e[MAX_OUTPUT_CHANNELS * MAX_OUTPUT_CHANNELS]; Word16 Q_e_arr[PARAM_MC_MAX_TRANSPORT_CHANS * MAX_CICP_CHANNELS]; set16_fx( Q_e_arr, Q_e, PARAM_MC_MAX_TRANSPORT_CHANS * MAX_CICP_CHANNELS ); matrix_product_mant_exp( Kx_fx, Kx_fx_e, lengthCx, lengthCx, 1, Q_fx, Q_e_arr, lengthCy, lengthCx, 1, mat_mult_buffer1_fx, mat_mult_buffer1_fx_e ); +#endif /* OPT_BASOP_ADD_v1 */ Word16 mat_mult_buffer2_fx_e[MAX_OUTPUT_CHANNELS * MAX_OUTPUT_CHANNELS]; +#ifdef OPT_BASOP_ADD_v1 + matrix_product_mant_exp_fx( Kx_fx, Kx_fx_e[0], lengthCx, lengthCx, 1, Q_fx, Q_e, lengthCy, lengthCx, 1, mat_mult_buffer1_fx, &mat_mult_buffer1_fx_e ); + + matrix_diag_product_fx_2( mat_mult_buffer1_fx, mat_mult_buffer1_fx_e, lengthCx, lengthCy, 0, G_hat_fx, G_hat_buff_e, lengthCy, mat_mult_buffer2_fx, mat_mult_buffer2_fx_e ); + + matrix_product_mant_exp_fx( mat_mult_buffer2_fx, mat_mult_buffer2_fx_e[0], lengthCx, lengthCy, 0, Ky_fx, Ky_fx_e[0], lengthCy, lengthCy, 0, mat_mult_buffer1_fx, &mat_mult_buffer1_fx_e ); +#else /* OPT_BASOP_ADD_v1 */ matrix_diag_product_fx_1( mat_mult_buffer1_fx, mat_mult_buffer1_fx_e, lengthCx, lengthCy, 0, G_hat_fx, G_hat_buff_e, lengthCy, mat_mult_buffer2_fx, mat_mult_buffer2_fx_e ); matrix_product_mant_exp( mat_mult_buffer2_fx, mat_mult_buffer2_fx_e, lengthCx, lengthCy, 0, Ky_fx, Ky_fx_e, lengthCy, lengthCy, 0, mat_mult_buffer1_fx, mat_mult_buffer1_fx_e ); @@ -969,6 +1011,7 @@ Word16 computeMixingMatrices_fx( mat_mult_buffer1_e = exp; move16(); +#endif /* OPT_BASOP_ADD_v1 */ IF( LT_16( lengthCx, lengthCy ) ) { @@ -977,7 +1020,11 @@ Word16 computeMixingMatrices_fx( move16(); nC = lengthCx; move16(); +#ifdef OPT_BASOP_ADD_v1 + svd_fx( svd_in_buffer_fx, mat_mult_buffer1_fx_e, svd_v_buffer_fx, svd_s_buffer_fx, svd_u_buffer_fx, svd_s_buffer_e, nL, nC ); +#else /* OPT_BASOP_ADD_v1 */ svd_fx( svd_in_buffer_fx, mat_mult_buffer1_e, svd_v_buffer_fx, svd_s_buffer_fx, svd_u_buffer_fx, svd_s_buffer_e, nL, nC ); +#endif /* OPT_BASOP_ADD_v1 */ } ELSE { @@ -986,7 +1033,11 @@ Word16 computeMixingMatrices_fx( move16(); nC = lengthCy; move16(); +#ifdef OPT_BASOP_ADD_v1 + svd_fx( svd_in_buffer_fx, mat_mult_buffer1_fx_e, svd_u_buffer_fx, svd_s_buffer_fx, svd_v_buffer_fx, svd_s_buffer_e, nL, nC ); +#else /* OPT_BASOP_ADD_v1 */ svd_fx( svd_in_buffer_fx, mat_mult_buffer1_e, svd_u_buffer_fx, svd_s_buffer_fx, svd_v_buffer_fx, svd_s_buffer_e, nL, nC ); +#endif /* OPT_BASOP_ADD_v1 */ } /* Actually Processing P */ @@ -997,25 +1048,46 @@ Word16 computeMixingMatrices_fx( svdMat2mat_fx( svd_v_buffer_fx, mat_mult_buffer1_fx, lengthCy, lengthCx ); svdMat2mat_fx( svd_u_buffer_fx, mat_mult_buffer2_fx, lengthCx, lengthCx ); +#ifdef OPT_BASOP_ADD_v1 + mat_mult_buffer1_fx_e = 0; +#else /* OPT_BASOP_ADD_v1 */ mat_mult_buffer1_e = 0; +#endif /* OPT_BASOP_ADD_v1 */ move16(); mat_mult_buffer2_e = 0; move16(); +#ifdef OPT_BASOP_ADD_v1 + matrix_product_mant_exp_fx( mat_mult_buffer1_fx, mat_mult_buffer1_fx_e, lengthCy, lengthCx, 0, + mat_mult_buffer2_fx, mat_mult_buffer2_e, lengthCx, lengthCx, 1, + mat_mult_buffer3_fx, &mat_mult_buffer3_e ); +#else /* OPT_BASOP_ADD_v1 */ matrix_product_mant_exp_fx( mat_mult_buffer1_fx, mat_mult_buffer1_e, lengthCy, lengthCx, 0, mat_mult_buffer2_fx, mat_mult_buffer2_e, lengthCx, lengthCx, 1, mat_mult_buffer3_fx, &mat_mult_buffer3_e ); +#endif /* OPT_BASOP_ADD_v1 */ /************************ Formulate M **********************/ +#ifdef OPT_BASOP_ADD_v1 + matrix_product_mant_exp_fx( Ky_fx, Ky_fx_e[0], lengthCy, lengthCy, 0, mat_mult_buffer3_fx, mat_mult_buffer3_e, lengthCy, lengthCx, 0, mat_mult_buffer1_fx, &mat_mult_buffer1_fx_e ); +#else /* OPT_BASOP_ADD_v1 */ Word16 mat_mult_buffer3_fx_e[MAX_OUTPUT_CHANNELS * MAX_OUTPUT_CHANNELS]; set16_fx( mat_mult_buffer3_fx_e, mat_mult_buffer3_e, MAX_OUTPUT_CHANNELS * MAX_OUTPUT_CHANNELS ); matrix_product_mant_exp( Ky_fx, Ky_fx_e, lengthCy, lengthCy, 0, mat_mult_buffer3_fx, mat_mult_buffer3_fx_e, lengthCy, lengthCx, 0, mat_mult_buffer1_fx, mat_mult_buffer1_fx_e ); +#endif /* OPT_BASOP_ADD_v1 */ Word16 mixing_matrix_fx_e[MAX_CICP_CHANNELS * PARAM_MC_MAX_TRANSPORT_CHANS]; +#ifdef OPT_BASOP_ADD_v1 + Word16 mat_mult_buffer1_fx_e1[MAX_OUTPUT_CHANNELS * MAX_OUTPUT_CHANNELS]; + set16_fx( mat_mult_buffer1_fx_e1, mat_mult_buffer1_fx_e, MAX_OUTPUT_CHANNELS * MAX_OUTPUT_CHANNELS ); + + matrix_product_mant_exp( mat_mult_buffer1_fx, mat_mult_buffer1_fx_e1, lengthCy, lengthCx, 0, Kx_reg_inv_fx, Kx_reg_inv_e, lengthCx, lengthCx, 0, mixing_matrix_fx, mixing_matrix_fx_e ); +#else /* OPT_BASOP_ADD_v1 */ matrix_product_mant_exp( mat_mult_buffer1_fx, mat_mult_buffer1_fx_e, lengthCy, lengthCx, 0, Kx_reg_inv_fx, Kx_reg_inv_e, lengthCx, lengthCx, 0, mixing_matrix_fx, mixing_matrix_fx_e ); +#endif /* OPT_BASOP_ADD_v1 */ /*-----------------------------------------------------------------* * Formulate Cr @@ -1026,9 +1098,15 @@ Word16 computeMixingMatrices_fx( Word16 Cx_e_arr[PARAM_MC_MAX_TRANSPORT_CHANS * PARAM_MC_MAX_TRANSPORT_CHANS]; set16_fx( Cx_e_arr, Cx_fx_e, PARAM_MC_MAX_TRANSPORT_CHANS * PARAM_MC_MAX_TRANSPORT_CHANS ); +#ifdef OPT_BASOP_ADD_v1 + matrix_product_mant_exp( mixing_matrix_fx, mixing_matrix_fx_e, lengthCy, lengthCx, 0, Cx_fx, Cx_e_arr, lengthCx, lengthCx, 0, mat_mult_buffer1_fx, mat_mult_buffer1_fx_e1 ); + + matrix_product_mant_exp( mat_mult_buffer1_fx, mat_mult_buffer1_fx_e1, lengthCy, lengthCx, 0, mixing_matrix_fx, mixing_matrix_fx_e, lengthCy, lengthCx, 1, mat_mult_buffer2_fx, mat_mult_buffer2_fx_e ); +#else /* OPT_BASOP_ADD_v1 */ matrix_product_mant_exp( mixing_matrix_fx, mixing_matrix_fx_e, lengthCy, lengthCx, 0, Cx_fx, Cx_e_arr, lengthCx, lengthCx, 0, mat_mult_buffer1_fx, mat_mult_buffer1_fx_e ); matrix_product_mant_exp( mat_mult_buffer1_fx, mat_mult_buffer1_fx_e, lengthCy, lengthCx, 0, mixing_matrix_fx, mixing_matrix_fx_e, lengthCy, lengthCx, 1, mat_mult_buffer2_fx, mat_mult_buffer2_fx_e ); +#endif /* OPT_BASOP_ADD_v1 */ exp = mixing_matrix_fx_e[0]; move16(); @@ -1067,7 +1145,11 @@ Word16 computeMixingMatrices_fx( } /* Avoid Meaningless negative main diagonal elements */ +#ifdef OPT_BASOP_ADD_v1 + IF( Cr_fx[i + ( i * lengthCy )] < 0 ) +#else /* OPT_BASOP_ADD_v1 */ IF( BASOP_Util_Cmp_Mant32Exp( Cr_fx[i + ( i * lengthCy )], exp, 0, 0 ) < 0 ) +#endif /* OPT_BASOP_ADD_v1 */ { Cr_fx[i + ( i * lengthCy )] = 0; move32(); @@ -1129,7 +1211,11 @@ Word16 computeMixingMatrices_fx( { /* Avoid correction for very small energies, main diagonal elements of Cy_tilde_p may be negative */ +#ifdef OPT_BASOP_ADD_v1 + IF( Cy_tilde_p_fx[i + ( i * lengthCy )] < 0 ) +#else /* OPT_BASOP_ADD_v1 */ IF( BASOP_Util_Cmp_Mant32Exp( Cy_tilde_p_fx[i + ( i * lengthCy )], mat_mult_buffer2_e, 0, 0 ) < 0 ) +#endif /* OPT_BASOP_ADD_v1 */ { adj_fx_p[i] = 1073741824; // 1.0f in Q30 move32(); @@ -1148,7 +1234,12 @@ Word16 computeMixingMatrices_fx( move16(); } +#ifdef OPT_BASOP_ADD_v1 + Word32 temp = W_shl_sat_l( W_deposit32_l( 4 ), sub( 31, adj_e[i] ) ); + IF( GT_32( adj_fx_p[i], temp ) ) +#else /* OPT_BASOP_ADD_v1 */ IF( BASOP_Util_Cmp_Mant32Exp( adj_fx_p[i], adj_e[i], 1073741824, 3 ) > 0 ) +#endif /* OPT_BASOP_ADD_v1 */ { adj_fx_p[i] = 1073741824; // 1.0f in Q30 move32(); @@ -1281,6 +1372,9 @@ Word16 computeMixingMatricesResidual_fx( svd_fx( svd_in_buffer_fx, Cy_fx_e, svd_u_buffer_fx, svd_s_buffer_fx, svd_v_buffer_fx, svd_s_buffer_e, lengthCy, lengthCy ); /* Computing Ky */ +#ifdef OPT_BASOP_ADD_v1 + Word16 max_e = -32; +#endif /* OPT_BASOP_ADD_v1 */ FOR( i = 0; i < lengthCy; ++i ) { FOR( j = 0; j < lengthCy; ++j ) @@ -1292,9 +1386,22 @@ Word16 computeMixingMatricesResidual_fx( move32(); Ky_fx_e[i + j * lengthCy] = tmp_e; move16(); +#ifdef OPT_BASOP_ADD_v1 + max_e = s_max( max_e, tmp_e ); +#endif /* OPT_BASOP_ADD_v1 */ } } +#ifdef OPT_BASOP_ADD_v1 + FOR( i = 0; i < lengthCy * lengthCy; ++i ) + { + Ky_fx[i] = L_shr( Ky_fx[i], sub( max_e, Ky_fx_e[i] ) ); + move32(); + Ky_fx_e[i] = max_e; + move16(); + } +#endif /* OPT_BASOP_ADD_v1 */ + /*-----------------------------------------------------------------* * Decomposition of Cx *-----------------------------------------------------------------*/ @@ -1305,7 +1412,9 @@ Word16 computeMixingMatricesResidual_fx( * square root of the diagonal of Cx */ /* Computing Kx */ - +#ifdef OPT_BASOP_ADD_v1 + max_e = -32; +#endif /* OPT_BASOP_ADD_v1 */ FOR( i = 0; i < lengthCx; ++i ) { exp = Cx_e; @@ -1314,7 +1423,20 @@ Word16 computeMixingMatricesResidual_fx( move32(); Kx_fx_e[i] = exp; move16(); +#ifdef OPT_BASOP_ADD_v1 + max_e = s_max( max_e, exp ); +#endif /* OPT_BASOP_ADD_v1 */ + } + +#ifdef OPT_BASOP_ADD_v1 + FOR( i = 0; i < lengthCx; ++i ) + { + Kx_fx[i] = L_shr( Kx_fx[i], sub( max_e, Kx_fx_e[i] ) ); + move32(); + Kx_fx_e[i] = max_e; + move16(); } +#endif /* OPT_BASOP_ADD_v1 */ /*-----------------------------------------------------------------* * Regularization of Sx @@ -1322,16 +1444,25 @@ Word16 computeMixingMatricesResidual_fx( limit_fx = Kx_fx[0]; move32(); +#ifndef OPT_BASOP_ADD_v1 limit_e = Kx_fx_e[0]; move16(); +#endif /* OPT_BASOP_ADD_v1 */ + FOR( i = 1; i < lengthCx; i++ ) { +#ifdef OPT_BASOP_ADD_v1 + IF( GT_32( Kx_fx[i], limit_fx ) ) +#else /* OPT_BASOP_ADD_v1 */ IF( BASOP_Util_Cmp_Mant32Exp( Kx_fx[i], Kx_fx_e[i], limit_fx, limit_e ) > 0 ) +#endif /* OPT_BASOP_ADD_v1 */ { limit_fx = Kx_fx[i]; move32(); +#ifndef OPT_BASOP_ADD_v1 limit_e = Kx_fx_e[i]; move16(); +#endif /* OPT_BASOP_ADD_v1 */ } } @@ -1339,7 +1470,11 @@ Word16 computeMixingMatricesResidual_fx( L_tmp = L_add( L_tmp, EPSILLON_FX ); limit_fx = L_tmp; move16(); +#ifdef OPT_BASOP_ADD_v1 + limit_e = add( Kx_fx_e[0], reg_Sx_e ); +#else /* OPT_BASOP_ADD_v1 */ limit_e = add( limit_e, reg_Sx_e ); +#endif /* OPT_BASOP_ADD_v1 */ FOR( i = 0; i < lengthCx; ++i ) { @@ -1488,10 +1623,16 @@ Word16 computeMixingMatricesResidual_fx( * Formulate M *-----------------------------------------------------------------*/ + +#ifdef OPT_BASOP_ADD_v1 + matrix_product_mant_exp_fx( Ky_fx, Ky_fx_e[0], lengthCy, lengthCy, 0, mat_mult_buffer3_fx, mat_mult_buffer3_e, lengthCy, lengthCx, 0, mat_mult_buffer1_fx, mat_mult_buffer1_buff_e ); + set16_fx( mat_mult_buffer1_buff_e, mat_mult_buffer1_buff_e[0], MAX_OUTPUT_CHANNELS * MAX_OUTPUT_CHANNELS ); +#else /* OPT_BASOP_ADD_v1 */ Word16 mat_mult_buffer3_fx_e[MAX_OUTPUT_CHANNELS * MAX_OUTPUT_CHANNELS]; set16_fx( mat_mult_buffer3_fx_e, mat_mult_buffer3_e, MAX_OUTPUT_CHANNELS * MAX_OUTPUT_CHANNELS ); matrix_product_mant_exp( Ky_fx, Ky_fx_e, lengthCy, lengthCy, 0, mat_mult_buffer3_fx, mat_mult_buffer3_fx_e, lengthCy, lengthCx, 0, mat_mult_buffer1_fx, mat_mult_buffer1_buff_e ); +#endif /* OPT_BASOP_ADD_v1 */ Word16 mixing_matrix_fx_e[MAX_CICP_CHANNELS * MAX_CICP_CHANNELS]; @@ -1576,7 +1717,12 @@ Word16 computeMixingMatricesResidual_fx( move32(); adj_buff_e[i] = scale; move16(); +#ifdef OPT_BASOP_ADD_v1 + Word32 temp = W_shl_sat_l( W_deposit32_l( 4 ), sub( 31, scale ) ); + IF( GT_32( adj_fx_p[i], temp ) ) // 1073741824 -> 1.0f in Q30 +#else /* OPT_BASOP_ADD_v1 */ IF( BASOP_Util_Cmp_Mant32Exp( adj_fx_p[i], scale, 1073741824, 3 ) > 0 ) // 1073741824 -> 1.0f in Q30 +#endif /* OPT_BASOP_ADD_v1 */ { adj_fx_p[i] = 1073741824; // 1.0f in Q30 move32(); @@ -1971,7 +2117,12 @@ Word16 computeMixingMatricesISM_fx( } } +#ifdef OPT_BASOP_ADD_v1 + Word32 temp = W_shl_sat_l( W_deposit32_l( 4 ), sub( 31, temp_e[i] ) ); + IF( GT_32( adj_fx[i], temp ) ) +#else /* OPT_BASOP_ADD_v1 */ IF( BASOP_Util_Cmp_Mant32Exp( adj_fx[i], temp_e[i], MAX_32, 2 ) > 0 ) +#endif /* OPT_BASOP_ADD_v1 */ { adj_fx[i] = MAX_32; move32(); diff --git a/lib_dec/ivas_mc_param_dec_fx.c b/lib_dec/ivas_mc_param_dec_fx.c index 5a948bdfa..4c1d5d190 100644 --- a/lib_dec/ivas_mc_param_dec_fx.c +++ b/lib_dec/ivas_mc_param_dec_fx.c @@ -2931,7 +2931,11 @@ static void ivas_param_mc_get_mixing_matrices_fx( FOR( ch_idx1 = 0; ch_idx1 < nY_band; ch_idx1++ ) { +#ifdef OPT_BASOP_ADD_v1 + if ( Cproto_diag_fx[ch_idx1] < 0 ) +#else /* OPT_BASOP_ADD_v1 */ if ( BASOP_Util_Cmp_Mant32Exp( Cproto_diag_fx[ch_idx1], Cproto_diag_e, 0, 0 ) < 0 ) +#endif /* OPT_BASOP_ADD_v1 */ { Cproto_diag_fx[ch_idx1] = 0; move16(); diff --git a/lib_rend/ivas_stat_rend.h b/lib_rend/ivas_stat_rend.h index 44207c79a..386644ab1 100644 --- a/lib_rend/ivas_stat_rend.h +++ b/lib_rend/ivas_stat_rend.h @@ -666,7 +666,11 @@ typedef struct ivas_binaural_rendering_conv_module_struct_fx Word32 ***filterStatesLeftReal_fx; Word32 ***filterStatesLeftImag_fx; +#ifdef OPT_BASOP_ADD_v1 + Word16 Q_filterStatesLeft; +#else /* OPT_BASOP_ADD_v1 */ Word16 ***Q_filterStatesLeft; +#endif /* OPT_BASOP_ADD_v1 */ Word16 numTapsArray[BINAURAL_CONVBANDS]; Word16 numTaps; -- GitLab