Commit a246ccfe authored by Sandesh Venkatesh's avatar Sandesh Venkatesh
Browse files

Merge branch 'sba_enc_opt_2' into 'main'

Bit exact optimization changes for sba enc path

See merge request !1796
parents 366c8fbb 3d860400
Loading
Loading
Loading
Loading
+5 −0
Original line number Diff line number Diff line
@@ -70,9 +70,14 @@ Word32 Interpol_lc_fx( /* o : interpolated value
            c2 += up_samp; /* move16() not needed, since the coefficient can be rearrange in bit exact way */
            c1 += up_samp;
        }
#ifdef OPT_SBA_ENC_V2_BE
        L_sum = W_shl_sat_l( L_sum64, 1 ); /*Q15*/
    }
#else
        L_sum = W_sat_l( L_sum64 ); /*Q14*/
    }
    L_sum = L_shl_sat( L_sum, 1 ); /*Q15*/
#endif
    return L_sum;
}

+87 −1
Original line number Diff line number Diff line
@@ -2361,18 +2361,30 @@ static void ivas_calc_p_coeffs_per_band_enc_fx(
                Word32 re1, re2;

                W_tmp = W_mult0_32_32( pSparMd->band_coeffs[b_ts_idx].C_re_fx[0][0], cov_dd_re[0][0] ); /*q_cov_dd_re+ pSparMd->band_coeffs[b_ts_idx].q_C_re_fx*/
#ifdef OPT_SBA_ENC_V2_BE
                q_tmp1 = sub( W_norm( W_tmp ), 32 );
                re1 = W_shl_sat_l( W_tmp, q_tmp1 ); /*q_cov_dd_re+ q_C_re+q_tmp1*/
                q_tmp1 = add( add( q_C_re, q_tmp1 ), q_cov_dd_re );
#else
                q_tmp1 = W_norm( W_tmp );
                re1 = W_extract_h( W_shl( W_tmp, q_tmp1 ) ); /*q_cov_dd_re+ q_C_re+q_tmp1-32*/
                q_tmp1 = sub( add( add( q_C_re, q_tmp1 ), q_cov_dd_re ), 32 );
#endif
                if ( W_tmp == 0 )
                {
                    q_tmp1 = 31;
                    move16();
                }
                W_tmp = W_mult0_32_32( pSparMd->band_coeffs[b_ts_idx].C_re_fx[1][0], cov_dd_re[0][0] ); /*q_cov_dd_re+ q_C_re*/
#ifdef OPT_SBA_ENC_V2_BE
                q_tmp = sub( W_norm( W_tmp ), 32 );
                re2 = W_shl_sat_l( W_tmp, q_tmp ); /*q_cov_dd_re+ q_C_re+q_tmp*/
                q_tmp = add( add( q_C_re, q_tmp ), q_cov_dd_re );
#else
                q_tmp = W_norm( W_tmp );
                re2 = W_extract_h( W_shl( W_tmp, q_tmp ) ); /*q_cov_dd_re+ q_C_re+q_tmp-32*/
                q_tmp = sub( add( add( q_C_re, q_tmp ), q_cov_dd_re ), 32 );
#endif
                if ( W_tmp == 0 )
                {
                    q_tmp = 31;
@@ -2380,12 +2392,20 @@ static void ivas_calc_p_coeffs_per_band_enc_fx(
                }

                W_tmp = W_mult0_32_32( pSparMd->band_coeffs[b_ts_idx].C_re_fx[0][0], re1 ); // q_tmp1+q_C_re
#ifdef OPT_SBA_ENC_V2_BE
                q_factor = sub( W_norm( W_tmp ), 32 );
                recon_uu_re[0][0] = W_shl_sat_l( W_tmp, q_factor ); // q_tmp1+q_C_re+q_recon_uu_re[0][0]
                move32();
                q_recon_uu_re[0][0] = add( add( q_C_re, q_factor ), q_tmp1 );
                move16();
#else
                q_recon_uu_re[0][0] = W_norm( W_tmp );
                move16();
                recon_uu_re[0][0] = W_extract_h( W_shl( W_tmp, q_recon_uu_re[0][0] ) ); // q_tmp1+q_C_re+q_recon_uu_re[0][0]-32
                move32();
                q_recon_uu_re[0][0] = sub( add( add( q_C_re, q_recon_uu_re[0][0] ), q_tmp1 ), 32 );
                move16();
#endif
                if ( W_tmp == 0 )
                {
                    q_recon_uu_re[0][0] = 31;
@@ -2393,12 +2413,20 @@ static void ivas_calc_p_coeffs_per_band_enc_fx(
                }

                W_tmp = W_mult0_32_32( pSparMd->band_coeffs[b_ts_idx].C_re_fx[1][0], re1 ); // q_C_re+q_tmp1
#ifdef OPT_SBA_ENC_V2_BE
                q_factor = sub( W_norm( W_tmp ), 32 );
                recon_uu_re[0][1] = W_shl_sat_l( W_tmp, q_factor ); // q_C_re+q_tmp1+q_recon_uu_re[0][1]
                move32();
                q_recon_uu_re[0][1] = add( add( q_C_re, q_factor ), q_tmp1 );
                move16();
#else
                q_recon_uu_re[0][1] = W_norm( W_tmp );
                move16();
                recon_uu_re[0][1] = W_extract_h( W_shl( W_tmp, q_recon_uu_re[0][1] ) ); // q_C_re+q_tmp1+q_recon_uu_re[0][1]-32
                move32();
                q_recon_uu_re[0][1] = sub( add( add( q_C_re, q_recon_uu_re[0][1] ), q_tmp1 ), 32 );
                move16();
#endif
                if ( W_tmp == 0 )
                {
                    q_recon_uu_re[0][1] = 31;
@@ -2406,12 +2434,20 @@ static void ivas_calc_p_coeffs_per_band_enc_fx(
                }

                W_tmp = W_mult0_32_32( pSparMd->band_coeffs[b_ts_idx].C_re_fx[0][0], re2 ); // q_C_re+q_tmp
#ifdef OPT_SBA_ENC_V2_BE
                q_factor = sub( W_norm( W_tmp ), 32 );
                recon_uu_re[1][0] = W_shl_sat_l( W_tmp, q_factor ); // q_C_re+q_tmp+q_recon_uu_re[1][0]
                move32();
                q_recon_uu_re[1][0] = add( add( q_C_re, q_factor ), q_tmp );
                move16();
#else
                q_recon_uu_re[1][0] = W_norm( W_tmp );
                move16();
                recon_uu_re[1][0] = W_extract_h( W_shl( W_tmp, q_recon_uu_re[1][0] ) ); // q_C_re+q_tmp+q_recon_uu_re[1][0]-32
                move32();
                q_recon_uu_re[1][0] = sub( add( add( q_C_re, q_recon_uu_re[1][0] ), q_tmp ), 32 );
                move16();
#endif
                if ( W_tmp == 0 )
                {
                    q_recon_uu_re[1][0] = 31;
@@ -2419,12 +2455,20 @@ static void ivas_calc_p_coeffs_per_band_enc_fx(
                }

                W_tmp = W_mult0_32_32( pSparMd->band_coeffs[b_ts_idx].C_re_fx[1][0], re2 ); // q_C_re+q_tmp
#ifdef OPT_SBA_ENC_V2_BE
                q_factor = sub( W_norm( W_tmp ), 32 );
                recon_uu_re[1][1] = W_shl_sat_l( W_tmp, q_factor ); // q_C_re+q_tmp+q_recon_uu_re[1][1]
                move32();
                q_recon_uu_re[1][1] = add( add( q_C_re, q_factor ), q_tmp );
                move16();
#else
                q_recon_uu_re[1][1] = W_norm( W_tmp );
                move16();
                recon_uu_re[1][1] = W_extract_h( W_shl( W_tmp, q_recon_uu_re[1][1] ) ); // q_C_re+q_tmp+q_recon_uu_re[1][1]-32
                move32();
                q_recon_uu_re[1][1] = sub( add( add( q_C_re, q_recon_uu_re[1][1] ), q_tmp ), 32 );
                move16();
#endif
                if ( W_tmp == 0 )
                {
                    q_recon_uu_re[1][1] = 31;
@@ -2441,12 +2485,18 @@ static void ivas_calc_p_coeffs_per_band_enc_fx(
                    }
                }
                q_tmp = sub( s_min( q_tmp, q_cov_uu_re ), 1 );

#ifdef OPT_SBA_ENC_V2_BE
                q_factor = sub( q_cov_uu_re, q_tmp );
#endif
                FOR( i = 0; i < 2; i++ )
                {
                    FOR( j = 0; j < 2; j++ )
                    {
#ifdef OPT_SBA_ENC_V2_BE
                        cov_uu_re[i][j] = L_sub( L_shr( cov_uu_re[i][j], q_factor ), L_shr( recon_uu_re[i][j], sub( q_recon_uu_re[i][j], q_tmp ) ) ); // q_tmp
#else
                        cov_uu_re[i][j] = L_sub( L_shr( cov_uu_re[i][j], sub( q_cov_uu_re, q_tmp ) ), L_shr( recon_uu_re[i][j], sub( q_recon_uu_re[i][j], q_tmp ) ) ); // q_tmp
#endif
                        move32();
                    }
                }
@@ -2466,9 +2516,15 @@ static void ivas_calc_p_coeffs_per_band_enc_fx(
                    {
                        Word32 re;
                        W_tmp = W_mult0_32_32( pSparMd->band_coeffs[b_ts_idx].C_re_fx[0][k], cov_dd_re[k][j] ); // q_C_re+q_cov_dd_re
#ifdef OPT_SBA_ENC_V2_BE
                        q_tmp = sub( W_norm( W_tmp ), 33 );
                        re = W_shl_sat_l( W_tmp, q_tmp ); // q_C_re+q_cov_dd_re+q_tmp
                        q_tmp = add( add( q_C_re, q_tmp ), q_cov_dd_re );
#else
                        q_tmp = sub( W_norm( W_tmp ), 1 );
                        re = W_extract_h( W_shl( W_tmp, q_tmp ) ); // q_C_re+q_cov_dd_re+q_tmp-32
                        q_tmp = sub( add( add( q_C_re, q_tmp ), q_cov_dd_re ), 32 );
#endif
                        if ( W_tmp == 0 )
                        {
                            q_tmp = 31;
@@ -2492,9 +2548,15 @@ static void ivas_calc_p_coeffs_per_band_enc_fx(
                }

                W_tmp = W_mult0_32_32( pSparMd->band_coeffs[b_ts_idx].C_re_fx[0][0], re1[0] ); // q_C_re+q_re1[0]
#ifdef OPT_SBA_ENC_V2_BE
                q_tmp = sub( W_norm( W_tmp ), 33 );
                re2 = W_shl_sat_l( W_tmp, q_tmp ); // q_C_re+q_re1[0]+q_tmp
                q_tmp = add( add( q_C_re, q_tmp ), q_re1[0] );
#else
                q_tmp = sub( W_norm( W_tmp ), 1 );
                re2 = W_extract_h( W_shl( W_tmp, q_tmp ) ); // q_C_re+q_re1[0]+q_tmp-32
                q_tmp = sub( add( add( q_C_re, q_tmp ), q_re1[0] ), 32 );
#endif
                if ( W_tmp == 0 )
                {
                    q_tmp = 31;
@@ -2504,9 +2566,15 @@ static void ivas_calc_p_coeffs_per_band_enc_fx(
                move32();

                W_tmp = W_mult0_32_32( pSparMd->band_coeffs[b_ts_idx].C_re_fx[0][1], re1[1] ); // q_C_re+q_re1[1]
#ifdef OPT_SBA_ENC_V2_BE
                q_tmp1 = sub( W_norm( W_tmp ), 33 );
                re2 = W_shl_sat_l( W_tmp, q_tmp1 ); // q_C_re+q_re1[1]+q_tmp1
                q_tmp1 = add( add( q_C_re, q_tmp1 ), q_re1[1] );
#else
                q_tmp1 = sub( W_norm( W_tmp ), 1 );
                re2 = W_extract_h( W_shl( W_tmp, q_tmp1 ) ); // q_C_re+q_re1[1]+q_tmp1-32
                q_tmp1 = sub( add( add( q_C_re, q_tmp1 ), q_re1[1] ), 32 );
#endif
                if ( W_tmp == 0 )
                {
                    q_tmp1 = 31;
@@ -2585,9 +2653,15 @@ static void ivas_calc_p_coeffs_per_band_enc_fx(
                        FOR( k = 0; k < num_dmx - 1; k++ )
                        {
                            W_tmp = W_mult0_32_32( pSparMd->band_coeffs[b_ts_idx].C_re_fx[i][k], cov_dd_re[k][m] ); // q_C_re+q_cov_dd_re
#ifdef OPT_SBA_ENC_V2_BE
                            q_tmp = sub( W_norm( W_tmp ), 34 );
                            re = W_shl_sat_l( W_tmp, q_tmp ); // q_C_re+q_cov_dd_re+q_tmp
                            q_tmp = add( add( q_C_re, q_tmp ), q_cov_dd_re );
#else
                            q_tmp = sub( W_norm( W_tmp ), 2 );
                            re = W_extract_h( W_shl( W_tmp, q_tmp ) ); // q_C_re+q_cov_dd_re+q_tmp-32
                            q_tmp = sub( add( add( q_C_re, q_tmp ), q_cov_dd_re ), 32 );
#endif
                            if ( W_tmp == 0 )
                            {
                                q_tmp = 31;
@@ -2627,9 +2701,15 @@ static void ivas_calc_p_coeffs_per_band_enc_fx(
                        FOR( m = 0; m < num_dmx - 1; m++ )
                        {
                            W_tmp = W_mult0_32_32( pSparMd->band_coeffs[b_ts_idx].C_re_fx[j][m], re1[m] ); // q_C_re+q_re1[m]
#ifdef OPT_SBA_ENC_V2_BE
                            q_tmp = sub( W_norm( W_tmp ), 34 );
                            re = W_shl_sat_l( W_tmp, q_tmp ); // q_C_re+q_re1[m]+q_tmp
                            q_tmp = add( add( q_C_re, q_tmp ), q_re1[m] );
#else
                            q_tmp = sub( W_norm( W_tmp ), 2 );
                            re = W_extract_h( W_shl( W_tmp, q_tmp ) ); // q_C_re+q_re1[m]+q_tmp-32
                            q_tmp = sub( add( add( q_C_re, q_tmp ), q_re1[m] ), 32 );
#endif
                            if ( W_tmp == 0 )
                            {
                                q_tmp = 31;
@@ -2714,9 +2794,15 @@ static void ivas_calc_p_coeffs_per_band_enc_fx(
        move16();
        IF( trace != 0 )
        {
#ifdef OPT_SBA_ENC_V2_BE
            q_factor = sub( W_norm( trace ), 32 );
            tmp = Mpy_32_32( p_norm_scaling, W_shl_sat_l( trace, q_factor ) ); // q_cov_uu_re+q_factor
            q_factor = add( q_cov_uu_re, q_factor );
#else
            q_factor = W_norm( trace );
            tmp = Mpy_32_32( p_norm_scaling, W_extract_h( W_shl( trace, q_factor ) ) ); // q_cov_uu_re+q_factor-32
            q_factor = sub( add( q_cov_uu_re, q_factor ), 32 );
#endif
            IF( GT_16( q_factor, q_postpred_cov_re ) )
            {
                tmp = L_shr( tmp, sub( q_factor, q_postpred_cov_re ) ); // q_postpred_cov_re
+1 −0
Original line number Diff line number Diff line
@@ -80,6 +80,7 @@
#define OPT_SBA_REND_V1_BE
#define OPT_HEAD_ROT_REND_V1_BE
#define OPT_SBA_DEC_V2_BE
#define OPT_SBA_ENC_V2_BE
#define OPT_SBA_ENC_V1_BE
#define OPT_BIN_RENDERER_V1
#define OPT_BIN_RENDERER_V2
+169 −45
Original line number Diff line number Diff line
@@ -1249,6 +1249,11 @@ Word16 RCcontextMapping_encode2_estimate_no_mem_s17_LCS_fx(
    k = 1;
    move16();

#ifdef OPT_SBA_ENC_V2_BE
    Word16 round_bit_estimate_fx;
    Word32 target_Q15 = L_shl( target, Q15 ); // Q15
#endif

    WHILE( LT_16( k, nt / 2 ) )
    {
        bit_estimate_fx = W_add( bit_estimate_fx, MAKE_NUMBER_QX( 1, Q23 ) );
@@ -1334,7 +1339,7 @@ Word16 RCcontextMapping_encode2_estimate_no_mem_s17_LCS_fx(
            ctx = &c[L_or( p1, p2 )];

            t = (UWord16) L_add( *ctx, rateFlag );
            IF( LT_16( nt_half, idx ) )
            if ( LT_16( nt_half, idx ) )
            {
                t = add( t, ( 1 << NBITS_CONTEXT ) );
            }
@@ -1351,6 +1356,19 @@ Word16 RCcontextMapping_encode2_estimate_no_mem_s17_LCS_fx(

            /* check while condition */
            /* MSBs coding */
#ifdef OPT_SBA_ENC_V2_BE
            FOR( ; s_max( a1, b1 ) >= A_THRES; )
            {
                pki = lookup[lev1]; /* ESC symbol */

                bit_estimate_fx = W_add( bit_estimate_fx, ari_bit_estimate_s17_LC_fx[pki][VAL_ESC] );
                bit_estimate_fx = W_add( bit_estimate_fx, MAKE_VARIABLE_QX( 2, Q23 ) );
                a1 = shr( a1, 1 );
                b1 = shr( b1, 1 );

                lev1 = s_min( add( lev1, ( 1 << ( NBITS_CONTEXT + NBITS_RATEQ ) ) ), 2 << ( NBITS_CONTEXT + NBITS_RATEQ ) );
            }
#else
            WHILE( GE_16( s_max( a1, b1 ), A_THRES ) )
            {
                pki = lookup[lev1]; /* ESC symbol */
@@ -1364,14 +1382,18 @@ Word16 RCcontextMapping_encode2_estimate_no_mem_s17_LCS_fx(

                /* check while condition */
            }

#endif
            pki = lookup[lev1];

            symbol = add( a1, i_mult( A_THRES, b1 ) ); /* Q0 */
            bit_estimate_fx = W_add( bit_estimate_fx, ari_bit_estimate_s17_LC_fx[pki][symbol] );

#ifdef OPT_SBA_ENC_V2_BE
            IF( GT_32( W_shl_sat_l( bit_estimate_fx, -Q8 ), target_Q15 ) ) // Q15
#else
            /* Should we truncate? */
            IF( GT_32( W_extract_l( W_shr( bit_estimate_fx, Q8 ) ), L_shl( target, Q15 ) ) )
#endif
            {
                stop2 = 1;
                move16();
@@ -1393,6 +1415,13 @@ Word16 RCcontextMapping_encode2_estimate_no_mem_s17_LCS_fx(
            {
                lev1 = shr( lev1, NBITS_CONTEXT + NBITS_RATEQ );

#ifdef OPT_SBA_ENC_V2_BE
                t = add( 13, lev1 );
                IF( lev1 <= 0 )
                {
                    t = add( 1, i_mult( add( a1, b1 ), add( lev1, 2 ) ) );
                }
#else
                IF( lev1 <= 0 )
                {
                    t = add( 1, i_mult( add( a1, b1 ), add( lev1, 2 ) ) );
@@ -1401,6 +1430,7 @@ Word16 RCcontextMapping_encode2_estimate_no_mem_s17_LCS_fx(
                {
                    t = add( 13, lev1 );
                }
#endif

                *ctx = L_add( imult3216( L_and( *ctx, 0xf ), 16 ), t );
                move32();
@@ -1425,15 +1455,21 @@ Word16 RCcontextMapping_encode2_estimate_no_mem_s17_LCS_fx(
            }

        } /*end of the 2-tuples loop*/

#ifdef OPT_SBA_ENC_V2_BE
        total_output_bits = round_fx( W_shl_sat_l( bit_estimate_fx, -Q7 ) ); /* Q23 -> Q16 -> Q0 */
#else
        total_output_bits = round_fx( W_extract_l( W_shr( bit_estimate_fx, Q7 ) ) );   /* Q23 -> Q16 -> Q0 */

#endif
        IF( *stop )
        {
#ifdef OPT_SBA_ENC_V2_BE
            total_output_bits = round_fx( W_shl_sat_l( nbits2_fx, -Q7 ) ); /* Q23 -> Q16 -> Q0 */
#else
            total_output_bits = round_fx( W_extract_l( W_shr( nbits2_fx, Q7 ) ) );     /* Q23 -> Q16 -> Q0 */
#endif
        }

        IF( stop2 )
        if ( stop2 )
        {
            stop2 = total_output_bits; /* Q0 */
            move16();
@@ -1455,8 +1491,11 @@ Word16 RCcontextMapping_encode2_estimate_no_mem_s17_LCS_fx(
        hm_cfg->numPeakIndices = numPeakIndicesOrig; /* Q0 */
        move16();


#ifdef OPT_SBA_ENC_V2_BE
        return round_fx( L_add( W_shl_sat_l( nbits2_fx, -Q7 ), ONE_IN_Q14 ) ); /* Q0 */
#else
        return round_fx( L_add( W_extract_l( W_shr( nbits2_fx, Q7 ) ), ONE_IN_Q14 ) ); /* Q0 */
#endif
    }
    ELSE /* if (!hm_cfg) */
    {
@@ -1530,6 +1569,21 @@ Word16 RCcontextMapping_encode2_estimate_no_mem_s17_LCS_fx(

            /* check while condition */
            /* MSBs coding */
#ifdef OPT_SBA_ENC_V2_BE
            FOR( ; s_max( a1, b1 ) >= A_THRES; )
            {
                pki = lookup[( esc_nb << ( NBITS_CONTEXT + NBITS_RATEQ ) )]; /* Q0 */

                bit_estimate_fx = W_add( bit_estimate_fx, ari_bit_estimate_s17_LC_fx[pki][VAL_ESC] );
                bit_estimate_fx = W_add( bit_estimate_fx, MAKE_NUMBER_QX( 2, Q23 ) );

                a1 = shr( a1, 1 );
                b1 = shr( b1, 1 );

                lev1 = add( lev1, 1 );
                esc_nb = s_min( lev1, 3 );
            }
#else
            WHILE( GE_16( s_max( a1, b1 ), A_THRES ) )
            {
                pki = lookup[( esc_nb << ( NBITS_CONTEXT + NBITS_RATEQ ) )]; /* Q0 */
@@ -1546,15 +1600,18 @@ Word16 RCcontextMapping_encode2_estimate_no_mem_s17_LCS_fx(

                /* check while condition */
            }

#endif
            pki = lookup[( esc_nb << ( NBITS_CONTEXT + NBITS_RATEQ ) )]; /* Q0 */
            move16();

            symbol = add( a1, i_mult( A_THRES, b1 ) ); /* Q0 */
            bit_estimate_fx = W_add( bit_estimate_fx, ari_bit_estimate_s17_LC_fx[pki][symbol] );

            /* Should we truncate? */
#ifdef OPT_SBA_ENC_V2_BE
            IF( GT_32( W_shl_sat_l( bit_estimate_fx, -Q8 ), target_Q15 ) ) // Q15
#else
            IF( GT_32( W_extract_l( W_shr( bit_estimate_fx, Q8 ) ), L_shl( target, Q15 ) ) )
#endif
            {
                overflow_flag = 1;
                move16();
@@ -1570,6 +1627,14 @@ Word16 RCcontextMapping_encode2_estimate_no_mem_s17_LCS_fx(
                }
            }

#ifdef OPT_SBA_ENC_V2_BE
            /* Update context for next 2-tuple */
            cp = add( 1, i_mult( add( a1, b1 ), add( esc_nb, 1 ) ) ); /* Q0 */
            if ( GE_16( esc_nb, 2 ) )
            {
                cp = add( 12, esc_nb ); /* Q0 */
            }
#else
            /* Update context for next 2-tuple */
            IF( LT_16( esc_nb, 2 ) )
            {
@@ -1579,22 +1644,55 @@ Word16 RCcontextMapping_encode2_estimate_no_mem_s17_LCS_fx(
            {
                cp = add( 12, esc_nb ); /* Q0 */
            }
#endif
            /*shift old bits and replace last 4 bits*/
            s = (UWord16) L_add( L_shl( s, 4 ), cp );
            t = s_and( s, 0xFF );

        } /*end of the 2-tuples loop*/

#ifdef OPT_SBA_ENC_V2_BE
        tot_bits2 = round_fx( W_shl_sat_l( nbits2_fx, -Q7 ) );                   /* Q23 -> Q16 -> Q0 */
        round_bit_estimate_fx = round_fx( W_shl_sat_l( bit_estimate_fx, -Q7 ) ); /* Q23 -> Q16 -> Q0 */
#else
        tot_bits2 = round_fx( W_extract_l( W_shr( nbits2_fx, Q7 ) ) ); /* Q23 -> Q16 -> Q0 */
        IF( LT_16( lastnz2, lastnz ) )                                 /* Overflow occured because unable to code all tuples */
#endif
        if ( LT_16( lastnz2, lastnz ) ) /* Overflow occured because unable to code all tuples */
        {
            overflow_flag = 1;
            move16();
        }
#ifdef OPT_SBA_ENC_V2_BE
        if ( EQ_16( mode, -1 ) )
        {
            tot_bits2 = round_bit_estimate_fx;
            move16();
        }
#else
        IF( EQ_16( mode, -1 ) )
        {
            tot_bits2 = round_fx( W_extract_l( W_shr( bit_estimate_fx, Q7 ) ) ); /* Q23 -> Q16 -> Q0 */
            tot_bits2 = round_fx( W_shl_sat_l( bit_estimate_fx, -Q7 ) ); /* Q23 -> Q16 -> Q0 */
        }
#endif
#ifdef OPT_SBA_ENC_V2_BE
        if ( overflow_flag == 0 ) /* No overflow */
        {
            *stop = 0;
            move16();
        }
        IF( overflow_flag != 0 ) /* Overflow */
        {
            IF( *stop )
            {
                *stop = tot_bits2; /* Q0 */
                move16();
            }
            ELSE
            {
                *stop = round_bit_estimate_fx;
                move16();
            }
        }
#else
        IF( overflow_flag == 0 ) /* No overflow */
        {
            *stop = 0;
@@ -1602,7 +1700,8 @@ Word16 RCcontextMapping_encode2_estimate_no_mem_s17_LCS_fx(
        }
        ELSE /* Overflow */
        {
                IF( *stop ){
            IF( *stop )
            {
                *stop = tot_bits2; /* Q0 */
                move16();
            }
@@ -1612,6 +1711,7 @@ Word16 RCcontextMapping_encode2_estimate_no_mem_s17_LCS_fx(
                move16();
            }
        }
#endif

        *lastnz_out = lastnz; /* Q0 */
        move16();
@@ -1743,6 +1843,15 @@ Word16 RCcontextMapping_encode2_estimate_bandWise_fx(
        /* Get context */
        t = add( hContextMem->ctx, hContextMem->rateFlag ); /* Q0 */

#ifdef OPT_SBA_ENC_V2_BE
        tmp = ( 1 << NBITS_CONTEXT );
        move16();
        if ( GE_16( hContextMem->nt_half, idx ) )
        {
            tmp = 0;
            move16();
        }
#else
        IF( GE_16( hContextMem->nt_half, idx ) )
        {
            tmp = 0;
@@ -1751,6 +1860,7 @@ Word16 RCcontextMapping_encode2_estimate_bandWise_fx(
        {
            tmp = ( 1 << NBITS_CONTEXT );
        }
#endif

        t = add( t, tmp ); /* Q0 */

@@ -1771,15 +1881,18 @@ Word16 RCcontextMapping_encode2_estimate_bandWise_fx(

        /* check while condition */
        /* MSBs coding */
#ifdef OPT_SBA_ENC_V2_BE
        FOR( ; s_max( a1, b1 ) >= A_THRES; )
#else
        WHILE( GE_16( s_max( a1, b1 ), A_THRES ) )
#endif
        {
            pki = lookup[lev1]; /* Q0 */
            move16();

            hContextMem->bit_estimate_fx = W_add( hContextMem->bit_estimate_fx, ari_bit_estimate_s17_LC_fx[pki][VAL_ESC] );
            hContextMem->bit_estimate_fx = W_add( hContextMem->bit_estimate_fx, MAKE_NUMBER_QX( 2, Q23 ) );
            move32();
            move32();
            move64();
            move64();


            // hContextMem->bit_estimate = hContextMem->bit_estimate + ari_bit_estimate_s17_LC[pki][VAL_ESC];
@@ -1793,16 +1906,23 @@ Word16 RCcontextMapping_encode2_estimate_bandWise_fx(
        }

        pki = lookup[lev1]; /* Q0 */
        move16();

        symbol = add( a1, i_mult( A_THRES, b1 ) ); /* MSB symbol    Q0*/
        hContextMem->bit_estimate_fx = W_add( hContextMem->bit_estimate_fx, ari_bit_estimate_s17_LC_fx[pki][symbol] );

        move32();
        move64();
        // hContextMem->bit_estimate = hContextMem->bit_estimate + ari_bit_estimate_s17_LC[pki][symbol];

        /* Update context */
        lev1 = shr( lev1, NBITS_CONTEXT + NBITS_RATEQ );

#ifdef OPT_SBA_ENC_V2_BE
        t = add( 1, i_mult( add( a1, b1 ), add( lev1, 2 ) ) ); /* Q0 */
        if ( lev1 > 0 )
        {
            t = add( 13, lev1 ); /* Q0 */
        }
#else
        IF( lev1 <= 0 )
        {
            t = add( 1, i_mult( add( a1, b1 ), add( lev1, 2 ) ) ); /* Q0 */
@@ -1811,12 +1931,16 @@ Word16 RCcontextMapping_encode2_estimate_bandWise_fx(
        {
            t = add( 13, lev1 ); /* Q0 */
        }

#endif
        hContextMem->ctx = add( i_mult( s_and( hContextMem->ctx, 0xf ), 16 ), t ); /* Q0 */
        move16();

    } /*end of the 2-tuples loop*/
#ifdef OPT_SBA_ENC_V2_BE
    total_output_bits = round_fx( W_shl_sat_l( hContextMem->bit_estimate_fx, -Q7 ) ); /* Q0 */
#else
    total_output_bits = round_fx( W_extract_l( W_shr( hContextMem->bit_estimate_fx, Q7 ) ) ); /* Q0 */
#endif
    // total_output_bits = (Word16) ( hContextMem->bit_estimate + 0.5f );