Commit fe8d6785 authored by Sandesh Venkatesh's avatar Sandesh Venkatesh
Browse files

Merge branch 'hp20_vect_add_sub_optimization' into 'main'

optimizing vector add and subtract, hp20 optimizations [allow regression]

See merge request !1457
parents 2fa586a8 8cdaa39d
Loading
Loading
Loading
Loading
Loading
+20 −65
Original line number Diff line number Diff line
@@ -458,14 +458,8 @@ void hp20_fx_32(
{
    Word16 i;
    Word32 a1_fx, a2_fx, b1_fx, b2_fx;
#ifdef OPT_STEREO_32KBPS_V1
    Word16 Qy1, Qy2, Qmin;
    Word64 y0_fx64, y1_fx64, y2_fx64;
    Word32 x0, x1, x2;
#else  /* OPT_STEREO_32KBPS_V1 */
    Word16 Qx0, Qx1, Qx2, Qy1, Qprev_y1, Qy2, Qprev_y2, Qmin;
    Word64 x0_fx64, x1_fx64, x2_fx64, y0_fx64, y1_fx64, y2_fx64, R1, R2, R3, R4, R5;
#endif /* OPT_STEREO_32KBPS_V1 */

    IF( EQ_32( Fs, 8000 ) )
    {
@@ -516,64 +510,15 @@ void hp20_fx_32(
    move32();
    move32();

#ifdef OPT_STEREO_32KBPS_V1
    y1_fx64 = W_add( W_deposit32_l( mem_fx[0] ), W_deposit32_h( mem_fx[1] ) );
    y2_fx64 = W_add( W_deposit32_l( mem_fx[2] ), W_deposit32_h( mem_fx[3] ) );

    x0 = mem_fx[4];
    move32();
    x1 = mem_fx[5];
    move32();
#else  /* OPT_STEREO_32KBPS_V1 */
    Qprev_y1 = extract_l( mem_fx[4] );
    Qprev_y2 = extract_l( mem_fx[5] );
    y1_fx64 = W_deposit32_l( mem_fx[0] );
    y2_fx64 = W_deposit32_l( mem_fx[1] );
    x0_fx64 = W_deposit32_l( mem_fx[2] );
    x1_fx64 = W_deposit32_l( mem_fx[3] );
#endif /* OPT_STEREO_32KBPS_V1 */

    FOR( i = 0; i < lg; i++ )
    {
#ifdef OPT_STEREO_32KBPS_V1
        x2 = x1;
        move32();
        x1 = x0;
        move32();
        x0 = signal_fx[i];
        move32();

        Qy1 = W_norm( y1_fx64 );
        if ( y1_fx64 == 0 )
        {
            Qy1 = 62;
            move16();
        }

        Qy2 = W_norm( y2_fx64 );
        if ( y2_fx64 == 0 )
        {
            Qy2 = 62;
            move16();
        }

        Qmin = s_min( Qy1, Qy2 );

        Qmin = sub( Qmin, 34 );

        y0_fx64 = W_mac_32_32( W_mult_32_32( W_shl_sat_l( y1_fx64, Qmin ), a1_fx ), W_shl_sat_l( y2_fx64, Qmin ), a2_fx ); // Qmin + Q29 + Q30 + 1

        Word64 temp = W_mac_32_32( W_mac_32_32( W_mult_32_32( x2, b2_fx ), x1, b1_fx ), x0, b2_fx ); // Q30
        Word64 y0_fx = W_shr( y0_fx64, add( Qmin, Q30 ) );                                           // Q30
        y0_fx64 = W_add( temp, y0_fx );                                                              // Q30
        signal_fx[i] = W_shl_sat_l( y0_fx64, -Q30 );
        move32();

        y2_fx64 = y1_fx64;
        move64();
        y1_fx64 = y0_fx64;
        move64();
#else  /* OPT_STEREO_32KBPS_V1 */
        x2_fx64 = x1_fx64;
        move64();
        x1_fx64 = x0_fx64;
@@ -587,7 +532,11 @@ void hp20_fx_32(
            move16();
        }
        Qy1 = sub( Qy1, 34 );
#ifdef OPT_STEREO_32KBPS_V1
        R1 = W_mult0_32_32( W_shl_sat_l( y1_fx64, Qy1 ), a1_fx );
#else  /* OPT_STEREO_32KBPS_V1 */
        R1 = W_mult0_32_32( W_extract_l( W_shl( y1_fx64, Qy1 ) ), a1_fx );
#endif /* OPT_STEREO_32KBPS_V1 */
        Qy1 = add( Qy1, Qprev_y1 );

        Qy2 = W_norm( y2_fx64 );
@@ -597,7 +546,11 @@ void hp20_fx_32(
            move16();
        }
        Qy2 = sub( Qy2, 34 );
#ifdef OPT_STEREO_32KBPS_V1
        R2 = W_mult0_32_32( W_shl_sat_l( y2_fx64, Qy2 ), a2_fx );
#else  /* OPT_STEREO_32KBPS_V1 */
        R2 = W_mult0_32_32( W_extract_l( W_shl( y2_fx64, Qy2 ) ), a2_fx );
#endif /* OPT_STEREO_32KBPS_V1 */
        Qy2 = add( Qy2, Qprev_y2 );

        Qx0 = W_norm( x0_fx64 );
@@ -607,7 +560,11 @@ void hp20_fx_32(
            move16();
        }
        Qx0 = sub( Qx0, 34 );
#ifdef OPT_STEREO_32KBPS_V1
        R3 = W_mult0_32_32( W_shl_sat_l( x0_fx64, Qx0 ), b2_fx );
#else  /* OPT_STEREO_32KBPS_V1 */
        R3 = W_mult0_32_32( W_extract_l( W_shl( x0_fx64, Qx0 ) ), b2_fx );
#endif /* OPT_STEREO_32KBPS_V1 */

        Qx1 = W_norm( x1_fx64 );
        if ( x1_fx64 == 0 )
@@ -616,7 +573,11 @@ void hp20_fx_32(
            move16();
        }
        Qx1 = sub( Qx1, 34 );
#ifdef OPT_STEREO_32KBPS_V1
        R4 = W_mult0_32_32( W_shl_sat_l( x1_fx64, Qx1 ), b1_fx );
#else  /* OPT_STEREO_32KBPS_V1 */
        R4 = W_mult0_32_32( W_extract_l( W_shl( x1_fx64, Qx1 ) ), b1_fx );
#endif /* OPT_STEREO_32KBPS_V1 */

        Qx2 = W_norm( x2_fx64 );
        if ( x2_fx64 == 0 )
@@ -625,7 +586,11 @@ void hp20_fx_32(
            move16();
        }
        Qx2 = sub( Qx2, 34 );
#ifdef OPT_STEREO_32KBPS_V1
        R5 = W_mult0_32_32( W_shl_sat_l( x2_fx64, Qx2 ), b2_fx );
#else  /* OPT_STEREO_32KBPS_V1 */
        R5 = W_mult0_32_32( W_extract_l( W_shl( x2_fx64, Qx2 ) ), b2_fx );
#endif /* OPT_STEREO_32KBPS_V1 */

        Qmin = s_min( Qy1, Qy2 );

@@ -655,17 +620,8 @@ void hp20_fx_32(
        move64();
        move16();
        move16();
#endif /* OPT_STEREO_32KBPS_V1 */
    }

#ifdef OPT_STEREO_32KBPS_V1
    mem_fx[0] = W_extract_l( y1_fx64 );
    mem_fx[1] = W_extract_h( y1_fx64 );
    mem_fx[2] = W_extract_l( y2_fx64 );
    mem_fx[3] = W_extract_h( y2_fx64 );
    mem_fx[4] = x0;
    mem_fx[5] = x1;
#else  /* OPT_STEREO_32KBPS_V1 */
    Qy1 = W_norm( y1_fx64 );
    test();
    IF( y1_fx64 != 0 && LT_16( Qy1, 32 ) )
@@ -688,7 +644,6 @@ void hp20_fx_32(
    mem_fx[3] = W_extract_l( x1_fx64 );
    mem_fx[4] = Qprev_y1;
    mem_fx[5] = Qprev_y2;
#endif /* OPT_STEREO_32KBPS_V1 */

    move32();
    move32();
+27 −8
Original line number Diff line number Diff line
@@ -128,21 +128,31 @@ void init_lvq_fx(
    FOR( i = 0; i < MAX_NO_MODES; i++ )
    {
#ifdef OPT_STEREO_32KBPS_V1
        FOR( ( j = 0, k = 0 ); j < MAX_NO_SCALES; ( j++, k++ ) )
        FOR( ( j = 0, k = 0 ); j < MAX_NO_SCALES; j++ )
        {
            if ( ( no_lead_fx[i][j] <= 0 ) )
            if ( no_lead_fx[i][j] > 0 )
            {
                j = MAX_NO_SCALES;
                k = add( k, 1 );
            }
            if ( no_lead_fx[i][j] <= 0 )
            {
                j = MAX_NO_SCALES - 1;
                move16();
            }
        }
        no_scales[i][0] = k;
        move16();

        FOR( k = 0; j < MAX_NO_SCALES << 1; ( j++, k++ ) )
        FOR( k = 0; j < MAX_NO_SCALES << 1; j++ )
        {
            if ( no_lead_fx[i][j] > 0 )
            {
                k = add( k, 1 );
            }
            if ( no_lead_fx[i][j] <= 0 )
            {
                j = MAX_NO_SCALES << 1;
                move16();
            }
        }
        no_scales[i][1] = k;
@@ -172,23 +182,32 @@ void init_lvq_fx(
    FOR( i = 0; i < MAX_NO_MODES_p; i++ )
    {
#ifdef OPT_STEREO_32KBPS_V1
        FOR( ( j = 0, k = 0 ); j < MAX_NO_SCALES; ( j++, k++ ) )
        FOR( ( j = 0, k = 0 ); j < MAX_NO_SCALES; j++ )
        {

            if ( no_lead_p_fx[i][j] > 0 )
            {
                k = add( k, 1 );
            }
            if ( ( no_lead_p_fx[i][j] <= 0 ) )
            {
                j = MAX_NO_SCALES;
                j = MAX_NO_SCALES - 1;
                move16();
            }
        }
        no_scales_p[i][0] = k;
        move16();

        FOR( k = 0; j < MAX_NO_SCALES << 1; ( j++, k++ ) )
        FOR( k = 0; j < MAX_NO_SCALES << 1; j++ )
        {
            if ( no_lead_p_fx[i][j] > 0 )
            {
                k = add( k, 1 );
            }

            if ( ( no_lead_p_fx[i][j] <= 0 ) )
            {
                j = MAX_NO_SCALES << 1;
                move16();
            }
        }

+2 −1
Original line number Diff line number Diff line
@@ -74,7 +74,7 @@
#define FIX_1379_MASA_ANGLE_ROUND

/* Note: each compile switch (FIX_1101_...) is independent from the other ones */
//#define OPT_STEREO_32KBPS_V1                    /* Optimization made in stereo decoding path for 32kbps decoding */
#define OPT_STEREO_32KBPS_V1                    /* Optimization made in stereo decoding path for 32kbps decoding */
#define OPT_AVOID_STATE_BUF_RESCALE             /* Optimization made to avoid rescale of synth state buffer */
#define FIX_1310_SPEEDUP_ivas_dirac_dec_get_response_fx                 /*FhG: WMOPS tuning, nonbe*/
#define FIX_1310_SPEEDUP_ivas_dirac_dec_output_synthesis_process_slot   /*FhG: WMOPS tuning, nonbe*/
@@ -93,6 +93,7 @@
#define FIX_1439_SPEEDUP_SIMPLIFY_elliptic_bpf_48k_generic              /*FhG: reduces maintenance complexity & reduces WMOPS & prepares STAGE2 patch*/
#define FIX_1439_SPEEDUP_SIMPLIFY_elliptic_bpf_48k_generic_STAGE2       /*FhG: reduces WMOPS*/
#define FIX_1481_HARDCODE_DIV                          /* FhG: hardcode division results in stereo_dmx_evs_init_encoder_fx() */
#define VEC_ARITH_OPT_v1
#define FIX_1486_IND_SHB_RES                   /* VA: Fix for issue 1486: align the usage of IND_SHB_RES_GS indices with float code */

#define TEST_HR
+18 −0
Original line number Diff line number Diff line
@@ -6184,6 +6184,15 @@ void v_add_fixed(
    const Word16 hdrm  /* i  : headroom for when subtraction result > 1 or < -1 */
);
#ifdef VEC_ARITH_OPT_v1
void v_add_fixed_no_hdrm(
    const Word32 x1[], /* i  : Input vector 1                                   */
    const Word32 x2[], /* i  : Input vector 2                                   */
    Word32 y[],        /* o  : Output vector that contains vector 1 + vector 2  */
    const Word16 N     /* i  : Vector length                                    */
);
#endif /* VEC_ARITH_OPT_v1 */
void v_add_fixed_me(
    const Word32 x1[], /* i  : Input vector 1                                   */
    const Word16 x1_e, /* i  : Exponent for input vector 1                      */
@@ -6218,6 +6227,15 @@ void v_sub_fixed(
    const Word16 hdrm  /* i  : headroom for when subtraction result > 1 or < -1 */
);
#ifdef VEC_ARITH_OPT_v1
void v_sub_fixed_no_hdrm(
    const Word32 x1[], /* i  : Input vector 1                                   */
    const Word32 x2[], /* i  : Input vector 2                                   */
    Word32 y[],        /* o  : Output vector that contains vector 1 - vector 2  */
    const Word16 N     /* i  : Vector length                                    */
);
#endif /* VEC_ARITH_OPT_v1 */
/*! r: dot product of x[] and y[] */
Word32 dotp_fixed(
    const Word32 x[], /* i  : vector x[]                                      */
+20 −0
Original line number Diff line number Diff line
@@ -888,6 +888,26 @@ void v_sub_fixed(
    return;
}

#ifdef VEC_ARITH_OPT_v1
void v_sub_fixed_no_hdrm(
    const Word32 x1[], /* i  : Input vector 1                                   */
    const Word32 x2[], /* i  : Input vector 2                                   */
    Word32 y[],        /* o  : Output vector that contains vector 1 - vector 2  */
    const Word16 N     /* i  : Vector length                                    */
)
{
    Word16 i;

    FOR( i = 0; i < N; i++ )
    {
        y[i] = L_sub( x1[i], x2[i] );
        move32();
    }

    return;
}
#endif /* VEC_ARITH_OPT_v1 */

/*-------------------------------------------------------------------*
 * v_multc_fixed()
 *
Loading