Probably porting mistake in protoSignalComputation_shd_fx

Basic info

Float reference:
- Decoder (float): dfed311fdc7c91f7f9c4204ba4f1c9b02441888a
Fixed point:
- Decoder (fixed): 496d928d

Bug description

Looking at the rotation code in protoSignalComputation_shd_fx, there seems to be a possible porting mistake. See code pieces below:

In float we have

        if ( p_Rmat != 0 )
        {
            assert( num_inputs == 4 && "This code block should never be run with num_inputs != 4!" );

            for ( l = 0; l < num_freq_bands; l++ )
            {
                *( p_k[0] ) = RealBuffer[0][0][l];
                reference_power[l + num_freq_bands] = *( p_k[0] ) * *( p_k[0] );
                p_k[0]++;
                *( p_k[0] ) = ImagBuffer[0][0][l];
                reference_power[l + num_freq_bands] += *( p_k[0] ) * *( p_k[0] );
                p_k[0]++;
                reference_power[l] = 0.5f * reference_power[l + num_freq_bands];

                for ( k = 1; k < 4; k++ )
                {
                    *( p_k[k] ) = p_Rmat[3 * Rmat_k[k] + 1] * RealBuffer[1][0][l] + p_Rmat[3 * Rmat_k[k] + 2] * RealBuffer[2][0][l] + p_Rmat[3 * Rmat_k[k] + 0] * RealBuffer[3][0][l];
                    reference_power[l + ( k + 1 ) * num_freq_bands] = *( p_k[k] ) * *( p_k[k] );
                    p_k[k]++;
                    *( p_k[k] ) = p_Rmat[3 * Rmat_k[k] + 1] * ImagBuffer[1][0][l] + p_Rmat[3 * Rmat_k[k] + 2] * ImagBuffer[2][0][l] + p_Rmat[3 * Rmat_k[k] + 0] * ImagBuffer[3][0][l];
                    reference_power[l + ( k + 1 ) * num_freq_bands] += *( p_k[k] ) * *( p_k[k] );
                    p_k[k]++;
                    reference_power[l] += 0.5f * ( reference_power[l + ( k + 1 ) * num_freq_bands] );
                }

                for ( k = 1; k < 4; k++ )
                {
                    RealBuffer[k][0][l] = p_proto_direct_buffer[k * 2 * num_freq_bands + 2 * l];
                    ImagBuffer[k][0][l] = p_proto_direct_buffer[k * 2 * num_freq_bands + 2 * l + 1];
                }
            }
        }

In BASOP, the corresponding part is this

        IF( p_Rmat_fx != 0 )
        {
            assert( EQ_16( num_inputs, 4 ) && "This code block should never be run with num_inputs != 4!" );

            FOR( l = 0; l < num_freq_bands; l++ )
            {
                *p_k_fx[0] = L_shl( RealBuffer_fx[0][0][l], min_q_shift ); /*Q(q_cldfb+min_q_shift)*/
                move32();
                reference_power_fx[l + num_freq_bands] = Mpy_32_32( *p_k_fx[0], *p_k_fx[0] ); /*2*Q(q_cldfb+min_q_shift)-31*/
                move32();
                p_k_fx[0]++;

                *p_k_fx[0] = L_shl( ImagBuffer_fx[0][0][l], min_q_shift ); /*Q(q_cldfb+min_q_shift)*/
                move32();
                reference_power_fx[l + num_freq_bands] = Madd_32_32( reference_power_fx[l + num_freq_bands], *p_k_fx[0], *p_k_fx[0] ); /*2*Q(q_cldfb+min_q_shift)-31*/
                move32();
                p_k_fx[0]++;

                reference_power_fx[l] = L_shr( reference_power_fx[l + num_freq_bands], 1 ); /*2*Q(q_cldfb+min_q_shift)-31-1*/
                move32();

                re1 = L_shl( RealBuffer_fx[1][0][l], min_q_shift ); /*Q(q_cldfb+min_q_shift)*/
                re2 = L_shl( RealBuffer_fx[2][0][l], min_q_shift ); /*Q(q_cldfb+min_q_shift)*/
                re3 = L_shl( RealBuffer_fx[3][0][l], min_q_shift ); /*Q(q_cldfb+min_q_shift)*/
                im1 = L_shl( ImagBuffer_fx[1][0][l], min_q_shift ); /*Q(q_cldfb+min_q_shift)*/
                im2 = L_shl( ImagBuffer_fx[2][0][l], min_q_shift ); /*Q(q_cldfb+min_q_shift)*/
                im3 = L_shl( ImagBuffer_fx[3][0][l], min_q_shift ); /*Q(q_cldfb+min_q_shift)*/

                FOR( k = 1; k < 4; k++ )
                {
                    idx = i_mult( 3, Rmat_k[k] );
                    idx1 = add( l, i_mult( add( k, 1 ), num_freq_bands ) );

                    *p_k_fx[k] = Madd_32_32( Madd_32_32( Mpy_32_32( p_Rmat_fx[idx + 1], re1 ), p_Rmat_fx[idx + 2], re2 ), p_Rmat_fx[idx], re3 ); /*Q(30 + q_cldfb+min_q_shift-31)=>Q(q_cldfb+min_q_shift-1)*/
                    move32();
                    *p_k_fx[k] = L_shl( *p_k_fx[k], Q1 ); // left shift is done to maintain constant Q factor for p_k_fx Q(q_cldfb+min_q_shift)
                    move32();
                    reference_power_fx[idx1] = Mpy_32_32( *p_k_fx[k], *p_k_fx[k] ); // Q(2*(q_cldfb + min_q_shift)-31)
                    move32();
                    p_k_fx[k]++;

                    *p_k_fx[k] = Madd_32_32( Madd_32_32( Mpy_32_32( p_Rmat_fx[idx + 1], im1 ), p_Rmat_fx[idx + 2], im2 ), p_Rmat_fx[idx], im3 ); /*Q(q_cldfb+min_q_shift-1)*/
                    move32();
                    *p_k_fx[k] = L_shl( *p_k_fx[k], Q1 ); // left shift is done to maintain constant Q factor Q(q_cldfb+min_q_shift)
                    move32();
BUG HERE? ->        reference_power_fx[idx1] = Mpy_32_32( *p_k_fx[k], *p_k_fx[k] ); // Q(2*(q_cldfb + min_q_shift)-31)
                    move32();
                    p_k_fx[k]++;

                    reference_power_fx[l] = L_add( reference_power_fx[l], L_shr( reference_power_fx[idx1], 1 ) ); /*2*Q(q_cldfb+min_q_shift)-31-1*/
                    move32();
                }

                *proto_direct_buffer_f_q = add( q_cldfb, min_q_shift );
                move16();
                Word16 qidx = s_min( 1, s_max( 0, sub( l, CLDFB_NO_CHANNELS_HALF - 1 ) ) );
                reference_power_q[qidx] = sub( add( *proto_direct_buffer_f_q, *proto_direct_buffer_f_q ), 31 );
                move16();

                Word16 shift = sub( *proto_direct_buffer_f_q, q_cldfb );
                FOR( k = 1; k < 4; k++ )
                {
                    RealBuffer_fx[k][0][l] = L_shr( p_proto_direct_buffer_fx[2 * ( k * num_freq_bands + l )], shift ); // proto_direct_buffer_f_q -> q_cldfb
                    move32();
                    ImagBuffer_fx[k][0][l] = L_shr( p_proto_direct_buffer_fx[2 * ( k * num_freq_bands + l ) + 1], shift ); // proto_direct_buffer_f_q -> q_cldfb
                    move32();
                }
            }
        }

I have marked the potential bug point above. Comparing to float, this probably should have accumulation instead of replacing the value. Effect should be probably wrong rendering when rotating SHD for SBA.