From bd8637d94fda7573c15f0e07b714299f3bb4ac7e Mon Sep 17 00:00:00 2001
From: Sandesh Venkatesh <sandesh.venkatesh@ittiam.com>
Date: Fri, 25 Apr 2025 14:55:08 +0530
Subject: [PATCH 1/2] Fix for 3GPP issue 1427: Basop Encoder Spectral Gaps in
 Stereo DTX 13.2 kbps Noisy Signal

Link #1427
---
 lib_enc/ivas_core_enc_fx.c      | 19 ++++++-------------
 lib_enc/ivas_core_pre_proc_fx.c | 14 ++++++++++----
 lib_enc/prot_fx_enc.h           |  1 +
 lib_enc/stat_enc.h              |  2 +-
 lib_enc/swb_bwe_enc_fx.c        | 17 +++++++++--------
 lib_enc/swb_pre_proc_fx.c       | 15 ++++++++++++++-
 6 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/lib_enc/ivas_core_enc_fx.c b/lib_enc/ivas_core_enc_fx.c
index f5d3c4d73..6fcb8abd0 100644
--- a/lib_enc/ivas_core_enc_fx.c
+++ b/lib_enc/ivas_core_enc_fx.c
@@ -675,12 +675,6 @@ ivas_error ivas_core_enc_fx(
             wb_bwe_enc_ivas_fx( st, new_inp_resamp16k_fx[n] );
         }
 
-        IF( st->hBWE_FD != NULL )
-        {
-            Scale_sig( st->hBWE_FD->L_old_wtda_swb_fx, L_FRAME48k, Q1 ); // Q-1 -> Q0
-            st->Q_old_wtda = add( st->Q_old_wtda, Q1 );
-            move16();
-        }
 
         /*---------------------------------------------------------------------*
          * SWB(FB) TBE encoding
@@ -725,8 +719,6 @@ ivas_error ivas_core_enc_fx(
             Scale_sig( st->hBWE_FD->L_old_wtda_swb_fx, L_FRAME48k, shift ); // st->Q_old_wtda
         }
 
-        Word16 q_new_swb_speech_buffer = getScaleFactor16( new_swb_speech_buffer_fx_16, L_FRAME48k + STEREO_DFT_OVL_MAX );
-        Scale_sig( new_swb_speech_buffer_fx_16, L_FRAME48k + STEREO_DFT_OVL_MAX, q_new_swb_speech_buffer ); // Q0->q_new_swb_speech_buffer
 
         /* SWB TBE encoder */
         test();
@@ -750,11 +742,9 @@ ivas_error ivas_core_enc_fx(
         }
         ELSE IF( EQ_16( st->extl, SWB_BWE ) || EQ_16( st->extl, FB_BWE ) )
         {
-            Copy_Scale_sig_32_16( shb_speech_fx32, shb_speech_fx, L_FRAME16k, -Q16 );                                     // Q_shb_spch - 16
-            Scale_sig( new_swb_speech_buffer_fx_16, L_FRAME48k + STEREO_DFT_OVL_MAX, negate( q_new_swb_speech_buffer ) ); // q_new_swb_speech_buffer -> Q0
+            Copy_Scale_sig_32_16( shb_speech_fx32, shb_speech_fx, L_FRAME16k, -Q16 ); // Q_shb_spch - 16
             /* SWB(FB) BWE encoder */
-            swb_bwe_enc_ivas_fx( st, last_element_mode, old_inp_12k8_fx[n], old_inp_16k_fx[n], old_syn_12k8_16k_fx[n], new_swb_speech_fx_16, shb_speech_fx, sub( Q_shb_spch, Q16 ), sub( Q_new[n], 1 ) );
-            Scale_sig( new_swb_speech_buffer_fx_16, L_FRAME48k + STEREO_DFT_OVL_MAX, q_new_swb_speech_buffer ); // Q0 -> q_new_swb_speech_buffer
+            swb_bwe_enc_ivas_fx( st, last_element_mode, old_inp_12k8_fx[n], old_inp_16k_fx[n], old_syn_12k8_16k_fx[n], new_swb_speech_fx_16, st->q_inp, shb_speech_fx, sub( Q_shb_spch, Q16 ), sub( Q_new[n], 1 ) );
         }
 
         Scale_sig( old_syn_12k8_16k_fx[n], L_FRAME16k, sub( Q1, Q_new[n] ) ); // Q0
@@ -776,6 +766,9 @@ ivas_error ivas_core_enc_fx(
          *  Inter-channel BWE encoding
          *-------------------------------------------------------------------*/
 
+        Word16 q_new_swb_speech_buffer = getScaleFactor16( new_swb_speech_buffer_fx_16, L_FRAME48k + STEREO_DFT_OVL_MAX );
+        Scale_sig( new_swb_speech_buffer_fx_16, L_FRAME48k + STEREO_DFT_OVL_MAX, q_new_swb_speech_buffer ); // st->q_inp+q_new_swb_speech_buffer
+        q_new_swb_speech_buffer = add( st->q_inp, q_new_swb_speech_buffer );
         test();
         test();
         IF( n == 0 && GE_32( input_Fs, 32000 ) && hStereoICBWE != NULL )
@@ -784,7 +777,7 @@ ivas_error ivas_core_enc_fx(
             stereo_icBWE_preproc_fx( hCPE, input_frame, new_swb_speech_buffer_fx_16 /*tmp buffer*/, q_new_swb_speech_buffer );
 
             q_new_swb_speech_buffer = add( q_new_swb_speech_buffer, 16 );
-            Copy_Scale_sig_16_32_no_sat( new_swb_speech_buffer_fx_16, new_swb_speech_buffer_fx, L_FRAME48k + STEREO_DFT_OVL_MAX, Q16 ); // q_new_swb_speech_buffer - 16 - > q_new_swb_speech_buffer
+            Copy_Scale_sig_16_32_no_sat( new_swb_speech_buffer_fx_16, new_swb_speech_buffer_fx, L_FRAME48k + STEREO_DFT_OVL_MAX, Q16 ); // q_new_swb_speech_buffer+st->q_inp - 16 - > q_new_swb_speech_buffer+st->q_inp
             Copy_Scale_sig_16_32_no_sat( voice_factors_fx[0], voice_factors_fx32[0], NB_SUBFR16k, Q16 );                                // Q31
 
             stereo_icBWE_enc_ivas_fx( hCPE, shb_speech_fx32, sub( Q31, Q_shb_spch ), new_swb_speech_buffer_fx, sub( Q31, q_new_swb_speech_buffer ), voice_factors_fx32[0] );
diff --git a/lib_enc/ivas_core_pre_proc_fx.c b/lib_enc/ivas_core_pre_proc_fx.c
index 6fe6b6720..48253aed6 100644
--- a/lib_enc/ivas_core_pre_proc_fx.c
+++ b/lib_enc/ivas_core_pre_proc_fx.c
@@ -886,8 +886,11 @@ ivas_error ivas_compute_core_buffers_fx(
 
             IF( EQ_16( st->bwidth, WB ) )
             {
-                Copy_Scale_sig( new_inp_16k_fx - delay, st->hBWE_FD->old_input_wb_fx, delay, negate( add( Q_old_inp_16k, 1 ) ) );                                        /* Scaling to Q(-1) */
-                Copy( new_inp_16k_fx - STEREO_DFT_OVL_16k, st->hBWE_FD->L_old_wtda_swb_fx + L_FRAME16k - STEREO_DFT_OVL_16k + delay, sub( STEREO_DFT_OVL_16k, delay ) ); /* Check Q here once. Q should be Q_old_wtda */
+                Copy_Scale_sig( new_inp_16k_fx - delay, st->hBWE_FD->old_input_wb_fx, delay, negate( add( Q_old_inp_16k, 1 ) ) ); /* Scaling to Q(-1) */
+                scale_sig( st->hBWE_FD->L_old_wtda_swb_fx, L_FRAME48k, sub( Q_old_inp_16k, st->Q_old_wtda ) );                    // st->Q_old_wtda->Q_old_inp_16k
+                Copy( new_inp_16k_fx - STEREO_DFT_OVL_16k, st->hBWE_FD->L_old_wtda_swb_fx + L_FRAME16k - STEREO_DFT_OVL_16k + delay, sub( STEREO_DFT_OVL_16k, delay ) );
+                st->Q_old_wtda = Q_old_inp_16k;
+                move16();
             }
         }
         ELSE IF( EQ_16( element_mode, IVAS_CPE_TD ) )
@@ -898,8 +901,11 @@ ivas_error ivas_compute_core_buffers_fx(
             test();
             IF( EQ_16( st->bwidth, WB ) && st->hBWE_FD != NULL )
             {
-                Copy_Scale_sig( new_inp_16k_fx + L_FILT16k - delay, st->hBWE_FD->old_input_wb_fx, delay, negate( add( Q_old_inp_16k, 1 ) ) );                                              /* Scaling to Q(-1) */
-                Copy( new_inp_16k_fx - L_MEM_RECALC_16K, st->hBWE_FD->L_old_wtda_swb_fx + L_FRAME16k - L_MEM_RECALC_16K - L_FILT16k + delay, sub( L_MEM_RECALC_16K + L_FILT16k, delay ) ); /* Check Q here once. Q should be Q_old_wtda */
+                Copy_Scale_sig( new_inp_16k_fx + L_FILT16k - delay, st->hBWE_FD->old_input_wb_fx, delay, negate( add( Q_old_inp_16k, 1 ) ) ); /* Scaling to Q(-1) */
+                scale_sig( st->hBWE_FD->L_old_wtda_swb_fx, L_FRAME48k, sub( Q_old_inp_16k, st->Q_old_wtda ) );                                // st->Q_old_wtda->Q_old_inp_16k
+                Copy( new_inp_16k_fx - L_MEM_RECALC_16K, st->hBWE_FD->L_old_wtda_swb_fx + L_FRAME16k - L_MEM_RECALC_16K - L_FILT16k + delay, sub( L_MEM_RECALC_16K + L_FILT16k, delay ) );
+                st->Q_old_wtda = Q_old_inp_16k;
+                move16();
             }
         }
         ELSE IF( element_mode == IVAS_SCE )
diff --git a/lib_enc/prot_fx_enc.h b/lib_enc/prot_fx_enc.h
index 4237aa9c3..11d6404b7 100644
--- a/lib_enc/prot_fx_enc.h
+++ b/lib_enc/prot_fx_enc.h
@@ -639,6 +639,7 @@ void swb_bwe_enc_ivas_fx(
     Word16 *old_input_16k_fx,          /* i  : input signal @16kHz for SWB BWE         */
     const Word16 *old_syn_12k8_16k_fx, /* i  : ACELP core synthesis at 12.8kHz or 16kHz */
     const Word16 *new_swb_speech_fx,   /* i  : original input signal at 32kHz           */
+    const Word16 Q_new_swb_speech,     /* i  : Q for new_swb_speech_fx                  */
     Word16 *shb_speech_fx,             /* i  : SHB target signal (6-14kHz) at 16kHz     */
     Word16 Q_shb_speech,
     Word16 Q_slb_speech );
diff --git a/lib_enc/stat_enc.h b/lib_enc/stat_enc.h
index 45ec01b31..4da0bff61 100644
--- a/lib_enc/stat_enc.h
+++ b/lib_enc/stat_enc.h
@@ -1013,7 +1013,7 @@ typedef struct fd_bwe_enc_structure
 {
     Word16 new_input_hp_fx[NS2SA( 16000, ACELP_LOOK_NS + DELAY_FD_BWE_ENC_NS + DELAY_FIR_RESAMPL_NS - DELAY_CLDFB_NS )]; // Q_new_input_hp
     Word16 Q_new_input_hp;
-    Word16 old_input_fx[NS2SA( 48000, DELAY_FD_BWE_ENC_NS + DELAY_FIR_RESAMPL_NS )]; // q0
+    Word16 old_input_fx[NS2SA( 48000, DELAY_FD_BWE_ENC_NS + DELAY_FIR_RESAMPL_NS )]; // st->q_inp
     Word16 old_input_wb_fx[NS2SA( 16000, DELAY_FD_BWE_ENC_NS )];                     /* Q(-1) */
     Word16 old_input_lp_fx[NS2SA( 16000, ACELP_LOOK_NS + DELAY_FD_BWE_ENC_NS )];     // st->hBWE_FD->prev_Q_input_lp
     Word16 old_syn_12k8_16k_fx[NS2SA( 16000, DELAY_FD_BWE_ENC_NS )];                 // st->Q_syn
diff --git a/lib_enc/swb_bwe_enc_fx.c b/lib_enc/swb_bwe_enc_fx.c
index 1ebef7404..6b309c334 100644
--- a/lib_enc/swb_bwe_enc_fx.c
+++ b/lib_enc/swb_bwe_enc_fx.c
@@ -270,6 +270,7 @@ void swb_bwe_enc_ivas_fx(
     Word16 *old_input_16k_fx,          /* i  : input signal @16kHz for SWB BWE         */
     const Word16 *old_syn_12k8_16k_fx, /* i  : ACELP core synthesis at 12.8kHz or 16kHz */
     const Word16 *new_swb_speech_fx,   /* i  : original input signal at 32kHz           */
+    const Word16 Q_new_swb_speech,     /* i  : Q for new_swb_speech_fx                  */
     Word16 *shb_speech_fx,             /* i  : SHB target signal (6-14kHz) at 16kHz     */
     Word16 Q_shb_speech,
     Word16 Q_slb_speech )
@@ -287,7 +288,7 @@ void swb_bwe_enc_ivas_fx(
     Word16 old_input_lp_fx[L_FRAME16k];
     Word16 new_input_hp_fx[L_FRAME16k];
     Word16 yorig_fx[L_FRAME48k];
-    Word16 scl, new_input_fx_exp;
+    Word16 scl, new_input_fx_q;
     Word16 max;
     Word16 Sample_Delay_SWB_BWE;
     Word16 Sample_Delay_HP;
@@ -412,7 +413,7 @@ void swb_bwe_enc_ivas_fx(
      * SWB BWE encoding
      * FB BWE encoding
      *---------------------------------------------------------------------*/
-    new_input_fx_exp = 0;
+    new_input_fx_q = Q_new_swb_speech;
     move16();
     test();
     IF( ( EQ_16( st_fx->idchan, 1 ) ) && ( EQ_16( last_element_mode, IVAS_CPE_DFT ) ) )
@@ -424,12 +425,12 @@ void swb_bwe_enc_ivas_fx(
         }
     }
     /* MDCT of the core synthesis signal */
-    wtda_fx( old_input_fx, &new_input_fx_exp, L_old_input_fx, hBWE_FD->L_old_wtda_swb_fx,
+    wtda_fx( old_input_fx, &new_input_fx_q, L_old_input_fx, hBWE_FD->L_old_wtda_swb_fx,
              &st_fx->Q_old_wtda, ALDO_WINDOW, ALDO_WINDOW, /* window overlap of current frame (0: full, 2: none, or 3: half) */
              inner_frame );
 
     /* DCT of the ACELP core synthesis */
-    direct_transform_fx( L_old_input_fx, yorig_32, 0, inner_frame, &new_input_fx_exp, st_fx->element_mode );
+    direct_transform_fx( L_old_input_fx, yorig_32, 0, inner_frame, &new_input_fx_q, st_fx->element_mode );
 
     /* high-band gain control in case of BWS */
     IF( st_fx->bwidth_sw_cnt > 0 )
@@ -438,7 +439,7 @@ void swb_bwe_enc_ivas_fx(
     }
 
     /* Convert to 16 Bits (Calc Shift Required to Stay within MAX_Q_NEW_INPUT) */
-    scl = sub( 16 + 8, new_input_fx_exp );
+    scl = sub( 16 + 8, new_input_fx_q );
     /* Possible to Upscale? */
     IF( scl > 0 )
     {
@@ -449,7 +450,7 @@ void swb_bwe_enc_ivas_fx(
         scl = s_min( Q_synth, scl );
     }
     Copy_Scale_sig32_16( yorig_32, yorig_fx, inner_frame, scl );
-    Q_synth = add( sub( new_input_fx_exp, 16 ), scl );
+    Q_synth = add( sub( new_input_fx_q, 16 ), scl );
     max = 0;
     move16();
     Q_synth_hf = 0;
@@ -558,12 +559,12 @@ void swb_bwe_enc_ivas_fx(
     IF( EQ_16( st_fx->L_frame, L_FRAME16k ) )
     {
         SWB_BWE_encoding_ivas_fx( st_fx, old_input_fx, old_input_lp_fx, new_input_hp_fx, old_syn_12k8_16k_fx, yorig_32,
-                                  SWB_fenv_fx, tilt_nb_fx, 80, Q_slb_speech, Q_shb, new_input_fx_exp, new_input_fx_exp );
+                                  SWB_fenv_fx, tilt_nb_fx, 80, Q_slb_speech, Q_shb, new_input_fx_q, new_input_fx_q );
     }
     ELSE
     {
         SWB_BWE_encoding_ivas_fx( st_fx, old_input_fx, old_input_lp_fx, new_input_hp_fx, old_syn_12k8_16k_fx, yorig_32,
-                                  SWB_fenv_fx, tilt_nb_fx, 6, Q_slb_speech, Q_shb, new_input_fx_exp, new_input_fx_exp );
+                                  SWB_fenv_fx, tilt_nb_fx, 6, Q_slb_speech, Q_shb, new_input_fx_q, new_input_fx_q );
     }
 
 
diff --git a/lib_enc/swb_pre_proc_fx.c b/lib_enc/swb_pre_proc_fx.c
index ba165cba4..30cfb4920 100644
--- a/lib_enc/swb_pre_proc_fx.c
+++ b/lib_enc/swb_pre_proc_fx.c
@@ -166,6 +166,8 @@ void wb_pre_proc_fx(
         Copy( hBWE_FD->old_input_wb_fx, old_input, Sample_Delay_WB_BWE );
         Copy( new_inp_resamp16k + L_FRAME16k - Sample_Delay_WB_BWE, hBWE_FD->old_input_wb_fx, Sample_Delay_WB_BWE );
         Copy( old_input, hBWE_FD->L_old_wtda_swb_fx, L_FRAME16k );
+        st_fx->Q_old_wtda = -1;
+        move16();
     }
     return;
 }
@@ -778,7 +780,7 @@ void swb_pre_proc_fx(
 /*full implementation pending*/
 void swb_pre_proc_ivas_fx(
     Encoder_State *st,         /* i/o: encoder state structure                  */
-    Word16 *new_swb_speech,    /* o  : original input signal at 32kHz - Q0      */
+    Word16 *new_swb_speech,    /* o  : original input signal at 32kHz - st->q_inp      */
     Word32 *new_swb_speech_fx, /* o  : original input signal at 32kHz - Q - q_reImBuffer */
     Word16 *shb_speech,        /* o  : SHB target signal (6-14kHz) at 16kHz- Q(Q_shb_spch) */
     Word16 *Q_shb_spch,
@@ -846,6 +848,9 @@ void swb_pre_proc_ivas_fx(
 
             test();
             test();
+            scale_sig( hBWE_FD->L_old_wtda_swb_fx, L_FRAME48k, sub( st->q_inp, st->Q_old_wtda ) ); // st->Q_old_wtda -> st->q_inp
+            st->Q_old_wtda = st->q_inp;
+            move16();
             IF( EQ_16( st->element_mode, IVAS_CPE_TD ) && GE_16( st->bwidth, SWB ) )
             {
                 Copy( st->input_fx - hCPE->hStereoTCA->lMemRecalc, hBWE_FD->L_old_wtda_swb_fx + L_FRAME32k - sub( hCPE->hStereoTCA->lMemRecalc, Sample_Delay_SWB_BWE ), sub( hCPE->hStereoTCA->lMemRecalc, Sample_Delay_SWB_BWE ) );
@@ -880,6 +885,8 @@ void swb_pre_proc_ivas_fx(
             IF( NE_16( st->extl, WB_BWE ) )
             {
                 Copy( old_input_fx, hBWE_FD->L_old_wtda_swb_fx, L_FRAME32k );
+                st->Q_old_wtda = st->q_inp;
+                move16();
             }
         }
 
@@ -929,6 +936,10 @@ void swb_pre_proc_ivas_fx(
 
                 test();
                 test();
+                scale_sig( hBWE_FD->L_old_wtda_swb_fx, L_FRAME48k, sub( st->q_inp, st->Q_old_wtda ) ); // st->Q_old_wtda -> st->q_inp
+                st->Q_old_wtda = st->q_inp;
+                move16();
+
                 IF( EQ_16( st->element_mode, IVAS_CPE_TD ) && GE_16( st->bwidth, SWB ) )
                 {
                     IF( EQ_16( st->bwidth, SWB ) )
@@ -1029,6 +1040,8 @@ void swb_pre_proc_ivas_fx(
                     set16_fx( old_input_fx, 0, Sample_Delay_SWB_BWE );
                     Copy( new_swb_speech + inner_frame - Sample_Delay_SWB_BWE, hBWE_FD->old_input_fx, Sample_Delay_SWB_BWE );
                     Copy( old_input_fx, hBWE_FD->L_old_wtda_swb_fx, inner_frame );
+                    st->Q_old_wtda = st->q_inp;
+                    move16();
                 }
 
                 /* resample 48 kHz to 32kHz */
-- 
GitLab


From 6e4399ee5756cb057a36cfd0e8a8b99367f53b03 Mon Sep 17 00:00:00 2001
From: Sandesh Venkatesh <sandesh.venkatesh@ittiam.com>
Date: Mon, 28 Apr 2025 08:46:47 +0530
Subject: [PATCH 2/2] EVS bitexactness fix

---
 lib_enc/swb_pre_proc_fx.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lib_enc/swb_pre_proc_fx.c b/lib_enc/swb_pre_proc_fx.c
index 30cfb4920..8991696e6 100644
--- a/lib_enc/swb_pre_proc_fx.c
+++ b/lib_enc/swb_pre_proc_fx.c
@@ -166,8 +166,6 @@ void wb_pre_proc_fx(
         Copy( hBWE_FD->old_input_wb_fx, old_input, Sample_Delay_WB_BWE );
         Copy( new_inp_resamp16k + L_FRAME16k - Sample_Delay_WB_BWE, hBWE_FD->old_input_wb_fx, Sample_Delay_WB_BWE );
         Copy( old_input, hBWE_FD->L_old_wtda_swb_fx, L_FRAME16k );
-        st_fx->Q_old_wtda = -1;
-        move16();
     }
     return;
 }
-- 
GitLab