From 73a29eb997b576d4b1364b4092c48e0f67e11918 Mon Sep 17 00:00:00 2001
From: Vladimir Malenovsky <vladimir.malenovsky@usherbrooke.ca>
Date: Mon, 11 Aug 2025 16:22:45 +0200
Subject: [PATCH 1/5] fix the duration parameter - only trim if duration is
 exceeded

---
 .../generation/generate_ismN_items.py         | 18 +++++------
 .../generation/generate_masa_items.py         | 30 +++++++++----------
 .../generation/generate_mc_items.py           | 22 +++++++-------
 .../generation/generate_omasa_items.py        | 13 +++++++-
 .../generation/generate_osba_items.py         | 20 ++++++-------
 .../generation/generate_sba_items.py          | 22 +++++++-------
 .../generation/generate_stereo_items.py       | 22 +++++++-------
 7 files changed, 72 insertions(+), 75 deletions(-)

diff --git a/ivas_processing_scripts/generation/generate_ismN_items.py b/ivas_processing_scripts/generation/generate_ismN_items.py
index 489dbea4..551058c7 100644
--- a/ivas_processing_scripts/generation/generate_ismN_items.py
+++ b/ivas_processing_scripts/generation/generate_ismN_items.py
@@ -443,18 +443,14 @@ def generate_ismN_scene(
         noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float")
         y.audio += noise
 
-    # adjust the length of the output signal
+    # trim the output signal such if the total duration exceeds X seconds
     if "duration" in cfg.__dict__:
-        # trim the output signal such that the total duration is X seconds
-        duration = int(cfg.duration * cfg.fs)  # convert to samples
-    else:
-        # do not change the length of the audio signal
-        duration = len(y.audio)
-    duration = int(
-        np.floor(duration / frame_len) * frame_len
-    )  # ensure multiple of 20ms
-    if len(y.audio) != duration:
-        metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True)
+        # convert from seconds to samples (ensure multiple of 20ms)
+        duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len)
+
+        # check if the current length of the output signal exceeds the duration
+        if len(y.audio) > duration:
+            metadata.trim_meta(y, limits=[0, duration], samples=True)
 
     # adjust the loudness of the output signal
     if "loudness" in cfg.__dict__:
diff --git a/ivas_processing_scripts/generation/generate_masa_items.py b/ivas_processing_scripts/generation/generate_masa_items.py
index 7a425823..6b40da78 100644
--- a/ivas_processing_scripts/generation/generate_masa_items.py
+++ b/ivas_processing_scripts/generation/generate_masa_items.py
@@ -316,12 +316,12 @@ def generate_MASA_scene(
             x = reverb_hoa3(x, IR, mode=None)
 
         # adjust the level of the FOA/HOA2/HOA3 signal
-        if level is None:
-            # do not change the level of the audio source signal
-            logger.info("-- Level of the audio source signal is not changed")
-        elif np.isinf(level):
+        if np.isinf(level):
             # set all channels to zero
             x.audio = np.zeros_like(x.audio)
+        elif level is None:
+            # do not change the level of the audio source signal
+            logger.info("-- Level of the audio source signal is not changed")
         else:
             x.audio, _ = loudness_norm(x, level, loudness_format="STEREO")
 
@@ -393,21 +393,19 @@ def generate_MASA_scene(
                 y_int.audio, y_int.fs, limits=[-preamble, -postamble], samples=True
             )
 
-    # adjust the length of the output signal
+    # trim the output signal if the total duration exceeds X seconds
     if "duration" in cfg.__dict__:
-        # trim the output signal such that the total duration is X seconds
-        duration = int(cfg.duration * cfg.fs)  # convert to samples
-    else:
-        # do not change the length of the audio signal
-        duration = len(y_int.audio)
-    duration = int(
-        np.floor(duration / frame_len) * frame_len
-    )  # ensure multiple of 20ms
-    if len(y_int.audio) != duration:
-        y_int.audio = audioarray.trim(
-            y_int.audio, y_int.fs, limits=[0, len(y_int.audio) - duration], samples=True
+        # convert from seconds to samples (ensure multiple of 20ms)
+        duration = int(
+            np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len
         )
 
+        # check if the current length of the output signal exceeds the duration
+        if len(y_int.audio) > duration:
+            y_int.audio = audioarray.trim(
+                y_int.audio, y_int.fs, limits=[0, len(y_int.audio) - duration], samples=True
+            )
+
     # adjust the loudness of the output signal
     if "loudness" in cfg.__dict__:
         logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LKFS")
diff --git a/ivas_processing_scripts/generation/generate_mc_items.py b/ivas_processing_scripts/generation/generate_mc_items.py
index df1b1645..daacf2d8 100644
--- a/ivas_processing_scripts/generation/generate_mc_items.py
+++ b/ivas_processing_scripts/generation/generate_mc_items.py
@@ -393,21 +393,19 @@ def generate_MC_scene(
                 y_int.audio, y_int.fs, limits=[-preamble, -postamble], samples=True
             )
 
-    # adjust the length of the output signal
+    # trim the output signal if the total duration exceeds X seconds
     if "duration" in cfg.__dict__:
-        # trim the output signal such that the total duration is X seconds
-        duration = int(cfg.duration * cfg.fs)  # convert to samples
-    else:
-        # do not change the length of the audio signal
-        duration = len(y_int.audio)
-    duration = int(
-        np.floor(duration / frame_len) * frame_len
-    )  # ensure multiple of 20ms
-    if len(y_int.audio) != duration:
-        y_int.audio = audioarray.trim(
-            y_int.audio, y_int.fs, limits=[0, len(y_int.audio) - duration], samples=True
+        # convert from seconds to samples (ensure multiple of 20ms)
+        duration = int(
+            np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len
         )
 
+        # check if the current length of the output signal exceeds the duration
+        if len(y_int.audio) > duration:
+            y_int.audio = audioarray.trim(
+                y_int.audio, y_int.fs, limits=[0, len(y_int.audio) - duration], samples=True
+            )
+
     # adjust the loudness of the output signal
     if "loudness" in cfg.__dict__:
         logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LKFS")
diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py
index 93f00305..2ca92195 100644
--- a/ivas_processing_scripts/generation/generate_omasa_items.py
+++ b/ivas_processing_scripts/generation/generate_omasa_items.py
@@ -472,8 +472,9 @@ def generate_OMASA_scene(
         noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float")
         y_int.audio += noise
 
-    # adjust the length of the output signal
+    # trim the output signal such if the total duration exceeds X seconds
     if "duration" in cfg.__dict__:
+<<<<<<< Updated upstream
         # trim the output signal such that the total duration is X seconds
         duration = int(cfg.duration * cfg.fs)  # convert to samples
     else:
@@ -484,6 +485,16 @@ def generate_OMASA_scene(
     )  # ensure multiple of 20ms
     if len(y_int.audio) != duration:
         metadata.trim_meta(y_int, limits=[0, len(y_int.audio) - duration], samples=True)
+=======
+        # convert from seconds to samples (ensure multiple of 20ms)
+        duration = int(
+            np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len
+        )
+
+        # check if the current length of the output signal exceeds the duration
+        if len(y.audio) > duration:
+            metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True)
+>>>>>>> Stashed changes
 
     # adjust the loudness of the output signal
     if "loudness" in cfg.__dict__:
diff --git a/ivas_processing_scripts/generation/generate_osba_items.py b/ivas_processing_scripts/generation/generate_osba_items.py
index dd8f5b5d..67e493b5 100644
--- a/ivas_processing_scripts/generation/generate_osba_items.py
+++ b/ivas_processing_scripts/generation/generate_osba_items.py
@@ -458,18 +458,16 @@ def generate_OSBA_scene(
         noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float")
         y.audio += noise
 
-    # adjust the length of the output signal
+    # trim the output signal if the total duration exceeds X seconds
     if "duration" in cfg.__dict__:
-        # trim the output signal such that the total duration is X seconds
-        duration = int(cfg.duration * cfg.fs)  # convert to samples
-    else:
-        # do not change the length of the audio signal
-        duration = len(y.audio)
-    duration = int(
-        np.floor(duration / frame_len) * frame_len
-    )  # ensure multiple of 20ms
-    if len(y.audio) != duration:
-        metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True)
+        # convert from seconds to samples (ensure multiple of 20ms)
+        duration = int(
+            np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len
+        )
+
+        # check if the current length of the output signal exceeds the duration
+        if len(y.audio) > duration:
+            metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True)
 
     # adjust the loudness of the output signal
     if "loudness" in cfg.__dict__:
diff --git a/ivas_processing_scripts/generation/generate_sba_items.py b/ivas_processing_scripts/generation/generate_sba_items.py
index 28fbabab..f40ba9d8 100644
--- a/ivas_processing_scripts/generation/generate_sba_items.py
+++ b/ivas_processing_scripts/generation/generate_sba_items.py
@@ -364,21 +364,19 @@ def generate_sba_scene(
                 y.audio, y.fs, limits=[-preamble, -postamble], samples=True
             )
 
-    # adjust the length of the output signal
+    # trim the output signal if the total duration exceeds X seconds
     if "duration" in cfg.__dict__:
-        # trim the output signal such that the total duration is X seconds
-        duration = int(cfg.duration * cfg.fs)  # convert to samples
-    else:
-        # do not change the length of the audio signal
-        duration = len(y.audio)
-    duration = int(
-        np.floor(duration / frame_len) * frame_len
-    )  # ensure multiple of 20ms
-    if len(y.audio) != duration:
-        y.audio = audioarray.trim(
-            y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True
+        # convert from seconds to samples (ensure multiple of 20ms)
+        duration = int(
+            np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len
         )
 
+        # check if the current length of the output signal exceeds the duration
+        if len(y.audio) > duration:
+            y.audio = audioarray.trim(
+                y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True
+            )
+
     # adjust the loudness of the output signal
     if "loudness" in cfg.__dict__:
         logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LKFS")
diff --git a/ivas_processing_scripts/generation/generate_stereo_items.py b/ivas_processing_scripts/generation/generate_stereo_items.py
index a0d99f90..6bf1e95e 100644
--- a/ivas_processing_scripts/generation/generate_stereo_items.py
+++ b/ivas_processing_scripts/generation/generate_stereo_items.py
@@ -365,21 +365,19 @@ def generate_stereo_scene(
                 y.audio, y.fs, limits=[-preamble, -postamble], samples=True
             )
 
-    # adjust the length of the output signal
+    # trim the output signal if the total duration exceeds X seconds
     if "duration" in cfg.__dict__:
-        # trim the output signal such that the total duration is X seconds
-        duration = int(cfg.duration * cfg.fs)  # convert to samples
-    else:
-        # do not change the length of the audio signal
-        duration = len(y.audio)
-    duration = int(
-        np.floor(duration / frame_len) * frame_len
-    )  # ensure multiple of 20ms
-    if len(y.audio) != duration:
-        y.audio = audioarray.trim(
-            y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True
+        # convert from seconds to samples (ensure multiple of 20ms)
+        duration = int(
+            np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len
         )
 
+        # check if the current length of the output signal exceeds the duration
+        if len(y.audio) > duration:
+            y.audio = audioarray.trim(
+                y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True
+            )
+
     # adjust the loudness of the output signal
     if "loudness" in cfg.__dict__:
         logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LKFS")
-- 
GitLab


From 93d4dbfed845607d10b5aec936578119636e7142 Mon Sep 17 00:00:00 2001
From: Vladimir Malenovsky <vladimir.malenovsky@usherbrooke.ca>
Date: Tue, 12 Aug 2025 14:07:34 +0200
Subject: [PATCH 2/5] support for X(i_ref) notation to allow specifying overlap
 between items

---
 .../generation/generate_ismN_items.py         | 46 +++++++++++----
 .../generation/generate_masa_items.py         | 57 +++++++++++++++----
 .../generation/generate_mc_items.py           | 36 +++++++++---
 .../generation/generate_omasa_items.py        | 53 ++++++++++-------
 .../generation/generate_osba_items.py         | 36 +++++++++---
 .../generation/generate_sba_items.py          | 36 +++++++++---
 .../generation/generate_stereo_items.py       | 36 +++++++++---
 7 files changed, 228 insertions(+), 72 deletions(-)

diff --git a/ivas_processing_scripts/generation/generate_ismN_items.py b/ivas_processing_scripts/generation/generate_ismN_items.py
index 551058c7..f57f58c3 100644
--- a/ivas_processing_scripts/generation/generate_ismN_items.py
+++ b/ivas_processing_scripts/generation/generate_ismN_items.py
@@ -30,6 +30,7 @@
 #  the United Nations Convention on Contracts on the International Sales of Goods.
 #
 import logging
+import re
 import sys
 from itertools import groupby, repeat
 from pathlib import Path
@@ -196,6 +197,7 @@ def generate_ismN_scene(
 
     # repeat for all source files
     offset = 0
+    end_position = []
     for i in range(N_inputs):
         # read input filename
         source_file = (
@@ -232,16 +234,33 @@ def generate_ismN_scene(
                 if isinstance(scene["shift"], list)
                 else scene["shift"]
             )
+
+            # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index
+            # of the reference signal (0-based index)
+            if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]:
+                # extract X and i_ref
+                match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i])
+
+                if match:
+                    overlap = float(match.group(1))
+                    overlap_ref = int(match.group(2))
+                else:
+                    scene_shift_str = scene["shift"][i]
+                    logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!")
+                    sys.exit(-1)
+
+                # calculate absolute shift of the source signal in seconds
+                source_shift = end_position[overlap_ref] + overlap
         else:
             source_shift = 0.0
 
         # convert shift from seconds to samples and ensure it is a multiple of 20ms
-        source_shift_in_seconds = source_shift
         source_shift = source_shift * cfg.fs
         if source_shift >= 0:
             source_shift = int(np.floor(source_shift / frame_len) * frame_len)
         else:
             source_shift = int(np.ceil(source_shift / frame_len) * frame_len)
+        source_shift_in_seconds = source_shift / cfg.fs
 
         # read the level
         if "level" in scene.keys():
@@ -278,6 +297,9 @@ def generate_ismN_scene(
         x = audio.fromtype("ISM1")
         x.audio, x.fs = audiofile.read(input_filename)
 
+        # record the total duration of the source signal, taking into account the shift of the starting position
+        end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds)
+
         # resample to the target fs if necessary
         if x.fs != cfg.fs:
             logger.warning(
@@ -288,12 +310,12 @@ def generate_ismN_scene(
             x.fs = cfg.fs
 
         # adjust the level of the audio source file (need to convert to MONO first)
-        if level is None:
-            # do not change the level of the audio source signal
-            logger.info("-- Level of the audio source signal is not changed")
-        elif np.isinf(level):
+        if np.isinf(level):
             # set all channels to zero
             x.audio = np.zeros_like(x.audio)
+        elif level is None:
+            # do not change the level of the audio source signal
+            logger.info("-- Level of the audio source signal is not changed")
         else:
             x_temp = audio.ChannelBasedAudio(
                 "MONO"
@@ -391,21 +413,21 @@ def generate_ismN_scene(
             y.object_pos = x.object_pos.copy()
             y.fs = x.fs
 
-            if source_shift < 0:
+            if source_shift > 0:
                 # insert zeros to the new audio source signal to shift it right
-                metadata.trim_meta(y, limits=[source_shift, 0], samples=True)
+                metadata.trim_meta(y, limits=[-source_shift, 0], samples=True)
             else:
                 offset = source_shift
         else:
             # shift the beginning of the audio source signal
             delta_offset = source_shift - offset
-            if delta_offset > 0:
+            if delta_offset < 0:
                 # insert zeros to the previous ISM signal(s) to shift them right
-                metadata.trim_meta(y, limits=[-delta_offset, 0], samples=True)
+                metadata.trim_meta(y, limits=[delta_offset, 0], samples=True)
                 offset = source_shift
             else:
                 # insert zeros to the new audio source signal to shift it right
-                metadata.trim_meta(x, limits=[delta_offset, 0], samples=True)
+                metadata.trim_meta(x, limits=[-delta_offset, 0], samples=True)
 
             # adjust the length of the audio source signal
             delta_length = len(x.audio) - len(y.audio)
@@ -443,14 +465,14 @@ def generate_ismN_scene(
         noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float")
         y.audio += noise
 
-    # trim the output signal such if the total duration exceeds X seconds
+    # trim the output signal if the total duration exceeds X seconds
     if "duration" in cfg.__dict__:
         # convert from seconds to samples (ensure multiple of 20ms)
         duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len)
 
         # check if the current length of the output signal exceeds the duration
         if len(y.audio) > duration:
-            metadata.trim_meta(y, limits=[0, duration], samples=True)
+            metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True)
 
     # adjust the loudness of the output signal
     if "loudness" in cfg.__dict__:
diff --git a/ivas_processing_scripts/generation/generate_masa_items.py b/ivas_processing_scripts/generation/generate_masa_items.py
index 6b40da78..6ddac870 100644
--- a/ivas_processing_scripts/generation/generate_masa_items.py
+++ b/ivas_processing_scripts/generation/generate_masa_items.py
@@ -31,6 +31,7 @@
 #
 
 import logging
+import re
 import sys
 from itertools import groupby, repeat
 from pathlib import Path
@@ -209,6 +210,7 @@ def generate_MASA_scene(
     # repeat for all source files
     offset = 0
     y_int = None
+    end_position = []
     for i in range(N_inputs):
         # parse parameters from the scene description
         source_file = (
@@ -232,13 +234,44 @@ def generate_MASA_scene(
         else:
             source_shift = 0.0
 
+        # read the source shift length (in seconds)
+        if "shift" in scene.keys():
+            source_shift = (
+                scene["shift"][i]
+                if isinstance(scene["shift"], list)
+                else scene["shift"]
+            )
+
+            # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index
+            # of the reference signal (0-based index)
+            if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]:
+                # extract X and i_ref
+                match = re.match(
+                    r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]
+                )
+
+                if match:
+                    overlap = float(match.group(1))
+                    overlap_ref = int(match.group(2))
+                else:
+                    scene_shift_str = scene["shift"][i]
+                    logger.error(
+                        f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!"
+                    )
+                    sys.exit(-1)
+
+                # calculate absolute shift of the source signal in seconds
+                source_shift = end_position[overlap_ref] - overlap
+        else:
+            source_shift = 0.0
+
         # convert shift from seconds to samples and ensure it is a multiple of 20ms
-        source_shift_in_seconds = source_shift
         source_shift = source_shift * cfg.fs
         if source_shift >= 0:
             source_shift = int(np.floor(source_shift / frame_len) * frame_len)
         else:
             source_shift = int(np.ceil(source_shift / frame_len) * frame_len)
+        source_shift_in_seconds = source_shift / cfg.fs
 
         # read the level
         if "level" in scene.keys():
@@ -295,6 +328,9 @@ def generate_MASA_scene(
         # read source file
         x = audio.fromfile("MONO", input_filename)
 
+        # record the total duration of the source signal, taking into account the shift of the starting position
+        end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds)
+
         # resample to the target fs if necessary
         if x.fs != cfg.fs:
             logger.warning(
@@ -339,26 +375,26 @@ def generate_MASA_scene(
             # this is the first SBA source signal
             y_int.audio = x.audio.copy()
 
-            if source_shift < 0:
+            if source_shift > 0:
                 # insert zeros to the first SBA source signal to shift it right
                 y_int.audio = audioarray.trim(
-                    y_int.audio, y_int.fs, limits=[source_shift, 0], samples=True
+                    y_int.audio, y_int.fs, limits=[-source_shift, 0], samples=True
                 )
             else:
                 offset = source_shift
         else:
             # shift the beginning of the audio source signal
             delta_offset = source_shift - offset
-            if delta_offset > 0:
+            if delta_offset < 0:
                 # insert zeros to the output SBA signal to shift it right
                 y_int.audio = audioarray.trim(
-                    y_int.audio, y_int.fs, limits=[-delta_offset, 0], samples=True
+                    y_int.audio, y_int.fs, limits=[delta_offset, 0], samples=True
                 )
                 offset = source_shift
             else:
                 # insert zeros to the new SBA source signal to shift it right
                 x.audio = audioarray.trim(
-                    x.audio, x.fs, limits=[delta_offset, 0], samples=True
+                    x.audio, x.fs, limits=[-delta_offset, 0], samples=True
                 )
 
             # adjust the length of the audio source signal
@@ -396,14 +432,15 @@ def generate_MASA_scene(
     # trim the output signal if the total duration exceeds X seconds
     if "duration" in cfg.__dict__:
         # convert from seconds to samples (ensure multiple of 20ms)
-        duration = int(
-            np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len
-        )
+        duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len)
 
         # check if the current length of the output signal exceeds the duration
         if len(y_int.audio) > duration:
             y_int.audio = audioarray.trim(
-                y_int.audio, y_int.fs, limits=[0, len(y_int.audio) - duration], samples=True
+                y_int.audio,
+                y_int.fs,
+                limits=[0, len(y_int.audio) - duration],
+                samples=True,
             )
 
     # adjust the loudness of the output signal
diff --git a/ivas_processing_scripts/generation/generate_mc_items.py b/ivas_processing_scripts/generation/generate_mc_items.py
index daacf2d8..a37a3710 100644
--- a/ivas_processing_scripts/generation/generate_mc_items.py
+++ b/ivas_processing_scripts/generation/generate_mc_items.py
@@ -31,6 +31,7 @@
 #
 
 import logging
+import re
 import sys
 from itertools import groupby, repeat
 from pathlib import Path
@@ -209,6 +210,7 @@ def generate_MC_scene(
     # repeat for all source files
     offset = 0
     y_int = None
+    end_position = []
     for i in range(N_inputs):
         # parse parameters from the scene description
         source_file = (
@@ -222,23 +224,40 @@ def generate_MC_scene(
         )
         IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name)
 
-        # read the shift time in seconds
+        # read the source shift length (in seconds)
         if "shift" in scene.keys():
             source_shift = (
                 scene["shift"][i]
                 if isinstance(scene["shift"], list)
                 else scene["shift"]
             )
+
+            # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index
+            # of the reference signal (0-based index)
+            if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]:
+                # extract X and i_ref
+                match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i])
+
+                if match:
+                    overlap = float(match.group(1))
+                    overlap_ref = int(match.group(2))
+                else:
+                    scene_shift_str = scene["shift"][i]
+                    logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!")
+                    sys.exit(-1)
+
+                # calculate absolute shift of the source signal in seconds
+                source_shift = end_position[overlap_ref] + overlap
         else:
             source_shift = 0.0
 
         # convert shift from seconds to samples and ensure it is a multiple of 20ms
-        source_shift_in_seconds = source_shift
         source_shift = source_shift * cfg.fs
         if source_shift >= 0:
             source_shift = int(np.floor(source_shift / frame_len) * frame_len)
         else:
             source_shift = int(np.ceil(source_shift / frame_len) * frame_len)
+        source_shift_in_seconds = source_shift / cfg.fs
 
         # read the level
         if "level" in scene.keys():
@@ -295,6 +314,9 @@ def generate_MC_scene(
         # read source file
         x = audio.fromfile("MONO", input_filename)
 
+        # record the total duration of the source signal, taking into account the shift of the starting position
+        end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds)
+
         # resample to the target fs if necessary
         if x.fs != cfg.fs:
             logger.warning(
@@ -339,26 +361,26 @@ def generate_MC_scene(
             # this is the first SBA source signal
             y_int.audio = x.audio.copy()
 
-            if source_shift < 0:
+            if source_shift > 0:
                 # insert zeros to the first SBA source signal to shift it right
                 y_int.audio = audioarray.trim(
-                    y_int.audio, y_int.fs, limits=[source_shift, 0], samples=True
+                    y_int.audio, y_int.fs, limits=[-source_shift, 0], samples=True
                 )
             else:
                 offset = source_shift
         else:
             # shift the beginning of the audio source signal
             delta_offset = source_shift - offset
-            if delta_offset > 0:
+            if delta_offset < 0:
                 # insert zeros to the output SBA signal to shift it right
                 y_int.audio = audioarray.trim(
-                    y_int.audio, y_int.fs, limits=[-delta_offset, 0], samples=True
+                    y_int.audio, y_int.fs, limits=[delta_offset, 0], samples=True
                 )
                 offset = source_shift
             else:
                 # insert zeros to the new SBA source signal to shift it right
                 x.audio = audioarray.trim(
-                    x.audio, x.fs, limits=[delta_offset, 0], samples=True
+                    x.audio, x.fs, limits=[-delta_offset, 0], samples=True
                 )
 
             # adjust the length of the audio source signal
diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py
index 2ca92195..73831922 100644
--- a/ivas_processing_scripts/generation/generate_omasa_items.py
+++ b/ivas_processing_scripts/generation/generate_omasa_items.py
@@ -31,6 +31,7 @@
 #
 
 import logging
+import re
 import sys
 from itertools import groupby, repeat
 from pathlib import Path
@@ -183,6 +184,7 @@ def generate_OMASA_scene(
 
     # repeat for all source files
     offset = 0
+    end_position = []
     for i in range(N_inputs):
         # parse parameters from the scene description
         source_file = (
@@ -220,16 +222,33 @@ def generate_OMASA_scene(
                 if isinstance(scene["shift"], list)
                 else scene["shift"]
             )
+
+            # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index
+            # of the reference signal (0-based index)
+            if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]:
+                # extract X and i_ref
+                match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i])
+
+                if match:
+                    overlap = float(match.group(1))
+                    overlap_ref = int(match.group(2))
+                else:
+                    scene_shift_str = scene["shift"][i]
+                    logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!")
+                    sys.exit(-1)
+
+                # calculate absolute shift of the source signal in seconds
+                source_shift = end_position[overlap_ref] + overlap
         else:
             source_shift = 0.0
 
         # convert shift from seconds to samples and ensure it is a multiple of 20ms
-        source_shift_in_seconds = source_shift
         source_shift = source_shift * cfg.fs
         if source_shift >= 0:
             source_shift = int(np.floor(source_shift / frame_len) * frame_len)
         else:
             source_shift = int(np.ceil(source_shift / frame_len) * frame_len)
+        source_shift_in_seconds = source_shift / cfg.fs
 
         # read the level
         if "level" in scene.keys():
@@ -300,6 +319,9 @@ def generate_OMASA_scene(
         # read source file
         x = audio.fromfile(fmt, input_filename)
 
+        # record the total duration of the source signal, taking into account the shift of the starting position
+        end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds)
+
         # resample to the target fs if necessary
         if x.fs != cfg.fs:
             logger.warning(
@@ -417,21 +439,21 @@ def generate_OMASA_scene(
             # add the first audio source signal (should be FOA/HOA2/HOA3) to the array of all source signals
             y_int.audio = x.audio.copy()
 
-            if source_shift < 0:
+            if source_shift > 0:
                 # insert zeros to the new audio source signal to shift it right
-                metadata.trim_meta(y_int, limits=[source_shift, 0], samples=True)
+                metadata.trim_meta(y_int, limits=[-source_shift, 0], samples=True)
             else:
                 offset = source_shift
         else:
             # shift the beginning of the audio source signal
             delta_offset = source_shift - offset
-            if delta_offset > 0:
+            if delta_offset < 0:
                 # insert zeros to the existing intermediate OSBA object to shift it right
-                metadata.trim_meta(y_int, limits=[-delta_offset, 0], samples=True)
+                metadata.trim_meta(y_int, limits=[delta_offset, 0], samples=True)
                 offset = source_shift
             else:
                 # insert zeros to the new audio source signal to shift it right
-                metadata.trim_meta(x, limits=[delta_offset, 0], samples=True)
+                metadata.trim_meta(x, limits=[-delta_offset, 0], samples=True)
 
             # adjust the length of the audio source signal
             delta_length = len(x.audio) - len(y_int.audio)
@@ -472,29 +494,16 @@ def generate_OMASA_scene(
         noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float")
         y_int.audio += noise
 
-    # trim the output signal such if the total duration exceeds X seconds
+    # trim the output signal if the total duration exceeds X seconds
     if "duration" in cfg.__dict__:
-<<<<<<< Updated upstream
-        # trim the output signal such that the total duration is X seconds
-        duration = int(cfg.duration * cfg.fs)  # convert to samples
-    else:
-        # do not change the length of the audio signal
-        duration = len(y_int.audio)
-    duration = int(
-        np.floor(duration / frame_len) * frame_len
-    )  # ensure multiple of 20ms
-    if len(y_int.audio) != duration:
-        metadata.trim_meta(y_int, limits=[0, len(y_int.audio) - duration], samples=True)
-=======
         # convert from seconds to samples (ensure multiple of 20ms)
         duration = int(
             np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len
         )
 
         # check if the current length of the output signal exceeds the duration
-        if len(y.audio) > duration:
-            metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True)
->>>>>>> Stashed changes
+        if len(y_int.audio) > duration:
+            metadata.trim_meta(y_int, limits=[0, len(y_int.audio) - duration], samples=True)
 
     # adjust the loudness of the output signal
     if "loudness" in cfg.__dict__:
diff --git a/ivas_processing_scripts/generation/generate_osba_items.py b/ivas_processing_scripts/generation/generate_osba_items.py
index 67e493b5..64921b7d 100644
--- a/ivas_processing_scripts/generation/generate_osba_items.py
+++ b/ivas_processing_scripts/generation/generate_osba_items.py
@@ -31,6 +31,7 @@
 #
 
 import logging
+import re
 import sys
 from itertools import groupby, repeat
 from pathlib import Path
@@ -187,6 +188,7 @@ def generate_OSBA_scene(
 
     # repeat for all source files
     offset = 0
+    end_position = []
     for i in range(N_inputs):
         # parse parameters from the scene description
         source_file = (
@@ -210,23 +212,40 @@ def generate_OSBA_scene(
             else scene["elevation"]
         )
 
-        # read the shift time in seconds
+        # read the source shift length (in seconds)
         if "shift" in scene.keys():
             source_shift = (
                 scene["shift"][i]
                 if isinstance(scene["shift"], list)
                 else scene["shift"]
             )
+
+            # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index
+            # of the reference signal (0-based index)
+            if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]:
+                # extract X and i_ref
+                match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i])
+
+                if match:
+                    overlap = float(match.group(1))
+                    overlap_ref = int(match.group(2))
+                else:
+                    scene_shift_str = scene["shift"][i]
+                    logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!")
+                    sys.exit(-1)
+
+                # calculate absolute shift of the source signal in seconds
+                source_shift = end_position[overlap_ref] + overlap
         else:
             source_shift = 0.0
 
         # convert shift from seconds to samples and ensure it is a multiple of 20ms
-        source_shift_in_seconds = source_shift
         source_shift = source_shift * cfg.fs
         if source_shift >= 0:
             source_shift = int(np.floor(source_shift / frame_len) * frame_len)
         else:
             source_shift = int(np.ceil(source_shift / frame_len) * frame_len)
+        source_shift_in_seconds = source_shift / cfg.fs
 
         # read the level
         if "level" in scene.keys():
@@ -282,6 +301,9 @@ def generate_OSBA_scene(
         # read source file
         x = audio.fromfile(fmt, input_filename)
 
+        # record the total duration of the source signal, taking into account the shift of the starting position
+        end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds)
+
         # resample to the target fs if necessary
         if x.fs != cfg.fs:
             logger.warning(
@@ -403,21 +425,21 @@ def generate_OSBA_scene(
                 # if ISM, append object position to the OSBA object
                 y.object_pos = x.object_pos.copy()
 
-            if source_shift < 0:
+            if source_shift > 0:
                 # insert zeros to the new audio source signal to shift it right
-                metadata.trim_meta(y, limits=[source_shift, 0], samples=True)
+                metadata.trim_meta(y, limits=[-source_shift, 0], samples=True)
             else:
                 offset = source_shift
         else:
             # shift the beginning of the audio source signal
             delta_offset = source_shift - offset
-            if delta_offset > 0:
+            if delta_offset < 0:
                 # insert zeros to the previous ISM signal(s) to shift them right
-                metadata.trim_meta(y, limits=[-delta_offset, 0], samples=True)
+                metadata.trim_meta(y, limits=[delta_offset, 0], samples=True)
                 offset = source_shift
             else:
                 # insert zeros to the new audio source signal to shift it right
-                metadata.trim_meta(x, limits=[delta_offset, 0], samples=True)
+                metadata.trim_meta(x, limits=[-delta_offset, 0], samples=True)
 
             # adjust the length of the audio source signal
             delta_length = len(x.audio) - len(y.audio)
diff --git a/ivas_processing_scripts/generation/generate_sba_items.py b/ivas_processing_scripts/generation/generate_sba_items.py
index f40ba9d8..36c8b828 100644
--- a/ivas_processing_scripts/generation/generate_sba_items.py
+++ b/ivas_processing_scripts/generation/generate_sba_items.py
@@ -31,6 +31,7 @@
 #
 
 import logging
+import re
 import sys
 from itertools import groupby, repeat
 from pathlib import Path
@@ -201,6 +202,7 @@ def generate_sba_scene(
 
     # repeat for all source files
     offset = 0
+    end_position = []
     for i in range(N_inputs):
         # parse parameters from the scene description
         source_file = (
@@ -214,23 +216,40 @@ def generate_sba_scene(
         )
         IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name)
 
-        # read the shift time in seconds
+        # read the source shift length (in seconds)
         if "shift" in scene.keys():
             source_shift = (
                 scene["shift"][i]
                 if isinstance(scene["shift"], list)
                 else scene["shift"]
             )
+
+            # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index
+            # of the reference signal (0-based index)
+            if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]:
+                # extract X and i_ref
+                match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i])
+
+                if match:
+                    overlap = float(match.group(1))
+                    overlap_ref = int(match.group(2))
+                else:
+                    scene_shift_str = scene["shift"][i]
+                    logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!")
+                    sys.exit(-1)
+
+                # calculate absolute shift of the source signal in seconds
+                source_shift = end_position[overlap_ref] + overlap
         else:
             source_shift = 0.0
 
         # convert shift from seconds to samples and ensure it is a multiple of 20ms
-        source_shift_in_seconds = source_shift
         source_shift = source_shift * cfg.fs
         if source_shift >= 0:
             source_shift = int(np.floor(source_shift / frame_len) * frame_len)
         else:
             source_shift = int(np.ceil(source_shift / frame_len) * frame_len)
+        source_shift_in_seconds = source_shift / cfg.fs
 
         # read the level
         if "level" in scene.keys():
@@ -266,6 +285,9 @@ def generate_sba_scene(
         # read source file
         x = audio.fromfile("MONO", input_filename)
 
+        # record the total duration of the source signal, taking into account the shift of the starting position
+        end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds)
+
         # resample to the target fs if necessary
         if x.fs != cfg.fs:
             logger.warning(
@@ -310,26 +332,26 @@ def generate_sba_scene(
             # add source signal to the array of all source signals
             y.audio = x.audio.copy()
 
-            if source_shift < 0:
+            if source_shift > 0:
                 # insert zeros to the new audio source signal to shift it right
                 y.audio = audioarray.trim_meta(
-                    y.audio, y.fs, limits=[source_shift, 0], samples=True
+                    y.audio, y.fs, limits=[-source_shift, 0], samples=True
                 )
             else:
                 offset = source_shift
         else:
             # shift the beginning of the audio source signal
             delta_offset = source_shift - offset
-            if delta_offset > 0:
+            if delta_offset < 0:
                 # insert zeros to the existing output signal to shift it right
                 y.audio = audioarray.trim(
-                    y.audio, y.fs, limits=[-delta_offset, 0], samples=True
+                    y.audio, y.fs, limits=[delta_offset, 0], samples=True
                 )
                 offset = source_shift
             else:
                 # insert zeros to the new audio source signal to shift it right
                 x.audio = audioarray.trim(
-                    x.audio, x.fs, limits=[delta_offset, 0], samples=True
+                    x.audio, x.fs, limits=[-delta_offset, 0], samples=True
                 )
 
             # adjust the length of the audio source signal
diff --git a/ivas_processing_scripts/generation/generate_stereo_items.py b/ivas_processing_scripts/generation/generate_stereo_items.py
index 6bf1e95e..9bc6a73d 100644
--- a/ivas_processing_scripts/generation/generate_stereo_items.py
+++ b/ivas_processing_scripts/generation/generate_stereo_items.py
@@ -32,6 +32,7 @@
 
 import logging
 import os
+import re
 import sys
 from itertools import groupby, repeat
 from pathlib import Path
@@ -207,6 +208,7 @@ def generate_stereo_scene(
 
     # repeat for all source files
     offset = 0
+    end_position = []
     for i in range(N_inputs):
         # parse parameters from the scene description
         source_file = (
@@ -220,23 +222,40 @@ def generate_stereo_scene(
         )
         IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name)
 
-        # read the shift time in seconds
+        # read the source shift length (in seconds)
         if "shift" in scene.keys():
             source_shift = (
                 scene["shift"][i]
                 if isinstance(scene["shift"], list)
                 else scene["shift"]
             )
+
+            # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index
+            # of the reference signal (0-based index)
+            if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]:
+                # extract X and i_ref
+                match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i])
+
+                if match:
+                    overlap = float(match.group(1))
+                    overlap_ref = int(match.group(2))
+                else:
+                    scene_shift_str = scene["shift"][i]
+                    logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!")
+                    sys.exit(-1)
+
+                # calculate absolute shift of the source signal in seconds
+                source_shift = end_position[overlap_ref] + overlap
         else:
             source_shift = 0.0
 
         # convert shift from seconds to samples and ensure it is a multiple of 20ms
-        source_shift_in_seconds = source_shift
         source_shift = source_shift * cfg.fs
         if source_shift >= 0:
             source_shift = int(np.floor(source_shift / frame_len) * frame_len)
         else:
             source_shift = int(np.ceil(source_shift / frame_len) * frame_len)
+        source_shift_in_seconds = source_shift / cfg.fs
 
         # read the level
         if "level" in scene.keys():
@@ -284,6 +303,9 @@ def generate_stereo_scene(
         # read the IR file (!must be in STEREO format!)
         IR = audio.fromfile("STEREO", IR_filename)
 
+        # record the total duration of the source signal, taking into account the shift of the starting position
+        end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds)
+
         # convolve MONO source audio with STEREO IR -> results in STEREO audio object
         x = reverb_stereo(x, IR, mode=None)
 
@@ -311,26 +333,26 @@ def generate_stereo_scene(
             # add source signal to the array of all source signals
             y.audio = x.audio.copy()
 
-            if source_shift < 0:
+            if source_shift > 0:
                 # insert zeros to the new audio source signal to shift it right
                 y.audio = audioarray.trim(
-                    y.audio, x.fs, limits=[source_shift, 0], samples=True
+                    y.audio, x.fs, limits=[-source_shift, 0], samples=True
                 )
             else:
                 offset = source_shift
         else:
             # shift the beginning of the audio source signal
             delta_offset = source_shift - offset
-            if delta_offset > 0:
+            if delta_offset < 0:
                 # insert zeros to the existing output signal to shift it right
                 y.audio = audioarray.trim(
-                    y.audio, y.fs, limits=[-delta_offset, 0], samples=True
+                    y.audio, y.fs, limits=[delta_offset, 0], samples=True
                 )
                 offset = source_shift
             else:
                 # insert zeros to the new audio source signal to shift it right
                 x.audio = audioarray.trim(
-                    x.audio, x.fs, limits=[delta_offset, 0], samples=True
+                    x.audio, x.fs, limits=[-delta_offset, 0], samples=True
                 )
 
             # adjust the length of the audio source signal
-- 
GitLab


From 85be2fd0eb514e124ab462892aed7925aca0bf7e Mon Sep 17 00:00:00 2001
From: Vladimir Malenovsky <vladimir.malenovsky@usherbrooke.ca>
Date: Tue, 12 Aug 2025 14:55:23 +0200
Subject: [PATCH 3/5] formatting

---
 .../generation/generate_ismN_items.py           |  8 ++++++--
 .../generation/generate_mc_items.py             | 17 +++++++++++------
 .../generation/generate_omasa_items.py          | 16 ++++++++++------
 .../generation/generate_osba_items.py           | 12 +++++++-----
 .../generation/generate_sba_items.py            | 12 +++++++-----
 .../generation/generate_stereo_items.py         | 12 +++++++-----
 6 files changed, 48 insertions(+), 29 deletions(-)

diff --git a/ivas_processing_scripts/generation/generate_ismN_items.py b/ivas_processing_scripts/generation/generate_ismN_items.py
index f57f58c3..3c474309 100644
--- a/ivas_processing_scripts/generation/generate_ismN_items.py
+++ b/ivas_processing_scripts/generation/generate_ismN_items.py
@@ -239,14 +239,18 @@ def generate_ismN_scene(
             # of the reference signal (0-based index)
             if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]:
                 # extract X and i_ref
-                match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i])
+                match = re.match(
+                    r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]
+                )
 
                 if match:
                     overlap = float(match.group(1))
                     overlap_ref = int(match.group(2))
                 else:
                     scene_shift_str = scene["shift"][i]
-                    logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!")
+                    logger.error(
+                        f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!"
+                    )
                     sys.exit(-1)
 
                 # calculate absolute shift of the source signal in seconds
diff --git a/ivas_processing_scripts/generation/generate_mc_items.py b/ivas_processing_scripts/generation/generate_mc_items.py
index a37a3710..35dcbb3b 100644
--- a/ivas_processing_scripts/generation/generate_mc_items.py
+++ b/ivas_processing_scripts/generation/generate_mc_items.py
@@ -236,14 +236,18 @@ def generate_MC_scene(
             # of the reference signal (0-based index)
             if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]:
                 # extract X and i_ref
-                match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i])
+                match = re.match(
+                    r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]
+                )
 
                 if match:
                     overlap = float(match.group(1))
                     overlap_ref = int(match.group(2))
                 else:
                     scene_shift_str = scene["shift"][i]
-                    logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!")
+                    logger.error(
+                        f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!"
+                    )
                     sys.exit(-1)
 
                 # calculate absolute shift of the source signal in seconds
@@ -418,14 +422,15 @@ def generate_MC_scene(
     # trim the output signal if the total duration exceeds X seconds
     if "duration" in cfg.__dict__:
         # convert from seconds to samples (ensure multiple of 20ms)
-        duration = int(
-            np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len
-        )
+        duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len)
 
         # check if the current length of the output signal exceeds the duration
         if len(y_int.audio) > duration:
             y_int.audio = audioarray.trim(
-                y_int.audio, y_int.fs, limits=[0, len(y_int.audio) - duration], samples=True
+                y_int.audio,
+                y_int.fs,
+                limits=[0, len(y_int.audio) - duration],
+                samples=True,
             )
 
     # adjust the loudness of the output signal
diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py
index 73831922..0881c7ca 100644
--- a/ivas_processing_scripts/generation/generate_omasa_items.py
+++ b/ivas_processing_scripts/generation/generate_omasa_items.py
@@ -227,14 +227,18 @@ def generate_OMASA_scene(
             # of the reference signal (0-based index)
             if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]:
                 # extract X and i_ref
-                match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i])
+                match = re.match(
+                    r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]
+                )
 
                 if match:
                     overlap = float(match.group(1))
                     overlap_ref = int(match.group(2))
                 else:
                     scene_shift_str = scene["shift"][i]
-                    logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!")
+                    logger.error(
+                        f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!"
+                    )
                     sys.exit(-1)
 
                 # calculate absolute shift of the source signal in seconds
@@ -497,13 +501,13 @@ def generate_OMASA_scene(
     # trim the output signal if the total duration exceeds X seconds
     if "duration" in cfg.__dict__:
         # convert from seconds to samples (ensure multiple of 20ms)
-        duration = int(
-            np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len
-        )
+        duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len)
 
         # check if the current length of the output signal exceeds the duration
         if len(y_int.audio) > duration:
-            metadata.trim_meta(y_int, limits=[0, len(y_int.audio) - duration], samples=True)
+            metadata.trim_meta(
+                y_int, limits=[0, len(y_int.audio) - duration], samples=True
+            )
 
     # adjust the loudness of the output signal
     if "loudness" in cfg.__dict__:
diff --git a/ivas_processing_scripts/generation/generate_osba_items.py b/ivas_processing_scripts/generation/generate_osba_items.py
index 64921b7d..8d2ca0d8 100644
--- a/ivas_processing_scripts/generation/generate_osba_items.py
+++ b/ivas_processing_scripts/generation/generate_osba_items.py
@@ -224,14 +224,18 @@ def generate_OSBA_scene(
             # of the reference signal (0-based index)
             if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]:
                 # extract X and i_ref
-                match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i])
+                match = re.match(
+                    r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]
+                )
 
                 if match:
                     overlap = float(match.group(1))
                     overlap_ref = int(match.group(2))
                 else:
                     scene_shift_str = scene["shift"][i]
-                    logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!")
+                    logger.error(
+                        f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!"
+                    )
                     sys.exit(-1)
 
                 # calculate absolute shift of the source signal in seconds
@@ -483,9 +487,7 @@ def generate_OSBA_scene(
     # trim the output signal if the total duration exceeds X seconds
     if "duration" in cfg.__dict__:
         # convert from seconds to samples (ensure multiple of 20ms)
-        duration = int(
-            np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len
-        )
+        duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len)
 
         # check if the current length of the output signal exceeds the duration
         if len(y.audio) > duration:
diff --git a/ivas_processing_scripts/generation/generate_sba_items.py b/ivas_processing_scripts/generation/generate_sba_items.py
index 36c8b828..631d6165 100644
--- a/ivas_processing_scripts/generation/generate_sba_items.py
+++ b/ivas_processing_scripts/generation/generate_sba_items.py
@@ -228,14 +228,18 @@ def generate_sba_scene(
             # of the reference signal (0-based index)
             if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]:
                 # extract X and i_ref
-                match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i])
+                match = re.match(
+                    r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]
+                )
 
                 if match:
                     overlap = float(match.group(1))
                     overlap_ref = int(match.group(2))
                 else:
                     scene_shift_str = scene["shift"][i]
-                    logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!")
+                    logger.error(
+                        f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!"
+                    )
                     sys.exit(-1)
 
                 # calculate absolute shift of the source signal in seconds
@@ -389,9 +393,7 @@ def generate_sba_scene(
     # trim the output signal if the total duration exceeds X seconds
     if "duration" in cfg.__dict__:
         # convert from seconds to samples (ensure multiple of 20ms)
-        duration = int(
-            np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len
-        )
+        duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len)
 
         # check if the current length of the output signal exceeds the duration
         if len(y.audio) > duration:
diff --git a/ivas_processing_scripts/generation/generate_stereo_items.py b/ivas_processing_scripts/generation/generate_stereo_items.py
index 9bc6a73d..1ad8a6ae 100644
--- a/ivas_processing_scripts/generation/generate_stereo_items.py
+++ b/ivas_processing_scripts/generation/generate_stereo_items.py
@@ -234,14 +234,18 @@ def generate_stereo_scene(
             # of the reference signal (0-based index)
             if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]:
                 # extract X and i_ref
-                match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i])
+                match = re.match(
+                    r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]
+                )
 
                 if match:
                     overlap = float(match.group(1))
                     overlap_ref = int(match.group(2))
                 else:
                     scene_shift_str = scene["shift"][i]
-                    logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!")
+                    logger.error(
+                        f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!"
+                    )
                     sys.exit(-1)
 
                 # calculate absolute shift of the source signal in seconds
@@ -390,9 +394,7 @@ def generate_stereo_scene(
     # trim the output signal if the total duration exceeds X seconds
     if "duration" in cfg.__dict__:
         # convert from seconds to samples (ensure multiple of 20ms)
-        duration = int(
-            np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len
-        )
+        duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len)
 
         # check if the current length of the output signal exceeds the duration
         if len(y.audio) > duration:
-- 
GitLab


From 68510c78eca959856594de807d6b2b578bec4183 Mon Sep 17 00:00:00 2001
From: Vladimir Malenovsky <vladimir.malenovsky@usherbrooke.ca>
Date: Wed, 13 Aug 2025 10:52:35 +0200
Subject: [PATCH 4/5] update examples

---
 examples/ITEM_GENERATION_3ISM.yml   | 7 +++++--
 examples/ITEM_GENERATION_5_1_4.yml  | 3 ++-
 examples/ITEM_GENERATION_FOA.yml    | 7 ++++---
 examples/ITEM_GENERATION_MASA.yml   | 3 ++-
 examples/ITEM_GENERATION_OMASA.yml  | 7 +++++--
 examples/ITEM_GENERATION_OSBA.yml   | 7 +++++--
 examples/ITEM_GENERATION_STEREO.yml | 5 +++--
 7 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/examples/ITEM_GENERATION_3ISM.yml b/examples/ITEM_GENERATION_3ISM.yml
index e770cadf..53dd0ded 100644
--- a/examples/ITEM_GENERATION_3ISM.yml
+++ b/examples/ITEM_GENERATION_3ISM.yml
@@ -37,7 +37,7 @@ duration: 8
 add_low_level_random_noise: true
 
 ### Process with parallel streams
-multiprocessing: False
+multiprocessing: false
 
 ################################################
 ### Item generation - Filename conventions
@@ -93,7 +93,10 @@ provider: "va"
 ###   input:       input filename(s)
 ###   azimuth:     azimuth in the range [-180,180]; positive values point to the left
 ###   elevation:   elevation in the range [-90,90]; positive values indicate up
-###   shift:       time adjustment of the input signal (negative value delays the signal)
+###   shift:       time adjustment of the input signal (positive value delays the signal)
+###                alternatively, the notation X(i_ref) generates overlap by X seconds from the reference signal i_ref (0-based index) (positive value creates gap)
+###   background:  background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored)
+###   background_level:  normalized background noise loudness to X dB LKFS
 ###
 ### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder)
 ### Note 1: use brackets [val1, val2, ...] when specifying multiple values 
diff --git a/examples/ITEM_GENERATION_5_1_4.yml b/examples/ITEM_GENERATION_5_1_4.yml
index 4670d197..2a0dbd27 100644
--- a/examples/ITEM_GENERATION_5_1_4.yml
+++ b/examples/ITEM_GENERATION_5_1_4.yml
@@ -94,7 +94,8 @@ provider: "va"
 ###   IR:          filenames(s) of the input IRs 
 ###   azimuth:     azimuth in the range [-180,180]; positive values point to the left
 ###   elevation:   elevation in the range [-90,90]; positive values indicate up
-###   shift:       time adjustment of the input signal (negative value delays the signal)
+###   shift:       time adjustment of the input signal (positive value delays the signal)
+###                alternatively, the notation X(i_ref) generates overlap by X seconds from the reference signal i_ref (0-based index) (positive value creates gap)
 ###   background:  background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored)
 ###   background_level:  normalized background noise loudness to X dB LKFS
 ###
diff --git a/examples/ITEM_GENERATION_FOA.yml b/examples/ITEM_GENERATION_FOA.yml
index 879735d4..016c5fcf 100644
--- a/examples/ITEM_GENERATION_FOA.yml
+++ b/examples/ITEM_GENERATION_FOA.yml
@@ -34,10 +34,10 @@ fade_in_out: 0.5
 duration: 8
 
 ### Add low-level random background noise (amplitude +-4) instead of silence; default = false (silence)
-add_low_level_random_noise: False
+add_low_level_random_noise: false
 
 ### Process with parallel streams
-multiprocessing: False
+multiprocessing: false
 
 ################################################
 ### Item generation - Filename conventions
@@ -94,7 +94,8 @@ use_output_prefix: "leee"
 ###   IR:          filenames(s) of the input IRs 
 ###   azimuth:     azimuth in the range [-180,180]; positive values point to the left
 ###   elevation:   elevation in the range [-90,90]; positive values indicate up
-###   shift:       time adjustment of the input signal (negative value delays the signal)
+###   shift:       time adjustment of the input signal (positive value delays the signal)
+###                alternatively, the notation X(i_ref) generates overlap by X seconds from the reference signal i_ref (0-based index) (positive value creates gap)
 ###   background:  background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored)
 ###   background_level:  normalized background noise loudness to X dB LKFS
 ###
diff --git a/examples/ITEM_GENERATION_MASA.yml b/examples/ITEM_GENERATION_MASA.yml
index 958a69cb..715b20c4 100644
--- a/examples/ITEM_GENERATION_MASA.yml
+++ b/examples/ITEM_GENERATION_MASA.yml
@@ -94,7 +94,8 @@ provider: "va"
 ###   IR:          filenames(s) of the input IRs 
 ###   azimuth:     azimuth in the range [-180,180]; positive values point to the left
 ###   elevation:   elevation in the range [-90,90]; positive values indicate up
-###   shift:       time adjustment of the input signal (negative value delays the signal)
+###   shift:       time adjustment of the input signal (positive value delays the signal)
+###                alternatively, the notation X(i_ref) generates overlap by X seconds from the reference signal i_ref (0-based index) (positive value creates gap)
 ###   background:  background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored)
 ###   background_level:  normalized background noise loudness to X dB LKFS
 ###
diff --git a/examples/ITEM_GENERATION_OMASA.yml b/examples/ITEM_GENERATION_OMASA.yml
index 942ad1c7..462bc54e 100644
--- a/examples/ITEM_GENERATION_OMASA.yml
+++ b/examples/ITEM_GENERATION_OMASA.yml
@@ -37,7 +37,7 @@ duration: 8
 add_low_level_random_noise: true
 
 ### Process with parallel streams
-multiprocessing: False
+multiprocessing: false
 
 ################################################
 ### Item generation - Filename conventions
@@ -93,7 +93,10 @@ provider: "va"
 ###   input:       input filename(s)
 ###   azimuth:     azimuth in the range [-180,180]; positive values point to the left
 ###   elevation:   elevation in the range [-90,90]; positive values indicate up
-###   shift:       time adjustment of the input signal (negative value delays the signal)
+###   shift:       time adjustment of the input signal (positive value delays the signal)
+###                alternatively, the notation X(i_ref) generates overlap by X seconds from the reference signal i_ref (0-based index) (positive value creates gap)
+###   background:  background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored)
+###   background_level:  normalized background noise loudness to X dB LKFS
 ###
 ### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder)
 ### Note 1: use brackets [val1, val2, ...] when specifying multiple values 
diff --git a/examples/ITEM_GENERATION_OSBA.yml b/examples/ITEM_GENERATION_OSBA.yml
index f7c33b49..3b696838 100644
--- a/examples/ITEM_GENERATION_OSBA.yml
+++ b/examples/ITEM_GENERATION_OSBA.yml
@@ -37,7 +37,7 @@ duration: 8
 add_low_level_random_noise: true
 
 ### Process with parallel streams
-multiprocessing: False
+multiprocessing: false
 
 ################################################
 ### Item generation - Filename conventions
@@ -93,7 +93,10 @@ provider: "va"
 ###   input:       input filename(s)
 ###   azimuth:     azimuth in the range [-180,180]; positive values point to the left
 ###   elevation:   elevation in the range [-90,90]; positive values indicate up
-###   shift:       time adjustment of the input signal (negative value delays the signal)
+###   shift:       time adjustment of the input signal (positive value delays the signal)
+###                alternatively, the notation X(i_ref) generates overlap by X seconds from the reference signal i_ref (0-based index) (positive value creates gap)
+###   background:  background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored)
+###   background_level:  normalized background noise loudness to X dB LKFS
 ###
 ### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder)
 ### Note 1: use brackets [val1, val2, ...] when specifying multiple values 
diff --git a/examples/ITEM_GENERATION_STEREO.yml b/examples/ITEM_GENERATION_STEREO.yml
index 48c6aa61..78426358 100644
--- a/examples/ITEM_GENERATION_STEREO.yml
+++ b/examples/ITEM_GENERATION_STEREO.yml
@@ -37,7 +37,7 @@ duration: 8
 add_low_level_random_noise: true
 
 ### Process with parallel streams
-multiprocessing: False
+multiprocessing: false
 
 ################################################
 ### Item generation - Filename conventions
@@ -94,7 +94,8 @@ provider: "g"
 ###   IR:          filenames(s) of the input IRs 
 ###   azimuth:     azimuth in the range [-180,180]; positive values point to the left
 ###   elevation:   elevation in the range [-90,90]; positive values indicate up
-###   shift:       time adjustment of the input signal (negative value delays the signal)
+###   shift:       time adjustment of the input signal (positive value delays the signal)
+###                alternatively, the notation X(i_ref) generates overlap by X seconds from the reference signal i_ref (0-based index) (positive value creates gap)
 ###   background:  background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored)
 ###   background_level:  normalized background noise loudness to X dB LKFS
 ###
-- 
GitLab


From a8b5728f5e29a03896f0244ce5f12e70d3d16882 Mon Sep 17 00:00:00 2001
From: Vladimir Malenovsky <vladimir.malenovsky@usherbrooke.ca>
Date: Wed, 13 Aug 2025 10:55:39 +0200
Subject: [PATCH 5/5] update doc

---
 README.md | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 02f00dde..d47e2ea3 100755
--- a/README.md
+++ b/README.md
@@ -76,13 +76,11 @@ Each entry under `scenes:` describes one test item, specifying:
 - `input`: list of mono `.wav` files
 - `azimuth` / `elevation`: spatial placement (°)
 - `level`: loudness in dB
-- `shift`: timing offsets in seconds
+- `shift`: signal offset/overlap in seconds
 - `background`: background noise file (applicable to STEREO and SBA only)
 - `background_level`: level of the background noise (applicable to STEREO and SBA only)
 
-Dynamic positioning (e.g., `"-20:1.0:360"`) means the source will move over time, stepping every 20 ms.
-
-The total duration of the output signal can be controlled using the `duration` field. The output signal may optionally be rendered to the BINAURAL format by specifying the `binaural_output` field.
+Dynamic positioning (e.g., `"-20:1.0:360"`) means the source will move over time, stepping every 20 ms. The maximum total duration of the output signal can be controlled using the `duration` field. The output signal may optionally be rendered to the BINAURAL format by specifying the `binaural_output` field. The `shift` parameter ensures time adjustment (offset) of the input signal (positive value delays the signal). Aternatively, the notation `X(i_ref)` generates overlap by `X` seconds from the reference signal `i_ref` (0-based index) (positive value creates gap).
 
 Start by running a single scene to verify settings. Output includes both audio and optional metadata files. You can enable multiprocessing by setting `multiprocessing: true`.
 
-- 
GitLab