diff --git a/README.md b/README.md
index 02f00dde932e8bcd03c6a1f62b19a75597d54ebf..d47e2ea3f79c18b73d95da6790dc4f1ce1fa8073 100755
--- a/README.md
+++ b/README.md
@@ -76,13 +76,11 @@ Each entry under `scenes:` describes one test item, specifying:
 - `input`: list of mono `.wav` files
 - `azimuth` / `elevation`: spatial placement (°)
 - `level`: loudness in dB
-- `shift`: timing offsets in seconds
+- `shift`: signal offset/overlap in seconds
 - `background`: background noise file (applicable to STEREO and SBA only)
 - `background_level`: level of the background noise (applicable to STEREO and SBA only)
 
-Dynamic positioning (e.g., `"-20:1.0:360"`) means the source will move over time, stepping every 20 ms.
-
-The total duration of the output signal can be controlled using the `duration` field. The output signal may optionally be rendered to the BINAURAL format by specifying the `binaural_output` field.
+Dynamic positioning (e.g., `"-20:1.0:360"`) means the source will move over time, stepping every 20 ms. The maximum total duration of the output signal can be controlled using the `duration` field. The output signal may optionally be rendered to the BINAURAL format by specifying the `binaural_output` field. The `shift` parameter ensures time adjustment (offset) of the input signal (positive value delays the signal). Aternatively, the notation `X(i_ref)` generates overlap by `X` seconds from the reference signal `i_ref` (0-based index) (positive value creates gap).
 
 Start by running a single scene to verify settings. Output includes both audio and optional metadata files. You can enable multiprocessing by setting `multiprocessing: true`.
 
diff --git a/examples/ITEM_GENERATION_3ISM.yml b/examples/ITEM_GENERATION_3ISM.yml
index e770cadf706b775682b8339677f64e863b0849b2..53dd0dedf501393b4ac35b7f19fbcd39686e23d7 100644
--- a/examples/ITEM_GENERATION_3ISM.yml
+++ b/examples/ITEM_GENERATION_3ISM.yml
@@ -37,7 +37,7 @@ duration: 8
 add_low_level_random_noise: true
 
 ### Process with parallel streams
-multiprocessing: False
+multiprocessing: false
 
 ################################################
 ### Item generation - Filename conventions
@@ -93,7 +93,10 @@ provider: "va"
 ###   input:       input filename(s)
 ###   azimuth:     azimuth in the range [-180,180]; positive values point to the left
 ###   elevation:   elevation in the range [-90,90]; positive values indicate up
-###   shift:       time adjustment of the input signal (negative value delays the signal)
+###   shift:       time adjustment of the input signal (positive value delays the signal)
+###                alternatively, the notation X(i_ref) generates overlap by X seconds from the reference signal i_ref (0-based index) (positive value creates gap)
+###   background:  background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored)
+###   background_level:  normalized background noise loudness to X dB LKFS
 ###
 ### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder)
 ### Note 1: use brackets [val1, val2, ...] when specifying multiple values 
diff --git a/examples/ITEM_GENERATION_5_1_4.yml b/examples/ITEM_GENERATION_5_1_4.yml
index 4670d1979eefe3b9fc0a0aaeb6521c9eb1aadf8c..2a0dbd278315dfff0deac6e9fcdcc587d5286ed4 100644
--- a/examples/ITEM_GENERATION_5_1_4.yml
+++ b/examples/ITEM_GENERATION_5_1_4.yml
@@ -94,7 +94,8 @@ provider: "va"
 ###   IR:          filenames(s) of the input IRs 
 ###   azimuth:     azimuth in the range [-180,180]; positive values point to the left
 ###   elevation:   elevation in the range [-90,90]; positive values indicate up
-###   shift:       time adjustment of the input signal (negative value delays the signal)
+###   shift:       time adjustment of the input signal (positive value delays the signal)
+###                alternatively, the notation X(i_ref) generates overlap by X seconds from the reference signal i_ref (0-based index) (positive value creates gap)
 ###   background:  background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored)
 ###   background_level:  normalized background noise loudness to X dB LKFS
 ###
diff --git a/examples/ITEM_GENERATION_FOA.yml b/examples/ITEM_GENERATION_FOA.yml
index 879735d42f6dc46fa9c996e98de087898b0fadc1..016c5fcfa268ca657667c0a18473eeec98aca06a 100644
--- a/examples/ITEM_GENERATION_FOA.yml
+++ b/examples/ITEM_GENERATION_FOA.yml
@@ -34,10 +34,10 @@ fade_in_out: 0.5
 duration: 8
 
 ### Add low-level random background noise (amplitude +-4) instead of silence; default = false (silence)
-add_low_level_random_noise: False
+add_low_level_random_noise: false
 
 ### Process with parallel streams
-multiprocessing: False
+multiprocessing: false
 
 ################################################
 ### Item generation - Filename conventions
@@ -94,7 +94,8 @@ use_output_prefix: "leee"
 ###   IR:          filenames(s) of the input IRs 
 ###   azimuth:     azimuth in the range [-180,180]; positive values point to the left
 ###   elevation:   elevation in the range [-90,90]; positive values indicate up
-###   shift:       time adjustment of the input signal (negative value delays the signal)
+###   shift:       time adjustment of the input signal (positive value delays the signal)
+###                alternatively, the notation X(i_ref) generates overlap by X seconds from the reference signal i_ref (0-based index) (positive value creates gap)
 ###   background:  background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored)
 ###   background_level:  normalized background noise loudness to X dB LKFS
 ###
diff --git a/examples/ITEM_GENERATION_MASA.yml b/examples/ITEM_GENERATION_MASA.yml
index 958a69cb5e27c5d710a10c0af07c5501b2516b0b..715b20c4932eb6b21ebcd9642b55d4b8c4805bae 100644
--- a/examples/ITEM_GENERATION_MASA.yml
+++ b/examples/ITEM_GENERATION_MASA.yml
@@ -94,7 +94,8 @@ provider: "va"
 ###   IR:          filenames(s) of the input IRs 
 ###   azimuth:     azimuth in the range [-180,180]; positive values point to the left
 ###   elevation:   elevation in the range [-90,90]; positive values indicate up
-###   shift:       time adjustment of the input signal (negative value delays the signal)
+###   shift:       time adjustment of the input signal (positive value delays the signal)
+###                alternatively, the notation X(i_ref) generates overlap by X seconds from the reference signal i_ref (0-based index) (positive value creates gap)
 ###   background:  background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored)
 ###   background_level:  normalized background noise loudness to X dB LKFS
 ###
diff --git a/examples/ITEM_GENERATION_OMASA.yml b/examples/ITEM_GENERATION_OMASA.yml
index 942ad1c7d323a88dc9fd057fc12970e767bb801f..462bc54e116c8a73869d7b79206435f97e024009 100644
--- a/examples/ITEM_GENERATION_OMASA.yml
+++ b/examples/ITEM_GENERATION_OMASA.yml
@@ -37,7 +37,7 @@ duration: 8
 add_low_level_random_noise: true
 
 ### Process with parallel streams
-multiprocessing: False
+multiprocessing: false
 
 ################################################
 ### Item generation - Filename conventions
@@ -93,7 +93,10 @@ provider: "va"
 ###   input:       input filename(s)
 ###   azimuth:     azimuth in the range [-180,180]; positive values point to the left
 ###   elevation:   elevation in the range [-90,90]; positive values indicate up
-###   shift:       time adjustment of the input signal (negative value delays the signal)
+###   shift:       time adjustment of the input signal (positive value delays the signal)
+###                alternatively, the notation X(i_ref) generates overlap by X seconds from the reference signal i_ref (0-based index) (positive value creates gap)
+###   background:  background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored)
+###   background_level:  normalized background noise loudness to X dB LKFS
 ###
 ### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder)
 ### Note 1: use brackets [val1, val2, ...] when specifying multiple values 
diff --git a/examples/ITEM_GENERATION_OSBA.yml b/examples/ITEM_GENERATION_OSBA.yml
index f7c33b490f6edd62ffa2d1e1faf0b582bf614b88..3b6968381d205f73bdcf9bef3b83ceae830da258 100644
--- a/examples/ITEM_GENERATION_OSBA.yml
+++ b/examples/ITEM_GENERATION_OSBA.yml
@@ -37,7 +37,7 @@ duration: 8
 add_low_level_random_noise: true
 
 ### Process with parallel streams
-multiprocessing: False
+multiprocessing: false
 
 ################################################
 ### Item generation - Filename conventions
@@ -93,7 +93,10 @@ provider: "va"
 ###   input:       input filename(s)
 ###   azimuth:     azimuth in the range [-180,180]; positive values point to the left
 ###   elevation:   elevation in the range [-90,90]; positive values indicate up
-###   shift:       time adjustment of the input signal (negative value delays the signal)
+###   shift:       time adjustment of the input signal (positive value delays the signal)
+###                alternatively, the notation X(i_ref) generates overlap by X seconds from the reference signal i_ref (0-based index) (positive value creates gap)
+###   background:  background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored)
+###   background_level:  normalized background noise loudness to X dB LKFS
 ###
 ### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder)
 ### Note 1: use brackets [val1, val2, ...] when specifying multiple values 
diff --git a/examples/ITEM_GENERATION_STEREO.yml b/examples/ITEM_GENERATION_STEREO.yml
index 48c6aa61d28ed0a481d35fafd745f5c0e842ab15..784263588b4f5390f3081799f93cad533b13b91f 100644
--- a/examples/ITEM_GENERATION_STEREO.yml
+++ b/examples/ITEM_GENERATION_STEREO.yml
@@ -37,7 +37,7 @@ duration: 8
 add_low_level_random_noise: true
 
 ### Process with parallel streams
-multiprocessing: False
+multiprocessing: false
 
 ################################################
 ### Item generation - Filename conventions
@@ -94,7 +94,8 @@ provider: "g"
 ###   IR:          filenames(s) of the input IRs 
 ###   azimuth:     azimuth in the range [-180,180]; positive values point to the left
 ###   elevation:   elevation in the range [-90,90]; positive values indicate up
-###   shift:       time adjustment of the input signal (negative value delays the signal)
+###   shift:       time adjustment of the input signal (positive value delays the signal)
+###                alternatively, the notation X(i_ref) generates overlap by X seconds from the reference signal i_ref (0-based index) (positive value creates gap)
 ###   background:  background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored)
 ###   background_level:  normalized background noise loudness to X dB LKFS
 ###
diff --git a/ivas_processing_scripts/generation/generate_ismN_items.py b/ivas_processing_scripts/generation/generate_ismN_items.py
index 489dbea486f9fdbe713024cfdab652945c309ef3..3c474309e24763e35b6c13c906448cb68bc565b0 100644
--- a/ivas_processing_scripts/generation/generate_ismN_items.py
+++ b/ivas_processing_scripts/generation/generate_ismN_items.py
@@ -30,6 +30,7 @@
 #  the United Nations Convention on Contracts on the International Sales of Goods.
 #
 import logging
+import re
 import sys
 from itertools import groupby, repeat
 from pathlib import Path
@@ -196,6 +197,7 @@ def generate_ismN_scene(
 
     # repeat for all source files
     offset = 0
+    end_position = []
     for i in range(N_inputs):
         # read input filename
         source_file = (
@@ -232,16 +234,37 @@ def generate_ismN_scene(
                 if isinstance(scene["shift"], list)
                 else scene["shift"]
             )
+
+            # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index
+            # of the reference signal (0-based index)
+            if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]:
+                # extract X and i_ref
+                match = re.match(
+                    r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]
+                )
+
+                if match:
+                    overlap = float(match.group(1))
+                    overlap_ref = int(match.group(2))
+                else:
+                    scene_shift_str = scene["shift"][i]
+                    logger.error(
+                        f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!"
+                    )
+                    sys.exit(-1)
+
+                # calculate absolute shift of the source signal in seconds
+                source_shift = end_position[overlap_ref] + overlap
         else:
             source_shift = 0.0
 
         # convert shift from seconds to samples and ensure it is a multiple of 20ms
-        source_shift_in_seconds = source_shift
         source_shift = source_shift * cfg.fs
         if source_shift >= 0:
             source_shift = int(np.floor(source_shift / frame_len) * frame_len)
         else:
             source_shift = int(np.ceil(source_shift / frame_len) * frame_len)
+        source_shift_in_seconds = source_shift / cfg.fs
 
         # read the level
         if "level" in scene.keys():
@@ -278,6 +301,9 @@ def generate_ismN_scene(
         x = audio.fromtype("ISM1")
         x.audio, x.fs = audiofile.read(input_filename)
 
+        # record the total duration of the source signal, taking into account the shift of the starting position
+        end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds)
+
         # resample to the target fs if necessary
         if x.fs != cfg.fs:
             logger.warning(
@@ -288,12 +314,12 @@ def generate_ismN_scene(
             x.fs = cfg.fs
 
         # adjust the level of the audio source file (need to convert to MONO first)
-        if level is None:
-            # do not change the level of the audio source signal
-            logger.info("-- Level of the audio source signal is not changed")
-        elif np.isinf(level):
+        if np.isinf(level):
             # set all channels to zero
             x.audio = np.zeros_like(x.audio)
+        elif level is None:
+            # do not change the level of the audio source signal
+            logger.info("-- Level of the audio source signal is not changed")
         else:
             x_temp = audio.ChannelBasedAudio(
                 "MONO"
@@ -391,21 +417,21 @@ def generate_ismN_scene(
             y.object_pos = x.object_pos.copy()
             y.fs = x.fs
 
-            if source_shift < 0:
+            if source_shift > 0:
                 # insert zeros to the new audio source signal to shift it right
-                metadata.trim_meta(y, limits=[source_shift, 0], samples=True)
+                metadata.trim_meta(y, limits=[-source_shift, 0], samples=True)
             else:
                 offset = source_shift
         else:
             # shift the beginning of the audio source signal
             delta_offset = source_shift - offset
-            if delta_offset > 0:
+            if delta_offset < 0:
                 # insert zeros to the previous ISM signal(s) to shift them right
-                metadata.trim_meta(y, limits=[-delta_offset, 0], samples=True)
+                metadata.trim_meta(y, limits=[delta_offset, 0], samples=True)
                 offset = source_shift
             else:
                 # insert zeros to the new audio source signal to shift it right
-                metadata.trim_meta(x, limits=[delta_offset, 0], samples=True)
+                metadata.trim_meta(x, limits=[-delta_offset, 0], samples=True)
 
             # adjust the length of the audio source signal
             delta_length = len(x.audio) - len(y.audio)
@@ -443,18 +469,14 @@ def generate_ismN_scene(
         noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float")
         y.audio += noise
 
-    # adjust the length of the output signal
+    # trim the output signal if the total duration exceeds X seconds
     if "duration" in cfg.__dict__:
-        # trim the output signal such that the total duration is X seconds
-        duration = int(cfg.duration * cfg.fs)  # convert to samples
-    else:
-        # do not change the length of the audio signal
-        duration = len(y.audio)
-    duration = int(
-        np.floor(duration / frame_len) * frame_len
-    )  # ensure multiple of 20ms
-    if len(y.audio) != duration:
-        metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True)
+        # convert from seconds to samples (ensure multiple of 20ms)
+        duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len)
+
+        # check if the current length of the output signal exceeds the duration
+        if len(y.audio) > duration:
+            metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True)
 
     # adjust the loudness of the output signal
     if "loudness" in cfg.__dict__:
diff --git a/ivas_processing_scripts/generation/generate_masa_items.py b/ivas_processing_scripts/generation/generate_masa_items.py
index 7a4258232b83f9833edbe7e03a6a49de68ced734..6ddac8707f566cef93ee5542662ea7b9e740420b 100644
--- a/ivas_processing_scripts/generation/generate_masa_items.py
+++ b/ivas_processing_scripts/generation/generate_masa_items.py
@@ -31,6 +31,7 @@
 #
 
 import logging
+import re
 import sys
 from itertools import groupby, repeat
 from pathlib import Path
@@ -209,6 +210,7 @@ def generate_MASA_scene(
     # repeat for all source files
     offset = 0
     y_int = None
+    end_position = []
     for i in range(N_inputs):
         # parse parameters from the scene description
         source_file = (
@@ -232,13 +234,44 @@ def generate_MASA_scene(
         else:
             source_shift = 0.0
 
+        # read the source shift length (in seconds)
+        if "shift" in scene.keys():
+            source_shift = (
+                scene["shift"][i]
+                if isinstance(scene["shift"], list)
+                else scene["shift"]
+            )
+
+            # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index
+            # of the reference signal (0-based index)
+            if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]:
+                # extract X and i_ref
+                match = re.match(
+                    r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]
+                )
+
+                if match:
+                    overlap = float(match.group(1))
+                    overlap_ref = int(match.group(2))
+                else:
+                    scene_shift_str = scene["shift"][i]
+                    logger.error(
+                        f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!"
+                    )
+                    sys.exit(-1)
+
+                # calculate absolute shift of the source signal in seconds
+                source_shift = end_position[overlap_ref] - overlap
+        else:
+            source_shift = 0.0
+
         # convert shift from seconds to samples and ensure it is a multiple of 20ms
-        source_shift_in_seconds = source_shift
         source_shift = source_shift * cfg.fs
         if source_shift >= 0:
             source_shift = int(np.floor(source_shift / frame_len) * frame_len)
         else:
             source_shift = int(np.ceil(source_shift / frame_len) * frame_len)
+        source_shift_in_seconds = source_shift / cfg.fs
 
         # read the level
         if "level" in scene.keys():
@@ -295,6 +328,9 @@ def generate_MASA_scene(
         # read source file
         x = audio.fromfile("MONO", input_filename)
 
+        # record the total duration of the source signal, taking into account the shift of the starting position
+        end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds)
+
         # resample to the target fs if necessary
         if x.fs != cfg.fs:
             logger.warning(
@@ -316,12 +352,12 @@ def generate_MASA_scene(
             x = reverb_hoa3(x, IR, mode=None)
 
         # adjust the level of the FOA/HOA2/HOA3 signal
-        if level is None:
-            # do not change the level of the audio source signal
-            logger.info("-- Level of the audio source signal is not changed")
-        elif np.isinf(level):
+        if np.isinf(level):
             # set all channels to zero
             x.audio = np.zeros_like(x.audio)
+        elif level is None:
+            # do not change the level of the audio source signal
+            logger.info("-- Level of the audio source signal is not changed")
         else:
             x.audio, _ = loudness_norm(x, level, loudness_format="STEREO")
 
@@ -339,26 +375,26 @@ def generate_MASA_scene(
             # this is the first SBA source signal
             y_int.audio = x.audio.copy()
 
-            if source_shift < 0:
+            if source_shift > 0:
                 # insert zeros to the first SBA source signal to shift it right
                 y_int.audio = audioarray.trim(
-                    y_int.audio, y_int.fs, limits=[source_shift, 0], samples=True
+                    y_int.audio, y_int.fs, limits=[-source_shift, 0], samples=True
                 )
             else:
                 offset = source_shift
         else:
             # shift the beginning of the audio source signal
             delta_offset = source_shift - offset
-            if delta_offset > 0:
+            if delta_offset < 0:
                 # insert zeros to the output SBA signal to shift it right
                 y_int.audio = audioarray.trim(
-                    y_int.audio, y_int.fs, limits=[-delta_offset, 0], samples=True
+                    y_int.audio, y_int.fs, limits=[delta_offset, 0], samples=True
                 )
                 offset = source_shift
             else:
                 # insert zeros to the new SBA source signal to shift it right
                 x.audio = audioarray.trim(
-                    x.audio, x.fs, limits=[delta_offset, 0], samples=True
+                    x.audio, x.fs, limits=[-delta_offset, 0], samples=True
                 )
 
             # adjust the length of the audio source signal
@@ -393,20 +429,19 @@ def generate_MASA_scene(
                 y_int.audio, y_int.fs, limits=[-preamble, -postamble], samples=True
             )
 
-    # adjust the length of the output signal
+    # trim the output signal if the total duration exceeds X seconds
     if "duration" in cfg.__dict__:
-        # trim the output signal such that the total duration is X seconds
-        duration = int(cfg.duration * cfg.fs)  # convert to samples
-    else:
-        # do not change the length of the audio signal
-        duration = len(y_int.audio)
-    duration = int(
-        np.floor(duration / frame_len) * frame_len
-    )  # ensure multiple of 20ms
-    if len(y_int.audio) != duration:
-        y_int.audio = audioarray.trim(
-            y_int.audio, y_int.fs, limits=[0, len(y_int.audio) - duration], samples=True
-        )
+        # convert from seconds to samples (ensure multiple of 20ms)
+        duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len)
+
+        # check if the current length of the output signal exceeds the duration
+        if len(y_int.audio) > duration:
+            y_int.audio = audioarray.trim(
+                y_int.audio,
+                y_int.fs,
+                limits=[0, len(y_int.audio) - duration],
+                samples=True,
+            )
 
     # adjust the loudness of the output signal
     if "loudness" in cfg.__dict__:
diff --git a/ivas_processing_scripts/generation/generate_mc_items.py b/ivas_processing_scripts/generation/generate_mc_items.py
index df1b16451e4e3d12a48f31c9977f8e435ce5b8bd..35dcbb3be7be6b646c39e6f16b1ae84bfa8bf50b 100644
--- a/ivas_processing_scripts/generation/generate_mc_items.py
+++ b/ivas_processing_scripts/generation/generate_mc_items.py
@@ -31,6 +31,7 @@
 #
 
 import logging
+import re
 import sys
 from itertools import groupby, repeat
 from pathlib import Path
@@ -209,6 +210,7 @@ def generate_MC_scene(
     # repeat for all source files
     offset = 0
     y_int = None
+    end_position = []
     for i in range(N_inputs):
         # parse parameters from the scene description
         source_file = (
@@ -222,23 +224,44 @@ def generate_MC_scene(
         )
         IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name)
 
-        # read the shift time in seconds
+        # read the source shift length (in seconds)
         if "shift" in scene.keys():
             source_shift = (
                 scene["shift"][i]
                 if isinstance(scene["shift"], list)
                 else scene["shift"]
             )
+
+            # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index
+            # of the reference signal (0-based index)
+            if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]:
+                # extract X and i_ref
+                match = re.match(
+                    r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]
+                )
+
+                if match:
+                    overlap = float(match.group(1))
+                    overlap_ref = int(match.group(2))
+                else:
+                    scene_shift_str = scene["shift"][i]
+                    logger.error(
+                        f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!"
+                    )
+                    sys.exit(-1)
+
+                # calculate absolute shift of the source signal in seconds
+                source_shift = end_position[overlap_ref] + overlap
         else:
             source_shift = 0.0
 
         # convert shift from seconds to samples and ensure it is a multiple of 20ms
-        source_shift_in_seconds = source_shift
         source_shift = source_shift * cfg.fs
         if source_shift >= 0:
             source_shift = int(np.floor(source_shift / frame_len) * frame_len)
         else:
             source_shift = int(np.ceil(source_shift / frame_len) * frame_len)
+        source_shift_in_seconds = source_shift / cfg.fs
 
         # read the level
         if "level" in scene.keys():
@@ -295,6 +318,9 @@ def generate_MC_scene(
         # read source file
         x = audio.fromfile("MONO", input_filename)
 
+        # record the total duration of the source signal, taking into account the shift of the starting position
+        end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds)
+
         # resample to the target fs if necessary
         if x.fs != cfg.fs:
             logger.warning(
@@ -339,26 +365,26 @@ def generate_MC_scene(
             # this is the first SBA source signal
             y_int.audio = x.audio.copy()
 
-            if source_shift < 0:
+            if source_shift > 0:
                 # insert zeros to the first SBA source signal to shift it right
                 y_int.audio = audioarray.trim(
-                    y_int.audio, y_int.fs, limits=[source_shift, 0], samples=True
+                    y_int.audio, y_int.fs, limits=[-source_shift, 0], samples=True
                 )
             else:
                 offset = source_shift
         else:
             # shift the beginning of the audio source signal
             delta_offset = source_shift - offset
-            if delta_offset > 0:
+            if delta_offset < 0:
                 # insert zeros to the output SBA signal to shift it right
                 y_int.audio = audioarray.trim(
-                    y_int.audio, y_int.fs, limits=[-delta_offset, 0], samples=True
+                    y_int.audio, y_int.fs, limits=[delta_offset, 0], samples=True
                 )
                 offset = source_shift
             else:
                 # insert zeros to the new SBA source signal to shift it right
                 x.audio = audioarray.trim(
-                    x.audio, x.fs, limits=[delta_offset, 0], samples=True
+                    x.audio, x.fs, limits=[-delta_offset, 0], samples=True
                 )
 
             # adjust the length of the audio source signal
@@ -393,20 +419,19 @@ def generate_MC_scene(
                 y_int.audio, y_int.fs, limits=[-preamble, -postamble], samples=True
             )
 
-    # adjust the length of the output signal
+    # trim the output signal if the total duration exceeds X seconds
     if "duration" in cfg.__dict__:
-        # trim the output signal such that the total duration is X seconds
-        duration = int(cfg.duration * cfg.fs)  # convert to samples
-    else:
-        # do not change the length of the audio signal
-        duration = len(y_int.audio)
-    duration = int(
-        np.floor(duration / frame_len) * frame_len
-    )  # ensure multiple of 20ms
-    if len(y_int.audio) != duration:
-        y_int.audio = audioarray.trim(
-            y_int.audio, y_int.fs, limits=[0, len(y_int.audio) - duration], samples=True
-        )
+        # convert from seconds to samples (ensure multiple of 20ms)
+        duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len)
+
+        # check if the current length of the output signal exceeds the duration
+        if len(y_int.audio) > duration:
+            y_int.audio = audioarray.trim(
+                y_int.audio,
+                y_int.fs,
+                limits=[0, len(y_int.audio) - duration],
+                samples=True,
+            )
 
     # adjust the loudness of the output signal
     if "loudness" in cfg.__dict__:
diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py
index 93f003053ebb04633ee8d0a679c444a5c9a98f81..0881c7ca200e5cf639d23bfd9a1f0edd1b205b9e 100644
--- a/ivas_processing_scripts/generation/generate_omasa_items.py
+++ b/ivas_processing_scripts/generation/generate_omasa_items.py
@@ -31,6 +31,7 @@
 #
 
 import logging
+import re
 import sys
 from itertools import groupby, repeat
 from pathlib import Path
@@ -183,6 +184,7 @@ def generate_OMASA_scene(
 
     # repeat for all source files
     offset = 0
+    end_position = []
     for i in range(N_inputs):
         # parse parameters from the scene description
         source_file = (
@@ -220,16 +222,37 @@ def generate_OMASA_scene(
                 if isinstance(scene["shift"], list)
                 else scene["shift"]
             )
+
+            # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index
+            # of the reference signal (0-based index)
+            if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]:
+                # extract X and i_ref
+                match = re.match(
+                    r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]
+                )
+
+                if match:
+                    overlap = float(match.group(1))
+                    overlap_ref = int(match.group(2))
+                else:
+                    scene_shift_str = scene["shift"][i]
+                    logger.error(
+                        f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!"
+                    )
+                    sys.exit(-1)
+
+                # calculate absolute shift of the source signal in seconds
+                source_shift = end_position[overlap_ref] + overlap
         else:
             source_shift = 0.0
 
         # convert shift from seconds to samples and ensure it is a multiple of 20ms
-        source_shift_in_seconds = source_shift
         source_shift = source_shift * cfg.fs
         if source_shift >= 0:
             source_shift = int(np.floor(source_shift / frame_len) * frame_len)
         else:
             source_shift = int(np.ceil(source_shift / frame_len) * frame_len)
+        source_shift_in_seconds = source_shift / cfg.fs
 
         # read the level
         if "level" in scene.keys():
@@ -300,6 +323,9 @@ def generate_OMASA_scene(
         # read source file
         x = audio.fromfile(fmt, input_filename)
 
+        # record the total duration of the source signal, taking into account the shift of the starting position
+        end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds)
+
         # resample to the target fs if necessary
         if x.fs != cfg.fs:
             logger.warning(
@@ -417,21 +443,21 @@ def generate_OMASA_scene(
             # add the first audio source signal (should be FOA/HOA2/HOA3) to the array of all source signals
             y_int.audio = x.audio.copy()
 
-            if source_shift < 0:
+            if source_shift > 0:
                 # insert zeros to the new audio source signal to shift it right
-                metadata.trim_meta(y_int, limits=[source_shift, 0], samples=True)
+                metadata.trim_meta(y_int, limits=[-source_shift, 0], samples=True)
             else:
                 offset = source_shift
         else:
             # shift the beginning of the audio source signal
             delta_offset = source_shift - offset
-            if delta_offset > 0:
+            if delta_offset < 0:
                 # insert zeros to the existing intermediate OSBA object to shift it right
-                metadata.trim_meta(y_int, limits=[-delta_offset, 0], samples=True)
+                metadata.trim_meta(y_int, limits=[delta_offset, 0], samples=True)
                 offset = source_shift
             else:
                 # insert zeros to the new audio source signal to shift it right
-                metadata.trim_meta(x, limits=[delta_offset, 0], samples=True)
+                metadata.trim_meta(x, limits=[-delta_offset, 0], samples=True)
 
             # adjust the length of the audio source signal
             delta_length = len(x.audio) - len(y_int.audio)
@@ -472,18 +498,16 @@ def generate_OMASA_scene(
         noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float")
         y_int.audio += noise
 
-    # adjust the length of the output signal
+    # trim the output signal if the total duration exceeds X seconds
     if "duration" in cfg.__dict__:
-        # trim the output signal such that the total duration is X seconds
-        duration = int(cfg.duration * cfg.fs)  # convert to samples
-    else:
-        # do not change the length of the audio signal
-        duration = len(y_int.audio)
-    duration = int(
-        np.floor(duration / frame_len) * frame_len
-    )  # ensure multiple of 20ms
-    if len(y_int.audio) != duration:
-        metadata.trim_meta(y_int, limits=[0, len(y_int.audio) - duration], samples=True)
+        # convert from seconds to samples (ensure multiple of 20ms)
+        duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len)
+
+        # check if the current length of the output signal exceeds the duration
+        if len(y_int.audio) > duration:
+            metadata.trim_meta(
+                y_int, limits=[0, len(y_int.audio) - duration], samples=True
+            )
 
     # adjust the loudness of the output signal
     if "loudness" in cfg.__dict__:
diff --git a/ivas_processing_scripts/generation/generate_osba_items.py b/ivas_processing_scripts/generation/generate_osba_items.py
index dd8f5b5d2a7a6ea56c98470e6c8b715add248da4..8d2ca0d85209e6e7e86f24b19daeb8fb6bb56465 100644
--- a/ivas_processing_scripts/generation/generate_osba_items.py
+++ b/ivas_processing_scripts/generation/generate_osba_items.py
@@ -31,6 +31,7 @@
 #
 
 import logging
+import re
 import sys
 from itertools import groupby, repeat
 from pathlib import Path
@@ -187,6 +188,7 @@ def generate_OSBA_scene(
 
     # repeat for all source files
     offset = 0
+    end_position = []
     for i in range(N_inputs):
         # parse parameters from the scene description
         source_file = (
@@ -210,23 +212,44 @@ def generate_OSBA_scene(
             else scene["elevation"]
         )
 
-        # read the shift time in seconds
+        # read the source shift length (in seconds)
         if "shift" in scene.keys():
             source_shift = (
                 scene["shift"][i]
                 if isinstance(scene["shift"], list)
                 else scene["shift"]
             )
+
+            # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index
+            # of the reference signal (0-based index)
+            if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]:
+                # extract X and i_ref
+                match = re.match(
+                    r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]
+                )
+
+                if match:
+                    overlap = float(match.group(1))
+                    overlap_ref = int(match.group(2))
+                else:
+                    scene_shift_str = scene["shift"][i]
+                    logger.error(
+                        f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!"
+                    )
+                    sys.exit(-1)
+
+                # calculate absolute shift of the source signal in seconds
+                source_shift = end_position[overlap_ref] + overlap
         else:
             source_shift = 0.0
 
         # convert shift from seconds to samples and ensure it is a multiple of 20ms
-        source_shift_in_seconds = source_shift
         source_shift = source_shift * cfg.fs
         if source_shift >= 0:
             source_shift = int(np.floor(source_shift / frame_len) * frame_len)
         else:
             source_shift = int(np.ceil(source_shift / frame_len) * frame_len)
+        source_shift_in_seconds = source_shift / cfg.fs
 
         # read the level
         if "level" in scene.keys():
@@ -282,6 +305,9 @@ def generate_OSBA_scene(
         # read source file
         x = audio.fromfile(fmt, input_filename)
 
+        # record the total duration of the source signal, taking into account the shift of the starting position
+        end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds)
+
         # resample to the target fs if necessary
         if x.fs != cfg.fs:
             logger.warning(
@@ -403,21 +429,21 @@ def generate_OSBA_scene(
                 # if ISM, append object position to the OSBA object
                 y.object_pos = x.object_pos.copy()
 
-            if source_shift < 0:
+            if source_shift > 0:
                 # insert zeros to the new audio source signal to shift it right
-                metadata.trim_meta(y, limits=[source_shift, 0], samples=True)
+                metadata.trim_meta(y, limits=[-source_shift, 0], samples=True)
             else:
                 offset = source_shift
         else:
             # shift the beginning of the audio source signal
             delta_offset = source_shift - offset
-            if delta_offset > 0:
+            if delta_offset < 0:
                 # insert zeros to the previous ISM signal(s) to shift them right
-                metadata.trim_meta(y, limits=[-delta_offset, 0], samples=True)
+                metadata.trim_meta(y, limits=[delta_offset, 0], samples=True)
                 offset = source_shift
             else:
                 # insert zeros to the new audio source signal to shift it right
-                metadata.trim_meta(x, limits=[delta_offset, 0], samples=True)
+                metadata.trim_meta(x, limits=[-delta_offset, 0], samples=True)
 
             # adjust the length of the audio source signal
             delta_length = len(x.audio) - len(y.audio)
@@ -458,18 +484,14 @@ def generate_OSBA_scene(
         noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float")
         y.audio += noise
 
-    # adjust the length of the output signal
+    # trim the output signal if the total duration exceeds X seconds
     if "duration" in cfg.__dict__:
-        # trim the output signal such that the total duration is X seconds
-        duration = int(cfg.duration * cfg.fs)  # convert to samples
-    else:
-        # do not change the length of the audio signal
-        duration = len(y.audio)
-    duration = int(
-        np.floor(duration / frame_len) * frame_len
-    )  # ensure multiple of 20ms
-    if len(y.audio) != duration:
-        metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True)
+        # convert from seconds to samples (ensure multiple of 20ms)
+        duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len)
+
+        # check if the current length of the output signal exceeds the duration
+        if len(y.audio) > duration:
+            metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True)
 
     # adjust the loudness of the output signal
     if "loudness" in cfg.__dict__:
diff --git a/ivas_processing_scripts/generation/generate_sba_items.py b/ivas_processing_scripts/generation/generate_sba_items.py
index 28fbababc787ef2d67a961deab39eef9330ad165..631d6165481f1cc09ccfe56335b3af86ec16e4b4 100644
--- a/ivas_processing_scripts/generation/generate_sba_items.py
+++ b/ivas_processing_scripts/generation/generate_sba_items.py
@@ -31,6 +31,7 @@
 #
 
 import logging
+import re
 import sys
 from itertools import groupby, repeat
 from pathlib import Path
@@ -201,6 +202,7 @@ def generate_sba_scene(
 
     # repeat for all source files
     offset = 0
+    end_position = []
     for i in range(N_inputs):
         # parse parameters from the scene description
         source_file = (
@@ -214,23 +216,44 @@ def generate_sba_scene(
         )
         IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name)
 
-        # read the shift time in seconds
+        # read the source shift length (in seconds)
         if "shift" in scene.keys():
             source_shift = (
                 scene["shift"][i]
                 if isinstance(scene["shift"], list)
                 else scene["shift"]
             )
+
+            # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index
+            # of the reference signal (0-based index)
+            if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]:
+                # extract X and i_ref
+                match = re.match(
+                    r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]
+                )
+
+                if match:
+                    overlap = float(match.group(1))
+                    overlap_ref = int(match.group(2))
+                else:
+                    scene_shift_str = scene["shift"][i]
+                    logger.error(
+                        f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!"
+                    )
+                    sys.exit(-1)
+
+                # calculate absolute shift of the source signal in seconds
+                source_shift = end_position[overlap_ref] + overlap
         else:
             source_shift = 0.0
 
         # convert shift from seconds to samples and ensure it is a multiple of 20ms
-        source_shift_in_seconds = source_shift
         source_shift = source_shift * cfg.fs
         if source_shift >= 0:
             source_shift = int(np.floor(source_shift / frame_len) * frame_len)
         else:
             source_shift = int(np.ceil(source_shift / frame_len) * frame_len)
+        source_shift_in_seconds = source_shift / cfg.fs
 
         # read the level
         if "level" in scene.keys():
@@ -266,6 +289,9 @@ def generate_sba_scene(
         # read source file
         x = audio.fromfile("MONO", input_filename)
 
+        # record the total duration of the source signal, taking into account the shift of the starting position
+        end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds)
+
         # resample to the target fs if necessary
         if x.fs != cfg.fs:
             logger.warning(
@@ -310,26 +336,26 @@ def generate_sba_scene(
             # add source signal to the array of all source signals
             y.audio = x.audio.copy()
 
-            if source_shift < 0:
+            if source_shift > 0:
                 # insert zeros to the new audio source signal to shift it right
                 y.audio = audioarray.trim_meta(
-                    y.audio, y.fs, limits=[source_shift, 0], samples=True
+                    y.audio, y.fs, limits=[-source_shift, 0], samples=True
                 )
             else:
                 offset = source_shift
         else:
             # shift the beginning of the audio source signal
             delta_offset = source_shift - offset
-            if delta_offset > 0:
+            if delta_offset < 0:
                 # insert zeros to the existing output signal to shift it right
                 y.audio = audioarray.trim(
-                    y.audio, y.fs, limits=[-delta_offset, 0], samples=True
+                    y.audio, y.fs, limits=[delta_offset, 0], samples=True
                 )
                 offset = source_shift
             else:
                 # insert zeros to the new audio source signal to shift it right
                 x.audio = audioarray.trim(
-                    x.audio, x.fs, limits=[delta_offset, 0], samples=True
+                    x.audio, x.fs, limits=[-delta_offset, 0], samples=True
                 )
 
             # adjust the length of the audio source signal
@@ -364,20 +390,16 @@ def generate_sba_scene(
                 y.audio, y.fs, limits=[-preamble, -postamble], samples=True
             )
 
-    # adjust the length of the output signal
+    # trim the output signal if the total duration exceeds X seconds
     if "duration" in cfg.__dict__:
-        # trim the output signal such that the total duration is X seconds
-        duration = int(cfg.duration * cfg.fs)  # convert to samples
-    else:
-        # do not change the length of the audio signal
-        duration = len(y.audio)
-    duration = int(
-        np.floor(duration / frame_len) * frame_len
-    )  # ensure multiple of 20ms
-    if len(y.audio) != duration:
-        y.audio = audioarray.trim(
-            y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True
-        )
+        # convert from seconds to samples (ensure multiple of 20ms)
+        duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len)
+
+        # check if the current length of the output signal exceeds the duration
+        if len(y.audio) > duration:
+            y.audio = audioarray.trim(
+                y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True
+            )
 
     # adjust the loudness of the output signal
     if "loudness" in cfg.__dict__:
diff --git a/ivas_processing_scripts/generation/generate_stereo_items.py b/ivas_processing_scripts/generation/generate_stereo_items.py
index a0d99f90cad67a8288b891f255f12cc106beba8a..1ad8a6ae47936af4a8ba9d71ab09634525fa2b95 100644
--- a/ivas_processing_scripts/generation/generate_stereo_items.py
+++ b/ivas_processing_scripts/generation/generate_stereo_items.py
@@ -32,6 +32,7 @@
 
 import logging
 import os
+import re
 import sys
 from itertools import groupby, repeat
 from pathlib import Path
@@ -207,6 +208,7 @@ def generate_stereo_scene(
 
     # repeat for all source files
     offset = 0
+    end_position = []
     for i in range(N_inputs):
         # parse parameters from the scene description
         source_file = (
@@ -220,23 +222,44 @@ def generate_stereo_scene(
         )
         IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name)
 
-        # read the shift time in seconds
+        # read the source shift length (in seconds)
         if "shift" in scene.keys():
             source_shift = (
                 scene["shift"][i]
                 if isinstance(scene["shift"], list)
                 else scene["shift"]
             )
+
+            # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index
+            # of the reference signal (0-based index)
+            if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]:
+                # extract X and i_ref
+                match = re.match(
+                    r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]
+                )
+
+                if match:
+                    overlap = float(match.group(1))
+                    overlap_ref = int(match.group(2))
+                else:
+                    scene_shift_str = scene["shift"][i]
+                    logger.error(
+                        f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!"
+                    )
+                    sys.exit(-1)
+
+                # calculate absolute shift of the source signal in seconds
+                source_shift = end_position[overlap_ref] + overlap
         else:
             source_shift = 0.0
 
         # convert shift from seconds to samples and ensure it is a multiple of 20ms
-        source_shift_in_seconds = source_shift
         source_shift = source_shift * cfg.fs
         if source_shift >= 0:
             source_shift = int(np.floor(source_shift / frame_len) * frame_len)
         else:
             source_shift = int(np.ceil(source_shift / frame_len) * frame_len)
+        source_shift_in_seconds = source_shift / cfg.fs
 
         # read the level
         if "level" in scene.keys():
@@ -284,6 +307,9 @@ def generate_stereo_scene(
         # read the IR file (!must be in STEREO format!)
         IR = audio.fromfile("STEREO", IR_filename)
 
+        # record the total duration of the source signal, taking into account the shift of the starting position
+        end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds)
+
         # convolve MONO source audio with STEREO IR -> results in STEREO audio object
         x = reverb_stereo(x, IR, mode=None)
 
@@ -311,26 +337,26 @@ def generate_stereo_scene(
             # add source signal to the array of all source signals
             y.audio = x.audio.copy()
 
-            if source_shift < 0:
+            if source_shift > 0:
                 # insert zeros to the new audio source signal to shift it right
                 y.audio = audioarray.trim(
-                    y.audio, x.fs, limits=[source_shift, 0], samples=True
+                    y.audio, x.fs, limits=[-source_shift, 0], samples=True
                 )
             else:
                 offset = source_shift
         else:
             # shift the beginning of the audio source signal
             delta_offset = source_shift - offset
-            if delta_offset > 0:
+            if delta_offset < 0:
                 # insert zeros to the existing output signal to shift it right
                 y.audio = audioarray.trim(
-                    y.audio, y.fs, limits=[-delta_offset, 0], samples=True
+                    y.audio, y.fs, limits=[delta_offset, 0], samples=True
                 )
                 offset = source_shift
             else:
                 # insert zeros to the new audio source signal to shift it right
                 x.audio = audioarray.trim(
-                    x.audio, x.fs, limits=[delta_offset, 0], samples=True
+                    x.audio, x.fs, limits=[-delta_offset, 0], samples=True
                 )
 
             # adjust the length of the audio source signal
@@ -365,20 +391,16 @@ def generate_stereo_scene(
                 y.audio, y.fs, limits=[-preamble, -postamble], samples=True
             )
 
-    # adjust the length of the output signal
+    # trim the output signal if the total duration exceeds X seconds
     if "duration" in cfg.__dict__:
-        # trim the output signal such that the total duration is X seconds
-        duration = int(cfg.duration * cfg.fs)  # convert to samples
-    else:
-        # do not change the length of the audio signal
-        duration = len(y.audio)
-    duration = int(
-        np.floor(duration / frame_len) * frame_len
-    )  # ensure multiple of 20ms
-    if len(y.audio) != duration:
-        y.audio = audioarray.trim(
-            y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True
-        )
+        # convert from seconds to samples (ensure multiple of 20ms)
+        duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len)
+
+        # check if the current length of the output signal exceeds the duration
+        if len(y.audio) > duration:
+            y.audio = audioarray.trim(
+                y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True
+            )
 
     # adjust the loudness of the output signal
     if "loudness" in cfg.__dict__: