From 73a29eb997b576d4b1364b4092c48e0f67e11918 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Mon, 11 Aug 2025 16:22:45 +0200 Subject: [PATCH 1/5] fix the duration parameter - only trim if duration is exceeded --- .../generation/generate_ismN_items.py | 18 +++++------ .../generation/generate_masa_items.py | 30 +++++++++---------- .../generation/generate_mc_items.py | 22 +++++++------- .../generation/generate_omasa_items.py | 13 +++++++- .../generation/generate_osba_items.py | 20 ++++++------- .../generation/generate_sba_items.py | 22 +++++++------- .../generation/generate_stereo_items.py | 22 +++++++------- 7 files changed, 72 insertions(+), 75 deletions(-) diff --git a/ivas_processing_scripts/generation/generate_ismN_items.py b/ivas_processing_scripts/generation/generate_ismN_items.py index 489dbea4..551058c7 100644 --- a/ivas_processing_scripts/generation/generate_ismN_items.py +++ b/ivas_processing_scripts/generation/generate_ismN_items.py @@ -443,18 +443,14 @@ def generate_ismN_scene( noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") y.audio += noise - # adjust the length of the output signal + # trim the output signal such if the total duration exceeds X seconds if "duration" in cfg.__dict__: - # trim the output signal such that the total duration is X seconds - duration = int(cfg.duration * cfg.fs) # convert to samples - else: - # do not change the length of the audio signal - duration = len(y.audio) - duration = int( - np.floor(duration / frame_len) * frame_len - ) # ensure multiple of 20ms - if len(y.audio) != duration: - metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) + # convert from seconds to samples (ensure multiple of 20ms) + duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len) + + # check if the current length of the output signal exceeds the duration + if len(y.audio) > duration: + metadata.trim_meta(y, limits=[0, duration], samples=True) # adjust the loudness of the output signal if "loudness" in cfg.__dict__: diff --git a/ivas_processing_scripts/generation/generate_masa_items.py b/ivas_processing_scripts/generation/generate_masa_items.py index 7a425823..6b40da78 100644 --- a/ivas_processing_scripts/generation/generate_masa_items.py +++ b/ivas_processing_scripts/generation/generate_masa_items.py @@ -316,12 +316,12 @@ def generate_MASA_scene( x = reverb_hoa3(x, IR, mode=None) # adjust the level of the FOA/HOA2/HOA3 signal - if level is None: - # do not change the level of the audio source signal - logger.info("-- Level of the audio source signal is not changed") - elif np.isinf(level): + if np.isinf(level): # set all channels to zero x.audio = np.zeros_like(x.audio) + elif level is None: + # do not change the level of the audio source signal + logger.info("-- Level of the audio source signal is not changed") else: x.audio, _ = loudness_norm(x, level, loudness_format="STEREO") @@ -393,21 +393,19 @@ def generate_MASA_scene( y_int.audio, y_int.fs, limits=[-preamble, -postamble], samples=True ) - # adjust the length of the output signal + # trim the output signal if the total duration exceeds X seconds if "duration" in cfg.__dict__: - # trim the output signal such that the total duration is X seconds - duration = int(cfg.duration * cfg.fs) # convert to samples - else: - # do not change the length of the audio signal - duration = len(y_int.audio) - duration = int( - np.floor(duration / frame_len) * frame_len - ) # ensure multiple of 20ms - if len(y_int.audio) != duration: - y_int.audio = audioarray.trim( - y_int.audio, y_int.fs, limits=[0, len(y_int.audio) - duration], samples=True + # convert from seconds to samples (ensure multiple of 20ms) + duration = int( + np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len ) + # check if the current length of the output signal exceeds the duration + if len(y_int.audio) > duration: + y_int.audio = audioarray.trim( + y_int.audio, y_int.fs, limits=[0, len(y_int.audio) - duration], samples=True + ) + # adjust the loudness of the output signal if "loudness" in cfg.__dict__: logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LKFS") diff --git a/ivas_processing_scripts/generation/generate_mc_items.py b/ivas_processing_scripts/generation/generate_mc_items.py index df1b1645..daacf2d8 100644 --- a/ivas_processing_scripts/generation/generate_mc_items.py +++ b/ivas_processing_scripts/generation/generate_mc_items.py @@ -393,21 +393,19 @@ def generate_MC_scene( y_int.audio, y_int.fs, limits=[-preamble, -postamble], samples=True ) - # adjust the length of the output signal + # trim the output signal if the total duration exceeds X seconds if "duration" in cfg.__dict__: - # trim the output signal such that the total duration is X seconds - duration = int(cfg.duration * cfg.fs) # convert to samples - else: - # do not change the length of the audio signal - duration = len(y_int.audio) - duration = int( - np.floor(duration / frame_len) * frame_len - ) # ensure multiple of 20ms - if len(y_int.audio) != duration: - y_int.audio = audioarray.trim( - y_int.audio, y_int.fs, limits=[0, len(y_int.audio) - duration], samples=True + # convert from seconds to samples (ensure multiple of 20ms) + duration = int( + np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len ) + # check if the current length of the output signal exceeds the duration + if len(y_int.audio) > duration: + y_int.audio = audioarray.trim( + y_int.audio, y_int.fs, limits=[0, len(y_int.audio) - duration], samples=True + ) + # adjust the loudness of the output signal if "loudness" in cfg.__dict__: logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LKFS") diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py index 93f00305..2ca92195 100644 --- a/ivas_processing_scripts/generation/generate_omasa_items.py +++ b/ivas_processing_scripts/generation/generate_omasa_items.py @@ -472,8 +472,9 @@ def generate_OMASA_scene( noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") y_int.audio += noise - # adjust the length of the output signal + # trim the output signal such if the total duration exceeds X seconds if "duration" in cfg.__dict__: +<<<<<<< Updated upstream # trim the output signal such that the total duration is X seconds duration = int(cfg.duration * cfg.fs) # convert to samples else: @@ -484,6 +485,16 @@ def generate_OMASA_scene( ) # ensure multiple of 20ms if len(y_int.audio) != duration: metadata.trim_meta(y_int, limits=[0, len(y_int.audio) - duration], samples=True) +======= + # convert from seconds to samples (ensure multiple of 20ms) + duration = int( + np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len + ) + + # check if the current length of the output signal exceeds the duration + if len(y.audio) > duration: + metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) +>>>>>>> Stashed changes # adjust the loudness of the output signal if "loudness" in cfg.__dict__: diff --git a/ivas_processing_scripts/generation/generate_osba_items.py b/ivas_processing_scripts/generation/generate_osba_items.py index dd8f5b5d..67e493b5 100644 --- a/ivas_processing_scripts/generation/generate_osba_items.py +++ b/ivas_processing_scripts/generation/generate_osba_items.py @@ -458,18 +458,16 @@ def generate_OSBA_scene( noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") y.audio += noise - # adjust the length of the output signal + # trim the output signal if the total duration exceeds X seconds if "duration" in cfg.__dict__: - # trim the output signal such that the total duration is X seconds - duration = int(cfg.duration * cfg.fs) # convert to samples - else: - # do not change the length of the audio signal - duration = len(y.audio) - duration = int( - np.floor(duration / frame_len) * frame_len - ) # ensure multiple of 20ms - if len(y.audio) != duration: - metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) + # convert from seconds to samples (ensure multiple of 20ms) + duration = int( + np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len + ) + + # check if the current length of the output signal exceeds the duration + if len(y.audio) > duration: + metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) # adjust the loudness of the output signal if "loudness" in cfg.__dict__: diff --git a/ivas_processing_scripts/generation/generate_sba_items.py b/ivas_processing_scripts/generation/generate_sba_items.py index 28fbabab..f40ba9d8 100644 --- a/ivas_processing_scripts/generation/generate_sba_items.py +++ b/ivas_processing_scripts/generation/generate_sba_items.py @@ -364,21 +364,19 @@ def generate_sba_scene( y.audio, y.fs, limits=[-preamble, -postamble], samples=True ) - # adjust the length of the output signal + # trim the output signal if the total duration exceeds X seconds if "duration" in cfg.__dict__: - # trim the output signal such that the total duration is X seconds - duration = int(cfg.duration * cfg.fs) # convert to samples - else: - # do not change the length of the audio signal - duration = len(y.audio) - duration = int( - np.floor(duration / frame_len) * frame_len - ) # ensure multiple of 20ms - if len(y.audio) != duration: - y.audio = audioarray.trim( - y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True + # convert from seconds to samples (ensure multiple of 20ms) + duration = int( + np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len ) + # check if the current length of the output signal exceeds the duration + if len(y.audio) > duration: + y.audio = audioarray.trim( + y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True + ) + # adjust the loudness of the output signal if "loudness" in cfg.__dict__: logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LKFS") diff --git a/ivas_processing_scripts/generation/generate_stereo_items.py b/ivas_processing_scripts/generation/generate_stereo_items.py index a0d99f90..6bf1e95e 100644 --- a/ivas_processing_scripts/generation/generate_stereo_items.py +++ b/ivas_processing_scripts/generation/generate_stereo_items.py @@ -365,21 +365,19 @@ def generate_stereo_scene( y.audio, y.fs, limits=[-preamble, -postamble], samples=True ) - # adjust the length of the output signal + # trim the output signal if the total duration exceeds X seconds if "duration" in cfg.__dict__: - # trim the output signal such that the total duration is X seconds - duration = int(cfg.duration * cfg.fs) # convert to samples - else: - # do not change the length of the audio signal - duration = len(y.audio) - duration = int( - np.floor(duration / frame_len) * frame_len - ) # ensure multiple of 20ms - if len(y.audio) != duration: - y.audio = audioarray.trim( - y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True + # convert from seconds to samples (ensure multiple of 20ms) + duration = int( + np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len ) + # check if the current length of the output signal exceeds the duration + if len(y.audio) > duration: + y.audio = audioarray.trim( + y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True + ) + # adjust the loudness of the output signal if "loudness" in cfg.__dict__: logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LKFS") -- GitLab From 93d4dbfed845607d10b5aec936578119636e7142 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Tue, 12 Aug 2025 14:07:34 +0200 Subject: [PATCH 2/5] support for X(i_ref) notation to allow specifying overlap between items --- .../generation/generate_ismN_items.py | 46 +++++++++++---- .../generation/generate_masa_items.py | 57 +++++++++++++++---- .../generation/generate_mc_items.py | 36 +++++++++--- .../generation/generate_omasa_items.py | 53 ++++++++++------- .../generation/generate_osba_items.py | 36 +++++++++--- .../generation/generate_sba_items.py | 36 +++++++++--- .../generation/generate_stereo_items.py | 36 +++++++++--- 7 files changed, 228 insertions(+), 72 deletions(-) diff --git a/ivas_processing_scripts/generation/generate_ismN_items.py b/ivas_processing_scripts/generation/generate_ismN_items.py index 551058c7..f57f58c3 100644 --- a/ivas_processing_scripts/generation/generate_ismN_items.py +++ b/ivas_processing_scripts/generation/generate_ismN_items.py @@ -30,6 +30,7 @@ # the United Nations Convention on Contracts on the International Sales of Goods. # import logging +import re import sys from itertools import groupby, repeat from pathlib import Path @@ -196,6 +197,7 @@ def generate_ismN_scene( # repeat for all source files offset = 0 + end_position = [] for i in range(N_inputs): # read input filename source_file = ( @@ -232,16 +234,33 @@ def generate_ismN_scene( if isinstance(scene["shift"], list) else scene["shift"] ) + + # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index + # of the reference signal (0-based index) + if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: + # extract X and i_ref + match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]) + + if match: + overlap = float(match.group(1)) + overlap_ref = int(match.group(2)) + else: + scene_shift_str = scene["shift"][i] + logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!") + sys.exit(-1) + + # calculate absolute shift of the source signal in seconds + source_shift = end_position[overlap_ref] + overlap else: source_shift = 0.0 # convert shift from seconds to samples and ensure it is a multiple of 20ms - source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) else: source_shift = int(np.ceil(source_shift / frame_len) * frame_len) + source_shift_in_seconds = source_shift / cfg.fs # read the level if "level" in scene.keys(): @@ -278,6 +297,9 @@ def generate_ismN_scene( x = audio.fromtype("ISM1") x.audio, x.fs = audiofile.read(input_filename) + # record the total duration of the source signal, taking into account the shift of the starting position + end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds) + # resample to the target fs if necessary if x.fs != cfg.fs: logger.warning( @@ -288,12 +310,12 @@ def generate_ismN_scene( x.fs = cfg.fs # adjust the level of the audio source file (need to convert to MONO first) - if level is None: - # do not change the level of the audio source signal - logger.info("-- Level of the audio source signal is not changed") - elif np.isinf(level): + if np.isinf(level): # set all channels to zero x.audio = np.zeros_like(x.audio) + elif level is None: + # do not change the level of the audio source signal + logger.info("-- Level of the audio source signal is not changed") else: x_temp = audio.ChannelBasedAudio( "MONO" @@ -391,21 +413,21 @@ def generate_ismN_scene( y.object_pos = x.object_pos.copy() y.fs = x.fs - if source_shift < 0: + if source_shift > 0: # insert zeros to the new audio source signal to shift it right - metadata.trim_meta(y, limits=[source_shift, 0], samples=True) + metadata.trim_meta(y, limits=[-source_shift, 0], samples=True) else: offset = source_shift else: # shift the beginning of the audio source signal delta_offset = source_shift - offset - if delta_offset > 0: + if delta_offset < 0: # insert zeros to the previous ISM signal(s) to shift them right - metadata.trim_meta(y, limits=[-delta_offset, 0], samples=True) + metadata.trim_meta(y, limits=[delta_offset, 0], samples=True) offset = source_shift else: # insert zeros to the new audio source signal to shift it right - metadata.trim_meta(x, limits=[delta_offset, 0], samples=True) + metadata.trim_meta(x, limits=[-delta_offset, 0], samples=True) # adjust the length of the audio source signal delta_length = len(x.audio) - len(y.audio) @@ -443,14 +465,14 @@ def generate_ismN_scene( noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") y.audio += noise - # trim the output signal such if the total duration exceeds X seconds + # trim the output signal if the total duration exceeds X seconds if "duration" in cfg.__dict__: # convert from seconds to samples (ensure multiple of 20ms) duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len) # check if the current length of the output signal exceeds the duration if len(y.audio) > duration: - metadata.trim_meta(y, limits=[0, duration], samples=True) + metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) # adjust the loudness of the output signal if "loudness" in cfg.__dict__: diff --git a/ivas_processing_scripts/generation/generate_masa_items.py b/ivas_processing_scripts/generation/generate_masa_items.py index 6b40da78..6ddac870 100644 --- a/ivas_processing_scripts/generation/generate_masa_items.py +++ b/ivas_processing_scripts/generation/generate_masa_items.py @@ -31,6 +31,7 @@ # import logging +import re import sys from itertools import groupby, repeat from pathlib import Path @@ -209,6 +210,7 @@ def generate_MASA_scene( # repeat for all source files offset = 0 y_int = None + end_position = [] for i in range(N_inputs): # parse parameters from the scene description source_file = ( @@ -232,13 +234,44 @@ def generate_MASA_scene( else: source_shift = 0.0 + # read the source shift length (in seconds) + if "shift" in scene.keys(): + source_shift = ( + scene["shift"][i] + if isinstance(scene["shift"], list) + else scene["shift"] + ) + + # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index + # of the reference signal (0-based index) + if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: + # extract X and i_ref + match = re.match( + r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i] + ) + + if match: + overlap = float(match.group(1)) + overlap_ref = int(match.group(2)) + else: + scene_shift_str = scene["shift"][i] + logger.error( + f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!" + ) + sys.exit(-1) + + # calculate absolute shift of the source signal in seconds + source_shift = end_position[overlap_ref] - overlap + else: + source_shift = 0.0 + # convert shift from seconds to samples and ensure it is a multiple of 20ms - source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) else: source_shift = int(np.ceil(source_shift / frame_len) * frame_len) + source_shift_in_seconds = source_shift / cfg.fs # read the level if "level" in scene.keys(): @@ -295,6 +328,9 @@ def generate_MASA_scene( # read source file x = audio.fromfile("MONO", input_filename) + # record the total duration of the source signal, taking into account the shift of the starting position + end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds) + # resample to the target fs if necessary if x.fs != cfg.fs: logger.warning( @@ -339,26 +375,26 @@ def generate_MASA_scene( # this is the first SBA source signal y_int.audio = x.audio.copy() - if source_shift < 0: + if source_shift > 0: # insert zeros to the first SBA source signal to shift it right y_int.audio = audioarray.trim( - y_int.audio, y_int.fs, limits=[source_shift, 0], samples=True + y_int.audio, y_int.fs, limits=[-source_shift, 0], samples=True ) else: offset = source_shift else: # shift the beginning of the audio source signal delta_offset = source_shift - offset - if delta_offset > 0: + if delta_offset < 0: # insert zeros to the output SBA signal to shift it right y_int.audio = audioarray.trim( - y_int.audio, y_int.fs, limits=[-delta_offset, 0], samples=True + y_int.audio, y_int.fs, limits=[delta_offset, 0], samples=True ) offset = source_shift else: # insert zeros to the new SBA source signal to shift it right x.audio = audioarray.trim( - x.audio, x.fs, limits=[delta_offset, 0], samples=True + x.audio, x.fs, limits=[-delta_offset, 0], samples=True ) # adjust the length of the audio source signal @@ -396,14 +432,15 @@ def generate_MASA_scene( # trim the output signal if the total duration exceeds X seconds if "duration" in cfg.__dict__: # convert from seconds to samples (ensure multiple of 20ms) - duration = int( - np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len - ) + duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len) # check if the current length of the output signal exceeds the duration if len(y_int.audio) > duration: y_int.audio = audioarray.trim( - y_int.audio, y_int.fs, limits=[0, len(y_int.audio) - duration], samples=True + y_int.audio, + y_int.fs, + limits=[0, len(y_int.audio) - duration], + samples=True, ) # adjust the loudness of the output signal diff --git a/ivas_processing_scripts/generation/generate_mc_items.py b/ivas_processing_scripts/generation/generate_mc_items.py index daacf2d8..a37a3710 100644 --- a/ivas_processing_scripts/generation/generate_mc_items.py +++ b/ivas_processing_scripts/generation/generate_mc_items.py @@ -31,6 +31,7 @@ # import logging +import re import sys from itertools import groupby, repeat from pathlib import Path @@ -209,6 +210,7 @@ def generate_MC_scene( # repeat for all source files offset = 0 y_int = None + end_position = [] for i in range(N_inputs): # parse parameters from the scene description source_file = ( @@ -222,23 +224,40 @@ def generate_MC_scene( ) IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) - # read the shift time in seconds + # read the source shift length (in seconds) if "shift" in scene.keys(): source_shift = ( scene["shift"][i] if isinstance(scene["shift"], list) else scene["shift"] ) + + # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index + # of the reference signal (0-based index) + if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: + # extract X and i_ref + match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]) + + if match: + overlap = float(match.group(1)) + overlap_ref = int(match.group(2)) + else: + scene_shift_str = scene["shift"][i] + logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!") + sys.exit(-1) + + # calculate absolute shift of the source signal in seconds + source_shift = end_position[overlap_ref] + overlap else: source_shift = 0.0 # convert shift from seconds to samples and ensure it is a multiple of 20ms - source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) else: source_shift = int(np.ceil(source_shift / frame_len) * frame_len) + source_shift_in_seconds = source_shift / cfg.fs # read the level if "level" in scene.keys(): @@ -295,6 +314,9 @@ def generate_MC_scene( # read source file x = audio.fromfile("MONO", input_filename) + # record the total duration of the source signal, taking into account the shift of the starting position + end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds) + # resample to the target fs if necessary if x.fs != cfg.fs: logger.warning( @@ -339,26 +361,26 @@ def generate_MC_scene( # this is the first SBA source signal y_int.audio = x.audio.copy() - if source_shift < 0: + if source_shift > 0: # insert zeros to the first SBA source signal to shift it right y_int.audio = audioarray.trim( - y_int.audio, y_int.fs, limits=[source_shift, 0], samples=True + y_int.audio, y_int.fs, limits=[-source_shift, 0], samples=True ) else: offset = source_shift else: # shift the beginning of the audio source signal delta_offset = source_shift - offset - if delta_offset > 0: + if delta_offset < 0: # insert zeros to the output SBA signal to shift it right y_int.audio = audioarray.trim( - y_int.audio, y_int.fs, limits=[-delta_offset, 0], samples=True + y_int.audio, y_int.fs, limits=[delta_offset, 0], samples=True ) offset = source_shift else: # insert zeros to the new SBA source signal to shift it right x.audio = audioarray.trim( - x.audio, x.fs, limits=[delta_offset, 0], samples=True + x.audio, x.fs, limits=[-delta_offset, 0], samples=True ) # adjust the length of the audio source signal diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py index 2ca92195..73831922 100644 --- a/ivas_processing_scripts/generation/generate_omasa_items.py +++ b/ivas_processing_scripts/generation/generate_omasa_items.py @@ -31,6 +31,7 @@ # import logging +import re import sys from itertools import groupby, repeat from pathlib import Path @@ -183,6 +184,7 @@ def generate_OMASA_scene( # repeat for all source files offset = 0 + end_position = [] for i in range(N_inputs): # parse parameters from the scene description source_file = ( @@ -220,16 +222,33 @@ def generate_OMASA_scene( if isinstance(scene["shift"], list) else scene["shift"] ) + + # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index + # of the reference signal (0-based index) + if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: + # extract X and i_ref + match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]) + + if match: + overlap = float(match.group(1)) + overlap_ref = int(match.group(2)) + else: + scene_shift_str = scene["shift"][i] + logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!") + sys.exit(-1) + + # calculate absolute shift of the source signal in seconds + source_shift = end_position[overlap_ref] + overlap else: source_shift = 0.0 # convert shift from seconds to samples and ensure it is a multiple of 20ms - source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) else: source_shift = int(np.ceil(source_shift / frame_len) * frame_len) + source_shift_in_seconds = source_shift / cfg.fs # read the level if "level" in scene.keys(): @@ -300,6 +319,9 @@ def generate_OMASA_scene( # read source file x = audio.fromfile(fmt, input_filename) + # record the total duration of the source signal, taking into account the shift of the starting position + end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds) + # resample to the target fs if necessary if x.fs != cfg.fs: logger.warning( @@ -417,21 +439,21 @@ def generate_OMASA_scene( # add the first audio source signal (should be FOA/HOA2/HOA3) to the array of all source signals y_int.audio = x.audio.copy() - if source_shift < 0: + if source_shift > 0: # insert zeros to the new audio source signal to shift it right - metadata.trim_meta(y_int, limits=[source_shift, 0], samples=True) + metadata.trim_meta(y_int, limits=[-source_shift, 0], samples=True) else: offset = source_shift else: # shift the beginning of the audio source signal delta_offset = source_shift - offset - if delta_offset > 0: + if delta_offset < 0: # insert zeros to the existing intermediate OSBA object to shift it right - metadata.trim_meta(y_int, limits=[-delta_offset, 0], samples=True) + metadata.trim_meta(y_int, limits=[delta_offset, 0], samples=True) offset = source_shift else: # insert zeros to the new audio source signal to shift it right - metadata.trim_meta(x, limits=[delta_offset, 0], samples=True) + metadata.trim_meta(x, limits=[-delta_offset, 0], samples=True) # adjust the length of the audio source signal delta_length = len(x.audio) - len(y_int.audio) @@ -472,29 +494,16 @@ def generate_OMASA_scene( noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") y_int.audio += noise - # trim the output signal such if the total duration exceeds X seconds + # trim the output signal if the total duration exceeds X seconds if "duration" in cfg.__dict__: -<<<<<<< Updated upstream - # trim the output signal such that the total duration is X seconds - duration = int(cfg.duration * cfg.fs) # convert to samples - else: - # do not change the length of the audio signal - duration = len(y_int.audio) - duration = int( - np.floor(duration / frame_len) * frame_len - ) # ensure multiple of 20ms - if len(y_int.audio) != duration: - metadata.trim_meta(y_int, limits=[0, len(y_int.audio) - duration], samples=True) -======= # convert from seconds to samples (ensure multiple of 20ms) duration = int( np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len ) # check if the current length of the output signal exceeds the duration - if len(y.audio) > duration: - metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) ->>>>>>> Stashed changes + if len(y_int.audio) > duration: + metadata.trim_meta(y_int, limits=[0, len(y_int.audio) - duration], samples=True) # adjust the loudness of the output signal if "loudness" in cfg.__dict__: diff --git a/ivas_processing_scripts/generation/generate_osba_items.py b/ivas_processing_scripts/generation/generate_osba_items.py index 67e493b5..64921b7d 100644 --- a/ivas_processing_scripts/generation/generate_osba_items.py +++ b/ivas_processing_scripts/generation/generate_osba_items.py @@ -31,6 +31,7 @@ # import logging +import re import sys from itertools import groupby, repeat from pathlib import Path @@ -187,6 +188,7 @@ def generate_OSBA_scene( # repeat for all source files offset = 0 + end_position = [] for i in range(N_inputs): # parse parameters from the scene description source_file = ( @@ -210,23 +212,40 @@ def generate_OSBA_scene( else scene["elevation"] ) - # read the shift time in seconds + # read the source shift length (in seconds) if "shift" in scene.keys(): source_shift = ( scene["shift"][i] if isinstance(scene["shift"], list) else scene["shift"] ) + + # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index + # of the reference signal (0-based index) + if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: + # extract X and i_ref + match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]) + + if match: + overlap = float(match.group(1)) + overlap_ref = int(match.group(2)) + else: + scene_shift_str = scene["shift"][i] + logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!") + sys.exit(-1) + + # calculate absolute shift of the source signal in seconds + source_shift = end_position[overlap_ref] + overlap else: source_shift = 0.0 # convert shift from seconds to samples and ensure it is a multiple of 20ms - source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) else: source_shift = int(np.ceil(source_shift / frame_len) * frame_len) + source_shift_in_seconds = source_shift / cfg.fs # read the level if "level" in scene.keys(): @@ -282,6 +301,9 @@ def generate_OSBA_scene( # read source file x = audio.fromfile(fmt, input_filename) + # record the total duration of the source signal, taking into account the shift of the starting position + end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds) + # resample to the target fs if necessary if x.fs != cfg.fs: logger.warning( @@ -403,21 +425,21 @@ def generate_OSBA_scene( # if ISM, append object position to the OSBA object y.object_pos = x.object_pos.copy() - if source_shift < 0: + if source_shift > 0: # insert zeros to the new audio source signal to shift it right - metadata.trim_meta(y, limits=[source_shift, 0], samples=True) + metadata.trim_meta(y, limits=[-source_shift, 0], samples=True) else: offset = source_shift else: # shift the beginning of the audio source signal delta_offset = source_shift - offset - if delta_offset > 0: + if delta_offset < 0: # insert zeros to the previous ISM signal(s) to shift them right - metadata.trim_meta(y, limits=[-delta_offset, 0], samples=True) + metadata.trim_meta(y, limits=[delta_offset, 0], samples=True) offset = source_shift else: # insert zeros to the new audio source signal to shift it right - metadata.trim_meta(x, limits=[delta_offset, 0], samples=True) + metadata.trim_meta(x, limits=[-delta_offset, 0], samples=True) # adjust the length of the audio source signal delta_length = len(x.audio) - len(y.audio) diff --git a/ivas_processing_scripts/generation/generate_sba_items.py b/ivas_processing_scripts/generation/generate_sba_items.py index f40ba9d8..36c8b828 100644 --- a/ivas_processing_scripts/generation/generate_sba_items.py +++ b/ivas_processing_scripts/generation/generate_sba_items.py @@ -31,6 +31,7 @@ # import logging +import re import sys from itertools import groupby, repeat from pathlib import Path @@ -201,6 +202,7 @@ def generate_sba_scene( # repeat for all source files offset = 0 + end_position = [] for i in range(N_inputs): # parse parameters from the scene description source_file = ( @@ -214,23 +216,40 @@ def generate_sba_scene( ) IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) - # read the shift time in seconds + # read the source shift length (in seconds) if "shift" in scene.keys(): source_shift = ( scene["shift"][i] if isinstance(scene["shift"], list) else scene["shift"] ) + + # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index + # of the reference signal (0-based index) + if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: + # extract X and i_ref + match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]) + + if match: + overlap = float(match.group(1)) + overlap_ref = int(match.group(2)) + else: + scene_shift_str = scene["shift"][i] + logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!") + sys.exit(-1) + + # calculate absolute shift of the source signal in seconds + source_shift = end_position[overlap_ref] + overlap else: source_shift = 0.0 # convert shift from seconds to samples and ensure it is a multiple of 20ms - source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) else: source_shift = int(np.ceil(source_shift / frame_len) * frame_len) + source_shift_in_seconds = source_shift / cfg.fs # read the level if "level" in scene.keys(): @@ -266,6 +285,9 @@ def generate_sba_scene( # read source file x = audio.fromfile("MONO", input_filename) + # record the total duration of the source signal, taking into account the shift of the starting position + end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds) + # resample to the target fs if necessary if x.fs != cfg.fs: logger.warning( @@ -310,26 +332,26 @@ def generate_sba_scene( # add source signal to the array of all source signals y.audio = x.audio.copy() - if source_shift < 0: + if source_shift > 0: # insert zeros to the new audio source signal to shift it right y.audio = audioarray.trim_meta( - y.audio, y.fs, limits=[source_shift, 0], samples=True + y.audio, y.fs, limits=[-source_shift, 0], samples=True ) else: offset = source_shift else: # shift the beginning of the audio source signal delta_offset = source_shift - offset - if delta_offset > 0: + if delta_offset < 0: # insert zeros to the existing output signal to shift it right y.audio = audioarray.trim( - y.audio, y.fs, limits=[-delta_offset, 0], samples=True + y.audio, y.fs, limits=[delta_offset, 0], samples=True ) offset = source_shift else: # insert zeros to the new audio source signal to shift it right x.audio = audioarray.trim( - x.audio, x.fs, limits=[delta_offset, 0], samples=True + x.audio, x.fs, limits=[-delta_offset, 0], samples=True ) # adjust the length of the audio source signal diff --git a/ivas_processing_scripts/generation/generate_stereo_items.py b/ivas_processing_scripts/generation/generate_stereo_items.py index 6bf1e95e..9bc6a73d 100644 --- a/ivas_processing_scripts/generation/generate_stereo_items.py +++ b/ivas_processing_scripts/generation/generate_stereo_items.py @@ -32,6 +32,7 @@ import logging import os +import re import sys from itertools import groupby, repeat from pathlib import Path @@ -207,6 +208,7 @@ def generate_stereo_scene( # repeat for all source files offset = 0 + end_position = [] for i in range(N_inputs): # parse parameters from the scene description source_file = ( @@ -220,23 +222,40 @@ def generate_stereo_scene( ) IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) - # read the shift time in seconds + # read the source shift length (in seconds) if "shift" in scene.keys(): source_shift = ( scene["shift"][i] if isinstance(scene["shift"], list) else scene["shift"] ) + + # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index + # of the reference signal (0-based index) + if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: + # extract X and i_ref + match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]) + + if match: + overlap = float(match.group(1)) + overlap_ref = int(match.group(2)) + else: + scene_shift_str = scene["shift"][i] + logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!") + sys.exit(-1) + + # calculate absolute shift of the source signal in seconds + source_shift = end_position[overlap_ref] + overlap else: source_shift = 0.0 # convert shift from seconds to samples and ensure it is a multiple of 20ms - source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) else: source_shift = int(np.ceil(source_shift / frame_len) * frame_len) + source_shift_in_seconds = source_shift / cfg.fs # read the level if "level" in scene.keys(): @@ -284,6 +303,9 @@ def generate_stereo_scene( # read the IR file (!must be in STEREO format!) IR = audio.fromfile("STEREO", IR_filename) + # record the total duration of the source signal, taking into account the shift of the starting position + end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds) + # convolve MONO source audio with STEREO IR -> results in STEREO audio object x = reverb_stereo(x, IR, mode=None) @@ -311,26 +333,26 @@ def generate_stereo_scene( # add source signal to the array of all source signals y.audio = x.audio.copy() - if source_shift < 0: + if source_shift > 0: # insert zeros to the new audio source signal to shift it right y.audio = audioarray.trim( - y.audio, x.fs, limits=[source_shift, 0], samples=True + y.audio, x.fs, limits=[-source_shift, 0], samples=True ) else: offset = source_shift else: # shift the beginning of the audio source signal delta_offset = source_shift - offset - if delta_offset > 0: + if delta_offset < 0: # insert zeros to the existing output signal to shift it right y.audio = audioarray.trim( - y.audio, y.fs, limits=[-delta_offset, 0], samples=True + y.audio, y.fs, limits=[delta_offset, 0], samples=True ) offset = source_shift else: # insert zeros to the new audio source signal to shift it right x.audio = audioarray.trim( - x.audio, x.fs, limits=[delta_offset, 0], samples=True + x.audio, x.fs, limits=[-delta_offset, 0], samples=True ) # adjust the length of the audio source signal -- GitLab From 85be2fd0eb514e124ab462892aed7925aca0bf7e Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Tue, 12 Aug 2025 14:55:23 +0200 Subject: [PATCH 3/5] formatting --- .../generation/generate_ismN_items.py | 8 ++++++-- .../generation/generate_mc_items.py | 17 +++++++++++------ .../generation/generate_omasa_items.py | 16 ++++++++++------ .../generation/generate_osba_items.py | 12 +++++++----- .../generation/generate_sba_items.py | 12 +++++++----- .../generation/generate_stereo_items.py | 12 +++++++----- 6 files changed, 48 insertions(+), 29 deletions(-) diff --git a/ivas_processing_scripts/generation/generate_ismN_items.py b/ivas_processing_scripts/generation/generate_ismN_items.py index f57f58c3..3c474309 100644 --- a/ivas_processing_scripts/generation/generate_ismN_items.py +++ b/ivas_processing_scripts/generation/generate_ismN_items.py @@ -239,14 +239,18 @@ def generate_ismN_scene( # of the reference signal (0-based index) if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: # extract X and i_ref - match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]) + match = re.match( + r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i] + ) if match: overlap = float(match.group(1)) overlap_ref = int(match.group(2)) else: scene_shift_str = scene["shift"][i] - logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!") + logger.error( + f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!" + ) sys.exit(-1) # calculate absolute shift of the source signal in seconds diff --git a/ivas_processing_scripts/generation/generate_mc_items.py b/ivas_processing_scripts/generation/generate_mc_items.py index a37a3710..35dcbb3b 100644 --- a/ivas_processing_scripts/generation/generate_mc_items.py +++ b/ivas_processing_scripts/generation/generate_mc_items.py @@ -236,14 +236,18 @@ def generate_MC_scene( # of the reference signal (0-based index) if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: # extract X and i_ref - match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]) + match = re.match( + r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i] + ) if match: overlap = float(match.group(1)) overlap_ref = int(match.group(2)) else: scene_shift_str = scene["shift"][i] - logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!") + logger.error( + f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!" + ) sys.exit(-1) # calculate absolute shift of the source signal in seconds @@ -418,14 +422,15 @@ def generate_MC_scene( # trim the output signal if the total duration exceeds X seconds if "duration" in cfg.__dict__: # convert from seconds to samples (ensure multiple of 20ms) - duration = int( - np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len - ) + duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len) # check if the current length of the output signal exceeds the duration if len(y_int.audio) > duration: y_int.audio = audioarray.trim( - y_int.audio, y_int.fs, limits=[0, len(y_int.audio) - duration], samples=True + y_int.audio, + y_int.fs, + limits=[0, len(y_int.audio) - duration], + samples=True, ) # adjust the loudness of the output signal diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py index 73831922..0881c7ca 100644 --- a/ivas_processing_scripts/generation/generate_omasa_items.py +++ b/ivas_processing_scripts/generation/generate_omasa_items.py @@ -227,14 +227,18 @@ def generate_OMASA_scene( # of the reference signal (0-based index) if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: # extract X and i_ref - match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]) + match = re.match( + r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i] + ) if match: overlap = float(match.group(1)) overlap_ref = int(match.group(2)) else: scene_shift_str = scene["shift"][i] - logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!") + logger.error( + f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!" + ) sys.exit(-1) # calculate absolute shift of the source signal in seconds @@ -497,13 +501,13 @@ def generate_OMASA_scene( # trim the output signal if the total duration exceeds X seconds if "duration" in cfg.__dict__: # convert from seconds to samples (ensure multiple of 20ms) - duration = int( - np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len - ) + duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len) # check if the current length of the output signal exceeds the duration if len(y_int.audio) > duration: - metadata.trim_meta(y_int, limits=[0, len(y_int.audio) - duration], samples=True) + metadata.trim_meta( + y_int, limits=[0, len(y_int.audio) - duration], samples=True + ) # adjust the loudness of the output signal if "loudness" in cfg.__dict__: diff --git a/ivas_processing_scripts/generation/generate_osba_items.py b/ivas_processing_scripts/generation/generate_osba_items.py index 64921b7d..8d2ca0d8 100644 --- a/ivas_processing_scripts/generation/generate_osba_items.py +++ b/ivas_processing_scripts/generation/generate_osba_items.py @@ -224,14 +224,18 @@ def generate_OSBA_scene( # of the reference signal (0-based index) if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: # extract X and i_ref - match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]) + match = re.match( + r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i] + ) if match: overlap = float(match.group(1)) overlap_ref = int(match.group(2)) else: scene_shift_str = scene["shift"][i] - logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!") + logger.error( + f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!" + ) sys.exit(-1) # calculate absolute shift of the source signal in seconds @@ -483,9 +487,7 @@ def generate_OSBA_scene( # trim the output signal if the total duration exceeds X seconds if "duration" in cfg.__dict__: # convert from seconds to samples (ensure multiple of 20ms) - duration = int( - np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len - ) + duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len) # check if the current length of the output signal exceeds the duration if len(y.audio) > duration: diff --git a/ivas_processing_scripts/generation/generate_sba_items.py b/ivas_processing_scripts/generation/generate_sba_items.py index 36c8b828..631d6165 100644 --- a/ivas_processing_scripts/generation/generate_sba_items.py +++ b/ivas_processing_scripts/generation/generate_sba_items.py @@ -228,14 +228,18 @@ def generate_sba_scene( # of the reference signal (0-based index) if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: # extract X and i_ref - match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]) + match = re.match( + r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i] + ) if match: overlap = float(match.group(1)) overlap_ref = int(match.group(2)) else: scene_shift_str = scene["shift"][i] - logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!") + logger.error( + f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!" + ) sys.exit(-1) # calculate absolute shift of the source signal in seconds @@ -389,9 +393,7 @@ def generate_sba_scene( # trim the output signal if the total duration exceeds X seconds if "duration" in cfg.__dict__: # convert from seconds to samples (ensure multiple of 20ms) - duration = int( - np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len - ) + duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len) # check if the current length of the output signal exceeds the duration if len(y.audio) > duration: diff --git a/ivas_processing_scripts/generation/generate_stereo_items.py b/ivas_processing_scripts/generation/generate_stereo_items.py index 9bc6a73d..1ad8a6ae 100644 --- a/ivas_processing_scripts/generation/generate_stereo_items.py +++ b/ivas_processing_scripts/generation/generate_stereo_items.py @@ -234,14 +234,18 @@ def generate_stereo_scene( # of the reference signal (0-based index) if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: # extract X and i_ref - match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]) + match = re.match( + r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i] + ) if match: overlap = float(match.group(1)) overlap_ref = int(match.group(2)) else: scene_shift_str = scene["shift"][i] - logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!") + logger.error( + f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!" + ) sys.exit(-1) # calculate absolute shift of the source signal in seconds @@ -390,9 +394,7 @@ def generate_stereo_scene( # trim the output signal if the total duration exceeds X seconds if "duration" in cfg.__dict__: # convert from seconds to samples (ensure multiple of 20ms) - duration = int( - np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len - ) + duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len) # check if the current length of the output signal exceeds the duration if len(y.audio) > duration: -- GitLab From 68510c78eca959856594de807d6b2b578bec4183 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Wed, 13 Aug 2025 10:52:35 +0200 Subject: [PATCH 4/5] update examples --- examples/ITEM_GENERATION_3ISM.yml | 7 +++++-- examples/ITEM_GENERATION_5_1_4.yml | 3 ++- examples/ITEM_GENERATION_FOA.yml | 7 ++++--- examples/ITEM_GENERATION_MASA.yml | 3 ++- examples/ITEM_GENERATION_OMASA.yml | 7 +++++-- examples/ITEM_GENERATION_OSBA.yml | 7 +++++-- examples/ITEM_GENERATION_STEREO.yml | 5 +++-- 7 files changed, 26 insertions(+), 13 deletions(-) diff --git a/examples/ITEM_GENERATION_3ISM.yml b/examples/ITEM_GENERATION_3ISM.yml index e770cadf..53dd0ded 100644 --- a/examples/ITEM_GENERATION_3ISM.yml +++ b/examples/ITEM_GENERATION_3ISM.yml @@ -37,7 +37,7 @@ duration: 8 add_low_level_random_noise: true ### Process with parallel streams -multiprocessing: False +multiprocessing: false ################################################ ### Item generation - Filename conventions @@ -93,7 +93,10 @@ provider: "va" ### input: input filename(s) ### azimuth: azimuth in the range [-180,180]; positive values point to the left ### elevation: elevation in the range [-90,90]; positive values indicate up -### shift: time adjustment of the input signal (negative value delays the signal) +### shift: time adjustment of the input signal (positive value delays the signal) +### alternatively, the notation X(i_ref) generates overlap by X seconds from the reference signal i_ref (0-based index) (positive value creates gap) +### background: background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored) +### background_level: normalized background noise loudness to X dB LKFS ### ### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder) ### Note 1: use brackets [val1, val2, ...] when specifying multiple values diff --git a/examples/ITEM_GENERATION_5_1_4.yml b/examples/ITEM_GENERATION_5_1_4.yml index 4670d197..2a0dbd27 100644 --- a/examples/ITEM_GENERATION_5_1_4.yml +++ b/examples/ITEM_GENERATION_5_1_4.yml @@ -94,7 +94,8 @@ provider: "va" ### IR: filenames(s) of the input IRs ### azimuth: azimuth in the range [-180,180]; positive values point to the left ### elevation: elevation in the range [-90,90]; positive values indicate up -### shift: time adjustment of the input signal (negative value delays the signal) +### shift: time adjustment of the input signal (positive value delays the signal) +### alternatively, the notation X(i_ref) generates overlap by X seconds from the reference signal i_ref (0-based index) (positive value creates gap) ### background: background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored) ### background_level: normalized background noise loudness to X dB LKFS ### diff --git a/examples/ITEM_GENERATION_FOA.yml b/examples/ITEM_GENERATION_FOA.yml index 879735d4..016c5fcf 100644 --- a/examples/ITEM_GENERATION_FOA.yml +++ b/examples/ITEM_GENERATION_FOA.yml @@ -34,10 +34,10 @@ fade_in_out: 0.5 duration: 8 ### Add low-level random background noise (amplitude +-4) instead of silence; default = false (silence) -add_low_level_random_noise: False +add_low_level_random_noise: false ### Process with parallel streams -multiprocessing: False +multiprocessing: false ################################################ ### Item generation - Filename conventions @@ -94,7 +94,8 @@ use_output_prefix: "leee" ### IR: filenames(s) of the input IRs ### azimuth: azimuth in the range [-180,180]; positive values point to the left ### elevation: elevation in the range [-90,90]; positive values indicate up -### shift: time adjustment of the input signal (negative value delays the signal) +### shift: time adjustment of the input signal (positive value delays the signal) +### alternatively, the notation X(i_ref) generates overlap by X seconds from the reference signal i_ref (0-based index) (positive value creates gap) ### background: background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored) ### background_level: normalized background noise loudness to X dB LKFS ### diff --git a/examples/ITEM_GENERATION_MASA.yml b/examples/ITEM_GENERATION_MASA.yml index 958a69cb..715b20c4 100644 --- a/examples/ITEM_GENERATION_MASA.yml +++ b/examples/ITEM_GENERATION_MASA.yml @@ -94,7 +94,8 @@ provider: "va" ### IR: filenames(s) of the input IRs ### azimuth: azimuth in the range [-180,180]; positive values point to the left ### elevation: elevation in the range [-90,90]; positive values indicate up -### shift: time adjustment of the input signal (negative value delays the signal) +### shift: time adjustment of the input signal (positive value delays the signal) +### alternatively, the notation X(i_ref) generates overlap by X seconds from the reference signal i_ref (0-based index) (positive value creates gap) ### background: background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored) ### background_level: normalized background noise loudness to X dB LKFS ### diff --git a/examples/ITEM_GENERATION_OMASA.yml b/examples/ITEM_GENERATION_OMASA.yml index 942ad1c7..462bc54e 100644 --- a/examples/ITEM_GENERATION_OMASA.yml +++ b/examples/ITEM_GENERATION_OMASA.yml @@ -37,7 +37,7 @@ duration: 8 add_low_level_random_noise: true ### Process with parallel streams -multiprocessing: False +multiprocessing: false ################################################ ### Item generation - Filename conventions @@ -93,7 +93,10 @@ provider: "va" ### input: input filename(s) ### azimuth: azimuth in the range [-180,180]; positive values point to the left ### elevation: elevation in the range [-90,90]; positive values indicate up -### shift: time adjustment of the input signal (negative value delays the signal) +### shift: time adjustment of the input signal (positive value delays the signal) +### alternatively, the notation X(i_ref) generates overlap by X seconds from the reference signal i_ref (0-based index) (positive value creates gap) +### background: background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored) +### background_level: normalized background noise loudness to X dB LKFS ### ### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder) ### Note 1: use brackets [val1, val2, ...] when specifying multiple values diff --git a/examples/ITEM_GENERATION_OSBA.yml b/examples/ITEM_GENERATION_OSBA.yml index f7c33b49..3b696838 100644 --- a/examples/ITEM_GENERATION_OSBA.yml +++ b/examples/ITEM_GENERATION_OSBA.yml @@ -37,7 +37,7 @@ duration: 8 add_low_level_random_noise: true ### Process with parallel streams -multiprocessing: False +multiprocessing: false ################################################ ### Item generation - Filename conventions @@ -93,7 +93,10 @@ provider: "va" ### input: input filename(s) ### azimuth: azimuth in the range [-180,180]; positive values point to the left ### elevation: elevation in the range [-90,90]; positive values indicate up -### shift: time adjustment of the input signal (negative value delays the signal) +### shift: time adjustment of the input signal (positive value delays the signal) +### alternatively, the notation X(i_ref) generates overlap by X seconds from the reference signal i_ref (0-based index) (positive value creates gap) +### background: background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored) +### background_level: normalized background noise loudness to X dB LKFS ### ### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder) ### Note 1: use brackets [val1, val2, ...] when specifying multiple values diff --git a/examples/ITEM_GENERATION_STEREO.yml b/examples/ITEM_GENERATION_STEREO.yml index 48c6aa61..78426358 100644 --- a/examples/ITEM_GENERATION_STEREO.yml +++ b/examples/ITEM_GENERATION_STEREO.yml @@ -37,7 +37,7 @@ duration: 8 add_low_level_random_noise: true ### Process with parallel streams -multiprocessing: False +multiprocessing: false ################################################ ### Item generation - Filename conventions @@ -94,7 +94,8 @@ provider: "g" ### IR: filenames(s) of the input IRs ### azimuth: azimuth in the range [-180,180]; positive values point to the left ### elevation: elevation in the range [-90,90]; positive values indicate up -### shift: time adjustment of the input signal (negative value delays the signal) +### shift: time adjustment of the input signal (positive value delays the signal) +### alternatively, the notation X(i_ref) generates overlap by X seconds from the reference signal i_ref (0-based index) (positive value creates gap) ### background: background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored) ### background_level: normalized background noise loudness to X dB LKFS ### -- GitLab From a8b5728f5e29a03896f0244ce5f12e70d3d16882 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Wed, 13 Aug 2025 10:55:39 +0200 Subject: [PATCH 5/5] update doc --- README.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 02f00dde..d47e2ea3 100755 --- a/README.md +++ b/README.md @@ -76,13 +76,11 @@ Each entry under `scenes:` describes one test item, specifying: - `input`: list of mono `.wav` files - `azimuth` / `elevation`: spatial placement (°) - `level`: loudness in dB -- `shift`: timing offsets in seconds +- `shift`: signal offset/overlap in seconds - `background`: background noise file (applicable to STEREO and SBA only) - `background_level`: level of the background noise (applicable to STEREO and SBA only) -Dynamic positioning (e.g., `"-20:1.0:360"`) means the source will move over time, stepping every 20 ms. - -The total duration of the output signal can be controlled using the `duration` field. The output signal may optionally be rendered to the BINAURAL format by specifying the `binaural_output` field. +Dynamic positioning (e.g., `"-20:1.0:360"`) means the source will move over time, stepping every 20 ms. The maximum total duration of the output signal can be controlled using the `duration` field. The output signal may optionally be rendered to the BINAURAL format by specifying the `binaural_output` field. The `shift` parameter ensures time adjustment (offset) of the input signal (positive value delays the signal). Aternatively, the notation `X(i_ref)` generates overlap by `X` seconds from the reference signal `i_ref` (0-based index) (positive value creates gap). Start by running a single scene to verify settings. Output includes both audio and optional metadata files. You can enable multiprocessing by setting `multiprocessing: true`. -- GitLab