diff --git a/README.md b/README.md index 02f00dde932e8bcd03c6a1f62b19a75597d54ebf..d47e2ea3f79c18b73d95da6790dc4f1ce1fa8073 100755 --- a/README.md +++ b/README.md @@ -76,13 +76,11 @@ Each entry under `scenes:` describes one test item, specifying: - `input`: list of mono `.wav` files - `azimuth` / `elevation`: spatial placement (°) - `level`: loudness in dB -- `shift`: timing offsets in seconds +- `shift`: signal offset/overlap in seconds - `background`: background noise file (applicable to STEREO and SBA only) - `background_level`: level of the background noise (applicable to STEREO and SBA only) -Dynamic positioning (e.g., `"-20:1.0:360"`) means the source will move over time, stepping every 20 ms. - -The total duration of the output signal can be controlled using the `duration` field. The output signal may optionally be rendered to the BINAURAL format by specifying the `binaural_output` field. +Dynamic positioning (e.g., `"-20:1.0:360"`) means the source will move over time, stepping every 20 ms. The maximum total duration of the output signal can be controlled using the `duration` field. The output signal may optionally be rendered to the BINAURAL format by specifying the `binaural_output` field. The `shift` parameter ensures time adjustment (offset) of the input signal (positive value delays the signal). Aternatively, the notation `X(i_ref)` generates overlap by `X` seconds from the reference signal `i_ref` (0-based index) (positive value creates gap). Start by running a single scene to verify settings. Output includes both audio and optional metadata files. You can enable multiprocessing by setting `multiprocessing: true`. diff --git a/examples/ITEM_GENERATION_3ISM.yml b/examples/ITEM_GENERATION_3ISM.yml index e770cadf706b775682b8339677f64e863b0849b2..53dd0dedf501393b4ac35b7f19fbcd39686e23d7 100644 --- a/examples/ITEM_GENERATION_3ISM.yml +++ b/examples/ITEM_GENERATION_3ISM.yml @@ -37,7 +37,7 @@ duration: 8 add_low_level_random_noise: true ### Process with parallel streams -multiprocessing: False +multiprocessing: false ################################################ ### Item generation - Filename conventions @@ -93,7 +93,10 @@ provider: "va" ### input: input filename(s) ### azimuth: azimuth in the range [-180,180]; positive values point to the left ### elevation: elevation in the range [-90,90]; positive values indicate up -### shift: time adjustment of the input signal (negative value delays the signal) +### shift: time adjustment of the input signal (positive value delays the signal) +### alternatively, the notation X(i_ref) generates overlap by X seconds from the reference signal i_ref (0-based index) (positive value creates gap) +### background: background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored) +### background_level: normalized background noise loudness to X dB LKFS ### ### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder) ### Note 1: use brackets [val1, val2, ...] when specifying multiple values diff --git a/examples/ITEM_GENERATION_5_1_4.yml b/examples/ITEM_GENERATION_5_1_4.yml index 4670d1979eefe3b9fc0a0aaeb6521c9eb1aadf8c..2a0dbd278315dfff0deac6e9fcdcc587d5286ed4 100644 --- a/examples/ITEM_GENERATION_5_1_4.yml +++ b/examples/ITEM_GENERATION_5_1_4.yml @@ -94,7 +94,8 @@ provider: "va" ### IR: filenames(s) of the input IRs ### azimuth: azimuth in the range [-180,180]; positive values point to the left ### elevation: elevation in the range [-90,90]; positive values indicate up -### shift: time adjustment of the input signal (negative value delays the signal) +### shift: time adjustment of the input signal (positive value delays the signal) +### alternatively, the notation X(i_ref) generates overlap by X seconds from the reference signal i_ref (0-based index) (positive value creates gap) ### background: background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored) ### background_level: normalized background noise loudness to X dB LKFS ### diff --git a/examples/ITEM_GENERATION_FOA.yml b/examples/ITEM_GENERATION_FOA.yml index 879735d42f6dc46fa9c996e98de087898b0fadc1..016c5fcfa268ca657667c0a18473eeec98aca06a 100644 --- a/examples/ITEM_GENERATION_FOA.yml +++ b/examples/ITEM_GENERATION_FOA.yml @@ -34,10 +34,10 @@ fade_in_out: 0.5 duration: 8 ### Add low-level random background noise (amplitude +-4) instead of silence; default = false (silence) -add_low_level_random_noise: False +add_low_level_random_noise: false ### Process with parallel streams -multiprocessing: False +multiprocessing: false ################################################ ### Item generation - Filename conventions @@ -94,7 +94,8 @@ use_output_prefix: "leee" ### IR: filenames(s) of the input IRs ### azimuth: azimuth in the range [-180,180]; positive values point to the left ### elevation: elevation in the range [-90,90]; positive values indicate up -### shift: time adjustment of the input signal (negative value delays the signal) +### shift: time adjustment of the input signal (positive value delays the signal) +### alternatively, the notation X(i_ref) generates overlap by X seconds from the reference signal i_ref (0-based index) (positive value creates gap) ### background: background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored) ### background_level: normalized background noise loudness to X dB LKFS ### diff --git a/examples/ITEM_GENERATION_MASA.yml b/examples/ITEM_GENERATION_MASA.yml index 958a69cb5e27c5d710a10c0af07c5501b2516b0b..715b20c4932eb6b21ebcd9642b55d4b8c4805bae 100644 --- a/examples/ITEM_GENERATION_MASA.yml +++ b/examples/ITEM_GENERATION_MASA.yml @@ -94,7 +94,8 @@ provider: "va" ### IR: filenames(s) of the input IRs ### azimuth: azimuth in the range [-180,180]; positive values point to the left ### elevation: elevation in the range [-90,90]; positive values indicate up -### shift: time adjustment of the input signal (negative value delays the signal) +### shift: time adjustment of the input signal (positive value delays the signal) +### alternatively, the notation X(i_ref) generates overlap by X seconds from the reference signal i_ref (0-based index) (positive value creates gap) ### background: background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored) ### background_level: normalized background noise loudness to X dB LKFS ### diff --git a/examples/ITEM_GENERATION_OMASA.yml b/examples/ITEM_GENERATION_OMASA.yml index 942ad1c7d323a88dc9fd057fc12970e767bb801f..462bc54e116c8a73869d7b79206435f97e024009 100644 --- a/examples/ITEM_GENERATION_OMASA.yml +++ b/examples/ITEM_GENERATION_OMASA.yml @@ -37,7 +37,7 @@ duration: 8 add_low_level_random_noise: true ### Process with parallel streams -multiprocessing: False +multiprocessing: false ################################################ ### Item generation - Filename conventions @@ -93,7 +93,10 @@ provider: "va" ### input: input filename(s) ### azimuth: azimuth in the range [-180,180]; positive values point to the left ### elevation: elevation in the range [-90,90]; positive values indicate up -### shift: time adjustment of the input signal (negative value delays the signal) +### shift: time adjustment of the input signal (positive value delays the signal) +### alternatively, the notation X(i_ref) generates overlap by X seconds from the reference signal i_ref (0-based index) (positive value creates gap) +### background: background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored) +### background_level: normalized background noise loudness to X dB LKFS ### ### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder) ### Note 1: use brackets [val1, val2, ...] when specifying multiple values diff --git a/examples/ITEM_GENERATION_OSBA.yml b/examples/ITEM_GENERATION_OSBA.yml index f7c33b490f6edd62ffa2d1e1faf0b582bf614b88..3b6968381d205f73bdcf9bef3b83ceae830da258 100644 --- a/examples/ITEM_GENERATION_OSBA.yml +++ b/examples/ITEM_GENERATION_OSBA.yml @@ -37,7 +37,7 @@ duration: 8 add_low_level_random_noise: true ### Process with parallel streams -multiprocessing: False +multiprocessing: false ################################################ ### Item generation - Filename conventions @@ -93,7 +93,10 @@ provider: "va" ### input: input filename(s) ### azimuth: azimuth in the range [-180,180]; positive values point to the left ### elevation: elevation in the range [-90,90]; positive values indicate up -### shift: time adjustment of the input signal (negative value delays the signal) +### shift: time adjustment of the input signal (positive value delays the signal) +### alternatively, the notation X(i_ref) generates overlap by X seconds from the reference signal i_ref (0-based index) (positive value creates gap) +### background: background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored) +### background_level: normalized background noise loudness to X dB LKFS ### ### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder) ### Note 1: use brackets [val1, val2, ...] when specifying multiple values diff --git a/examples/ITEM_GENERATION_STEREO.yml b/examples/ITEM_GENERATION_STEREO.yml index 48c6aa61d28ed0a481d35fafd745f5c0e842ab15..784263588b4f5390f3081799f93cad533b13b91f 100644 --- a/examples/ITEM_GENERATION_STEREO.yml +++ b/examples/ITEM_GENERATION_STEREO.yml @@ -37,7 +37,7 @@ duration: 8 add_low_level_random_noise: true ### Process with parallel streams -multiprocessing: False +multiprocessing: false ################################################ ### Item generation - Filename conventions @@ -94,7 +94,8 @@ provider: "g" ### IR: filenames(s) of the input IRs ### azimuth: azimuth in the range [-180,180]; positive values point to the left ### elevation: elevation in the range [-90,90]; positive values indicate up -### shift: time adjustment of the input signal (negative value delays the signal) +### shift: time adjustment of the input signal (positive value delays the signal) +### alternatively, the notation X(i_ref) generates overlap by X seconds from the reference signal i_ref (0-based index) (positive value creates gap) ### background: background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored) ### background_level: normalized background noise loudness to X dB LKFS ### diff --git a/ivas_processing_scripts/generation/generate_ismN_items.py b/ivas_processing_scripts/generation/generate_ismN_items.py index 489dbea486f9fdbe713024cfdab652945c309ef3..3c474309e24763e35b6c13c906448cb68bc565b0 100644 --- a/ivas_processing_scripts/generation/generate_ismN_items.py +++ b/ivas_processing_scripts/generation/generate_ismN_items.py @@ -30,6 +30,7 @@ # the United Nations Convention on Contracts on the International Sales of Goods. # import logging +import re import sys from itertools import groupby, repeat from pathlib import Path @@ -196,6 +197,7 @@ def generate_ismN_scene( # repeat for all source files offset = 0 + end_position = [] for i in range(N_inputs): # read input filename source_file = ( @@ -232,16 +234,37 @@ def generate_ismN_scene( if isinstance(scene["shift"], list) else scene["shift"] ) + + # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index + # of the reference signal (0-based index) + if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: + # extract X and i_ref + match = re.match( + r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i] + ) + + if match: + overlap = float(match.group(1)) + overlap_ref = int(match.group(2)) + else: + scene_shift_str = scene["shift"][i] + logger.error( + f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!" + ) + sys.exit(-1) + + # calculate absolute shift of the source signal in seconds + source_shift = end_position[overlap_ref] + overlap else: source_shift = 0.0 # convert shift from seconds to samples and ensure it is a multiple of 20ms - source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) else: source_shift = int(np.ceil(source_shift / frame_len) * frame_len) + source_shift_in_seconds = source_shift / cfg.fs # read the level if "level" in scene.keys(): @@ -278,6 +301,9 @@ def generate_ismN_scene( x = audio.fromtype("ISM1") x.audio, x.fs = audiofile.read(input_filename) + # record the total duration of the source signal, taking into account the shift of the starting position + end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds) + # resample to the target fs if necessary if x.fs != cfg.fs: logger.warning( @@ -288,12 +314,12 @@ def generate_ismN_scene( x.fs = cfg.fs # adjust the level of the audio source file (need to convert to MONO first) - if level is None: - # do not change the level of the audio source signal - logger.info("-- Level of the audio source signal is not changed") - elif np.isinf(level): + if np.isinf(level): # set all channels to zero x.audio = np.zeros_like(x.audio) + elif level is None: + # do not change the level of the audio source signal + logger.info("-- Level of the audio source signal is not changed") else: x_temp = audio.ChannelBasedAudio( "MONO" @@ -391,21 +417,21 @@ def generate_ismN_scene( y.object_pos = x.object_pos.copy() y.fs = x.fs - if source_shift < 0: + if source_shift > 0: # insert zeros to the new audio source signal to shift it right - metadata.trim_meta(y, limits=[source_shift, 0], samples=True) + metadata.trim_meta(y, limits=[-source_shift, 0], samples=True) else: offset = source_shift else: # shift the beginning of the audio source signal delta_offset = source_shift - offset - if delta_offset > 0: + if delta_offset < 0: # insert zeros to the previous ISM signal(s) to shift them right - metadata.trim_meta(y, limits=[-delta_offset, 0], samples=True) + metadata.trim_meta(y, limits=[delta_offset, 0], samples=True) offset = source_shift else: # insert zeros to the new audio source signal to shift it right - metadata.trim_meta(x, limits=[delta_offset, 0], samples=True) + metadata.trim_meta(x, limits=[-delta_offset, 0], samples=True) # adjust the length of the audio source signal delta_length = len(x.audio) - len(y.audio) @@ -443,18 +469,14 @@ def generate_ismN_scene( noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") y.audio += noise - # adjust the length of the output signal + # trim the output signal if the total duration exceeds X seconds if "duration" in cfg.__dict__: - # trim the output signal such that the total duration is X seconds - duration = int(cfg.duration * cfg.fs) # convert to samples - else: - # do not change the length of the audio signal - duration = len(y.audio) - duration = int( - np.floor(duration / frame_len) * frame_len - ) # ensure multiple of 20ms - if len(y.audio) != duration: - metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) + # convert from seconds to samples (ensure multiple of 20ms) + duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len) + + # check if the current length of the output signal exceeds the duration + if len(y.audio) > duration: + metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) # adjust the loudness of the output signal if "loudness" in cfg.__dict__: diff --git a/ivas_processing_scripts/generation/generate_masa_items.py b/ivas_processing_scripts/generation/generate_masa_items.py index 7a4258232b83f9833edbe7e03a6a49de68ced734..6ddac8707f566cef93ee5542662ea7b9e740420b 100644 --- a/ivas_processing_scripts/generation/generate_masa_items.py +++ b/ivas_processing_scripts/generation/generate_masa_items.py @@ -31,6 +31,7 @@ # import logging +import re import sys from itertools import groupby, repeat from pathlib import Path @@ -209,6 +210,7 @@ def generate_MASA_scene( # repeat for all source files offset = 0 y_int = None + end_position = [] for i in range(N_inputs): # parse parameters from the scene description source_file = ( @@ -232,13 +234,44 @@ def generate_MASA_scene( else: source_shift = 0.0 + # read the source shift length (in seconds) + if "shift" in scene.keys(): + source_shift = ( + scene["shift"][i] + if isinstance(scene["shift"], list) + else scene["shift"] + ) + + # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index + # of the reference signal (0-based index) + if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: + # extract X and i_ref + match = re.match( + r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i] + ) + + if match: + overlap = float(match.group(1)) + overlap_ref = int(match.group(2)) + else: + scene_shift_str = scene["shift"][i] + logger.error( + f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!" + ) + sys.exit(-1) + + # calculate absolute shift of the source signal in seconds + source_shift = end_position[overlap_ref] - overlap + else: + source_shift = 0.0 + # convert shift from seconds to samples and ensure it is a multiple of 20ms - source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) else: source_shift = int(np.ceil(source_shift / frame_len) * frame_len) + source_shift_in_seconds = source_shift / cfg.fs # read the level if "level" in scene.keys(): @@ -295,6 +328,9 @@ def generate_MASA_scene( # read source file x = audio.fromfile("MONO", input_filename) + # record the total duration of the source signal, taking into account the shift of the starting position + end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds) + # resample to the target fs if necessary if x.fs != cfg.fs: logger.warning( @@ -316,12 +352,12 @@ def generate_MASA_scene( x = reverb_hoa3(x, IR, mode=None) # adjust the level of the FOA/HOA2/HOA3 signal - if level is None: - # do not change the level of the audio source signal - logger.info("-- Level of the audio source signal is not changed") - elif np.isinf(level): + if np.isinf(level): # set all channels to zero x.audio = np.zeros_like(x.audio) + elif level is None: + # do not change the level of the audio source signal + logger.info("-- Level of the audio source signal is not changed") else: x.audio, _ = loudness_norm(x, level, loudness_format="STEREO") @@ -339,26 +375,26 @@ def generate_MASA_scene( # this is the first SBA source signal y_int.audio = x.audio.copy() - if source_shift < 0: + if source_shift > 0: # insert zeros to the first SBA source signal to shift it right y_int.audio = audioarray.trim( - y_int.audio, y_int.fs, limits=[source_shift, 0], samples=True + y_int.audio, y_int.fs, limits=[-source_shift, 0], samples=True ) else: offset = source_shift else: # shift the beginning of the audio source signal delta_offset = source_shift - offset - if delta_offset > 0: + if delta_offset < 0: # insert zeros to the output SBA signal to shift it right y_int.audio = audioarray.trim( - y_int.audio, y_int.fs, limits=[-delta_offset, 0], samples=True + y_int.audio, y_int.fs, limits=[delta_offset, 0], samples=True ) offset = source_shift else: # insert zeros to the new SBA source signal to shift it right x.audio = audioarray.trim( - x.audio, x.fs, limits=[delta_offset, 0], samples=True + x.audio, x.fs, limits=[-delta_offset, 0], samples=True ) # adjust the length of the audio source signal @@ -393,20 +429,19 @@ def generate_MASA_scene( y_int.audio, y_int.fs, limits=[-preamble, -postamble], samples=True ) - # adjust the length of the output signal + # trim the output signal if the total duration exceeds X seconds if "duration" in cfg.__dict__: - # trim the output signal such that the total duration is X seconds - duration = int(cfg.duration * cfg.fs) # convert to samples - else: - # do not change the length of the audio signal - duration = len(y_int.audio) - duration = int( - np.floor(duration / frame_len) * frame_len - ) # ensure multiple of 20ms - if len(y_int.audio) != duration: - y_int.audio = audioarray.trim( - y_int.audio, y_int.fs, limits=[0, len(y_int.audio) - duration], samples=True - ) + # convert from seconds to samples (ensure multiple of 20ms) + duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len) + + # check if the current length of the output signal exceeds the duration + if len(y_int.audio) > duration: + y_int.audio = audioarray.trim( + y_int.audio, + y_int.fs, + limits=[0, len(y_int.audio) - duration], + samples=True, + ) # adjust the loudness of the output signal if "loudness" in cfg.__dict__: diff --git a/ivas_processing_scripts/generation/generate_mc_items.py b/ivas_processing_scripts/generation/generate_mc_items.py index df1b16451e4e3d12a48f31c9977f8e435ce5b8bd..35dcbb3be7be6b646c39e6f16b1ae84bfa8bf50b 100644 --- a/ivas_processing_scripts/generation/generate_mc_items.py +++ b/ivas_processing_scripts/generation/generate_mc_items.py @@ -31,6 +31,7 @@ # import logging +import re import sys from itertools import groupby, repeat from pathlib import Path @@ -209,6 +210,7 @@ def generate_MC_scene( # repeat for all source files offset = 0 y_int = None + end_position = [] for i in range(N_inputs): # parse parameters from the scene description source_file = ( @@ -222,23 +224,44 @@ def generate_MC_scene( ) IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) - # read the shift time in seconds + # read the source shift length (in seconds) if "shift" in scene.keys(): source_shift = ( scene["shift"][i] if isinstance(scene["shift"], list) else scene["shift"] ) + + # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index + # of the reference signal (0-based index) + if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: + # extract X and i_ref + match = re.match( + r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i] + ) + + if match: + overlap = float(match.group(1)) + overlap_ref = int(match.group(2)) + else: + scene_shift_str = scene["shift"][i] + logger.error( + f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!" + ) + sys.exit(-1) + + # calculate absolute shift of the source signal in seconds + source_shift = end_position[overlap_ref] + overlap else: source_shift = 0.0 # convert shift from seconds to samples and ensure it is a multiple of 20ms - source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) else: source_shift = int(np.ceil(source_shift / frame_len) * frame_len) + source_shift_in_seconds = source_shift / cfg.fs # read the level if "level" in scene.keys(): @@ -295,6 +318,9 @@ def generate_MC_scene( # read source file x = audio.fromfile("MONO", input_filename) + # record the total duration of the source signal, taking into account the shift of the starting position + end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds) + # resample to the target fs if necessary if x.fs != cfg.fs: logger.warning( @@ -339,26 +365,26 @@ def generate_MC_scene( # this is the first SBA source signal y_int.audio = x.audio.copy() - if source_shift < 0: + if source_shift > 0: # insert zeros to the first SBA source signal to shift it right y_int.audio = audioarray.trim( - y_int.audio, y_int.fs, limits=[source_shift, 0], samples=True + y_int.audio, y_int.fs, limits=[-source_shift, 0], samples=True ) else: offset = source_shift else: # shift the beginning of the audio source signal delta_offset = source_shift - offset - if delta_offset > 0: + if delta_offset < 0: # insert zeros to the output SBA signal to shift it right y_int.audio = audioarray.trim( - y_int.audio, y_int.fs, limits=[-delta_offset, 0], samples=True + y_int.audio, y_int.fs, limits=[delta_offset, 0], samples=True ) offset = source_shift else: # insert zeros to the new SBA source signal to shift it right x.audio = audioarray.trim( - x.audio, x.fs, limits=[delta_offset, 0], samples=True + x.audio, x.fs, limits=[-delta_offset, 0], samples=True ) # adjust the length of the audio source signal @@ -393,20 +419,19 @@ def generate_MC_scene( y_int.audio, y_int.fs, limits=[-preamble, -postamble], samples=True ) - # adjust the length of the output signal + # trim the output signal if the total duration exceeds X seconds if "duration" in cfg.__dict__: - # trim the output signal such that the total duration is X seconds - duration = int(cfg.duration * cfg.fs) # convert to samples - else: - # do not change the length of the audio signal - duration = len(y_int.audio) - duration = int( - np.floor(duration / frame_len) * frame_len - ) # ensure multiple of 20ms - if len(y_int.audio) != duration: - y_int.audio = audioarray.trim( - y_int.audio, y_int.fs, limits=[0, len(y_int.audio) - duration], samples=True - ) + # convert from seconds to samples (ensure multiple of 20ms) + duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len) + + # check if the current length of the output signal exceeds the duration + if len(y_int.audio) > duration: + y_int.audio = audioarray.trim( + y_int.audio, + y_int.fs, + limits=[0, len(y_int.audio) - duration], + samples=True, + ) # adjust the loudness of the output signal if "loudness" in cfg.__dict__: diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py index 93f003053ebb04633ee8d0a679c444a5c9a98f81..0881c7ca200e5cf639d23bfd9a1f0edd1b205b9e 100644 --- a/ivas_processing_scripts/generation/generate_omasa_items.py +++ b/ivas_processing_scripts/generation/generate_omasa_items.py @@ -31,6 +31,7 @@ # import logging +import re import sys from itertools import groupby, repeat from pathlib import Path @@ -183,6 +184,7 @@ def generate_OMASA_scene( # repeat for all source files offset = 0 + end_position = [] for i in range(N_inputs): # parse parameters from the scene description source_file = ( @@ -220,16 +222,37 @@ def generate_OMASA_scene( if isinstance(scene["shift"], list) else scene["shift"] ) + + # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index + # of the reference signal (0-based index) + if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: + # extract X and i_ref + match = re.match( + r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i] + ) + + if match: + overlap = float(match.group(1)) + overlap_ref = int(match.group(2)) + else: + scene_shift_str = scene["shift"][i] + logger.error( + f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!" + ) + sys.exit(-1) + + # calculate absolute shift of the source signal in seconds + source_shift = end_position[overlap_ref] + overlap else: source_shift = 0.0 # convert shift from seconds to samples and ensure it is a multiple of 20ms - source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) else: source_shift = int(np.ceil(source_shift / frame_len) * frame_len) + source_shift_in_seconds = source_shift / cfg.fs # read the level if "level" in scene.keys(): @@ -300,6 +323,9 @@ def generate_OMASA_scene( # read source file x = audio.fromfile(fmt, input_filename) + # record the total duration of the source signal, taking into account the shift of the starting position + end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds) + # resample to the target fs if necessary if x.fs != cfg.fs: logger.warning( @@ -417,21 +443,21 @@ def generate_OMASA_scene( # add the first audio source signal (should be FOA/HOA2/HOA3) to the array of all source signals y_int.audio = x.audio.copy() - if source_shift < 0: + if source_shift > 0: # insert zeros to the new audio source signal to shift it right - metadata.trim_meta(y_int, limits=[source_shift, 0], samples=True) + metadata.trim_meta(y_int, limits=[-source_shift, 0], samples=True) else: offset = source_shift else: # shift the beginning of the audio source signal delta_offset = source_shift - offset - if delta_offset > 0: + if delta_offset < 0: # insert zeros to the existing intermediate OSBA object to shift it right - metadata.trim_meta(y_int, limits=[-delta_offset, 0], samples=True) + metadata.trim_meta(y_int, limits=[delta_offset, 0], samples=True) offset = source_shift else: # insert zeros to the new audio source signal to shift it right - metadata.trim_meta(x, limits=[delta_offset, 0], samples=True) + metadata.trim_meta(x, limits=[-delta_offset, 0], samples=True) # adjust the length of the audio source signal delta_length = len(x.audio) - len(y_int.audio) @@ -472,18 +498,16 @@ def generate_OMASA_scene( noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") y_int.audio += noise - # adjust the length of the output signal + # trim the output signal if the total duration exceeds X seconds if "duration" in cfg.__dict__: - # trim the output signal such that the total duration is X seconds - duration = int(cfg.duration * cfg.fs) # convert to samples - else: - # do not change the length of the audio signal - duration = len(y_int.audio) - duration = int( - np.floor(duration / frame_len) * frame_len - ) # ensure multiple of 20ms - if len(y_int.audio) != duration: - metadata.trim_meta(y_int, limits=[0, len(y_int.audio) - duration], samples=True) + # convert from seconds to samples (ensure multiple of 20ms) + duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len) + + # check if the current length of the output signal exceeds the duration + if len(y_int.audio) > duration: + metadata.trim_meta( + y_int, limits=[0, len(y_int.audio) - duration], samples=True + ) # adjust the loudness of the output signal if "loudness" in cfg.__dict__: diff --git a/ivas_processing_scripts/generation/generate_osba_items.py b/ivas_processing_scripts/generation/generate_osba_items.py index dd8f5b5d2a7a6ea56c98470e6c8b715add248da4..8d2ca0d85209e6e7e86f24b19daeb8fb6bb56465 100644 --- a/ivas_processing_scripts/generation/generate_osba_items.py +++ b/ivas_processing_scripts/generation/generate_osba_items.py @@ -31,6 +31,7 @@ # import logging +import re import sys from itertools import groupby, repeat from pathlib import Path @@ -187,6 +188,7 @@ def generate_OSBA_scene( # repeat for all source files offset = 0 + end_position = [] for i in range(N_inputs): # parse parameters from the scene description source_file = ( @@ -210,23 +212,44 @@ def generate_OSBA_scene( else scene["elevation"] ) - # read the shift time in seconds + # read the source shift length (in seconds) if "shift" in scene.keys(): source_shift = ( scene["shift"][i] if isinstance(scene["shift"], list) else scene["shift"] ) + + # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index + # of the reference signal (0-based index) + if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: + # extract X and i_ref + match = re.match( + r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i] + ) + + if match: + overlap = float(match.group(1)) + overlap_ref = int(match.group(2)) + else: + scene_shift_str = scene["shift"][i] + logger.error( + f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!" + ) + sys.exit(-1) + + # calculate absolute shift of the source signal in seconds + source_shift = end_position[overlap_ref] + overlap else: source_shift = 0.0 # convert shift from seconds to samples and ensure it is a multiple of 20ms - source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) else: source_shift = int(np.ceil(source_shift / frame_len) * frame_len) + source_shift_in_seconds = source_shift / cfg.fs # read the level if "level" in scene.keys(): @@ -282,6 +305,9 @@ def generate_OSBA_scene( # read source file x = audio.fromfile(fmt, input_filename) + # record the total duration of the source signal, taking into account the shift of the starting position + end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds) + # resample to the target fs if necessary if x.fs != cfg.fs: logger.warning( @@ -403,21 +429,21 @@ def generate_OSBA_scene( # if ISM, append object position to the OSBA object y.object_pos = x.object_pos.copy() - if source_shift < 0: + if source_shift > 0: # insert zeros to the new audio source signal to shift it right - metadata.trim_meta(y, limits=[source_shift, 0], samples=True) + metadata.trim_meta(y, limits=[-source_shift, 0], samples=True) else: offset = source_shift else: # shift the beginning of the audio source signal delta_offset = source_shift - offset - if delta_offset > 0: + if delta_offset < 0: # insert zeros to the previous ISM signal(s) to shift them right - metadata.trim_meta(y, limits=[-delta_offset, 0], samples=True) + metadata.trim_meta(y, limits=[delta_offset, 0], samples=True) offset = source_shift else: # insert zeros to the new audio source signal to shift it right - metadata.trim_meta(x, limits=[delta_offset, 0], samples=True) + metadata.trim_meta(x, limits=[-delta_offset, 0], samples=True) # adjust the length of the audio source signal delta_length = len(x.audio) - len(y.audio) @@ -458,18 +484,14 @@ def generate_OSBA_scene( noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") y.audio += noise - # adjust the length of the output signal + # trim the output signal if the total duration exceeds X seconds if "duration" in cfg.__dict__: - # trim the output signal such that the total duration is X seconds - duration = int(cfg.duration * cfg.fs) # convert to samples - else: - # do not change the length of the audio signal - duration = len(y.audio) - duration = int( - np.floor(duration / frame_len) * frame_len - ) # ensure multiple of 20ms - if len(y.audio) != duration: - metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) + # convert from seconds to samples (ensure multiple of 20ms) + duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len) + + # check if the current length of the output signal exceeds the duration + if len(y.audio) > duration: + metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) # adjust the loudness of the output signal if "loudness" in cfg.__dict__: diff --git a/ivas_processing_scripts/generation/generate_sba_items.py b/ivas_processing_scripts/generation/generate_sba_items.py index 28fbababc787ef2d67a961deab39eef9330ad165..631d6165481f1cc09ccfe56335b3af86ec16e4b4 100644 --- a/ivas_processing_scripts/generation/generate_sba_items.py +++ b/ivas_processing_scripts/generation/generate_sba_items.py @@ -31,6 +31,7 @@ # import logging +import re import sys from itertools import groupby, repeat from pathlib import Path @@ -201,6 +202,7 @@ def generate_sba_scene( # repeat for all source files offset = 0 + end_position = [] for i in range(N_inputs): # parse parameters from the scene description source_file = ( @@ -214,23 +216,44 @@ def generate_sba_scene( ) IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) - # read the shift time in seconds + # read the source shift length (in seconds) if "shift" in scene.keys(): source_shift = ( scene["shift"][i] if isinstance(scene["shift"], list) else scene["shift"] ) + + # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index + # of the reference signal (0-based index) + if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: + # extract X and i_ref + match = re.match( + r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i] + ) + + if match: + overlap = float(match.group(1)) + overlap_ref = int(match.group(2)) + else: + scene_shift_str = scene["shift"][i] + logger.error( + f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!" + ) + sys.exit(-1) + + # calculate absolute shift of the source signal in seconds + source_shift = end_position[overlap_ref] + overlap else: source_shift = 0.0 # convert shift from seconds to samples and ensure it is a multiple of 20ms - source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) else: source_shift = int(np.ceil(source_shift / frame_len) * frame_len) + source_shift_in_seconds = source_shift / cfg.fs # read the level if "level" in scene.keys(): @@ -266,6 +289,9 @@ def generate_sba_scene( # read source file x = audio.fromfile("MONO", input_filename) + # record the total duration of the source signal, taking into account the shift of the starting position + end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds) + # resample to the target fs if necessary if x.fs != cfg.fs: logger.warning( @@ -310,26 +336,26 @@ def generate_sba_scene( # add source signal to the array of all source signals y.audio = x.audio.copy() - if source_shift < 0: + if source_shift > 0: # insert zeros to the new audio source signal to shift it right y.audio = audioarray.trim_meta( - y.audio, y.fs, limits=[source_shift, 0], samples=True + y.audio, y.fs, limits=[-source_shift, 0], samples=True ) else: offset = source_shift else: # shift the beginning of the audio source signal delta_offset = source_shift - offset - if delta_offset > 0: + if delta_offset < 0: # insert zeros to the existing output signal to shift it right y.audio = audioarray.trim( - y.audio, y.fs, limits=[-delta_offset, 0], samples=True + y.audio, y.fs, limits=[delta_offset, 0], samples=True ) offset = source_shift else: # insert zeros to the new audio source signal to shift it right x.audio = audioarray.trim( - x.audio, x.fs, limits=[delta_offset, 0], samples=True + x.audio, x.fs, limits=[-delta_offset, 0], samples=True ) # adjust the length of the audio source signal @@ -364,20 +390,16 @@ def generate_sba_scene( y.audio, y.fs, limits=[-preamble, -postamble], samples=True ) - # adjust the length of the output signal + # trim the output signal if the total duration exceeds X seconds if "duration" in cfg.__dict__: - # trim the output signal such that the total duration is X seconds - duration = int(cfg.duration * cfg.fs) # convert to samples - else: - # do not change the length of the audio signal - duration = len(y.audio) - duration = int( - np.floor(duration / frame_len) * frame_len - ) # ensure multiple of 20ms - if len(y.audio) != duration: - y.audio = audioarray.trim( - y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True - ) + # convert from seconds to samples (ensure multiple of 20ms) + duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len) + + # check if the current length of the output signal exceeds the duration + if len(y.audio) > duration: + y.audio = audioarray.trim( + y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True + ) # adjust the loudness of the output signal if "loudness" in cfg.__dict__: diff --git a/ivas_processing_scripts/generation/generate_stereo_items.py b/ivas_processing_scripts/generation/generate_stereo_items.py index a0d99f90cad67a8288b891f255f12cc106beba8a..1ad8a6ae47936af4a8ba9d71ab09634525fa2b95 100644 --- a/ivas_processing_scripts/generation/generate_stereo_items.py +++ b/ivas_processing_scripts/generation/generate_stereo_items.py @@ -32,6 +32,7 @@ import logging import os +import re import sys from itertools import groupby, repeat from pathlib import Path @@ -207,6 +208,7 @@ def generate_stereo_scene( # repeat for all source files offset = 0 + end_position = [] for i in range(N_inputs): # parse parameters from the scene description source_file = ( @@ -220,23 +222,44 @@ def generate_stereo_scene( ) IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) - # read the shift time in seconds + # read the source shift length (in seconds) if "shift" in scene.keys(): source_shift = ( scene["shift"][i] if isinstance(scene["shift"], list) else scene["shift"] ) + + # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index + # of the reference signal (0-based index) + if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: + # extract X and i_ref + match = re.match( + r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i] + ) + + if match: + overlap = float(match.group(1)) + overlap_ref = int(match.group(2)) + else: + scene_shift_str = scene["shift"][i] + logger.error( + f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!" + ) + sys.exit(-1) + + # calculate absolute shift of the source signal in seconds + source_shift = end_position[overlap_ref] + overlap else: source_shift = 0.0 # convert shift from seconds to samples and ensure it is a multiple of 20ms - source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) else: source_shift = int(np.ceil(source_shift / frame_len) * frame_len) + source_shift_in_seconds = source_shift / cfg.fs # read the level if "level" in scene.keys(): @@ -284,6 +307,9 @@ def generate_stereo_scene( # read the IR file (!must be in STEREO format!) IR = audio.fromfile("STEREO", IR_filename) + # record the total duration of the source signal, taking into account the shift of the starting position + end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds) + # convolve MONO source audio with STEREO IR -> results in STEREO audio object x = reverb_stereo(x, IR, mode=None) @@ -311,26 +337,26 @@ def generate_stereo_scene( # add source signal to the array of all source signals y.audio = x.audio.copy() - if source_shift < 0: + if source_shift > 0: # insert zeros to the new audio source signal to shift it right y.audio = audioarray.trim( - y.audio, x.fs, limits=[source_shift, 0], samples=True + y.audio, x.fs, limits=[-source_shift, 0], samples=True ) else: offset = source_shift else: # shift the beginning of the audio source signal delta_offset = source_shift - offset - if delta_offset > 0: + if delta_offset < 0: # insert zeros to the existing output signal to shift it right y.audio = audioarray.trim( - y.audio, y.fs, limits=[-delta_offset, 0], samples=True + y.audio, y.fs, limits=[delta_offset, 0], samples=True ) offset = source_shift else: # insert zeros to the new audio source signal to shift it right x.audio = audioarray.trim( - x.audio, x.fs, limits=[delta_offset, 0], samples=True + x.audio, x.fs, limits=[-delta_offset, 0], samples=True ) # adjust the length of the audio source signal @@ -365,20 +391,16 @@ def generate_stereo_scene( y.audio, y.fs, limits=[-preamble, -postamble], samples=True ) - # adjust the length of the output signal + # trim the output signal if the total duration exceeds X seconds if "duration" in cfg.__dict__: - # trim the output signal such that the total duration is X seconds - duration = int(cfg.duration * cfg.fs) # convert to samples - else: - # do not change the length of the audio signal - duration = len(y.audio) - duration = int( - np.floor(duration / frame_len) * frame_len - ) # ensure multiple of 20ms - if len(y.audio) != duration: - y.audio = audioarray.trim( - y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True - ) + # convert from seconds to samples (ensure multiple of 20ms) + duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len) + + # check if the current length of the output signal exceeds the duration + if len(y.audio) > duration: + y.audio = audioarray.trim( + y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True + ) # adjust the loudness of the output signal if "loudness" in cfg.__dict__: