Loading ivas_processing_scripts/generation/generate_ismN_items.py +34 −12 Original line number Diff line number Diff line Loading @@ -30,6 +30,7 @@ # the United Nations Convention on Contracts on the International Sales of Goods. # import logging import re import sys from itertools import groupby, repeat from pathlib import Path Loading Loading @@ -196,6 +197,7 @@ def generate_ismN_scene( # repeat for all source files offset = 0 end_position = [] for i in range(N_inputs): # read input filename source_file = ( Loading Loading @@ -232,16 +234,33 @@ def generate_ismN_scene( if isinstance(scene["shift"], list) else scene["shift"] ) # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index # of the reference signal (0-based index) if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: # extract X and i_ref match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]) if match: overlap = float(match.group(1)) overlap_ref = int(match.group(2)) else: scene_shift_str = scene["shift"][i] logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!") sys.exit(-1) # calculate absolute shift of the source signal in seconds source_shift = end_position[overlap_ref] + overlap else: source_shift = 0.0 # convert shift from seconds to samples and ensure it is a multiple of 20ms source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) else: source_shift = int(np.ceil(source_shift / frame_len) * frame_len) source_shift_in_seconds = source_shift / cfg.fs # read the level if "level" in scene.keys(): Loading Loading @@ -278,6 +297,9 @@ def generate_ismN_scene( x = audio.fromtype("ISM1") x.audio, x.fs = audiofile.read(input_filename) # record the total duration of the source signal, taking into account the shift of the starting position end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds) # resample to the target fs if necessary if x.fs != cfg.fs: logger.warning( Loading @@ -288,12 +310,12 @@ def generate_ismN_scene( x.fs = cfg.fs # adjust the level of the audio source file (need to convert to MONO first) if level is None: # do not change the level of the audio source signal logger.info("-- Level of the audio source signal is not changed") elif np.isinf(level): if np.isinf(level): # set all channels to zero x.audio = np.zeros_like(x.audio) elif level is None: # do not change the level of the audio source signal logger.info("-- Level of the audio source signal is not changed") else: x_temp = audio.ChannelBasedAudio( "MONO" Loading Loading @@ -391,21 +413,21 @@ def generate_ismN_scene( y.object_pos = x.object_pos.copy() y.fs = x.fs if source_shift < 0: if source_shift > 0: # insert zeros to the new audio source signal to shift it right metadata.trim_meta(y, limits=[source_shift, 0], samples=True) metadata.trim_meta(y, limits=[-source_shift, 0], samples=True) else: offset = source_shift else: # shift the beginning of the audio source signal delta_offset = source_shift - offset if delta_offset > 0: if delta_offset < 0: # insert zeros to the previous ISM signal(s) to shift them right metadata.trim_meta(y, limits=[-delta_offset, 0], samples=True) metadata.trim_meta(y, limits=[delta_offset, 0], samples=True) offset = source_shift else: # insert zeros to the new audio source signal to shift it right metadata.trim_meta(x, limits=[delta_offset, 0], samples=True) metadata.trim_meta(x, limits=[-delta_offset, 0], samples=True) # adjust the length of the audio source signal delta_length = len(x.audio) - len(y.audio) Loading Loading @@ -443,14 +465,14 @@ def generate_ismN_scene( noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") y.audio += noise # trim the output signal such if the total duration exceeds X seconds # trim the output signal if the total duration exceeds X seconds if "duration" in cfg.__dict__: # convert from seconds to samples (ensure multiple of 20ms) duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len) # check if the current length of the output signal exceeds the duration if len(y.audio) > duration: metadata.trim_meta(y, limits=[0, duration], samples=True) metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) # adjust the loudness of the output signal if "loudness" in cfg.__dict__: Loading ivas_processing_scripts/generation/generate_masa_items.py +47 −10 Original line number Diff line number Diff line Loading @@ -31,6 +31,7 @@ # import logging import re import sys from itertools import groupby, repeat from pathlib import Path Loading Loading @@ -209,6 +210,7 @@ def generate_MASA_scene( # repeat for all source files offset = 0 y_int = None end_position = [] for i in range(N_inputs): # parse parameters from the scene description source_file = ( Loading @@ -232,13 +234,44 @@ def generate_MASA_scene( else: source_shift = 0.0 # read the source shift length (in seconds) if "shift" in scene.keys(): source_shift = ( scene["shift"][i] if isinstance(scene["shift"], list) else scene["shift"] ) # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index # of the reference signal (0-based index) if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: # extract X and i_ref match = re.match( r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i] ) if match: overlap = float(match.group(1)) overlap_ref = int(match.group(2)) else: scene_shift_str = scene["shift"][i] logger.error( f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!" ) sys.exit(-1) # calculate absolute shift of the source signal in seconds source_shift = end_position[overlap_ref] - overlap else: source_shift = 0.0 # convert shift from seconds to samples and ensure it is a multiple of 20ms source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) else: source_shift = int(np.ceil(source_shift / frame_len) * frame_len) source_shift_in_seconds = source_shift / cfg.fs # read the level if "level" in scene.keys(): Loading Loading @@ -295,6 +328,9 @@ def generate_MASA_scene( # read source file x = audio.fromfile("MONO", input_filename) # record the total duration of the source signal, taking into account the shift of the starting position end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds) # resample to the target fs if necessary if x.fs != cfg.fs: logger.warning( Loading Loading @@ -339,26 +375,26 @@ def generate_MASA_scene( # this is the first SBA source signal y_int.audio = x.audio.copy() if source_shift < 0: if source_shift > 0: # insert zeros to the first SBA source signal to shift it right y_int.audio = audioarray.trim( y_int.audio, y_int.fs, limits=[source_shift, 0], samples=True y_int.audio, y_int.fs, limits=[-source_shift, 0], samples=True ) else: offset = source_shift else: # shift the beginning of the audio source signal delta_offset = source_shift - offset if delta_offset > 0: if delta_offset < 0: # insert zeros to the output SBA signal to shift it right y_int.audio = audioarray.trim( y_int.audio, y_int.fs, limits=[-delta_offset, 0], samples=True y_int.audio, y_int.fs, limits=[delta_offset, 0], samples=True ) offset = source_shift else: # insert zeros to the new SBA source signal to shift it right x.audio = audioarray.trim( x.audio, x.fs, limits=[delta_offset, 0], samples=True x.audio, x.fs, limits=[-delta_offset, 0], samples=True ) # adjust the length of the audio source signal Loading Loading @@ -396,14 +432,15 @@ def generate_MASA_scene( # trim the output signal if the total duration exceeds X seconds if "duration" in cfg.__dict__: # convert from seconds to samples (ensure multiple of 20ms) duration = int( np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len ) duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len) # check if the current length of the output signal exceeds the duration if len(y_int.audio) > duration: y_int.audio = audioarray.trim( y_int.audio, y_int.fs, limits=[0, len(y_int.audio) - duration], samples=True y_int.audio, y_int.fs, limits=[0, len(y_int.audio) - duration], samples=True, ) # adjust the loudness of the output signal Loading ivas_processing_scripts/generation/generate_mc_items.py +29 −7 Original line number Diff line number Diff line Loading @@ -31,6 +31,7 @@ # import logging import re import sys from itertools import groupby, repeat from pathlib import Path Loading Loading @@ -209,6 +210,7 @@ def generate_MC_scene( # repeat for all source files offset = 0 y_int = None end_position = [] for i in range(N_inputs): # parse parameters from the scene description source_file = ( Loading @@ -222,23 +224,40 @@ def generate_MC_scene( ) IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) # read the shift time in seconds # read the source shift length (in seconds) if "shift" in scene.keys(): source_shift = ( scene["shift"][i] if isinstance(scene["shift"], list) else scene["shift"] ) # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index # of the reference signal (0-based index) if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: # extract X and i_ref match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]) if match: overlap = float(match.group(1)) overlap_ref = int(match.group(2)) else: scene_shift_str = scene["shift"][i] logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!") sys.exit(-1) # calculate absolute shift of the source signal in seconds source_shift = end_position[overlap_ref] + overlap else: source_shift = 0.0 # convert shift from seconds to samples and ensure it is a multiple of 20ms source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) else: source_shift = int(np.ceil(source_shift / frame_len) * frame_len) source_shift_in_seconds = source_shift / cfg.fs # read the level if "level" in scene.keys(): Loading Loading @@ -295,6 +314,9 @@ def generate_MC_scene( # read source file x = audio.fromfile("MONO", input_filename) # record the total duration of the source signal, taking into account the shift of the starting position end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds) # resample to the target fs if necessary if x.fs != cfg.fs: logger.warning( Loading Loading @@ -339,26 +361,26 @@ def generate_MC_scene( # this is the first SBA source signal y_int.audio = x.audio.copy() if source_shift < 0: if source_shift > 0: # insert zeros to the first SBA source signal to shift it right y_int.audio = audioarray.trim( y_int.audio, y_int.fs, limits=[source_shift, 0], samples=True y_int.audio, y_int.fs, limits=[-source_shift, 0], samples=True ) else: offset = source_shift else: # shift the beginning of the audio source signal delta_offset = source_shift - offset if delta_offset > 0: if delta_offset < 0: # insert zeros to the output SBA signal to shift it right y_int.audio = audioarray.trim( y_int.audio, y_int.fs, limits=[-delta_offset, 0], samples=True y_int.audio, y_int.fs, limits=[delta_offset, 0], samples=True ) offset = source_shift else: # insert zeros to the new SBA source signal to shift it right x.audio = audioarray.trim( x.audio, x.fs, limits=[delta_offset, 0], samples=True x.audio, x.fs, limits=[-delta_offset, 0], samples=True ) # adjust the length of the audio source signal Loading ivas_processing_scripts/generation/generate_omasa_items.py +31 −22 Original line number Diff line number Diff line Loading @@ -31,6 +31,7 @@ # import logging import re import sys from itertools import groupby, repeat from pathlib import Path Loading Loading @@ -183,6 +184,7 @@ def generate_OMASA_scene( # repeat for all source files offset = 0 end_position = [] for i in range(N_inputs): # parse parameters from the scene description source_file = ( Loading Loading @@ -220,16 +222,33 @@ def generate_OMASA_scene( if isinstance(scene["shift"], list) else scene["shift"] ) # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index # of the reference signal (0-based index) if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: # extract X and i_ref match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]) if match: overlap = float(match.group(1)) overlap_ref = int(match.group(2)) else: scene_shift_str = scene["shift"][i] logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!") sys.exit(-1) # calculate absolute shift of the source signal in seconds source_shift = end_position[overlap_ref] + overlap else: source_shift = 0.0 # convert shift from seconds to samples and ensure it is a multiple of 20ms source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) else: source_shift = int(np.ceil(source_shift / frame_len) * frame_len) source_shift_in_seconds = source_shift / cfg.fs # read the level if "level" in scene.keys(): Loading Loading @@ -300,6 +319,9 @@ def generate_OMASA_scene( # read source file x = audio.fromfile(fmt, input_filename) # record the total duration of the source signal, taking into account the shift of the starting position end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds) # resample to the target fs if necessary if x.fs != cfg.fs: logger.warning( Loading Loading @@ -417,21 +439,21 @@ def generate_OMASA_scene( # add the first audio source signal (should be FOA/HOA2/HOA3) to the array of all source signals y_int.audio = x.audio.copy() if source_shift < 0: if source_shift > 0: # insert zeros to the new audio source signal to shift it right metadata.trim_meta(y_int, limits=[source_shift, 0], samples=True) metadata.trim_meta(y_int, limits=[-source_shift, 0], samples=True) else: offset = source_shift else: # shift the beginning of the audio source signal delta_offset = source_shift - offset if delta_offset > 0: if delta_offset < 0: # insert zeros to the existing intermediate OSBA object to shift it right metadata.trim_meta(y_int, limits=[-delta_offset, 0], samples=True) metadata.trim_meta(y_int, limits=[delta_offset, 0], samples=True) offset = source_shift else: # insert zeros to the new audio source signal to shift it right metadata.trim_meta(x, limits=[delta_offset, 0], samples=True) metadata.trim_meta(x, limits=[-delta_offset, 0], samples=True) # adjust the length of the audio source signal delta_length = len(x.audio) - len(y_int.audio) Loading Loading @@ -472,29 +494,16 @@ def generate_OMASA_scene( noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") y_int.audio += noise # trim the output signal such if the total duration exceeds X seconds # trim the output signal if the total duration exceeds X seconds if "duration" in cfg.__dict__: <<<<<<< Updated upstream # trim the output signal such that the total duration is X seconds duration = int(cfg.duration * cfg.fs) # convert to samples else: # do not change the length of the audio signal duration = len(y_int.audio) duration = int( np.floor(duration / frame_len) * frame_len ) # ensure multiple of 20ms if len(y_int.audio) != duration: metadata.trim_meta(y_int, limits=[0, len(y_int.audio) - duration], samples=True) ======= # convert from seconds to samples (ensure multiple of 20ms) duration = int( np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len ) # check if the current length of the output signal exceeds the duration if len(y.audio) > duration: metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) >>>>>>> Stashed changes if len(y_int.audio) > duration: metadata.trim_meta(y_int, limits=[0, len(y_int.audio) - duration], samples=True) # adjust the loudness of the output signal if "loudness" in cfg.__dict__: Loading ivas_processing_scripts/generation/generate_osba_items.py +29 −7 Original line number Diff line number Diff line Loading @@ -31,6 +31,7 @@ # import logging import re import sys from itertools import groupby, repeat from pathlib import Path Loading Loading @@ -187,6 +188,7 @@ def generate_OSBA_scene( # repeat for all source files offset = 0 end_position = [] for i in range(N_inputs): # parse parameters from the scene description source_file = ( Loading @@ -210,23 +212,40 @@ def generate_OSBA_scene( else scene["elevation"] ) # read the shift time in seconds # read the source shift length (in seconds) if "shift" in scene.keys(): source_shift = ( scene["shift"][i] if isinstance(scene["shift"], list) else scene["shift"] ) # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index # of the reference signal (0-based index) if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: # extract X and i_ref match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]) if match: overlap = float(match.group(1)) overlap_ref = int(match.group(2)) else: scene_shift_str = scene["shift"][i] logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!") sys.exit(-1) # calculate absolute shift of the source signal in seconds source_shift = end_position[overlap_ref] + overlap else: source_shift = 0.0 # convert shift from seconds to samples and ensure it is a multiple of 20ms source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) else: source_shift = int(np.ceil(source_shift / frame_len) * frame_len) source_shift_in_seconds = source_shift / cfg.fs # read the level if "level" in scene.keys(): Loading Loading @@ -282,6 +301,9 @@ def generate_OSBA_scene( # read source file x = audio.fromfile(fmt, input_filename) # record the total duration of the source signal, taking into account the shift of the starting position end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds) # resample to the target fs if necessary if x.fs != cfg.fs: logger.warning( Loading Loading @@ -403,21 +425,21 @@ def generate_OSBA_scene( # if ISM, append object position to the OSBA object y.object_pos = x.object_pos.copy() if source_shift < 0: if source_shift > 0: # insert zeros to the new audio source signal to shift it right metadata.trim_meta(y, limits=[source_shift, 0], samples=True) metadata.trim_meta(y, limits=[-source_shift, 0], samples=True) else: offset = source_shift else: # shift the beginning of the audio source signal delta_offset = source_shift - offset if delta_offset > 0: if delta_offset < 0: # insert zeros to the previous ISM signal(s) to shift them right metadata.trim_meta(y, limits=[-delta_offset, 0], samples=True) metadata.trim_meta(y, limits=[delta_offset, 0], samples=True) offset = source_shift else: # insert zeros to the new audio source signal to shift it right metadata.trim_meta(x, limits=[delta_offset, 0], samples=True) metadata.trim_meta(x, limits=[-delta_offset, 0], samples=True) # adjust the length of the audio source signal delta_length = len(x.audio) - len(y.audio) Loading Loading
ivas_processing_scripts/generation/generate_ismN_items.py +34 −12 Original line number Diff line number Diff line Loading @@ -30,6 +30,7 @@ # the United Nations Convention on Contracts on the International Sales of Goods. # import logging import re import sys from itertools import groupby, repeat from pathlib import Path Loading Loading @@ -196,6 +197,7 @@ def generate_ismN_scene( # repeat for all source files offset = 0 end_position = [] for i in range(N_inputs): # read input filename source_file = ( Loading Loading @@ -232,16 +234,33 @@ def generate_ismN_scene( if isinstance(scene["shift"], list) else scene["shift"] ) # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index # of the reference signal (0-based index) if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: # extract X and i_ref match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]) if match: overlap = float(match.group(1)) overlap_ref = int(match.group(2)) else: scene_shift_str = scene["shift"][i] logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!") sys.exit(-1) # calculate absolute shift of the source signal in seconds source_shift = end_position[overlap_ref] + overlap else: source_shift = 0.0 # convert shift from seconds to samples and ensure it is a multiple of 20ms source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) else: source_shift = int(np.ceil(source_shift / frame_len) * frame_len) source_shift_in_seconds = source_shift / cfg.fs # read the level if "level" in scene.keys(): Loading Loading @@ -278,6 +297,9 @@ def generate_ismN_scene( x = audio.fromtype("ISM1") x.audio, x.fs = audiofile.read(input_filename) # record the total duration of the source signal, taking into account the shift of the starting position end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds) # resample to the target fs if necessary if x.fs != cfg.fs: logger.warning( Loading @@ -288,12 +310,12 @@ def generate_ismN_scene( x.fs = cfg.fs # adjust the level of the audio source file (need to convert to MONO first) if level is None: # do not change the level of the audio source signal logger.info("-- Level of the audio source signal is not changed") elif np.isinf(level): if np.isinf(level): # set all channels to zero x.audio = np.zeros_like(x.audio) elif level is None: # do not change the level of the audio source signal logger.info("-- Level of the audio source signal is not changed") else: x_temp = audio.ChannelBasedAudio( "MONO" Loading Loading @@ -391,21 +413,21 @@ def generate_ismN_scene( y.object_pos = x.object_pos.copy() y.fs = x.fs if source_shift < 0: if source_shift > 0: # insert zeros to the new audio source signal to shift it right metadata.trim_meta(y, limits=[source_shift, 0], samples=True) metadata.trim_meta(y, limits=[-source_shift, 0], samples=True) else: offset = source_shift else: # shift the beginning of the audio source signal delta_offset = source_shift - offset if delta_offset > 0: if delta_offset < 0: # insert zeros to the previous ISM signal(s) to shift them right metadata.trim_meta(y, limits=[-delta_offset, 0], samples=True) metadata.trim_meta(y, limits=[delta_offset, 0], samples=True) offset = source_shift else: # insert zeros to the new audio source signal to shift it right metadata.trim_meta(x, limits=[delta_offset, 0], samples=True) metadata.trim_meta(x, limits=[-delta_offset, 0], samples=True) # adjust the length of the audio source signal delta_length = len(x.audio) - len(y.audio) Loading Loading @@ -443,14 +465,14 @@ def generate_ismN_scene( noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") y.audio += noise # trim the output signal such if the total duration exceeds X seconds # trim the output signal if the total duration exceeds X seconds if "duration" in cfg.__dict__: # convert from seconds to samples (ensure multiple of 20ms) duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len) # check if the current length of the output signal exceeds the duration if len(y.audio) > duration: metadata.trim_meta(y, limits=[0, duration], samples=True) metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) # adjust the loudness of the output signal if "loudness" in cfg.__dict__: Loading
ivas_processing_scripts/generation/generate_masa_items.py +47 −10 Original line number Diff line number Diff line Loading @@ -31,6 +31,7 @@ # import logging import re import sys from itertools import groupby, repeat from pathlib import Path Loading Loading @@ -209,6 +210,7 @@ def generate_MASA_scene( # repeat for all source files offset = 0 y_int = None end_position = [] for i in range(N_inputs): # parse parameters from the scene description source_file = ( Loading @@ -232,13 +234,44 @@ def generate_MASA_scene( else: source_shift = 0.0 # read the source shift length (in seconds) if "shift" in scene.keys(): source_shift = ( scene["shift"][i] if isinstance(scene["shift"], list) else scene["shift"] ) # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index # of the reference signal (0-based index) if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: # extract X and i_ref match = re.match( r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i] ) if match: overlap = float(match.group(1)) overlap_ref = int(match.group(2)) else: scene_shift_str = scene["shift"][i] logger.error( f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!" ) sys.exit(-1) # calculate absolute shift of the source signal in seconds source_shift = end_position[overlap_ref] - overlap else: source_shift = 0.0 # convert shift from seconds to samples and ensure it is a multiple of 20ms source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) else: source_shift = int(np.ceil(source_shift / frame_len) * frame_len) source_shift_in_seconds = source_shift / cfg.fs # read the level if "level" in scene.keys(): Loading Loading @@ -295,6 +328,9 @@ def generate_MASA_scene( # read source file x = audio.fromfile("MONO", input_filename) # record the total duration of the source signal, taking into account the shift of the starting position end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds) # resample to the target fs if necessary if x.fs != cfg.fs: logger.warning( Loading Loading @@ -339,26 +375,26 @@ def generate_MASA_scene( # this is the first SBA source signal y_int.audio = x.audio.copy() if source_shift < 0: if source_shift > 0: # insert zeros to the first SBA source signal to shift it right y_int.audio = audioarray.trim( y_int.audio, y_int.fs, limits=[source_shift, 0], samples=True y_int.audio, y_int.fs, limits=[-source_shift, 0], samples=True ) else: offset = source_shift else: # shift the beginning of the audio source signal delta_offset = source_shift - offset if delta_offset > 0: if delta_offset < 0: # insert zeros to the output SBA signal to shift it right y_int.audio = audioarray.trim( y_int.audio, y_int.fs, limits=[-delta_offset, 0], samples=True y_int.audio, y_int.fs, limits=[delta_offset, 0], samples=True ) offset = source_shift else: # insert zeros to the new SBA source signal to shift it right x.audio = audioarray.trim( x.audio, x.fs, limits=[delta_offset, 0], samples=True x.audio, x.fs, limits=[-delta_offset, 0], samples=True ) # adjust the length of the audio source signal Loading Loading @@ -396,14 +432,15 @@ def generate_MASA_scene( # trim the output signal if the total duration exceeds X seconds if "duration" in cfg.__dict__: # convert from seconds to samples (ensure multiple of 20ms) duration = int( np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len ) duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len) # check if the current length of the output signal exceeds the duration if len(y_int.audio) > duration: y_int.audio = audioarray.trim( y_int.audio, y_int.fs, limits=[0, len(y_int.audio) - duration], samples=True y_int.audio, y_int.fs, limits=[0, len(y_int.audio) - duration], samples=True, ) # adjust the loudness of the output signal Loading
ivas_processing_scripts/generation/generate_mc_items.py +29 −7 Original line number Diff line number Diff line Loading @@ -31,6 +31,7 @@ # import logging import re import sys from itertools import groupby, repeat from pathlib import Path Loading Loading @@ -209,6 +210,7 @@ def generate_MC_scene( # repeat for all source files offset = 0 y_int = None end_position = [] for i in range(N_inputs): # parse parameters from the scene description source_file = ( Loading @@ -222,23 +224,40 @@ def generate_MC_scene( ) IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) # read the shift time in seconds # read the source shift length (in seconds) if "shift" in scene.keys(): source_shift = ( scene["shift"][i] if isinstance(scene["shift"], list) else scene["shift"] ) # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index # of the reference signal (0-based index) if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: # extract X and i_ref match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]) if match: overlap = float(match.group(1)) overlap_ref = int(match.group(2)) else: scene_shift_str = scene["shift"][i] logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!") sys.exit(-1) # calculate absolute shift of the source signal in seconds source_shift = end_position[overlap_ref] + overlap else: source_shift = 0.0 # convert shift from seconds to samples and ensure it is a multiple of 20ms source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) else: source_shift = int(np.ceil(source_shift / frame_len) * frame_len) source_shift_in_seconds = source_shift / cfg.fs # read the level if "level" in scene.keys(): Loading Loading @@ -295,6 +314,9 @@ def generate_MC_scene( # read source file x = audio.fromfile("MONO", input_filename) # record the total duration of the source signal, taking into account the shift of the starting position end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds) # resample to the target fs if necessary if x.fs != cfg.fs: logger.warning( Loading Loading @@ -339,26 +361,26 @@ def generate_MC_scene( # this is the first SBA source signal y_int.audio = x.audio.copy() if source_shift < 0: if source_shift > 0: # insert zeros to the first SBA source signal to shift it right y_int.audio = audioarray.trim( y_int.audio, y_int.fs, limits=[source_shift, 0], samples=True y_int.audio, y_int.fs, limits=[-source_shift, 0], samples=True ) else: offset = source_shift else: # shift the beginning of the audio source signal delta_offset = source_shift - offset if delta_offset > 0: if delta_offset < 0: # insert zeros to the output SBA signal to shift it right y_int.audio = audioarray.trim( y_int.audio, y_int.fs, limits=[-delta_offset, 0], samples=True y_int.audio, y_int.fs, limits=[delta_offset, 0], samples=True ) offset = source_shift else: # insert zeros to the new SBA source signal to shift it right x.audio = audioarray.trim( x.audio, x.fs, limits=[delta_offset, 0], samples=True x.audio, x.fs, limits=[-delta_offset, 0], samples=True ) # adjust the length of the audio source signal Loading
ivas_processing_scripts/generation/generate_omasa_items.py +31 −22 Original line number Diff line number Diff line Loading @@ -31,6 +31,7 @@ # import logging import re import sys from itertools import groupby, repeat from pathlib import Path Loading Loading @@ -183,6 +184,7 @@ def generate_OMASA_scene( # repeat for all source files offset = 0 end_position = [] for i in range(N_inputs): # parse parameters from the scene description source_file = ( Loading Loading @@ -220,16 +222,33 @@ def generate_OMASA_scene( if isinstance(scene["shift"], list) else scene["shift"] ) # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index # of the reference signal (0-based index) if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: # extract X and i_ref match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]) if match: overlap = float(match.group(1)) overlap_ref = int(match.group(2)) else: scene_shift_str = scene["shift"][i] logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!") sys.exit(-1) # calculate absolute shift of the source signal in seconds source_shift = end_position[overlap_ref] + overlap else: source_shift = 0.0 # convert shift from seconds to samples and ensure it is a multiple of 20ms source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) else: source_shift = int(np.ceil(source_shift / frame_len) * frame_len) source_shift_in_seconds = source_shift / cfg.fs # read the level if "level" in scene.keys(): Loading Loading @@ -300,6 +319,9 @@ def generate_OMASA_scene( # read source file x = audio.fromfile(fmt, input_filename) # record the total duration of the source signal, taking into account the shift of the starting position end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds) # resample to the target fs if necessary if x.fs != cfg.fs: logger.warning( Loading Loading @@ -417,21 +439,21 @@ def generate_OMASA_scene( # add the first audio source signal (should be FOA/HOA2/HOA3) to the array of all source signals y_int.audio = x.audio.copy() if source_shift < 0: if source_shift > 0: # insert zeros to the new audio source signal to shift it right metadata.trim_meta(y_int, limits=[source_shift, 0], samples=True) metadata.trim_meta(y_int, limits=[-source_shift, 0], samples=True) else: offset = source_shift else: # shift the beginning of the audio source signal delta_offset = source_shift - offset if delta_offset > 0: if delta_offset < 0: # insert zeros to the existing intermediate OSBA object to shift it right metadata.trim_meta(y_int, limits=[-delta_offset, 0], samples=True) metadata.trim_meta(y_int, limits=[delta_offset, 0], samples=True) offset = source_shift else: # insert zeros to the new audio source signal to shift it right metadata.trim_meta(x, limits=[delta_offset, 0], samples=True) metadata.trim_meta(x, limits=[-delta_offset, 0], samples=True) # adjust the length of the audio source signal delta_length = len(x.audio) - len(y_int.audio) Loading Loading @@ -472,29 +494,16 @@ def generate_OMASA_scene( noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") y_int.audio += noise # trim the output signal such if the total duration exceeds X seconds # trim the output signal if the total duration exceeds X seconds if "duration" in cfg.__dict__: <<<<<<< Updated upstream # trim the output signal such that the total duration is X seconds duration = int(cfg.duration * cfg.fs) # convert to samples else: # do not change the length of the audio signal duration = len(y_int.audio) duration = int( np.floor(duration / frame_len) * frame_len ) # ensure multiple of 20ms if len(y_int.audio) != duration: metadata.trim_meta(y_int, limits=[0, len(y_int.audio) - duration], samples=True) ======= # convert from seconds to samples (ensure multiple of 20ms) duration = int( np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len ) # check if the current length of the output signal exceeds the duration if len(y.audio) > duration: metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) >>>>>>> Stashed changes if len(y_int.audio) > duration: metadata.trim_meta(y_int, limits=[0, len(y_int.audio) - duration], samples=True) # adjust the loudness of the output signal if "loudness" in cfg.__dict__: Loading
ivas_processing_scripts/generation/generate_osba_items.py +29 −7 Original line number Diff line number Diff line Loading @@ -31,6 +31,7 @@ # import logging import re import sys from itertools import groupby, repeat from pathlib import Path Loading Loading @@ -187,6 +188,7 @@ def generate_OSBA_scene( # repeat for all source files offset = 0 end_position = [] for i in range(N_inputs): # parse parameters from the scene description source_file = ( Loading @@ -210,23 +212,40 @@ def generate_OSBA_scene( else scene["elevation"] ) # read the shift time in seconds # read the source shift length (in seconds) if "shift" in scene.keys(): source_shift = ( scene["shift"][i] if isinstance(scene["shift"], list) else scene["shift"] ) # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index # of the reference signal (0-based index) if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: # extract X and i_ref match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]) if match: overlap = float(match.group(1)) overlap_ref = int(match.group(2)) else: scene_shift_str = scene["shift"][i] logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!") sys.exit(-1) # calculate absolute shift of the source signal in seconds source_shift = end_position[overlap_ref] + overlap else: source_shift = 0.0 # convert shift from seconds to samples and ensure it is a multiple of 20ms source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) else: source_shift = int(np.ceil(source_shift / frame_len) * frame_len) source_shift_in_seconds = source_shift / cfg.fs # read the level if "level" in scene.keys(): Loading Loading @@ -282,6 +301,9 @@ def generate_OSBA_scene( # read source file x = audio.fromfile(fmt, input_filename) # record the total duration of the source signal, taking into account the shift of the starting position end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds) # resample to the target fs if necessary if x.fs != cfg.fs: logger.warning( Loading Loading @@ -403,21 +425,21 @@ def generate_OSBA_scene( # if ISM, append object position to the OSBA object y.object_pos = x.object_pos.copy() if source_shift < 0: if source_shift > 0: # insert zeros to the new audio source signal to shift it right metadata.trim_meta(y, limits=[source_shift, 0], samples=True) metadata.trim_meta(y, limits=[-source_shift, 0], samples=True) else: offset = source_shift else: # shift the beginning of the audio source signal delta_offset = source_shift - offset if delta_offset > 0: if delta_offset < 0: # insert zeros to the previous ISM signal(s) to shift them right metadata.trim_meta(y, limits=[-delta_offset, 0], samples=True) metadata.trim_meta(y, limits=[delta_offset, 0], samples=True) offset = source_shift else: # insert zeros to the new audio source signal to shift it right metadata.trim_meta(x, limits=[delta_offset, 0], samples=True) metadata.trim_meta(x, limits=[-delta_offset, 0], samples=True) # adjust the length of the audio source signal delta_length = len(x.audio) - len(y.audio) Loading