Loading ivas_processing_scripts/generation/generate_stereo_items.py +83 −36 Original line number Diff line number Diff line Loading @@ -33,7 +33,6 @@ import logging import os from itertools import groupby, repeat from math import floor from pathlib import Path import numpy as np Loading Loading @@ -202,22 +201,37 @@ def generate_stereo_scene( # extract the number of audio sources N_inputs = len(np.atleast_1d(scene["input"])) # initialize output dirs # get the output filename output_filename = Path(scene["output"]).parent / ( cfg.use_output_prefix + Path(scene["output"]).name ) # initialize output dirs dir_path = output_filename.parent if dir_path and not dir_path.exists(): dir_path.mkdir(parents=True, exist_ok=True) # initialize output audio object # initialize output STEREO object y = audio.ChannelBasedAudio(cfg.format) y.fs = cfg.fs # set the frame length frame_len = int(cfg.fs / 50) # repeat for all source files offset = 0 for i in range(N_inputs): # parse parameters from the scene description source_file = np.atleast_1d(scene["input"])[i] IR_file = np.atleast_1d(scene["IR"])[i] source_file = ( scene["input"][i] if isinstance(scene["input"], list) else scene["input"] ) IR_file = ( scene["IR"][i] if isinstance(scene["IR"], list) else scene["IR"] ) # get input filename and IR filename input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name) IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) # read the overlap length if "shift" in scene.keys(): Loading @@ -229,6 +243,13 @@ def generate_stereo_scene( else: source_shift = 0.0 # convert overlap to samples and ensure it is a multiple of 20ms source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) else: source_shift = int(np.ceil(source_shift / frame_len) * frame_len) # read the level if "level" in scene.keys(): level = ( Loading @@ -241,62 +262,88 @@ def generate_stereo_scene( logger.info(f"Convolving {source_file} with {IR_file}") # get input filename and IR filename input_filename = Path(source_file).parent / ( cfg.use_input_prefix + Path(source_file).name ) IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) # read source file x = audio.fromfile("MONO", input_filename, fs=cfg.fs) x = audio.fromfile("MONO", input_filename) # resample to the target fs if necessary if x.fs != cfg.fs: logger.warning( f"Warning: Sample rate of the audio source is {x.fs} Hz and needs to be resampled to {cfg.fs}!" ) resampled_audio = audioarray.resample(x.audio, x.fs, cfg.fs) x.audio = resampled_audio x.fs = cfg.fs # read the IR file (!must be in STEREO format!) IR = audio.fromfile("STEREO", IR_filename, fs=cfg.IR_fs) IR = audio.fromfile("STEREO", IR_filename) # convolve mono source signal with stereo IR # convolve MONO source audio with STEREO IR -> results in STEREO audio object x = reverb_stereo(x, IR) # adjust the level of the stereo signal x.audio, _ = loudness_norm(x, level, loudness_format="STEREO") # shift the source signal (positive shift creates overlap, negative shift creates a gap) if int(floor(-source_shift)) != 0: x.audio = audioarray.trim(x.audio, x.fs, limits=[-source_shift, 0]) # get the number of frames (multiple of 20ms) frame_len = int(x.fs / 50) # ensure the length of the audio source signal is a multiple of 20ms if len(x.audio) % frame_len != 0: # pad with zeros to ensure that the signal length is a multiple of 20ms if len(x.audio) % frame_len != 0: N_pad = int(frame_len - len(x.audio) % frame_len) x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) # add the convolved STEREO audio source signal to the output signal if y.audio is None: # add source signal to the array of all source signals y.audio = x.audio.copy() y.fs = x.fs # if source_shift < 0: # # insert zeros to the new audio source signal to shift it right # metadata.trim_meta(y, limits=[source_shift, 0], samples=True) offset = source_shift else: # pad the signal with zeros to have the same length as the previous signal(s) N_pad = y.audio.shape[0] - x.audio.shape[0] if N_pad != 0: x.audio = audioarray.trim( x.audio, x.fs, limits=[0, -N_pad], samples=True ) # shift the beginning of the audio source signal delta_offset = source_shift - offset if delta_offset > 0: # insert zeros to the existing output signal to shift it right y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_offset], samples=True) offset = source_shift else: # insert zeros to the new audio source signal to shift it right x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_offset], samples=True) # adjust the length of the audio source signal delta_length = len(x.audio) - len(y.audio) if delta_length > 0: # pad zeros to the existing output signal y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_length], samples=True) else: # pad zeros to the new audio source signal x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_length], samples=True) # superimpose y.audio += x.audio # append pre-amble and post-amble to all sources y.audio = audioarray.trim(y.audio, y.fs, limits=[-cfg.preamble, -cfg.postamble]) # append pre-amble and post-amble preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms y.audio = audioarray.trim(y.audio, y.fs, limits=[-preamble, -postamble], samples=True) # add random noise if cfg.add_low_level_random_noise: # create uniformly distributed noise between -4 and 4 np.random.seed(SEED_RANDOM_NOISE) noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") # superimpose y.audio += noise # write the output STEREO audio signal into output file # adjust the length of the output signal if "duration" in cfg.__dict__: # trim the output signal such that the total duration is X seconds duration = int(cfg.duration * cfg.fs) # convert to samples else: # do not change the length of the audio signal duration = len(y.audio) duration = int(np.floor(duration / frame_len) * frame_len) # ensure multiple of 20ms if len(y.audio) != duration: y.audio = audioarray.trim(y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True) # write the STEREO audio signal into output file audiofile.write(output_filename, y.audio, y.fs) Loading
ivas_processing_scripts/generation/generate_stereo_items.py +83 −36 Original line number Diff line number Diff line Loading @@ -33,7 +33,6 @@ import logging import os from itertools import groupby, repeat from math import floor from pathlib import Path import numpy as np Loading Loading @@ -202,22 +201,37 @@ def generate_stereo_scene( # extract the number of audio sources N_inputs = len(np.atleast_1d(scene["input"])) # initialize output dirs # get the output filename output_filename = Path(scene["output"]).parent / ( cfg.use_output_prefix + Path(scene["output"]).name ) # initialize output dirs dir_path = output_filename.parent if dir_path and not dir_path.exists(): dir_path.mkdir(parents=True, exist_ok=True) # initialize output audio object # initialize output STEREO object y = audio.ChannelBasedAudio(cfg.format) y.fs = cfg.fs # set the frame length frame_len = int(cfg.fs / 50) # repeat for all source files offset = 0 for i in range(N_inputs): # parse parameters from the scene description source_file = np.atleast_1d(scene["input"])[i] IR_file = np.atleast_1d(scene["IR"])[i] source_file = ( scene["input"][i] if isinstance(scene["input"], list) else scene["input"] ) IR_file = ( scene["IR"][i] if isinstance(scene["IR"], list) else scene["IR"] ) # get input filename and IR filename input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name) IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) # read the overlap length if "shift" in scene.keys(): Loading @@ -229,6 +243,13 @@ def generate_stereo_scene( else: source_shift = 0.0 # convert overlap to samples and ensure it is a multiple of 20ms source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) else: source_shift = int(np.ceil(source_shift / frame_len) * frame_len) # read the level if "level" in scene.keys(): level = ( Loading @@ -241,62 +262,88 @@ def generate_stereo_scene( logger.info(f"Convolving {source_file} with {IR_file}") # get input filename and IR filename input_filename = Path(source_file).parent / ( cfg.use_input_prefix + Path(source_file).name ) IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) # read source file x = audio.fromfile("MONO", input_filename, fs=cfg.fs) x = audio.fromfile("MONO", input_filename) # resample to the target fs if necessary if x.fs != cfg.fs: logger.warning( f"Warning: Sample rate of the audio source is {x.fs} Hz and needs to be resampled to {cfg.fs}!" ) resampled_audio = audioarray.resample(x.audio, x.fs, cfg.fs) x.audio = resampled_audio x.fs = cfg.fs # read the IR file (!must be in STEREO format!) IR = audio.fromfile("STEREO", IR_filename, fs=cfg.IR_fs) IR = audio.fromfile("STEREO", IR_filename) # convolve mono source signal with stereo IR # convolve MONO source audio with STEREO IR -> results in STEREO audio object x = reverb_stereo(x, IR) # adjust the level of the stereo signal x.audio, _ = loudness_norm(x, level, loudness_format="STEREO") # shift the source signal (positive shift creates overlap, negative shift creates a gap) if int(floor(-source_shift)) != 0: x.audio = audioarray.trim(x.audio, x.fs, limits=[-source_shift, 0]) # get the number of frames (multiple of 20ms) frame_len = int(x.fs / 50) # ensure the length of the audio source signal is a multiple of 20ms if len(x.audio) % frame_len != 0: # pad with zeros to ensure that the signal length is a multiple of 20ms if len(x.audio) % frame_len != 0: N_pad = int(frame_len - len(x.audio) % frame_len) x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) # add the convolved STEREO audio source signal to the output signal if y.audio is None: # add source signal to the array of all source signals y.audio = x.audio.copy() y.fs = x.fs # if source_shift < 0: # # insert zeros to the new audio source signal to shift it right # metadata.trim_meta(y, limits=[source_shift, 0], samples=True) offset = source_shift else: # pad the signal with zeros to have the same length as the previous signal(s) N_pad = y.audio.shape[0] - x.audio.shape[0] if N_pad != 0: x.audio = audioarray.trim( x.audio, x.fs, limits=[0, -N_pad], samples=True ) # shift the beginning of the audio source signal delta_offset = source_shift - offset if delta_offset > 0: # insert zeros to the existing output signal to shift it right y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_offset], samples=True) offset = source_shift else: # insert zeros to the new audio source signal to shift it right x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_offset], samples=True) # adjust the length of the audio source signal delta_length = len(x.audio) - len(y.audio) if delta_length > 0: # pad zeros to the existing output signal y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_length], samples=True) else: # pad zeros to the new audio source signal x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_length], samples=True) # superimpose y.audio += x.audio # append pre-amble and post-amble to all sources y.audio = audioarray.trim(y.audio, y.fs, limits=[-cfg.preamble, -cfg.postamble]) # append pre-amble and post-amble preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms y.audio = audioarray.trim(y.audio, y.fs, limits=[-preamble, -postamble], samples=True) # add random noise if cfg.add_low_level_random_noise: # create uniformly distributed noise between -4 and 4 np.random.seed(SEED_RANDOM_NOISE) noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") # superimpose y.audio += noise # write the output STEREO audio signal into output file # adjust the length of the output signal if "duration" in cfg.__dict__: # trim the output signal such that the total duration is X seconds duration = int(cfg.duration * cfg.fs) # convert to samples else: # do not change the length of the audio signal duration = len(y.audio) duration = int(np.floor(duration / frame_len) * frame_len) # ensure multiple of 20ms if len(y.audio) != duration: y.audio = audioarray.trim(y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True) # write the STEREO audio signal into output file audiofile.write(output_filename, y.audio, y.fs)