Loading ivas_processing_scripts/generation/generate_ambi_items.py +43 −17 Original line number Diff line number Diff line Loading @@ -205,12 +205,12 @@ def generate_ambi_scene( source_file = ( scene["input"][i] if isinstance(scene["input"], list) else scene["input"] ) IR_file = ( scene["IR"][i] if isinstance(scene["IR"], list) else scene["IR"] ) IR_file = scene["IR"][i] if isinstance(scene["IR"], list) else scene["IR"] # get input filename and IR filename input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name) input_filename = Path(source_file).parent / ( cfg.use_input_prefix + Path(source_file).name ) IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) # read the overlap length Loading Loading @@ -241,7 +241,9 @@ def generate_ambi_scene( else: level = -26 logger.info(f"-- Convolving {source_file} with {IR_file} at {level} LUFS with shift of {source_shift_in_seconds} seconds") logger.info( f"-- Convolving {source_file} with {IR_file} at {level} LUFS with shift of {source_shift_in_seconds} seconds" ) # read source file x = audio.fromfile("MONO", input_filename) Loading Loading @@ -274,7 +276,9 @@ def generate_ambi_scene( # pad with zeros to ensure that the signal length is a multiple of 20ms if len(x.audio) % frame_len != 0: N_pad = int(frame_len - len(x.audio) % frame_len) x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) x.audio = audioarray.trim( x.audio, x.fs, limits=[0, -N_pad], samples=True ) # add the convolved FOA/HOA2/HOA3 audio source signal to the output signal if y.audio is None: Loading @@ -283,7 +287,9 @@ def generate_ambi_scene( if source_shift < 0: # insert zeros to the new audio source signal to shift it right y.audio = audioarray.trim_meta(y.audio, y.fs, limits=[source_shift, 0], samples=True) y.audio = audioarray.trim_meta( y.audio, y.fs, limits=[source_shift, 0], samples=True ) else: offset = source_shift else: Loading @@ -291,33 +297,47 @@ def generate_ambi_scene( delta_offset = source_shift - offset if delta_offset > 0: # insert zeros to the existing output signal to shift it right y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_offset], samples=True) y.audio = audioarray.trim( y.audio, y.fs, limits=[0, -delta_offset], samples=True ) offset = source_shift else: # insert zeros to the new audio source signal to shift it right x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_offset], samples=True) x.audio = audioarray.trim( x.audio, x.fs, limits=[0, delta_offset], samples=True ) # adjust the length of the audio source signal delta_length = len(x.audio) - len(y.audio) if delta_length > 0: # pad zeros to the existing output signal y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_length], samples=True) y.audio = audioarray.trim( y.audio, y.fs, limits=[0, -delta_length], samples=True ) else: # pad zeros to the new audio source signal x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_length], samples=True) x.audio = audioarray.trim( x.audio, x.fs, limits=[0, delta_length], samples=True ) # superimpose y.audio += x.audio # append pre-amble and post-amble if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms preamble = int( np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len ) # convert to samples and ensure multiple of 20ms postamble = int( np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len ) # convert to samples and ensure multiple of 20ms if preamble != 0 or postamble != 0: logger.info( f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" ) y.audio = audioarray.trim(y.audio, y.fs, limits=[-preamble, -postamble], samples=True) y.audio = audioarray.trim( y.audio, y.fs, limits=[-preamble, -postamble], samples=True ) # add random noise if "add_low_level_random_noise" in cfg.__dict__ and cfg.add_low_level_random_noise: Loading @@ -333,9 +353,13 @@ def generate_ambi_scene( else: # do not change the length of the audio signal duration = len(y.audio) duration = int(np.floor(duration / frame_len) * frame_len) # ensure multiple of 20ms duration = int( np.floor(duration / frame_len) * frame_len ) # ensure multiple of 20ms if len(y.audio) != duration: y.audio = audioarray.trim(y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True) y.audio = audioarray.trim( y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True ) # adjust the loudness of the output signal if "loudness" in cfg.__dict__: Loading @@ -355,7 +379,9 @@ def generate_ambi_scene( binaural_output_filename = output_filename.with_name( output_filename.stem + "_BINAURAL" + output_filename.suffix ) logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}") logger.info( f"-- Converting to BINAURAL output file: {binaural_output_filename}" ) binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_scenebased(y, binaudio) Loading ivas_processing_scripts/generation/generate_ismN_items.py +30 −10 Original line number Diff line number Diff line Loading @@ -208,14 +208,18 @@ def generate_ismN_scene( # read azimuth and elevation information if "azimuth" in scene.keys(): source_azi = ( scene["azimuth"][i] if isinstance(scene["azimuth"], list) else scene["azimuth"] scene["azimuth"][i] if isinstance(scene["azimuth"], list) else scene["azimuth"] ) else: source_azi = 0.0 if "elevation" in scene.keys(): source_ele = ( scene["elevation"][i] if isinstance(scene["elevation"], list) else scene["elevation"] scene["elevation"][i] if isinstance(scene["elevation"], list) else scene["elevation"] ) else: source_ele = 0.0 Loading @@ -223,7 +227,9 @@ def generate_ismN_scene( # read the source shift length (in seconds) if "shift" in scene.keys(): source_shift = ( scene["shift"][i] if isinstance(scene["shift"], list) else scene["shift"] scene["shift"][i] if isinstance(scene["shift"], list) else scene["shift"] ) else: source_shift = 0.0 Loading @@ -239,12 +245,16 @@ def generate_ismN_scene( # read the level if "level" in scene.keys(): level = ( scene["level"][i] if isinstance(scene["level"], list) else scene["level"] scene["level"][i] if isinstance(scene["level"], list) else scene["level"] ) else: level = -26 logger.info(f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds") logger.info( f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds" ) # read source file x = audio.fromtype("ISM1") Loading @@ -271,7 +281,9 @@ def generate_ismN_scene( # pad with zeros to ensure that the signal length is a multiple of 20ms if len(x.audio) % frame_len != 0: N_pad = int(frame_len - len(x.audio) % frame_len) x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) x.audio = audioarray.trim( x.audio, x.fs, limits=[0, -N_pad], samples=True ) # get the number of frames (multiple of 20ms) N_frames = int(len(x.audio) / frame_len) Loading Loading @@ -385,8 +397,12 @@ def generate_ismN_scene( # append pre-amble and post-amble if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms preamble = int( np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len ) # convert to samples and ensure multiple of 20ms postamble = int( np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len ) # convert to samples and ensure multiple of 20ms if preamble != 0 or postamble != 0: logger.info( f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" Loading @@ -407,7 +423,9 @@ def generate_ismN_scene( else: # do not change the length of the audio signal duration = len(y.audio) duration = int(np.floor(duration / frame_len) * frame_len) # ensure multiple of 20ms duration = int( np.floor(duration / frame_len) * frame_len ) # ensure multiple of 20ms if len(y.audio) != duration: metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) Loading @@ -430,7 +448,9 @@ def generate_ismN_scene( binaural_output_filename = output_filename.with_name( output_filename.stem + "_BINAURAL" + output_filename.suffix ) logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}") logger.info( f"-- Converting to BINAURAL output file: {binaural_output_filename}" ) binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_objectbased(y, binaudio) Loading ivas_processing_scripts/generation/generate_omasa_items.py +30 −10 Original line number Diff line number Diff line Loading @@ -203,14 +203,18 @@ def generate_OMASA_scene( # read azimuth and elevation information if "azimuth" in scene.keys(): source_azi = ( scene["azimuth"][i] if isinstance(scene["azimuth"], list) else scene["azimuth"] scene["azimuth"][i] if isinstance(scene["azimuth"], list) else scene["azimuth"] ) else: source_azi = 0.0 if "elevation" in scene.keys(): source_ele = ( scene["elevation"][i] if isinstance(scene["elevation"], list) else scene["elevation"] scene["elevation"][i] if isinstance(scene["elevation"], list) else scene["elevation"] ) else: source_ele = 0.0 Loading @@ -218,7 +222,9 @@ def generate_OMASA_scene( # read the source shift length (in seconds) if "shift" in scene.keys(): source_shift = ( scene["shift"][i] if isinstance(scene["shift"], list) else scene["shift"] scene["shift"][i] if isinstance(scene["shift"], list) else scene["shift"] ) else: source_shift = 0.0 Loading @@ -241,7 +247,9 @@ def generate_OMASA_scene( else: level = -26 logger.info(f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds") logger.info( f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds" ) # get the number of channels from the .wav file header wav_header = audiofile.parse_wave_header(input_filename) Loading Loading @@ -286,7 +294,9 @@ def generate_OMASA_scene( # pad with zeros to ensure that the signal length is a multiple of 20ms if len(x.audio) % frame_len != 0: N_pad = int(frame_len - len(x.audio) % frame_len) x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) x.audio = audioarray.trim( x.audio, x.fs, limits=[0, -N_pad], samples=True ) # get the number of frames (multiple of 20ms) N_frames = int(len(x.audio) / frame_len) Loading Loading @@ -421,12 +431,18 @@ def generate_OMASA_scene( y.object_pos.extend(x.object_pos) # add ISM metadata .csv filename (should end with .wav.0.csv, .wav.1.csv, ...) y.metadata_files.insert(i - 1, str(output_filename.with_suffix(f".{i - 1}.csv"))) y.metadata_files.insert( i - 1, str(output_filename.with_suffix(f".{i - 1}.csv")) ) # append pre-amble and post-amble if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms preamble = int( np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len ) # convert to samples and ensure multiple of 20ms postamble = int( np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len ) # convert to samples and ensure multiple of 20ms if preamble != 0 or postamble != 0: logger.info( f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" Loading @@ -447,7 +463,9 @@ def generate_OMASA_scene( else: # do not change the length of the audio signal duration = len(y.audio) duration = int(np.floor(duration / frame_len) * frame_len) # ensure multiple of 20ms duration = int( np.floor(duration / frame_len) * frame_len ) # ensure multiple of 20ms if len(y.audio) != duration: metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) Loading @@ -470,7 +488,9 @@ def generate_OMASA_scene( binaural_output_filename = output_filename.with_name( output_filename.stem + "_BINAURAL" + output_filename.suffix ) logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}") logger.info( f"-- Converting to BINAURAL output file: {binaural_output_filename}" ) binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_omasa(y, binaudio) Loading ivas_processing_scripts/generation/generate_osba_items.py +24 −8 Original line number Diff line number Diff line Loading @@ -194,7 +194,9 @@ def generate_OSBA_scene( ) # get input filename input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name) input_filename = Path(source_file).parent / ( cfg.use_input_prefix + Path(source_file).name ) # read azimuth and elevation information source_azi = ( Loading Loading @@ -236,7 +238,9 @@ def generate_OSBA_scene( else: level = -26 logger.info(f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds") logger.info( f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds" ) # get the number of channels from the .wav file header wav_header = audiofile.parse_wave_header(input_filename) Loading Loading @@ -281,7 +285,9 @@ def generate_OSBA_scene( # pad with zeros to ensure that the signal length is a multiple of 20ms if len(x.audio) % frame_len != 0: N_pad = int(frame_len - len(x.audio) % frame_len) x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) x.audio = audioarray.trim( x.audio, x.fs, limits=[0, -N_pad], samples=True ) # get the number of frames (multiple of 20ms) N_frames = int(len(x.audio) / frame_len) Loading Loading @@ -403,12 +409,18 @@ def generate_OSBA_scene( y.object_pos.extend(x.object_pos) # add ISM metadata .csv filename (should end with .wav.0.csv, .wav.1.csv, ...) y.metadata_files.insert(i - 1, str(output_filename.with_suffix(f".{i - 1}.csv"))) y.metadata_files.insert( i - 1, str(output_filename.with_suffix(f".{i - 1}.csv")) ) # append pre-amble and post-amble if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms preamble = int( np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len ) # convert to samples and ensure multiple of 20ms postamble = int( np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len ) # convert to samples and ensure multiple of 20ms if preamble != 0 or postamble != 0: logger.info( f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" Loading @@ -429,7 +441,9 @@ def generate_OSBA_scene( else: # do not change the length of the audio signal duration = len(y.audio) duration = int(np.floor(duration / frame_len) * frame_len) # ensure multiple of 20ms duration = int( np.floor(duration / frame_len) * frame_len ) # ensure multiple of 20ms if len(y.audio) != duration: metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) Loading @@ -452,7 +466,9 @@ def generate_OSBA_scene( binaural_output_filename = output_filename.with_name( output_filename.stem + "_BINAURAL" + output_filename.suffix ) logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}") logger.info( f"-- Converting to BINAURAL output file: {binaural_output_filename}" ) binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_osba(y, binaudio) Loading ivas_processing_scripts/generation/generate_stereo_items.py +40 −16 Original line number Diff line number Diff line Loading @@ -211,12 +211,12 @@ def generate_stereo_scene( source_file = ( scene["input"][i] if isinstance(scene["input"], list) else scene["input"] ) IR_file = ( scene["IR"][i] if isinstance(scene["IR"], list) else scene["IR"] ) IR_file = scene["IR"][i] if isinstance(scene["IR"], list) else scene["IR"] # get input filename and IR filename input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name) input_filename = Path(source_file).parent / ( cfg.use_input_prefix + Path(source_file).name ) IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) # read the overlap length Loading Loading @@ -247,7 +247,9 @@ def generate_stereo_scene( else: level = -26 logger.info(f"-- Convolving {source_file} with {IR_file} at {level} LUFS with shift of {source_shift_in_seconds} seconds") logger.info( f"-- Convolving {source_file} with {IR_file} at {level} LUFS with shift of {source_shift_in_seconds} seconds" ) # read source file x = audio.fromfile("MONO", input_filename) Loading Loading @@ -275,7 +277,9 @@ def generate_stereo_scene( # pad with zeros to ensure that the signal length is a multiple of 20ms if len(x.audio) % frame_len != 0: N_pad = int(frame_len - len(x.audio) % frame_len) x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) x.audio = audioarray.trim( x.audio, x.fs, limits=[0, -N_pad], samples=True ) # add the convolved STEREO audio source signal to the output signal if y.audio is None: Loading @@ -284,7 +288,9 @@ def generate_stereo_scene( if source_shift < 0: # insert zeros to the new audio source signal to shift it right y.audio = audioarray.trim(y.audio, x.fs, limits=[source_shift, 0], samples=True) y.audio = audioarray.trim( y.audio, x.fs, limits=[source_shift, 0], samples=True ) else: offset = source_shift else: Loading @@ -292,33 +298,47 @@ def generate_stereo_scene( delta_offset = source_shift - offset if delta_offset > 0: # insert zeros to the existing output signal to shift it right y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_offset], samples=True) y.audio = audioarray.trim( y.audio, y.fs, limits=[0, -delta_offset], samples=True ) offset = source_shift else: # insert zeros to the new audio source signal to shift it right x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_offset], samples=True) x.audio = audioarray.trim( x.audio, x.fs, limits=[0, delta_offset], samples=True ) # adjust the length of the audio source signal delta_length = len(x.audio) - len(y.audio) if delta_length > 0: # pad zeros to the existing output signal y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_length], samples=True) y.audio = audioarray.trim( y.audio, y.fs, limits=[0, -delta_length], samples=True ) else: # pad zeros to the new audio source signal x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_length], samples=True) x.audio = audioarray.trim( x.audio, x.fs, limits=[0, delta_length], samples=True ) # superimpose y.audio += x.audio # append pre-amble and post-amble if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms preamble = int( np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len ) # convert to samples and ensure multiple of 20ms postamble = int( np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len ) # convert to samples and ensure multiple of 20ms if preamble != 0 or postamble != 0: logger.info( f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" ) y.audio = audioarray.trim(y.audio, y.fs, limits=[-preamble, -postamble], samples=True) y.audio = audioarray.trim( y.audio, y.fs, limits=[-preamble, -postamble], samples=True ) # add random noise if "add_low_level_random_noise" in cfg.__dict__ and cfg.add_low_level_random_noise: Loading @@ -334,9 +354,13 @@ def generate_stereo_scene( else: # do not change the length of the audio signal duration = len(y.audio) duration = int(np.floor(duration / frame_len) * frame_len) # ensure multiple of 20ms duration = int( np.floor(duration / frame_len) * frame_len ) # ensure multiple of 20ms if len(y.audio) != duration: y.audio = audioarray.trim(y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True) y.audio = audioarray.trim( y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True ) # adjust the loudness of the output signal if "loudness" in cfg.__dict__: Loading Loading
ivas_processing_scripts/generation/generate_ambi_items.py +43 −17 Original line number Diff line number Diff line Loading @@ -205,12 +205,12 @@ def generate_ambi_scene( source_file = ( scene["input"][i] if isinstance(scene["input"], list) else scene["input"] ) IR_file = ( scene["IR"][i] if isinstance(scene["IR"], list) else scene["IR"] ) IR_file = scene["IR"][i] if isinstance(scene["IR"], list) else scene["IR"] # get input filename and IR filename input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name) input_filename = Path(source_file).parent / ( cfg.use_input_prefix + Path(source_file).name ) IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) # read the overlap length Loading Loading @@ -241,7 +241,9 @@ def generate_ambi_scene( else: level = -26 logger.info(f"-- Convolving {source_file} with {IR_file} at {level} LUFS with shift of {source_shift_in_seconds} seconds") logger.info( f"-- Convolving {source_file} with {IR_file} at {level} LUFS with shift of {source_shift_in_seconds} seconds" ) # read source file x = audio.fromfile("MONO", input_filename) Loading Loading @@ -274,7 +276,9 @@ def generate_ambi_scene( # pad with zeros to ensure that the signal length is a multiple of 20ms if len(x.audio) % frame_len != 0: N_pad = int(frame_len - len(x.audio) % frame_len) x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) x.audio = audioarray.trim( x.audio, x.fs, limits=[0, -N_pad], samples=True ) # add the convolved FOA/HOA2/HOA3 audio source signal to the output signal if y.audio is None: Loading @@ -283,7 +287,9 @@ def generate_ambi_scene( if source_shift < 0: # insert zeros to the new audio source signal to shift it right y.audio = audioarray.trim_meta(y.audio, y.fs, limits=[source_shift, 0], samples=True) y.audio = audioarray.trim_meta( y.audio, y.fs, limits=[source_shift, 0], samples=True ) else: offset = source_shift else: Loading @@ -291,33 +297,47 @@ def generate_ambi_scene( delta_offset = source_shift - offset if delta_offset > 0: # insert zeros to the existing output signal to shift it right y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_offset], samples=True) y.audio = audioarray.trim( y.audio, y.fs, limits=[0, -delta_offset], samples=True ) offset = source_shift else: # insert zeros to the new audio source signal to shift it right x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_offset], samples=True) x.audio = audioarray.trim( x.audio, x.fs, limits=[0, delta_offset], samples=True ) # adjust the length of the audio source signal delta_length = len(x.audio) - len(y.audio) if delta_length > 0: # pad zeros to the existing output signal y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_length], samples=True) y.audio = audioarray.trim( y.audio, y.fs, limits=[0, -delta_length], samples=True ) else: # pad zeros to the new audio source signal x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_length], samples=True) x.audio = audioarray.trim( x.audio, x.fs, limits=[0, delta_length], samples=True ) # superimpose y.audio += x.audio # append pre-amble and post-amble if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms preamble = int( np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len ) # convert to samples and ensure multiple of 20ms postamble = int( np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len ) # convert to samples and ensure multiple of 20ms if preamble != 0 or postamble != 0: logger.info( f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" ) y.audio = audioarray.trim(y.audio, y.fs, limits=[-preamble, -postamble], samples=True) y.audio = audioarray.trim( y.audio, y.fs, limits=[-preamble, -postamble], samples=True ) # add random noise if "add_low_level_random_noise" in cfg.__dict__ and cfg.add_low_level_random_noise: Loading @@ -333,9 +353,13 @@ def generate_ambi_scene( else: # do not change the length of the audio signal duration = len(y.audio) duration = int(np.floor(duration / frame_len) * frame_len) # ensure multiple of 20ms duration = int( np.floor(duration / frame_len) * frame_len ) # ensure multiple of 20ms if len(y.audio) != duration: y.audio = audioarray.trim(y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True) y.audio = audioarray.trim( y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True ) # adjust the loudness of the output signal if "loudness" in cfg.__dict__: Loading @@ -355,7 +379,9 @@ def generate_ambi_scene( binaural_output_filename = output_filename.with_name( output_filename.stem + "_BINAURAL" + output_filename.suffix ) logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}") logger.info( f"-- Converting to BINAURAL output file: {binaural_output_filename}" ) binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_scenebased(y, binaudio) Loading
ivas_processing_scripts/generation/generate_ismN_items.py +30 −10 Original line number Diff line number Diff line Loading @@ -208,14 +208,18 @@ def generate_ismN_scene( # read azimuth and elevation information if "azimuth" in scene.keys(): source_azi = ( scene["azimuth"][i] if isinstance(scene["azimuth"], list) else scene["azimuth"] scene["azimuth"][i] if isinstance(scene["azimuth"], list) else scene["azimuth"] ) else: source_azi = 0.0 if "elevation" in scene.keys(): source_ele = ( scene["elevation"][i] if isinstance(scene["elevation"], list) else scene["elevation"] scene["elevation"][i] if isinstance(scene["elevation"], list) else scene["elevation"] ) else: source_ele = 0.0 Loading @@ -223,7 +227,9 @@ def generate_ismN_scene( # read the source shift length (in seconds) if "shift" in scene.keys(): source_shift = ( scene["shift"][i] if isinstance(scene["shift"], list) else scene["shift"] scene["shift"][i] if isinstance(scene["shift"], list) else scene["shift"] ) else: source_shift = 0.0 Loading @@ -239,12 +245,16 @@ def generate_ismN_scene( # read the level if "level" in scene.keys(): level = ( scene["level"][i] if isinstance(scene["level"], list) else scene["level"] scene["level"][i] if isinstance(scene["level"], list) else scene["level"] ) else: level = -26 logger.info(f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds") logger.info( f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds" ) # read source file x = audio.fromtype("ISM1") Loading @@ -271,7 +281,9 @@ def generate_ismN_scene( # pad with zeros to ensure that the signal length is a multiple of 20ms if len(x.audio) % frame_len != 0: N_pad = int(frame_len - len(x.audio) % frame_len) x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) x.audio = audioarray.trim( x.audio, x.fs, limits=[0, -N_pad], samples=True ) # get the number of frames (multiple of 20ms) N_frames = int(len(x.audio) / frame_len) Loading Loading @@ -385,8 +397,12 @@ def generate_ismN_scene( # append pre-amble and post-amble if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms preamble = int( np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len ) # convert to samples and ensure multiple of 20ms postamble = int( np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len ) # convert to samples and ensure multiple of 20ms if preamble != 0 or postamble != 0: logger.info( f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" Loading @@ -407,7 +423,9 @@ def generate_ismN_scene( else: # do not change the length of the audio signal duration = len(y.audio) duration = int(np.floor(duration / frame_len) * frame_len) # ensure multiple of 20ms duration = int( np.floor(duration / frame_len) * frame_len ) # ensure multiple of 20ms if len(y.audio) != duration: metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) Loading @@ -430,7 +448,9 @@ def generate_ismN_scene( binaural_output_filename = output_filename.with_name( output_filename.stem + "_BINAURAL" + output_filename.suffix ) logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}") logger.info( f"-- Converting to BINAURAL output file: {binaural_output_filename}" ) binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_objectbased(y, binaudio) Loading
ivas_processing_scripts/generation/generate_omasa_items.py +30 −10 Original line number Diff line number Diff line Loading @@ -203,14 +203,18 @@ def generate_OMASA_scene( # read azimuth and elevation information if "azimuth" in scene.keys(): source_azi = ( scene["azimuth"][i] if isinstance(scene["azimuth"], list) else scene["azimuth"] scene["azimuth"][i] if isinstance(scene["azimuth"], list) else scene["azimuth"] ) else: source_azi = 0.0 if "elevation" in scene.keys(): source_ele = ( scene["elevation"][i] if isinstance(scene["elevation"], list) else scene["elevation"] scene["elevation"][i] if isinstance(scene["elevation"], list) else scene["elevation"] ) else: source_ele = 0.0 Loading @@ -218,7 +222,9 @@ def generate_OMASA_scene( # read the source shift length (in seconds) if "shift" in scene.keys(): source_shift = ( scene["shift"][i] if isinstance(scene["shift"], list) else scene["shift"] scene["shift"][i] if isinstance(scene["shift"], list) else scene["shift"] ) else: source_shift = 0.0 Loading @@ -241,7 +247,9 @@ def generate_OMASA_scene( else: level = -26 logger.info(f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds") logger.info( f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds" ) # get the number of channels from the .wav file header wav_header = audiofile.parse_wave_header(input_filename) Loading Loading @@ -286,7 +294,9 @@ def generate_OMASA_scene( # pad with zeros to ensure that the signal length is a multiple of 20ms if len(x.audio) % frame_len != 0: N_pad = int(frame_len - len(x.audio) % frame_len) x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) x.audio = audioarray.trim( x.audio, x.fs, limits=[0, -N_pad], samples=True ) # get the number of frames (multiple of 20ms) N_frames = int(len(x.audio) / frame_len) Loading Loading @@ -421,12 +431,18 @@ def generate_OMASA_scene( y.object_pos.extend(x.object_pos) # add ISM metadata .csv filename (should end with .wav.0.csv, .wav.1.csv, ...) y.metadata_files.insert(i - 1, str(output_filename.with_suffix(f".{i - 1}.csv"))) y.metadata_files.insert( i - 1, str(output_filename.with_suffix(f".{i - 1}.csv")) ) # append pre-amble and post-amble if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms preamble = int( np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len ) # convert to samples and ensure multiple of 20ms postamble = int( np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len ) # convert to samples and ensure multiple of 20ms if preamble != 0 or postamble != 0: logger.info( f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" Loading @@ -447,7 +463,9 @@ def generate_OMASA_scene( else: # do not change the length of the audio signal duration = len(y.audio) duration = int(np.floor(duration / frame_len) * frame_len) # ensure multiple of 20ms duration = int( np.floor(duration / frame_len) * frame_len ) # ensure multiple of 20ms if len(y.audio) != duration: metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) Loading @@ -470,7 +488,9 @@ def generate_OMASA_scene( binaural_output_filename = output_filename.with_name( output_filename.stem + "_BINAURAL" + output_filename.suffix ) logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}") logger.info( f"-- Converting to BINAURAL output file: {binaural_output_filename}" ) binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_omasa(y, binaudio) Loading
ivas_processing_scripts/generation/generate_osba_items.py +24 −8 Original line number Diff line number Diff line Loading @@ -194,7 +194,9 @@ def generate_OSBA_scene( ) # get input filename input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name) input_filename = Path(source_file).parent / ( cfg.use_input_prefix + Path(source_file).name ) # read azimuth and elevation information source_azi = ( Loading Loading @@ -236,7 +238,9 @@ def generate_OSBA_scene( else: level = -26 logger.info(f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds") logger.info( f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds" ) # get the number of channels from the .wav file header wav_header = audiofile.parse_wave_header(input_filename) Loading Loading @@ -281,7 +285,9 @@ def generate_OSBA_scene( # pad with zeros to ensure that the signal length is a multiple of 20ms if len(x.audio) % frame_len != 0: N_pad = int(frame_len - len(x.audio) % frame_len) x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) x.audio = audioarray.trim( x.audio, x.fs, limits=[0, -N_pad], samples=True ) # get the number of frames (multiple of 20ms) N_frames = int(len(x.audio) / frame_len) Loading Loading @@ -403,12 +409,18 @@ def generate_OSBA_scene( y.object_pos.extend(x.object_pos) # add ISM metadata .csv filename (should end with .wav.0.csv, .wav.1.csv, ...) y.metadata_files.insert(i - 1, str(output_filename.with_suffix(f".{i - 1}.csv"))) y.metadata_files.insert( i - 1, str(output_filename.with_suffix(f".{i - 1}.csv")) ) # append pre-amble and post-amble if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms preamble = int( np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len ) # convert to samples and ensure multiple of 20ms postamble = int( np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len ) # convert to samples and ensure multiple of 20ms if preamble != 0 or postamble != 0: logger.info( f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" Loading @@ -429,7 +441,9 @@ def generate_OSBA_scene( else: # do not change the length of the audio signal duration = len(y.audio) duration = int(np.floor(duration / frame_len) * frame_len) # ensure multiple of 20ms duration = int( np.floor(duration / frame_len) * frame_len ) # ensure multiple of 20ms if len(y.audio) != duration: metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) Loading @@ -452,7 +466,9 @@ def generate_OSBA_scene( binaural_output_filename = output_filename.with_name( output_filename.stem + "_BINAURAL" + output_filename.suffix ) logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}") logger.info( f"-- Converting to BINAURAL output file: {binaural_output_filename}" ) binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_osba(y, binaudio) Loading
ivas_processing_scripts/generation/generate_stereo_items.py +40 −16 Original line number Diff line number Diff line Loading @@ -211,12 +211,12 @@ def generate_stereo_scene( source_file = ( scene["input"][i] if isinstance(scene["input"], list) else scene["input"] ) IR_file = ( scene["IR"][i] if isinstance(scene["IR"], list) else scene["IR"] ) IR_file = scene["IR"][i] if isinstance(scene["IR"], list) else scene["IR"] # get input filename and IR filename input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name) input_filename = Path(source_file).parent / ( cfg.use_input_prefix + Path(source_file).name ) IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) # read the overlap length Loading Loading @@ -247,7 +247,9 @@ def generate_stereo_scene( else: level = -26 logger.info(f"-- Convolving {source_file} with {IR_file} at {level} LUFS with shift of {source_shift_in_seconds} seconds") logger.info( f"-- Convolving {source_file} with {IR_file} at {level} LUFS with shift of {source_shift_in_seconds} seconds" ) # read source file x = audio.fromfile("MONO", input_filename) Loading Loading @@ -275,7 +277,9 @@ def generate_stereo_scene( # pad with zeros to ensure that the signal length is a multiple of 20ms if len(x.audio) % frame_len != 0: N_pad = int(frame_len - len(x.audio) % frame_len) x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) x.audio = audioarray.trim( x.audio, x.fs, limits=[0, -N_pad], samples=True ) # add the convolved STEREO audio source signal to the output signal if y.audio is None: Loading @@ -284,7 +288,9 @@ def generate_stereo_scene( if source_shift < 0: # insert zeros to the new audio source signal to shift it right y.audio = audioarray.trim(y.audio, x.fs, limits=[source_shift, 0], samples=True) y.audio = audioarray.trim( y.audio, x.fs, limits=[source_shift, 0], samples=True ) else: offset = source_shift else: Loading @@ -292,33 +298,47 @@ def generate_stereo_scene( delta_offset = source_shift - offset if delta_offset > 0: # insert zeros to the existing output signal to shift it right y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_offset], samples=True) y.audio = audioarray.trim( y.audio, y.fs, limits=[0, -delta_offset], samples=True ) offset = source_shift else: # insert zeros to the new audio source signal to shift it right x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_offset], samples=True) x.audio = audioarray.trim( x.audio, x.fs, limits=[0, delta_offset], samples=True ) # adjust the length of the audio source signal delta_length = len(x.audio) - len(y.audio) if delta_length > 0: # pad zeros to the existing output signal y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_length], samples=True) y.audio = audioarray.trim( y.audio, y.fs, limits=[0, -delta_length], samples=True ) else: # pad zeros to the new audio source signal x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_length], samples=True) x.audio = audioarray.trim( x.audio, x.fs, limits=[0, delta_length], samples=True ) # superimpose y.audio += x.audio # append pre-amble and post-amble if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms preamble = int( np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len ) # convert to samples and ensure multiple of 20ms postamble = int( np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len ) # convert to samples and ensure multiple of 20ms if preamble != 0 or postamble != 0: logger.info( f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" ) y.audio = audioarray.trim(y.audio, y.fs, limits=[-preamble, -postamble], samples=True) y.audio = audioarray.trim( y.audio, y.fs, limits=[-preamble, -postamble], samples=True ) # add random noise if "add_low_level_random_noise" in cfg.__dict__ and cfg.add_low_level_random_noise: Loading @@ -334,9 +354,13 @@ def generate_stereo_scene( else: # do not change the length of the audio signal duration = len(y.audio) duration = int(np.floor(duration / frame_len) * frame_len) # ensure multiple of 20ms duration = int( np.floor(duration / frame_len) * frame_len ) # ensure multiple of 20ms if len(y.audio) != duration: y.audio = audioarray.trim(y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True) y.audio = audioarray.trim( y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True ) # adjust the loudness of the output signal if "loudness" in cfg.__dict__: Loading