From 4651439f24993edb5562257d94cea3af5e296fb5 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Sat, 9 Aug 2025 18:09:07 +0200 Subject: [PATCH 1/4] fix OMASA item generation --- .../audiotools/convert/osba.py | 18 ++-- .../audiotools/convert/scenebased.py | 11 ++ .../generation/generate_ismN_items.py | 8 +- .../generation/generate_masa_items.py | 8 +- .../generation/generate_mc_items.py | 8 +- .../generation/generate_omasa_items.py | 102 +++++++++--------- .../generation/generate_osba_items.py | 8 +- .../generation/generate_sba_items.py | 8 +- .../generation/generate_stereo_items.py | 8 +- 9 files changed, 96 insertions(+), 83 deletions(-) diff --git a/ivas_processing_scripts/audiotools/convert/osba.py b/ivas_processing_scripts/audiotools/convert/osba.py index 3fdd582c..28d738ec 100644 --- a/ivas_processing_scripts/audiotools/convert/osba.py +++ b/ivas_processing_scripts/audiotools/convert/osba.py @@ -129,14 +129,20 @@ def convert_osba( ) # only render SBA part to MASA - out_sba = audio.fromtype(out.name[4:]) - out_sba.metadata_file = out.metadata_files[-1] - render_sba_to_masa(sba, out_sba) + out_masa = audio.fromtype(out.name[4:]) + if out.metadata_files[-1].endswith(".met"): + # if MASA metadata filename is given, copy it + out_masa.metadata_file = copy(out.metadata_files[-1]) + render_sba_to_masa(sba, out_masa) + + # combine ISM audio with MASA audio into OMASA audio + out.audio = np.concatenate((oba.audio, out_masa.audio), axis=1) - # ism audio is passed through - ism_audio = osba.audio[:, : osba.num_ism_channels] + # combine ISM metadata with MASA metadata filenames + out.metadata_files = oba.metadata_files + [out_masa.metadata_file] - out.audio = np.concatenate((ism_audio, out_sba.audio), axis=1) + # copy ISM object positions + out.object_pos = copy(oba.object_pos) # OSBA -> OSBA elif isinstance(out, audio.OSBAAudio): diff --git a/ivas_processing_scripts/audiotools/convert/scenebased.py b/ivas_processing_scripts/audiotools/convert/scenebased.py index 4930526b..1749a987 100755 --- a/ivas_processing_scripts/audiotools/convert/scenebased.py +++ b/ivas_processing_scripts/audiotools/convert/scenebased.py @@ -197,6 +197,17 @@ def render_sba_to_masa( masa_out: audio.MetadataAssistedSpatialAudio, ) -> None: num_tcs = masa_out.num_channels + + # convert to HOA2 if input is HOA3 + if sba_in.name.endswith("HOA3"): + warn( + "MASA conversion only supports up to 2nd order ambisonics! Converting to HOA2 format." + ) + sba_hoa2 = audio.fromtype("HOA2") + sba_hoa2.fs = sba_in.fs + render_sba_to_sba(sba_in, sba_hoa2) + sba_in = sba_hoa2 + masa = masaAnalyzer(sba_in, num_tcs, masa_out.dirs, masa_out.metadata_file) masa_out.audio = masa.audio diff --git a/ivas_processing_scripts/generation/generate_ismN_items.py b/ivas_processing_scripts/generation/generate_ismN_items.py index a19acd6e..489dbea4 100644 --- a/ivas_processing_scripts/generation/generate_ismN_items.py +++ b/ivas_processing_scripts/generation/generate_ismN_items.py @@ -288,12 +288,12 @@ def generate_ismN_scene( x.fs = cfg.fs # adjust the level of the audio source file (need to convert to MONO first) - if np.isinf(level): - # set all channels to zero - x.audio = np.zeros_like(x.audio) - elif level is None: + if level is None: # do not change the level of the audio source signal logger.info("-- Level of the audio source signal is not changed") + elif np.isinf(level): + # set all channels to zero + x.audio = np.zeros_like(x.audio) else: x_temp = audio.ChannelBasedAudio( "MONO" diff --git a/ivas_processing_scripts/generation/generate_masa_items.py b/ivas_processing_scripts/generation/generate_masa_items.py index 7a0e020c..7a425823 100644 --- a/ivas_processing_scripts/generation/generate_masa_items.py +++ b/ivas_processing_scripts/generation/generate_masa_items.py @@ -316,12 +316,12 @@ def generate_MASA_scene( x = reverb_hoa3(x, IR, mode=None) # adjust the level of the FOA/HOA2/HOA3 signal - if np.isinf(level): - # set all channels to zero - x.audio = np.zeros_like(x.audio) - elif level is None: + if level is None: # do not change the level of the audio source signal logger.info("-- Level of the audio source signal is not changed") + elif np.isinf(level): + # set all channels to zero + x.audio = np.zeros_like(x.audio) else: x.audio, _ = loudness_norm(x, level, loudness_format="STEREO") diff --git a/ivas_processing_scripts/generation/generate_mc_items.py b/ivas_processing_scripts/generation/generate_mc_items.py index 29e6b661..df1b1645 100644 --- a/ivas_processing_scripts/generation/generate_mc_items.py +++ b/ivas_processing_scripts/generation/generate_mc_items.py @@ -316,12 +316,12 @@ def generate_MC_scene( x = reverb_hoa3(x, IR, mode=None) # adjust the level of the FOA/HOA2/HOA3 signal - if np.isinf(level): - # set all channels to zero - x.audio = np.zeros_like(x.audio) - elif level is None: + if level is None: # do not change the level of the audio source signal logger.info("-- Level of the audio source signal is not changed") + elif np.isinf(level): + # set all channels to zero + x.audio = np.zeros_like(x.audio) else: x.audio, _ = loudness_norm(x, level, loudness_format="STEREO") diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py index 13fc3470..75c5579a 100644 --- a/ivas_processing_scripts/generation/generate_omasa_items.py +++ b/ivas_processing_scripts/generation/generate_omasa_items.py @@ -39,7 +39,7 @@ import numpy as np from ivas_processing_scripts.audiotools import audio, audioarray, audiofile, metadata from ivas_processing_scripts.audiotools.convert.omasa import convert_omasa -from ivas_processing_scripts.audiotools.convert.scenebased import render_sba_to_masa +from ivas_processing_scripts.audiotools.convert.osba import convert_osba from ivas_processing_scripts.audiotools.wrappers.bs1770 import loudness_norm from ivas_processing_scripts.generation import config from ivas_processing_scripts.utils import apply_func_parallel @@ -67,13 +67,6 @@ def replace_char_seq_with_string(str, char_seq, repl_str): return "".join(result) -# function for appending string to a filename before file extension -def append_str_filename(filename, str_to_append): - p = Path(filename) - # Combine the stem, the string to append, and the suffix - return p.parent / (p.stem + str_to_append + p.suffix) - - def generate_omasa_items( cfg: config.TestConfig, logger: logging.Logger, @@ -183,6 +176,7 @@ def generate_OMASA_scene( # initialize output OMASA object y = audio.OMASAAudio(omasa_format) y.fs = cfg.fs + y_int = None # intermediate OSBA object # set the frame length frame_len = int(cfg.fs / 50) @@ -278,16 +272,29 @@ def generate_OMASA_scene( fmt = "STEREO" elif N_channels == 4: fmt = "FOA" + sba_order = 1 elif N_channels == 9: fmt = "HOA2" + sba_order = 2 elif N_channels == 16: fmt = "HOA3" + sba_order = 3 else: logger.error( f"Error: Input format of the source file with {N_channels} channels is not supported!" ) sys.exit(-1) + # initialize intermediate OSBA object + if y_int is None: + if fmt not in ["FOA", "HOA2", "HOA3"]: + logger.error("Error: Expecting FOA/HOA2/HOA3 as the first file in the list!") + sys.exit(-1) + + osba_format = f"ISM{N_ISMs}SBA{sba_order}" + y_int = audio.OSBAAudio(osba_format) + y_int.fs = cfg.fs + # read source file x = audio.fromfile(fmt, input_filename) @@ -301,12 +308,12 @@ def generate_OMASA_scene( x.fs = cfg.fs # adjust the level of the source file - if np.isinf(level): - # set all channels to zero - x.audio = np.zeros_like(x.audio) - elif level is None: + if level is None: # do not change the level of the audio source signal logger.info("-- Level of the audio source signal is not changed") + elif np.isinf(level): + # set all channels to zero + x.audio = np.zeros_like(x.audio) else: if fmt in ["FOA", "HOA2", "HOA3"]: x.audio, _ = loudness_norm(x, level, loudness_format="STEREO") @@ -325,18 +332,8 @@ def generate_OMASA_scene( # get the number of frames (multiple of 20ms) N_frames = int(len(x.audio) / frame_len) - # convert the input audio source signal to MASA or ISM - if fmt in ["FOA", "HOA2", "HOA3"]: - # convert FOA/HOA2/HOA3 to MASA - x_masa = audio.MetadataAssistedSpatialAudio( - f"MASA{cfg.masa_tc}DIR{cfg.masa_dirs}" - ) - x_masa.fs = cfg.fs - # generate MASA metadata filename (should end with .met) - x_masa.metadata_file = output_filename.with_suffix(".met") - render_sba_to_masa(x, x_masa) - x = x_masa # replace x with the MASA object - elif fmt == "MONO": + # convert the input MONO audio source signal to ISM1 object + if fmt == "MONO": # convert MONO to ISM1 x_ism = audio.ObjectBasedAudio("ISM1") # ISM with 1 channel x_ism.fs = cfg.fs @@ -413,49 +410,42 @@ def generate_OMASA_scene( x = x_ism # replace x with the ISM object - # copy new audio source signal to the OMASA object - if y.audio is None: - # add the first audio source signal (should be MASA) to the array of all source signals - y.audio = x.audio.copy() - - if "MASA" in x.name: - # if MASA, append metadata file to the OMASA object - y.metadata_files.append(x.metadata_file) - else: - # if ISM, append object position to the OMASA object - y.object_pos = x.object_pos.copy() + # copy new audio source signal to the intermediate OSBA object + if y_int.audio is None: + # add the first audio source signal (should be FOA/HOA2/HOA3) to the array of all source signals + y_int.audio = x.audio.copy() if source_shift < 0: # insert zeros to the new audio source signal to shift it right - metadata.trim_meta(y, limits=[source_shift, 0], samples=True) + metadata.trim_meta(y_int, limits=[source_shift, 0], samples=True) else: offset = source_shift else: # shift the beginning of the audio source signal delta_offset = source_shift - offset if delta_offset > 0: - # insert zeros to the previous ISM signal(s) to shift them right - metadata.trim_meta(y, limits=[-delta_offset, 0], samples=True) + # insert zeros to the existing intermediate OSBA object to shift it right + metadata.trim_meta(y_int, limits=[-delta_offset, 0], samples=True) offset = source_shift else: # insert zeros to the new audio source signal to shift it right metadata.trim_meta(x, limits=[delta_offset, 0], samples=True) # adjust the length of the audio source signal - delta_length = len(x.audio) - len(y.audio) + delta_length = len(x.audio) - len(y_int.audio) if delta_length > 0: - # pad zeros to the previous ISM signal(s) - metadata.trim_meta(y, limits=[0, -delta_length], samples=True) + # pad zeros to the existing intermediate OSBA object signal + metadata.trim_meta(y_int, limits=[0, -delta_length], samples=True) else: # pad zeros to the new audio source signal metadata.trim_meta(x, limits=[0, delta_length], samples=True) - # append ISM signal to the OMASA object (ISM comes first !!!) - y.audio = np.insert(y.audio, [i - 1], x.audio, axis=1) - y.object_pos.extend(x.object_pos) + # append ISM signal to the intermediate OSBA object (ISM comes first !!!) + y_int.audio = np.insert(y_int.audio, [i - 1], x.audio, axis=1) + y_int.object_pos.extend(x.object_pos) # add ISM metadata .csv filename (should end with .wav.0.csv, .wav.1.csv, ...) - y.metadata_files.insert( + y_int.metadata_files.insert( i - 1, str(output_filename.with_suffix(f".{i - 1}.csv")) ) @@ -471,14 +461,14 @@ def generate_OMASA_scene( logger.info( f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" ) - metadata.trim_meta(y, limits=[-preamble, -postamble], samples=True) + metadata.trim_meta(y_int, limits=[-preamble, -postamble], samples=True) # add random noise if "add_low_level_random_noise" in cfg.__dict__ and cfg.add_low_level_random_noise: # create uniformly distributed noise between -4 and 4 np.random.seed(SEED_RANDOM_NOISE) noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") - y.audio += noise + y_int.audio += noise # adjust the length of the output signal if "duration" in cfg.__dict__: @@ -486,28 +476,34 @@ def generate_OMASA_scene( duration = int(cfg.duration * cfg.fs) # convert to samples else: # do not change the length of the audio signal - duration = len(y.audio) + duration = len(y_int.audio) duration = int( np.floor(duration / frame_len) * frame_len ) # ensure multiple of 20ms - if len(y.audio) != duration: - metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) + if len(y_int.audio) != duration: + metadata.trim_meta(y_int, limits=[0, len(y_int.audio) - duration], samples=True) # adjust the loudness of the output signal if "loudness" in cfg.__dict__: logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LKFS") - y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL") + y_int.audio, _ = loudness_norm(y_int, cfg.loudness, loudness_format="BINAURAL") # apply fade-in and fade-out if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0: logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds") - y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000) + y_int.audio = audioarray.window(y_int.audio, y_int.fs, cfg.fade_in_out * 1000) + + # generate and insert MASA metadata filename (should end with .met) + y.metadata_files.append(str(output_filename.with_suffix(".met"))) + + # convert the intermediate OSBA object to OMASA object + convert_osba(y_int, y) # write the OMASA audio output to .wav file in an interleaved format and ISM metadata in .csv files audiofile.write(output_filename, y.audio, y.fs) metadata.write_ISM_metadata_in_file(y.object_pos, y.metadata_files[:-1]) - # convert to OMASA output to BINAURAL, if option was chosen + # convert the OMASA output to BINAURAL output, if option was chosen if cfg.binaural_output: binaural_output_filename = output_filename.with_name( output_filename.stem + "_BINAURAL" + output_filename.suffix diff --git a/ivas_processing_scripts/generation/generate_osba_items.py b/ivas_processing_scripts/generation/generate_osba_items.py index 29ff66a7..dd8f5b5d 100644 --- a/ivas_processing_scripts/generation/generate_osba_items.py +++ b/ivas_processing_scripts/generation/generate_osba_items.py @@ -292,12 +292,12 @@ def generate_OSBA_scene( x.fs = cfg.fs # adjust the level of the source file - if np.isinf(level): - # set all channels to zero - x.audio = np.zeros_like(x.audio) - elif level is None: + if level is None: # do not change the level of the audio source signal logger.info("-- Level of the audio source signal is not changed") + elif np.isinf(level): + # set all channels to zero + x.audio = np.zeros_like(x.audio) else: if fmt in ["FOA", "HOA2", "HOA3"]: x.audio, _ = loudness_norm(x, level, loudness_format="STEREO") diff --git a/ivas_processing_scripts/generation/generate_sba_items.py b/ivas_processing_scripts/generation/generate_sba_items.py index 4f95bf98..28fbabab 100644 --- a/ivas_processing_scripts/generation/generate_sba_items.py +++ b/ivas_processing_scripts/generation/generate_sba_items.py @@ -287,12 +287,12 @@ def generate_sba_scene( x = reverb_hoa3(x, IR, mode=None) # adjust the level of the FOA/HOA2/HOA3 signal - if np.isinf(level): - # set all channels to zero - x.audio = np.zeros_like(x.audio) - elif level is None: + if level is None: # do not change the level of the audio source signal logger.info("-- Level of the audio source signal is not changed") + elif np.isinf(level): + # set all channels to zero + x.audio = np.zeros_like(x.audio) else: x.audio, _ = loudness_norm(x, level, loudness_format="STEREO") diff --git a/ivas_processing_scripts/generation/generate_stereo_items.py b/ivas_processing_scripts/generation/generate_stereo_items.py index 8b616e64..a0d99f90 100644 --- a/ivas_processing_scripts/generation/generate_stereo_items.py +++ b/ivas_processing_scripts/generation/generate_stereo_items.py @@ -288,12 +288,12 @@ def generate_stereo_scene( x = reverb_stereo(x, IR, mode=None) # adjust the level of the STEREO signal - if np.isinf(level): - # set all channels to zero - x.audio = np.zeros_like(x.audio) - elif level is None: + if level is None: # do not change the level of the audio source signal logger.info("-- Level of the audio source signal is not changed") + elif np.isinf(level): + # set all channels to zero + x.audio = np.zeros_like(x.audio) else: x.audio, _ = loudness_norm(x, level, loudness_format="STEREO") -- GitLab From bd60be090610ee4c4721bf75976f9e59739a41a9 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Sat, 9 Aug 2025 18:16:21 +0200 Subject: [PATCH 2/4] fix formatting --- ivas_processing_scripts/generation/generate_omasa_items.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py index 75c5579a..93f00305 100644 --- a/ivas_processing_scripts/generation/generate_omasa_items.py +++ b/ivas_processing_scripts/generation/generate_omasa_items.py @@ -288,7 +288,9 @@ def generate_OMASA_scene( # initialize intermediate OSBA object if y_int is None: if fmt not in ["FOA", "HOA2", "HOA3"]: - logger.error("Error: Expecting FOA/HOA2/HOA3 as the first file in the list!") + logger.error( + "Error: Expecting FOA/HOA2/HOA3 as the first file in the list!" + ) sys.exit(-1) osba_format = f"ISM{N_ISMs}SBA{sba_order}" -- GitLab From 63c54ecf94d11e3661c6580a7d3d3f2f10147725 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Sun, 10 Aug 2025 14:37:06 +0200 Subject: [PATCH 3/4] forgot to add mode parameter to reverb_hoa3(), ... --- ivas_processing_scripts/audiotools/wrappers/reverb.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ivas_processing_scripts/audiotools/wrappers/reverb.py b/ivas_processing_scripts/audiotools/wrappers/reverb.py index f8fcfaa5..57f4cd6e 100644 --- a/ivas_processing_scripts/audiotools/wrappers/reverb.py +++ b/ivas_processing_scripts/audiotools/wrappers/reverb.py @@ -285,6 +285,7 @@ def reverb_hoa2( input: Audio, hoa2_IR: Audio, align: Optional[float] = None, + mode: Optional[str] = None, ) -> Audio: """ Convolve mono audio signal with an HOA2 impulse response @@ -322,7 +323,7 @@ def reverb_hoa2( # separate IR into each channel IR.audio = hoa2_IR.audio[:, [i]] # convolve mono input with channel IR - ych.append(reverb(input, IR, align=align)) + ych.append(reverb(input, IR, align=align, modee=mode)) # combine into HOA2 output y = audio.fromtype("HOA2") @@ -336,6 +337,7 @@ def reverb_hoa3( input: Audio, hoa3_IR: Audio, align: Optional[float] = None, + mode: Optional[str] = None, ) -> Audio: """ Convolve mono audio signal with an HOA3 impulse response @@ -373,7 +375,7 @@ def reverb_hoa3( # separate IR into each channel IR.audio = hoa3_IR.audio[:, [i]] # convolve mono input with channel IR - ych.append(reverb(input, IR, align=align)) + ych.append(reverb(input, IR, align=align, mode=mode)) # combine into HOA3 output y = audio.fromtype("HOA3") -- GitLab From af01ceeb2714cd42f2bfa244bf987b63398469b6 Mon Sep 17 00:00:00 2001 From: malenovsky Date: Mon, 11 Aug 2025 12:56:56 +0200 Subject: [PATCH 4/4] Apply 1 suggestion(s) to 1 file(s) Co-authored-by: Archit Tamarapu --- ivas_processing_scripts/audiotools/convert/osba.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ivas_processing_scripts/audiotools/convert/osba.py b/ivas_processing_scripts/audiotools/convert/osba.py index 28d738ec..76cead88 100644 --- a/ivas_processing_scripts/audiotools/convert/osba.py +++ b/ivas_processing_scripts/audiotools/convert/osba.py @@ -130,7 +130,7 @@ def convert_osba( # only render SBA part to MASA out_masa = audio.fromtype(out.name[4:]) - if out.metadata_files[-1].endswith(".met"): + if str(out.metadata_files[-1]).endswith(".met"): # if MASA metadata filename is given, copy it out_masa.metadata_file = copy(out.metadata_files[-1]) render_sba_to_masa(sba, out_masa) -- GitLab