From 4651439f24993edb5562257d94cea3af5e296fb5 Mon Sep 17 00:00:00 2001
From: Vladimir Malenovsky <vladimir.malenovsky@usherbrooke.ca>
Date: Sat, 9 Aug 2025 18:09:07 +0200
Subject: [PATCH 1/4] fix OMASA item generation

---
 .../audiotools/convert/osba.py                |  18 ++--
 .../audiotools/convert/scenebased.py          |  11 ++
 .../generation/generate_ismN_items.py         |   8 +-
 .../generation/generate_masa_items.py         |   8 +-
 .../generation/generate_mc_items.py           |   8 +-
 .../generation/generate_omasa_items.py        | 102 +++++++++---------
 .../generation/generate_osba_items.py         |   8 +-
 .../generation/generate_sba_items.py          |   8 +-
 .../generation/generate_stereo_items.py       |   8 +-
 9 files changed, 96 insertions(+), 83 deletions(-)

diff --git a/ivas_processing_scripts/audiotools/convert/osba.py b/ivas_processing_scripts/audiotools/convert/osba.py
index 3fdd582c..28d738ec 100644
--- a/ivas_processing_scripts/audiotools/convert/osba.py
+++ b/ivas_processing_scripts/audiotools/convert/osba.py
@@ -129,14 +129,20 @@ def convert_osba(
             )
 
         # only render SBA part to MASA
-        out_sba = audio.fromtype(out.name[4:])
-        out_sba.metadata_file = out.metadata_files[-1]
-        render_sba_to_masa(sba, out_sba)
+        out_masa = audio.fromtype(out.name[4:])
+        if out.metadata_files[-1].endswith(".met"):
+            # if MASA metadata filename is given, copy it
+            out_masa.metadata_file = copy(out.metadata_files[-1])
+        render_sba_to_masa(sba, out_masa)
+
+        # combine ISM audio with MASA audio into OMASA audio
+        out.audio = np.concatenate((oba.audio, out_masa.audio), axis=1)
 
-        # ism audio is passed through
-        ism_audio = osba.audio[:, : osba.num_ism_channels]
+        # combine ISM metadata with MASA metadata filenames
+        out.metadata_files = oba.metadata_files + [out_masa.metadata_file]
 
-        out.audio = np.concatenate((ism_audio, out_sba.audio), axis=1)
+        # copy ISM object positions
+        out.object_pos = copy(oba.object_pos)
 
     # OSBA -> OSBA
     elif isinstance(out, audio.OSBAAudio):
diff --git a/ivas_processing_scripts/audiotools/convert/scenebased.py b/ivas_processing_scripts/audiotools/convert/scenebased.py
index 4930526b..1749a987 100755
--- a/ivas_processing_scripts/audiotools/convert/scenebased.py
+++ b/ivas_processing_scripts/audiotools/convert/scenebased.py
@@ -197,6 +197,17 @@ def render_sba_to_masa(
     masa_out: audio.MetadataAssistedSpatialAudio,
 ) -> None:
     num_tcs = masa_out.num_channels
+
+    # convert to HOA2 if input is HOA3
+    if sba_in.name.endswith("HOA3"):
+        warn(
+            "MASA conversion only supports up to 2nd order ambisonics! Converting to HOA2 format."
+        )
+        sba_hoa2 = audio.fromtype("HOA2")
+        sba_hoa2.fs = sba_in.fs
+        render_sba_to_sba(sba_in, sba_hoa2)
+        sba_in = sba_hoa2
+
     masa = masaAnalyzer(sba_in, num_tcs, masa_out.dirs, masa_out.metadata_file)
     masa_out.audio = masa.audio
 
diff --git a/ivas_processing_scripts/generation/generate_ismN_items.py b/ivas_processing_scripts/generation/generate_ismN_items.py
index a19acd6e..489dbea4 100644
--- a/ivas_processing_scripts/generation/generate_ismN_items.py
+++ b/ivas_processing_scripts/generation/generate_ismN_items.py
@@ -288,12 +288,12 @@ def generate_ismN_scene(
             x.fs = cfg.fs
 
         # adjust the level of the audio source file (need to convert to MONO first)
-        if np.isinf(level):
-            # set all channels to zero
-            x.audio = np.zeros_like(x.audio)
-        elif level is None:
+        if level is None:
             # do not change the level of the audio source signal
             logger.info("-- Level of the audio source signal is not changed")
+        elif np.isinf(level):
+            # set all channels to zero
+            x.audio = np.zeros_like(x.audio)
         else:
             x_temp = audio.ChannelBasedAudio(
                 "MONO"
diff --git a/ivas_processing_scripts/generation/generate_masa_items.py b/ivas_processing_scripts/generation/generate_masa_items.py
index 7a0e020c..7a425823 100644
--- a/ivas_processing_scripts/generation/generate_masa_items.py
+++ b/ivas_processing_scripts/generation/generate_masa_items.py
@@ -316,12 +316,12 @@ def generate_MASA_scene(
             x = reverb_hoa3(x, IR, mode=None)
 
         # adjust the level of the FOA/HOA2/HOA3 signal
-        if np.isinf(level):
-            # set all channels to zero
-            x.audio = np.zeros_like(x.audio)
-        elif level is None:
+        if level is None:
             # do not change the level of the audio source signal
             logger.info("-- Level of the audio source signal is not changed")
+        elif np.isinf(level):
+            # set all channels to zero
+            x.audio = np.zeros_like(x.audio)
         else:
             x.audio, _ = loudness_norm(x, level, loudness_format="STEREO")
 
diff --git a/ivas_processing_scripts/generation/generate_mc_items.py b/ivas_processing_scripts/generation/generate_mc_items.py
index 29e6b661..df1b1645 100644
--- a/ivas_processing_scripts/generation/generate_mc_items.py
+++ b/ivas_processing_scripts/generation/generate_mc_items.py
@@ -316,12 +316,12 @@ def generate_MC_scene(
             x = reverb_hoa3(x, IR, mode=None)
 
         # adjust the level of the FOA/HOA2/HOA3 signal
-        if np.isinf(level):
-            # set all channels to zero
-            x.audio = np.zeros_like(x.audio)
-        elif level is None:
+        if level is None:
             # do not change the level of the audio source signal
             logger.info("-- Level of the audio source signal is not changed")
+        elif np.isinf(level):
+            # set all channels to zero
+            x.audio = np.zeros_like(x.audio)
         else:
             x.audio, _ = loudness_norm(x, level, loudness_format="STEREO")
 
diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py
index 13fc3470..75c5579a 100644
--- a/ivas_processing_scripts/generation/generate_omasa_items.py
+++ b/ivas_processing_scripts/generation/generate_omasa_items.py
@@ -39,7 +39,7 @@ import numpy as np
 
 from ivas_processing_scripts.audiotools import audio, audioarray, audiofile, metadata
 from ivas_processing_scripts.audiotools.convert.omasa import convert_omasa
-from ivas_processing_scripts.audiotools.convert.scenebased import render_sba_to_masa
+from ivas_processing_scripts.audiotools.convert.osba import convert_osba
 from ivas_processing_scripts.audiotools.wrappers.bs1770 import loudness_norm
 from ivas_processing_scripts.generation import config
 from ivas_processing_scripts.utils import apply_func_parallel
@@ -67,13 +67,6 @@ def replace_char_seq_with_string(str, char_seq, repl_str):
     return "".join(result)
 
 
-# function for appending string to a filename before file extension
-def append_str_filename(filename, str_to_append):
-    p = Path(filename)
-    # Combine the stem, the string to append, and the suffix
-    return p.parent / (p.stem + str_to_append + p.suffix)
-
-
 def generate_omasa_items(
     cfg: config.TestConfig,
     logger: logging.Logger,
@@ -183,6 +176,7 @@ def generate_OMASA_scene(
     # initialize output OMASA object
     y = audio.OMASAAudio(omasa_format)
     y.fs = cfg.fs
+    y_int = None  # intermediate OSBA object
 
     # set the frame length
     frame_len = int(cfg.fs / 50)
@@ -278,16 +272,29 @@ def generate_OMASA_scene(
             fmt = "STEREO"
         elif N_channels == 4:
             fmt = "FOA"
+            sba_order = 1
         elif N_channels == 9:
             fmt = "HOA2"
+            sba_order = 2
         elif N_channels == 16:
             fmt = "HOA3"
+            sba_order = 3
         else:
             logger.error(
                 f"Error: Input format of the source file with {N_channels} channels is not supported!"
             )
             sys.exit(-1)
 
+        # initialize intermediate OSBA object
+        if y_int is None:
+            if fmt not in ["FOA", "HOA2", "HOA3"]:
+                logger.error("Error: Expecting FOA/HOA2/HOA3 as the first file in the list!")
+                sys.exit(-1)
+
+            osba_format = f"ISM{N_ISMs}SBA{sba_order}"
+            y_int = audio.OSBAAudio(osba_format)
+            y_int.fs = cfg.fs
+
         # read source file
         x = audio.fromfile(fmt, input_filename)
 
@@ -301,12 +308,12 @@ def generate_OMASA_scene(
             x.fs = cfg.fs
 
         # adjust the level of the source file
-        if np.isinf(level):
-            # set all channels to zero
-            x.audio = np.zeros_like(x.audio)
-        elif level is None:
+        if level is None:
             # do not change the level of the audio source signal
             logger.info("-- Level of the audio source signal is not changed")
+        elif np.isinf(level):
+            # set all channels to zero
+            x.audio = np.zeros_like(x.audio)
         else:
             if fmt in ["FOA", "HOA2", "HOA3"]:
                 x.audio, _ = loudness_norm(x, level, loudness_format="STEREO")
@@ -325,18 +332,8 @@ def generate_OMASA_scene(
         # get the number of frames (multiple of 20ms)
         N_frames = int(len(x.audio) / frame_len)
 
-        # convert the input audio source signal to MASA or ISM
-        if fmt in ["FOA", "HOA2", "HOA3"]:
-            # convert FOA/HOA2/HOA3 to MASA
-            x_masa = audio.MetadataAssistedSpatialAudio(
-                f"MASA{cfg.masa_tc}DIR{cfg.masa_dirs}"
-            )
-            x_masa.fs = cfg.fs
-            # generate MASA metadata filename (should end with .met)
-            x_masa.metadata_file = output_filename.with_suffix(".met")
-            render_sba_to_masa(x, x_masa)
-            x = x_masa  # replace x with the MASA object
-        elif fmt == "MONO":
+        # convert the input MONO audio source signal to ISM1 object
+        if fmt == "MONO":
             # convert MONO to ISM1
             x_ism = audio.ObjectBasedAudio("ISM1")  # ISM with 1 channel
             x_ism.fs = cfg.fs
@@ -413,49 +410,42 @@ def generate_OMASA_scene(
 
             x = x_ism  # replace x with the ISM object
 
-        # copy new audio source signal to the OMASA object
-        if y.audio is None:
-            # add the first audio source signal (should be MASA) to the array of all source signals
-            y.audio = x.audio.copy()
-
-            if "MASA" in x.name:
-                # if MASA, append metadata file to the OMASA object
-                y.metadata_files.append(x.metadata_file)
-            else:
-                # if ISM, append object position to the OMASA object
-                y.object_pos = x.object_pos.copy()
+        # copy new audio source signal to the intermediate OSBA object
+        if y_int.audio is None:
+            # add the first audio source signal (should be FOA/HOA2/HOA3) to the array of all source signals
+            y_int.audio = x.audio.copy()
 
             if source_shift < 0:
                 # insert zeros to the new audio source signal to shift it right
-                metadata.trim_meta(y, limits=[source_shift, 0], samples=True)
+                metadata.trim_meta(y_int, limits=[source_shift, 0], samples=True)
             else:
                 offset = source_shift
         else:
             # shift the beginning of the audio source signal
             delta_offset = source_shift - offset
             if delta_offset > 0:
-                # insert zeros to the previous ISM signal(s) to shift them right
-                metadata.trim_meta(y, limits=[-delta_offset, 0], samples=True)
+                # insert zeros to the existing intermediate OSBA object to shift it right
+                metadata.trim_meta(y_int, limits=[-delta_offset, 0], samples=True)
                 offset = source_shift
             else:
                 # insert zeros to the new audio source signal to shift it right
                 metadata.trim_meta(x, limits=[delta_offset, 0], samples=True)
 
             # adjust the length of the audio source signal
-            delta_length = len(x.audio) - len(y.audio)
+            delta_length = len(x.audio) - len(y_int.audio)
             if delta_length > 0:
-                # pad zeros to the previous ISM signal(s)
-                metadata.trim_meta(y, limits=[0, -delta_length], samples=True)
+                # pad zeros to the existing intermediate OSBA object signal
+                metadata.trim_meta(y_int, limits=[0, -delta_length], samples=True)
             else:
                 # pad zeros to the new audio source signal
                 metadata.trim_meta(x, limits=[0, delta_length], samples=True)
 
-            # append ISM signal to the OMASA object (ISM comes first !!!)
-            y.audio = np.insert(y.audio, [i - 1], x.audio, axis=1)
-            y.object_pos.extend(x.object_pos)
+            # append ISM signal to the intermediate OSBA object (ISM comes first !!!)
+            y_int.audio = np.insert(y_int.audio, [i - 1], x.audio, axis=1)
+            y_int.object_pos.extend(x.object_pos)
 
             # add ISM metadata .csv filename (should end with .wav.0.csv, .wav.1.csv, ...)
-            y.metadata_files.insert(
+            y_int.metadata_files.insert(
                 i - 1, str(output_filename.with_suffix(f".{i - 1}.csv"))
             )
 
@@ -471,14 +461,14 @@ def generate_OMASA_scene(
             logger.info(
                 f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds"
             )
-            metadata.trim_meta(y, limits=[-preamble, -postamble], samples=True)
+            metadata.trim_meta(y_int, limits=[-preamble, -postamble], samples=True)
 
     # add random noise
     if "add_low_level_random_noise" in cfg.__dict__ and cfg.add_low_level_random_noise:
         # create uniformly distributed noise between -4 and 4
         np.random.seed(SEED_RANDOM_NOISE)
         noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float")
-        y.audio += noise
+        y_int.audio += noise
 
     # adjust the length of the output signal
     if "duration" in cfg.__dict__:
@@ -486,28 +476,34 @@ def generate_OMASA_scene(
         duration = int(cfg.duration * cfg.fs)  # convert to samples
     else:
         # do not change the length of the audio signal
-        duration = len(y.audio)
+        duration = len(y_int.audio)
     duration = int(
         np.floor(duration / frame_len) * frame_len
     )  # ensure multiple of 20ms
-    if len(y.audio) != duration:
-        metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True)
+    if len(y_int.audio) != duration:
+        metadata.trim_meta(y_int, limits=[0, len(y_int.audio) - duration], samples=True)
 
     # adjust the loudness of the output signal
     if "loudness" in cfg.__dict__:
         logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LKFS")
-        y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL")
+        y_int.audio, _ = loudness_norm(y_int, cfg.loudness, loudness_format="BINAURAL")
 
     # apply fade-in and fade-out
     if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0:
         logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds")
-        y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000)
+        y_int.audio = audioarray.window(y_int.audio, y_int.fs, cfg.fade_in_out * 1000)
+
+    # generate and insert MASA metadata filename (should end with .met)
+    y.metadata_files.append(str(output_filename.with_suffix(".met")))
+
+    # convert the intermediate OSBA object to OMASA object
+    convert_osba(y_int, y)
 
     # write the OMASA audio output to .wav file in an interleaved format and ISM metadata in .csv files
     audiofile.write(output_filename, y.audio, y.fs)
     metadata.write_ISM_metadata_in_file(y.object_pos, y.metadata_files[:-1])
 
-    # convert to OMASA output to BINAURAL, if option was chosen
+    # convert the OMASA output to BINAURAL output, if option was chosen
     if cfg.binaural_output:
         binaural_output_filename = output_filename.with_name(
             output_filename.stem + "_BINAURAL" + output_filename.suffix
diff --git a/ivas_processing_scripts/generation/generate_osba_items.py b/ivas_processing_scripts/generation/generate_osba_items.py
index 29ff66a7..dd8f5b5d 100644
--- a/ivas_processing_scripts/generation/generate_osba_items.py
+++ b/ivas_processing_scripts/generation/generate_osba_items.py
@@ -292,12 +292,12 @@ def generate_OSBA_scene(
             x.fs = cfg.fs
 
         # adjust the level of the source file
-        if np.isinf(level):
-            # set all channels to zero
-            x.audio = np.zeros_like(x.audio)
-        elif level is None:
+        if level is None:
             # do not change the level of the audio source signal
             logger.info("-- Level of the audio source signal is not changed")
+        elif np.isinf(level):
+            # set all channels to zero
+            x.audio = np.zeros_like(x.audio)
         else:
             if fmt in ["FOA", "HOA2", "HOA3"]:
                 x.audio, _ = loudness_norm(x, level, loudness_format="STEREO")
diff --git a/ivas_processing_scripts/generation/generate_sba_items.py b/ivas_processing_scripts/generation/generate_sba_items.py
index 4f95bf98..28fbabab 100644
--- a/ivas_processing_scripts/generation/generate_sba_items.py
+++ b/ivas_processing_scripts/generation/generate_sba_items.py
@@ -287,12 +287,12 @@ def generate_sba_scene(
             x = reverb_hoa3(x, IR, mode=None)
 
         # adjust the level of the FOA/HOA2/HOA3 signal
-        if np.isinf(level):
-            # set all channels to zero
-            x.audio = np.zeros_like(x.audio)
-        elif level is None:
+        if level is None:
             # do not change the level of the audio source signal
             logger.info("-- Level of the audio source signal is not changed")
+        elif np.isinf(level):
+            # set all channels to zero
+            x.audio = np.zeros_like(x.audio)
         else:
             x.audio, _ = loudness_norm(x, level, loudness_format="STEREO")
 
diff --git a/ivas_processing_scripts/generation/generate_stereo_items.py b/ivas_processing_scripts/generation/generate_stereo_items.py
index 8b616e64..a0d99f90 100644
--- a/ivas_processing_scripts/generation/generate_stereo_items.py
+++ b/ivas_processing_scripts/generation/generate_stereo_items.py
@@ -288,12 +288,12 @@ def generate_stereo_scene(
         x = reverb_stereo(x, IR, mode=None)
 
         # adjust the level of the STEREO signal
-        if np.isinf(level):
-            # set all channels to zero
-            x.audio = np.zeros_like(x.audio)
-        elif level is None:
+        if level is None:
             # do not change the level of the audio source signal
             logger.info("-- Level of the audio source signal is not changed")
+        elif np.isinf(level):
+            # set all channels to zero
+            x.audio = np.zeros_like(x.audio)
         else:
             x.audio, _ = loudness_norm(x, level, loudness_format="STEREO")
 
-- 
GitLab


From bd60be090610ee4c4721bf75976f9e59739a41a9 Mon Sep 17 00:00:00 2001
From: Vladimir Malenovsky <vladimir.malenovsky@usherbrooke.ca>
Date: Sat, 9 Aug 2025 18:16:21 +0200
Subject: [PATCH 2/4] fix formatting

---
 ivas_processing_scripts/generation/generate_omasa_items.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py
index 75c5579a..93f00305 100644
--- a/ivas_processing_scripts/generation/generate_omasa_items.py
+++ b/ivas_processing_scripts/generation/generate_omasa_items.py
@@ -288,7 +288,9 @@ def generate_OMASA_scene(
         # initialize intermediate OSBA object
         if y_int is None:
             if fmt not in ["FOA", "HOA2", "HOA3"]:
-                logger.error("Error: Expecting FOA/HOA2/HOA3 as the first file in the list!")
+                logger.error(
+                    "Error: Expecting FOA/HOA2/HOA3 as the first file in the list!"
+                )
                 sys.exit(-1)
 
             osba_format = f"ISM{N_ISMs}SBA{sba_order}"
-- 
GitLab


From 63c54ecf94d11e3661c6580a7d3d3f2f10147725 Mon Sep 17 00:00:00 2001
From: Vladimir Malenovsky <vladimir.malenovsky@usherbrooke.ca>
Date: Sun, 10 Aug 2025 14:37:06 +0200
Subject: [PATCH 3/4] forgot to add mode parameter to reverb_hoa3(), ...

---
 ivas_processing_scripts/audiotools/wrappers/reverb.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ivas_processing_scripts/audiotools/wrappers/reverb.py b/ivas_processing_scripts/audiotools/wrappers/reverb.py
index f8fcfaa5..57f4cd6e 100644
--- a/ivas_processing_scripts/audiotools/wrappers/reverb.py
+++ b/ivas_processing_scripts/audiotools/wrappers/reverb.py
@@ -285,6 +285,7 @@ def reverb_hoa2(
     input: Audio,
     hoa2_IR: Audio,
     align: Optional[float] = None,
+    mode: Optional[str] = None,
 ) -> Audio:
     """
     Convolve mono audio signal with an HOA2 impulse response
@@ -322,7 +323,7 @@ def reverb_hoa2(
         # separate IR into each channel
         IR.audio = hoa2_IR.audio[:, [i]]
         # convolve mono input with channel IR
-        ych.append(reverb(input, IR, align=align))
+        ych.append(reverb(input, IR, align=align, modee=mode))
 
     # combine into HOA2 output
     y = audio.fromtype("HOA2")
@@ -336,6 +337,7 @@ def reverb_hoa3(
     input: Audio,
     hoa3_IR: Audio,
     align: Optional[float] = None,
+    mode: Optional[str] = None,
 ) -> Audio:
     """
     Convolve mono audio signal with an HOA3 impulse response
@@ -373,7 +375,7 @@ def reverb_hoa3(
         # separate IR into each channel
         IR.audio = hoa3_IR.audio[:, [i]]
         # convolve mono input with channel IR
-        ych.append(reverb(input, IR, align=align))
+        ych.append(reverb(input, IR, align=align, mode=mode))
 
     # combine into HOA3 output
     y = audio.fromtype("HOA3")
-- 
GitLab


From af01ceeb2714cd42f2bfa244bf987b63398469b6 Mon Sep 17 00:00:00 2001
From: malenovsky <vladimir.malenovsky@usherbrooke.ca>
Date: Mon, 11 Aug 2025 12:56:56 +0200
Subject: [PATCH 4/4] Apply 1 suggestion(s) to 1 file(s)

Co-authored-by: Archit Tamarapu <archit.tamarapu@iis.fraunhofer.de>
---
 ivas_processing_scripts/audiotools/convert/osba.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ivas_processing_scripts/audiotools/convert/osba.py b/ivas_processing_scripts/audiotools/convert/osba.py
index 28d738ec..76cead88 100644
--- a/ivas_processing_scripts/audiotools/convert/osba.py
+++ b/ivas_processing_scripts/audiotools/convert/osba.py
@@ -130,7 +130,7 @@ def convert_osba(
 
         # only render SBA part to MASA
         out_masa = audio.fromtype(out.name[4:])
-        if out.metadata_files[-1].endswith(".met"):
+        if str(out.metadata_files[-1]).endswith(".met"):
             # if MASA metadata filename is given, copy it
             out_masa.metadata_file = copy(out.metadata_files[-1])
         render_sba_to_masa(sba, out_masa)
-- 
GitLab