From 884d79aa4e3cfa60ba6e48317f04c9f8fb073c4c Mon Sep 17 00:00:00 2001
From: Vladimir Malenovsky <vladimir.malenovsky@usherbrooke.ca>
Date: Sun, 27 Jul 2025 14:55:06 +0200
Subject: [PATCH 1/2] add support for item generation in MASA format

---
 examples/ITEM_GENERATION_FOA.yml              |  96 ++--
 examples/ITEM_GENERATION_MASA.yml             | 177 +++++++
 examples/ITEM_GENERATION_OMASA.yml            |   4 +-
 .../generation/__init__.py                    |   8 +-
 .../generation/generate_ismN_items.py         |   2 +-
 .../generation/generate_masa_items.py         | 484 ++++++++++++++++++
 .../generation/generate_omasa_items.py        |   4 +-
 .../generation/generate_osba_items.py         |   4 +-
 .../generation/generate_sba_items.py          |   4 +-
 .../generation/generate_stereo_items.py       |   4 +-
 .../generation/process_ambi_items.py          |   2 +-
 .../generation/process_ism1_items.py          |   2 +-
 .../generation/process_ism2_items.py          |   2 +-
 .../generation/process_stereo_items.py        |   2 +-
 14 files changed, 730 insertions(+), 65 deletions(-)
 create mode 100644 examples/ITEM_GENERATION_MASA.yml
 create mode 100644 ivas_processing_scripts/generation/generate_masa_items.py

diff --git a/examples/ITEM_GENERATION_FOA.yml b/examples/ITEM_GENERATION_FOA.yml
index 46cbe845..879735d4 100644
--- a/examples/ITEM_GENERATION_FOA.yml
+++ b/examples/ITEM_GENERATION_FOA.yml
@@ -114,57 +114,57 @@ scenes:
         background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav"
         background_level: -46
 
-    # "02": 
-        # output: "out/s02.wav"
-        # description: "Car with AB microphone pickup, overlap between the talkers, car noise."
-        # input: ["items_mono/untrimmed/f1s6a_Talker2.wav", "items_mono/untrimmed/f2s3b_Talker1.wav"]
-        # IR: ["IRs/IR_do_p04_e_03_01_FOA.wav", "IRs/IR_do_p04_e_04_01_FOA.wav"]
-        # shift: [0.0, +1.0]
-        # background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav"
-        # background_level: -46
+    "02": 
+        output: "out/s02.wav"
+        description: "Car with AB microphone pickup, overlap between the talkers, car noise."
+        input: ["items_mono/untrimmed/f1s6a_Talker2.wav", "items_mono/untrimmed/f2s3b_Talker1.wav"]
+        IR: ["IRs/IR_do_p04_e_03_01_FOA.wav", "IRs/IR_do_p04_e_04_01_FOA.wav"]
+        shift: [0.0, +1.0]
+        background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav"
+        background_level: -46
         
-    # "03": 
-        # output: "out/s03.wav"
-        # description: "Car with AB microphone pickup, no overlap between the talkers, car noise."
-        # input: ["items_mono/untrimmed/f3s3a_Talker2.wav", "items_mono/untrimmed/f3s10b_Talker2.wav"]
-        # IR: ["IRs/IR_do_p04_e_05_01_FOA.wav", "IRs/IR_do_p04_e_06_01_FOA.wav"]
-        # shift: [0.0, -1.0]
+    "03": 
+        output: "out/s03.wav"
+        description: "Car with AB microphone pickup, no overlap between the talkers, car noise."
+        input: ["items_mono/untrimmed/f3s3a_Talker2.wav", "items_mono/untrimmed/f3s10b_Talker2.wav"]
+        IR: ["IRs/IR_do_p04_e_05_01_FOA.wav", "IRs/IR_do_p04_e_06_01_FOA.wav"]
+        shift: [0.0, -1.0]
         
-    # "04": 
-        # output: "out/s04.wav"
-        # description: "Car with AB microphone pickup, no overlap between the talkers, car noise."
-        # input: ["items_mono/untrimmed/f2s7b_Talker1.wav", "items_mono/untrimmed/f5s15a_Talker1.wav"]
-        # IR: ["IRs/IR_do_p04_e_07_01_FOA.wav", "IRs/IR_do_p04_e_08_01_FOA.wav"]
-        # shift: [0.0, -1.0]
-        # background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav"
-        # background_level: -46
+    "04": 
+        output: "out/s04.wav"
+        description: "Car with AB microphone pickup, no overlap between the talkers, car noise."
+        input: ["items_mono/untrimmed/f2s7b_Talker1.wav", "items_mono/untrimmed/f5s15a_Talker1.wav"]
+        IR: ["IRs/IR_do_p04_e_07_01_FOA.wav", "IRs/IR_do_p04_e_08_01_FOA.wav"]
+        shift: [0.0, -1.0]
+        background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav"
+        background_level: -46
         
-    # "05": 
-        # output: "out/s05.wav"
-        # description: "Car with AB microphone pickup, no overlap between the talkers, car noise."
-        # input: ["items_mono/untrimmed/m2s15a_Talker2.wav", "items_mono/untrimmed/m1s4a_Talker1.wav"]
-        # IR: ["IRs/IR_do_p04_e_07_01_FOA.wav", "IRs/IR_do_p04_e_01_01_FOA.wav"]
-        # shift: [0.0, -1.0]
-        # background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav"
-        # background_level: -46
+    "05": 
+        output: "out/s05.wav"
+        description: "Car with AB microphone pickup, no overlap between the talkers, car noise."
+        input: ["items_mono/untrimmed/m2s15a_Talker2.wav", "items_mono/untrimmed/m1s4a_Talker1.wav"]
+        IR: ["IRs/IR_do_p04_e_07_01_FOA.wav", "IRs/IR_do_p04_e_01_01_FOA.wav"]
+        shift: [0.0, -1.0]
+        background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav"
+        background_level: -46
         
-    # "06": 
-        # output: "out/s06.wav"
-        # description: "Car with AB microphone pickup, no overlap between the talkers."
-        # input: ["items_mono/untrimmed/m3s8a_Talker2.wav", "items_mono/untrimmed/m4s13a_Talker1.wav"]
-        # IR: ["IRs/IR_do_p04_e_03_01_FOA.wav", "IRs/IR_do_p04_e_01_01_FOA.wav"]
-        # shift: [0.0, -1.0]
+    "06": 
+        output: "out/s06.wav"
+        description: "Car with AB microphone pickup, no overlap between the talkers."
+        input: ["items_mono/untrimmed/m3s8a_Talker2.wav", "items_mono/untrimmed/m4s13a_Talker1.wav"]
+        IR: ["IRs/IR_do_p04_e_03_01_FOA.wav", "IRs/IR_do_p04_e_01_01_FOA.wav"]
+        shift: [0.0, -1.0]
          
-    # "07": 
-        # output: "out/s07.wav"
-        # description: "Preliminary: Car with AB microphone pickup, no overlap between the talkers."
-        # input: ["items_mono/untrimmed/f1s20a_Talker2.wav", "items_mono/untrimmed/f5s15b_Talker1.wav"]
-        # IR: ["IRs/IR_do_p04_e_02_01_FOA.wav", "IRs/IR_do_p04_e_07_01_FOA.wav"]
-        # shift: [0.0, -1.0]
+    "07": 
+        output: "out/s07.wav"
+        description: "Preliminary: Car with AB microphone pickup, no overlap between the talkers."
+        input: ["items_mono/untrimmed/f1s20a_Talker2.wav", "items_mono/untrimmed/f5s15b_Talker1.wav"]
+        IR: ["IRs/IR_do_p04_e_02_01_FOA.wav", "IRs/IR_do_p04_e_07_01_FOA.wav"]
+        shift: [0.0, -1.0]
          
-    # "08": 
-        # output: "out/s08.wav"
-        # description: "Car with AB microphone pickup, overlap between the talkers."
-        # input: ["items_mono/untrimmed/m2s6b_Talker2.wav", "items_mono/untrimmed/f5s14a_Talker1.wav"]
-        # IR: ["IRs/IR_do_p04_e_08_01_FOA.wav", "IRs/IR_do_p04_e_04_01_FOA.wav"]
-        # shift: [0.0, +1.0]
+    "08": 
+        output: "out/s08.wav"
+        description: "Car with AB microphone pickup, overlap between the talkers."
+        input: ["items_mono/untrimmed/m2s6b_Talker2.wav", "items_mono/untrimmed/f5s14a_Talker1.wav"]
+        IR: ["IRs/IR_do_p04_e_08_01_FOA.wav", "IRs/IR_do_p04_e_04_01_FOA.wav"]
+        shift: [0.0, +1.0]
diff --git a/examples/ITEM_GENERATION_MASA.yml b/examples/ITEM_GENERATION_MASA.yml
new file mode 100644
index 00000000..958a69cb
--- /dev/null
+++ b/examples/ITEM_GENERATION_MASA.yml
@@ -0,0 +1,177 @@
+---
+################################################
+# Item generation - General configuration
+################################################
+
+### Any relative paths will be interpreted relative to the working directory the script is called from!
+### Usage of absolute paths is recommended.
+### Do not use file names with dots "." in them! This is not supported, use "_" instead
+### For Windows users: please use double back slash '\\' in paths and add '.exe' to executable definitions
+
+### Output format
+format: "MASA"
+masa_tc: 1        # applicable only to MASA/OMASA format
+masa_dirs: 1      # applicable only to MASA/OMASA format
+# sba_order: 2      # applicable only to OSBA format
+
+### Output sampling rate in Hz
+fs: 48000
+
+### Generate BINAURAL output (_BINAURAL will be appended to the output filename)
+binaural_output: true
+
+### Normalize target loudness to X LKFS 
+loudness: -26
+
+### Apply pre-amble and post-amble in X seconds 
+preamble: 0.0
+postamble: 0.0
+
+### Apply fade-in and fade-out of X seconds
+fade_in_out: 0.5
+
+### Trim the output such that the total duration is X seconds
+duration: 8
+
+### Add low-level random background noise (amplitude +-4) instead of silence; default = false (silence)
+add_low_level_random_noise: false
+
+### Process with parallel streams
+multiprocessing: false
+
+################################################
+### Item generation - Filename conventions
+################################################
+
+### Naming convention for the input mono files
+### The input filenames are represented by:
+###   lLLeeettszz.wav
+### where: 
+###   l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com) 
+###   LL stands for the language: JP, FR, GE, MA, DA, EN
+###   eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09
+###   tt stands for the talker ID: f1, f2, f3, m1, m2, m3
+###   s stands for 'sample' and zz is the sample number; 01, ..., 14
+
+### Naming convention for the generated output files
+### The output filenames are represented by:
+###   leeeayszz.wav
+### The filenames of the accompanying output metadata files (applicable to metadata-assisted spatial audio, object-based audio) are represented by:
+###   leeeayszz.met for metadata-assisted spatial audio
+###   leeeayszz.wav.o.csv for object-based audio
+### where: 
+###   l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com) 
+###   eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09
+###   a stands 'audio'
+###   y is the per-experiment category according to IVAS-8a: 01, 02, 03, 04, 05, 06
+###   s stands for sample and zz is the sample number; 01, 02, 03, 04, 05, 06, 07 (07 is the preliminary sample)
+###   o stands for the object number; 0, 1, 2, 3
+
+### File designators, default is "l" for listening lab, "EN" for language, "p07" for experiment and "g" for company
+listening_lab: "l"
+language: "EN"
+exp: "p01"
+provider: "va"
+
+### Insert prefix for all input filenames (default: "")
+### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment' 
+### the number of consecutive letters define the length of each field
+# use_input_prefix: "lLLeee"
+
+### Insert prefix for all output filenames (default: "")
+### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment' 
+### the number of consecutive letters define the length of each field
+# use_output_prefix: "leee"
+
+################################################
+### Item generation - Scene description
+################################################
+
+### Each scene shall de described using the following parameters/properties:
+###   output:      output filename
+###   description: textual description of the scene
+###   input:       input filename(s)
+###   IR:          filenames(s) of the input IRs 
+###   azimuth:     azimuth in the range [-180,180]; positive values point to the left
+###   elevation:   elevation in the range [-90,90]; positive values indicate up
+###   shift:       time adjustment of the input signal (negative value delays the signal)
+###   background:  background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored)
+###   background_level:  normalized background noise loudness to X dB LKFS
+###
+### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder)
+### Note 1: use brackets [val1, val2, ...] when specifying multiple values 
+### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames
+### Note 3: we're using right-handed coordinate system with azimuth = 0 pointing from the nose to the screen
+
+scenes:
+    "01": 
+        output: "out/s01.wav"
+        description: "Car with AB microphone pickup, no overlap between the talkers, car noise."
+        input: ["items_mono/untrimmed/f1s4b_Talker2.wav", "items_mono/untrimmed/f2s1a_Talker1.wav"]
+        IR: ["IRs/IR_do_p04_e_01_01_FOA.wav", "IRs/IR_do_p04_e_02_01_FOA.wav"]
+        shift: [0.0, -1.0]
+        background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav"
+        background_level: -46
+
+    "02": 
+        output: "out/s02.wav"
+        description: "Car with AB microphone pickup, overlap between the talkers, car noise."
+        input: ["items_mono/untrimmed/f1s6a_Talker2.wav", "items_mono/untrimmed/f2s3b_Talker1.wav"]
+        IR: ["IRs/IR_do_p04_e_03_01_FOA.wav", "IRs/IR_do_p04_e_04_01_FOA.wav"]
+        shift: [0.0, +1.0]
+        background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav"
+        background_level: -46
+        
+    "03": 
+        output: "out/s03.wav"
+        description: "Car with AB microphone pickup, no overlap between the talkers, car noise."
+        input: ["items_mono/untrimmed/f3s3a_Talker2.wav", "items_mono/untrimmed/f3s10b_Talker2.wav"]
+        IR: ["IRs/IR_do_p04_e_05_01_FOA.wav", "IRs/IR_do_p04_e_06_01_FOA.wav"]
+        shift: [0.0, -1.0]
+        background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav"
+        background_level: -46
+        
+    "04": 
+        output: "out/s04.wav"
+        description: "Car with AB microphone pickup, no overlap between the talkers, car noise."
+        input: ["items_mono/untrimmed/f2s7b_Talker1.wav", "items_mono/untrimmed/f5s15a_Talker1.wav"]
+        IR: ["IRs/IR_do_p04_e_07_01_FOA.wav", "IRs/IR_do_p04_e_08_01_FOA.wav"]
+        shift: [0.0, -1.0]
+        background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav"
+        background_level: -46
+        
+    "05": 
+        output: "out/s05.wav"
+        description: "Car with AB microphone pickup, no overlap between the talkers, car noise."
+        input: ["items_mono/untrimmed/m2s15a_Talker2.wav", "items_mono/untrimmed/m1s4a_Talker1.wav"]
+        IR: ["IRs/IR_do_p04_e_07_01_FOA.wav", "IRs/IR_do_p04_e_01_01_FOA.wav"]
+        shift: [0.0, -1.0]
+        background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav"
+        background_level: -46
+        
+    "06": 
+        output: "out/s06.wav"
+        description: "Car with AB microphone pickup, no overlap between the talkers."
+        input: ["items_mono/untrimmed/m3s8a_Talker2.wav", "items_mono/untrimmed/m4s13a_Talker1.wav"]
+        IR: ["IRs/IR_do_p04_e_03_01_FOA.wav", "IRs/IR_do_p04_e_01_01_FOA.wav"]
+        shift: [0.0, -1.0]
+        background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav"
+        background_level: -46
+
+    "07": 
+        output: "out/s07.wav"
+        description: "Preliminary: Car with AB microphone pickup, no overlap between the talkers."
+        input: ["items_mono/untrimmed/f1s20a_Talker2.wav", "items_mono/untrimmed/f5s15b_Talker1.wav"]
+        IR: ["IRs/IR_do_p04_e_02_01_FOA.wav", "IRs/IR_do_p04_e_07_01_FOA.wav"]
+        shift: [0.0, -1.0]
+        background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav"
+        background_level: -46
+         
+    "08": 
+        output: "out/s08.wav"
+        description: "Car with AB microphone pickup, overlap between the talkers."
+        input: ["items_mono/untrimmed/m2s6b_Talker2.wav", "items_mono/untrimmed/f5s14a_Talker1.wav"]
+        IR: ["IRs/IR_do_p04_e_08_01_FOA.wav", "IRs/IR_do_p04_e_04_01_FOA.wav"]
+        shift: [0.0, +1.0]
+        background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav"
+        background_level: -46
diff --git a/examples/ITEM_GENERATION_OMASA.yml b/examples/ITEM_GENERATION_OMASA.yml
index 1f631f3f..942ad1c7 100644
--- a/examples/ITEM_GENERATION_OMASA.yml
+++ b/examples/ITEM_GENERATION_OMASA.yml
@@ -10,8 +10,8 @@
 
 ### Output format
 format: "OMASA"
-masa_tc: 2        # applicable only to OMASA format
-masa_dirs: 2      # applicable only to OMASA format
+masa_tc: 2        # applicable only to MASA/OMASA format
+masa_dirs: 2      # applicable only to MASA/OMASA format
 # sba_order: 2      # applicable only to OSBA format
 
 ### Output sampling rate in Hz
diff --git a/ivas_processing_scripts/generation/__init__.py b/ivas_processing_scripts/generation/__init__.py
index 8a9dfb98..2035340e 100755
--- a/ivas_processing_scripts/generation/__init__.py
+++ b/ivas_processing_scripts/generation/__init__.py
@@ -40,6 +40,7 @@ from ivas_processing_scripts.constants import (
 from ivas_processing_scripts.generation import (
     config,
     generate_ismN_items,
+    generate_masa_items,
     generate_omasa_items,
     generate_osba_items,
     generate_sba_items,
@@ -90,11 +91,14 @@ def main(args):
     elif any(fmt in cfg.format for fmt in ["FOA", "HOA2", "HOA3"]):
         # generate FOA/HOA2/HOA3 items according to scene description
         generate_sba_items.generate_sba_items(cfg, logger)
+    elif "MASA" in cfg.format:
+        # generate MASA items from MONO items according to scene description
+        generate_masa_items.generate_masa_items(cfg, logger)
     elif "OMASA" in cfg.format:
-        # generate OMASA items from FOA/HO2/HOA3 and MONO items according to scene description
+        # generate OMASA items from FOA/HOA2/HOA3 and MONO items according to scene description
         generate_omasa_items.generate_omasa_items(cfg, logger)
     elif "OSBA" in cfg.format:
-        # generate OSBA items from FOA/HO2/HOA3 and MONO items according to scene description
+        # generate OSBA items from FOA/HOA2/HOA3 and MONO items according to scene description
         generate_osba_items.generate_osba_items(cfg, logger)
 
     logger.handlers.clear()
diff --git a/ivas_processing_scripts/generation/generate_ismN_items.py b/ivas_processing_scripts/generation/generate_ismN_items.py
index bd92367e..adec1961 100644
--- a/ivas_processing_scripts/generation/generate_ismN_items.py
+++ b/ivas_processing_scripts/generation/generate_ismN_items.py
@@ -234,7 +234,7 @@ def generate_ismN_scene(
         else:
             source_shift = 0.0
 
-        # convert overlap to samples and ensure it is a multiple of 20ms
+        # convert shift from seconds to samples and ensure it is a multiple of 20ms
         source_shift_in_seconds = source_shift
         source_shift = source_shift * cfg.fs
         if source_shift >= 0:
diff --git a/ivas_processing_scripts/generation/generate_masa_items.py b/ivas_processing_scripts/generation/generate_masa_items.py
new file mode 100644
index 00000000..5ead6147
--- /dev/null
+++ b/ivas_processing_scripts/generation/generate_masa_items.py
@@ -0,0 +1,484 @@
+#!/usr/bin/env python3
+
+#
+#  (C) 2022-2025 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB,
+#  Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
+#  Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
+#  Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
+#  contributors to this repository. All Rights Reserved.
+#
+#  This software is protected by copyright law and by international treaties.
+#  The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB,
+#  Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
+#  Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
+#  Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
+#  contributors to this repository retain full ownership rights in their respective contributions in
+#  the software. This notice grants no license of any kind, including but not limited to patent
+#  license, nor is any license granted by implication, estoppel or otherwise.
+#
+#  Contributors are required to enter into the IVAS codec Public Collaboration agreement before making
+#  contributions.
+#
+#  This software is provided "AS IS", without any express or implied warranties. The software is in the
+#  development stage. It is intended exclusively for experts who have experience with such software and
+#  solely for the purpose of inspection. All implied warranties of non-infringement, merchantability
+#  and fitness for a particular purpose are hereby disclaimed and excluded.
+#
+#  Any dispute, controversy or claim arising under or in relation to providing this software shall be
+#  submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in
+#  accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and
+#  the United Nations Convention on Contracts on the International Sales of Goods.
+#
+
+import logging
+import sys
+from itertools import groupby, repeat
+from pathlib import Path
+
+import numpy as np
+
+from ivas_processing_scripts.audiotools import audio, audioarray, audiofile
+from ivas_processing_scripts.audiotools.convert.masa import convert_masa
+from ivas_processing_scripts.audiotools.convert.scenebased import render_sba_to_masa
+from ivas_processing_scripts.audiotools.wrappers.bs1770 import loudness_norm
+from ivas_processing_scripts.audiotools.wrappers.reverb import reverb_foa, reverb_hoa2, reverb_hoa3
+from ivas_processing_scripts.generation import config
+from ivas_processing_scripts.utils import apply_func_parallel
+
+SEED_RANDOM_NOISE = 0
+
+
+# function for searching sequences of same the same character and replacing it by another string
+def replace_char_seq_with_string(str, char_seq, repl_str):
+    result = []
+
+    # find groups of consecutive letters
+    groups = ["".join(list(g)) for k, g in groupby(str)]
+
+    # limit the length of the replacement string by the length of the character sequence
+    repl_str = repl_str[: len(char_seq)]
+
+    # replace each occurence of the sequence of characters
+    for g in groups:
+        if char_seq in g:
+            result.append(repl_str)
+        else:
+            result.append(g)
+
+    return "".join(result)
+
+
+# function for appending string to a filename before file extension
+def append_str_filename(filename, str_to_append):
+    p = Path(filename)
+    # Combine the stem, the string to append, and the suffix
+    return p.parent / (p.stem + str_to_append + p.suffix)
+
+
+def generate_masa_items(
+    cfg: config.TestConfig,
+    logger: logging.Logger,
+):
+    """Generate MASA items with metadata from FOA/HOA2/HOA3 and ISMn items based on scene description"""
+
+    # set the fs
+    if "fs" not in cfg.__dict__:
+        cfg.fs = 48000
+
+    # set the listening lab designator
+    if "listening_lab" not in cfg.__dict__:
+        cfg.listening_lab = "l"
+
+    # set the language designator
+    if "language" not in cfg.__dict__:
+        cfg.language = "EN"
+
+    # set the experiment designator
+    if "exp" not in cfg.__dict__:
+        cfg.exp = "p07"
+
+    # set the provider
+    if "provider" not in cfg.__dict__:
+        cfg.provider = "g"
+
+    # set the prefix for all input filenames
+    if "use_input_prefix" not in cfg.__dict__:
+        cfg.use_input_prefix = ""
+    else:
+        # replace file designators
+        cfg.use_input_prefix = replace_char_seq_with_string(
+            cfg.use_input_prefix, "l", cfg.listening_lab
+        )
+        cfg.use_input_prefix = replace_char_seq_with_string(
+            cfg.use_input_prefix, "LL", cfg.language
+        )
+        cfg.use_input_prefix = replace_char_seq_with_string(
+            cfg.use_input_prefix, "eee", cfg.exp
+        )
+
+    # set the prefix for all IR filenames
+    if "use_IR_prefix" not in cfg.__dict__:
+        cfg.use_IR_prefix = ""
+    else:
+        # replace file designators
+        cfg.use_IR_prefix = replace_char_seq_with_string(
+            cfg.use_IR_prefix, "p", cfg.provider
+        )
+        cfg.use_IR_prefix = replace_char_seq_with_string(
+            cfg.use_IR_prefix, "LL", cfg.language
+        )
+        cfg.use_IR_prefix = replace_char_seq_with_string(
+            cfg.use_IR_prefix, "eee", cfg.exp
+        )
+
+    # set the prefix for all output filenames
+    if "use_output_prefix" not in cfg.__dict__:
+        cfg.use_output_prefix = ""
+    else:
+        # replace file designators
+        cfg.use_output_prefix = replace_char_seq_with_string(
+            cfg.use_output_prefix, "l", cfg.listening_lab
+        )
+        cfg.use_output_prefix = replace_char_seq_with_string(
+            cfg.use_output_prefix, "eee", cfg.exp
+        )
+
+    # set multiprocessing
+    if "multiprocessing" not in cfg.__dict__:
+        cfg.multiprocessing = False
+
+    apply_func_parallel(
+        generate_MASA_scene,
+        zip(cfg.scenes.keys(), cfg.scenes.values(), repeat(cfg), repeat(logger)),
+        type="mp" if cfg.multiprocessing else None,
+        show_progress=None,
+    )
+
+    return
+
+
+def generate_MASA_scene(
+    scene_name: str, scene: dict, cfg: config.TestConfig, logger: logging.Logger
+):
+    """
+    Processes a single scene to generate MASA item with metadata.
+
+    Args:
+        scene_name (str): The name of the scene being processed.
+        scene (dict): A dictionary containing scene description, including source files, azimuth, elevation, and other parameters.
+        cfg (config.TestConfig): Configuration object containing settings for processing, such as input/output paths, sampling rate, and loudness levels.
+        logger (logging.Logger): Logger instance for logging information and errors.
+
+    Expected Behavior:
+        - Reads audio source files and processes them based on the scene description.
+        - Generates metadata files and appends them to the MASA object.
+        - Writes the processed audio and metadata to output files.
+        - Handles various audio formats (e.g., MONO, FOA, HOA2) and applies transformations like loudness normalization, trimming, and padding.
+    """
+
+    scenes = list(cfg.scenes.keys())
+    logger.info(
+        f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}"
+    )
+
+    # extract the number of audio sources
+    N_inputs = len(np.atleast_1d(scene["input"]))
+
+    # get output filename
+    masa_format = f"MASA{cfg.masa_tc}DIR{cfg.masa_dirs}"
+    output_filename = Path(scene["output"]).parent / (
+        cfg.use_output_prefix + Path(scene["output"]).name
+    )
+
+    # initialize output dirs
+    dir_path = output_filename.parent
+    if dir_path and not dir_path.exists():
+        dir_path.mkdir(parents=True, exist_ok=True)
+
+    # initialize output MASA object
+    y = audio.MetadataAssistedSpatialAudio(masa_format)
+    y.fs = cfg.fs
+
+    # set the frame length
+    frame_len = int(cfg.fs / 50)
+
+    # repeat for all source files
+    offset = 0
+    for i in range(N_inputs):
+        # parse parameters from the scene description
+        source_file = (
+            scene["input"][i] if isinstance(scene["input"], list) else scene["input"]
+        )
+        IR_file = scene["IR"][i] if isinstance(scene["IR"], list) else scene["IR"]
+
+        # get input filename and IR filename
+        input_filename = Path(source_file).parent / (
+            cfg.use_input_prefix + Path(source_file).name
+        )
+        IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name)
+
+        # read the shift time in seconds
+        if "shift" in scene.keys():
+            source_shift = (
+                scene["shift"][i]
+                if isinstance(scene["shift"], list)
+                else scene["shift"]
+            )
+        else:
+            source_shift = 0.0
+
+        # convert shift from seconds to samples and ensure it is a multiple of 20ms
+        source_shift_in_seconds = source_shift
+        source_shift = source_shift * cfg.fs
+        if source_shift >= 0:
+            source_shift = int(np.floor(source_shift / frame_len) * frame_len)
+        else:
+            source_shift = int(np.ceil(source_shift / frame_len) * frame_len)
+
+        # read the level
+        if "level" in scene.keys():
+            level = (
+                scene["level"][i]
+                if isinstance(scene["level"], list)
+                else scene["level"]
+            )
+        else:
+            level = -26
+
+        logger.info(
+            f"-- Convolving {source_file} with {IR_file} at {level} LKFS with shift of {source_shift_in_seconds} seconds"
+        )
+
+        # get the number of channels from the IR .wav file header
+        wav_header = audiofile.parse_wave_header(IR_filename)
+        IR_channels = wav_header["channels"]
+
+        if IR_channels == 4:
+            IR_fmt = "FOA"
+        elif IR_channels == 9:
+            IR_fmt = "HOA2"
+        elif IR_channels == 16:
+            IR_fmt = "HOA3"
+        else:
+            logger.error(
+                f"Error: Input format of the IR source file with {IR_channels} channels is not supported!"
+            )
+            sys.exit(-1)
+
+        # initialize intermediate SBA object
+        y_int = audio.SceneBasedAudio(IR_fmt)
+        y_int.fs = cfg.fs
+
+        # read source file
+        x = audio.fromfile("MONO", input_filename)
+
+        # resample to the target fs if necessary
+        if x.fs != cfg.fs:
+            logger.warning(
+                f"Warning: Sample rate of the audio source is {x.fs} Hz and needs to be resampled to {cfg.fs}!"
+            )
+            resampled_audio = audioarray.resample(x.audio, x.fs, cfg.fs)
+            x.audio = resampled_audio
+            x.fs = cfg.fs
+
+        # read the IR file
+        IR = audio.fromfile(IR_fmt, IR_filename)
+
+        # convolve MONO source audio with FOA/HOA2/HOA3 IR -> results in FOA/HOA2/HOA3 audio object
+        if IR_fmt == "FOA":
+            x = reverb_foa(x, IR)
+        elif IR_fmt == "HOA2":
+            x = reverb_hoa2(x, IR)
+        elif IR_fmt == "HOA3":
+            x = reverb_hoa3(x, IR)
+
+        # adjust the level of the FOA/HOA2/HOA3 signal
+        x.audio, _ = loudness_norm(x, level, loudness_format="STEREO")
+
+        # ensure the length of the audio source signal is a multiple of 20ms
+        if len(x.audio) % frame_len != 0:
+            # pad with zeros to ensure that the signal length is a multiple of 20ms
+            if len(x.audio) % frame_len != 0:
+                N_pad = int(frame_len - len(x.audio) % frame_len)
+                x.audio = audioarray.trim(
+                    x.audio, x.fs, limits=[0, -N_pad], samples=True
+                )
+
+        # add the convolved FOA/HOA2/HOA3 audio source signal to the intermediate SBA output signal
+        if y_int.audio is None:
+            # this is the first SBA source signal
+            y_int.audio = x.audio.copy()
+
+            if source_shift < 0:
+                # insert zeros to the first SBA source signal to shift it right
+                y_int.audio = audioarray.trim(
+                    y_int.audio, y_int.fs, limits=[source_shift, 0], samples=True
+                )
+            else:
+                offset = source_shift
+        else:
+            # shift the beginning of the audio source signal
+            delta_offset = source_shift - offset
+            if delta_offset > 0:
+                # insert zeros to the output SBA signal to shift it right
+                audioarray.trim(
+                    y_int.audio, y_int.fs, limits=[-delta_offset, 0], samples=True
+                )
+                offset = source_shift
+            else:
+                # insert zeros to the new SBA source signal to shift it right
+                audioarray.trim(
+                    x.audio, x.fs, limits=[delta_offset, 0], samples=True
+                )
+
+            # adjust the length of the audio source signal
+            delta_length = len(x.audio) - len(y_int.audio)
+            if delta_length > 0:
+                # pad zeros to the output SBA signal
+                y_int.audio = audioarray.trim(
+                    y_int.audio, y_int.fs, limits=[0, -delta_length], samples=True
+                )
+            else:
+                # pad zeros to the new MASA source signal
+                x.audio = audioarray.trim(
+                    x.audio, x.fs, limits=[0, delta_length], samples=True
+                )
+
+            # superimpose
+            y_int.audio += x.audio
+
+    # append pre-amble and post-amble
+    if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__:
+        preamble = int(
+            np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len
+        )  # convert to samples and ensure multiple of 20ms
+        postamble = int(
+            np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len
+        )  # convert to samples and ensure multiple of 20ms
+        if preamble != 0 or postamble != 0:
+            logger.info(
+                f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds"
+            )
+            y_int.audio = audioarray.trim(
+                y_int.audio, y_int.fs, limits=[-preamble, -postamble], samples=True
+            )
+
+    # adjust the length of the output signal
+    if "duration" in cfg.__dict__:
+        # trim the output signal such that the total duration is X seconds
+        duration = int(cfg.duration * cfg.fs)  # convert to samples
+    else:
+        # do not change the length of the audio signal
+        duration = len(y_int.audio)
+    duration = int(
+        np.floor(duration / frame_len) * frame_len
+    )  # ensure multiple of 20ms
+    if len(y_int.audio) != duration:
+        y_int.audio = audioarray.trim(
+            y_int.audio, y_int.fs, limits=[0, len(y_int.audio) - duration], samples=True
+        )
+
+    # adjust the loudness of the output signal
+    if "loudness" in cfg.__dict__:
+        logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LKFS")
+        y_int.audio, _ = loudness_norm(y_int, cfg.loudness, loudness_format="BINAURAL")
+
+    # add background noise in FOA/HOA2/HOA3 format
+    if "background" in scene.keys():
+        # check if [] are used in the background noise file name
+        if isinstance(scene["background"], list):
+            # if so, use the first element
+            background_filename = scene["background"][0]
+        else:
+            background_filename = scene["background"]
+
+        # read the background noise file
+        background_filename = Path(scene["background"]).parent / (
+            cfg.use_input_prefix + Path(scene["background"]).name
+        )
+        logger.info(f"-- Adding background noise from {background_filename}")
+        background = audio.fromfile(IR_fmt, background_filename)
+
+        # resample to the target fs if necessary
+        if background.fs != cfg.fs:
+            logger.warning(
+                f"Warning: Sample rate of the background noise is {background.fs} Hz and needs to be resampled to {cfg.fs}!"
+            )
+            resampled_audio = audioarray.resample(
+                background.audio, background.fs, cfg.fs
+            )
+            background.audio = resampled_audio
+            background.fs = cfg.fs
+
+        # adjust the length of the background noise signal
+        if len(background.audio) != len(y_int.audio):
+            background.audio = audioarray.trim(
+                background.audio,
+                background.fs,
+                limits=[0, len(background.audio) - len(y_int.audio)],
+                samples=True,
+            )
+
+        # adjust the loudness of the background noise signal
+        if "background_level" in scene.keys():
+            logger.info(
+                f"-- Rescaling background noise to target loudness: {scene['background_level']} LKFS"
+            )
+
+            # check if [] are used in the background level
+            if isinstance(scene["background_level"], list):
+                # if so, use the first element
+                scene["background_level"] = scene["background_level"][0]
+
+            # convert to float if the background level was entered in string format
+            if not isinstance(scene["background_level"], (int, float)):
+                scene["background_level"] = float(scene["background_level"])
+        else:
+            logger.warning(
+                "-- Warning: No target loudness for background noise specified, using default value of -26 LKFS"
+            )
+            scene["background_level"] = -26
+        background.audio, _ = loudness_norm(
+            background, scene["background_level"], loudness_format="STEREO", rms=True
+        )
+
+        # add the background noise to the output signal
+        y_int.audio += background.audio
+    elif (
+        "add_low_level_random_noise" in cfg.__dict__ and cfg.add_low_level_random_noise
+    ):
+        # create uniformly distributed noise between -4 and 4
+        np.random.seed(SEED_RANDOM_NOISE)
+        noise = np.random.randint(low=-4, high=5, size=y_int.audio.shape).astype("float")
+        y_int.audio += noise
+
+    # apply fade-in and fade-out
+    if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0:
+        logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds")
+        y_int.audio = audioarray.window(y_int.audio, y_int.fs, cfg.fade_in_out * 1000)
+
+    # generate MASA metadata filename (should end with .met)
+    y.metadata_file = output_filename.with_suffix(".met")
+
+    # convert the intermediate SBA output signal to MASA format
+    render_sba_to_masa(y_int, y)
+
+    # write the MASA audio signal to the output file
+    audiofile.write(output_filename, y.audio, y.fs)
+
+    # convert the MASA audio signal to BINAURAL, if option was chosen
+    if cfg.binaural_output:
+        binaural_output_filename = output_filename.with_name(
+            output_filename.stem + "_BINAURAL" + output_filename.suffix
+        )
+        logger.info(
+            f"-- Converting to BINAURAL output file: {binaural_output_filename}"
+        )
+        binaudio = audio.fromtype("BINAURAL")
+        binaudio.fs = y.fs
+        convert_masa(y, binaudio)
+        audiofile.write(
+            binaural_output_filename,
+            binaudio.audio,
+            binaudio.fs,
+        )
diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py
index 603a3593..972ee69a 100644
--- a/ivas_processing_scripts/generation/generate_omasa_items.py
+++ b/ivas_processing_scripts/generation/generate_omasa_items.py
@@ -78,7 +78,7 @@ def generate_omasa_items(
     cfg: config.TestConfig,
     logger: logging.Logger,
 ):
-    """Generate OMASA items with metadata from FOA/HO2 and ISMn items based on scene description"""
+    """Generate OMASA items with metadata from FOA/HOA2/HOA3 and ISMn items based on scene description"""
 
     # set the fs
     if "fs" not in cfg.__dict__:
@@ -229,7 +229,7 @@ def generate_OMASA_scene(
         else:
             source_shift = 0.0
 
-        # convert overlap to samples and ensure it is a multiple of 20ms
+        # convert shift from seconds to samples and ensure it is a multiple of 20ms
         source_shift_in_seconds = source_shift
         source_shift = source_shift * cfg.fs
         if source_shift >= 0:
diff --git a/ivas_processing_scripts/generation/generate_osba_items.py b/ivas_processing_scripts/generation/generate_osba_items.py
index d2a71777..156c3d2d 100644
--- a/ivas_processing_scripts/generation/generate_osba_items.py
+++ b/ivas_processing_scripts/generation/generate_osba_items.py
@@ -210,7 +210,7 @@ def generate_OSBA_scene(
             else scene["elevation"]
         )
 
-        # read the overlap length
+        # read the shift time in seconds
         if "shift" in scene.keys():
             source_shift = (
                 scene["shift"][i]
@@ -220,7 +220,7 @@ def generate_OSBA_scene(
         else:
             source_shift = 0.0
 
-        # convert overlap to samples and ensure it is a multiple of 20ms
+        # convert shift from seconds to samples and ensure it is a multiple of 20ms
         source_shift_in_seconds = source_shift
         source_shift = source_shift * cfg.fs
         if source_shift >= 0:
diff --git a/ivas_processing_scripts/generation/generate_sba_items.py b/ivas_processing_scripts/generation/generate_sba_items.py
index bdb40b1b..96b542d2 100644
--- a/ivas_processing_scripts/generation/generate_sba_items.py
+++ b/ivas_processing_scripts/generation/generate_sba_items.py
@@ -213,7 +213,7 @@ def generate_sba_scene(
         )
         IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name)
 
-        # read the overlap length
+        # read the shift time in seconds
         if "shift" in scene.keys():
             source_shift = (
                 scene["shift"][i]
@@ -223,7 +223,7 @@ def generate_sba_scene(
         else:
             source_shift = 0.0
 
-        # convert overlap to samples and ensure it is a multiple of 20ms
+        # convert shift from seconds to samples and ensure it is a multiple of 20ms
         source_shift_in_seconds = source_shift
         source_shift = source_shift * cfg.fs
         if source_shift >= 0:
diff --git a/ivas_processing_scripts/generation/generate_stereo_items.py b/ivas_processing_scripts/generation/generate_stereo_items.py
index 92a68906..bea865de 100644
--- a/ivas_processing_scripts/generation/generate_stereo_items.py
+++ b/ivas_processing_scripts/generation/generate_stereo_items.py
@@ -219,7 +219,7 @@ def generate_stereo_scene(
         )
         IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name)
 
-        # read the overlap length
+        # read the shift time in seconds
         if "shift" in scene.keys():
             source_shift = (
                 scene["shift"][i]
@@ -229,7 +229,7 @@ def generate_stereo_scene(
         else:
             source_shift = 0.0
 
-        # convert overlap to samples and ensure it is a multiple of 20ms
+        # convert shift from seconds to samples and ensure it is a multiple of 20ms
         source_shift_in_seconds = source_shift
         source_shift = source_shift * cfg.fs
         if source_shift >= 0:
diff --git a/ivas_processing_scripts/generation/process_ambi_items.py b/ivas_processing_scripts/generation/process_ambi_items.py
index 913fdcc4..f2b8982e 100644
--- a/ivas_processing_scripts/generation/process_ambi_items.py
+++ b/ivas_processing_scripts/generation/process_ambi_items.py
@@ -191,7 +191,7 @@ def generate_ambi_scene(
     # extract the number of audio sources
     N_sources = len(np.atleast_1d(scene["source"]))
 
-    # read the overlap length
+    # read the shift time in seconds
     if "overlap" in scene.keys():
         source_overlap = float(scene["overlap"])
     else:
diff --git a/ivas_processing_scripts/generation/process_ism1_items.py b/ivas_processing_scripts/generation/process_ism1_items.py
index ac1f273e..2177f09b 100644
--- a/ivas_processing_scripts/generation/process_ism1_items.py
+++ b/ivas_processing_scripts/generation/process_ism1_items.py
@@ -168,7 +168,7 @@ def generate_ism1_scene(
     y = audio.ChannelBasedAudio("MONO")
     y_meta = None
 
-    # read the overlap length
+    # read the shift time in seconds
     if "overlap" in scene.keys():
         source_overlap = float(scene["overlap"])
     else:
diff --git a/ivas_processing_scripts/generation/process_ism2_items.py b/ivas_processing_scripts/generation/process_ism2_items.py
index e944ca2c..83bd59e4 100644
--- a/ivas_processing_scripts/generation/process_ism2_items.py
+++ b/ivas_processing_scripts/generation/process_ism2_items.py
@@ -168,7 +168,7 @@ def generate_ism2_scene(
     y = audio.ChannelBasedAudio("STEREO")
     y_meta = None
 
-    # read the overlap length
+    # read the shift time in seconds
     if "overlap" in scene.keys():
         source_overlap = float(scene["overlap"])
     else:
diff --git a/ivas_processing_scripts/generation/process_stereo_items.py b/ivas_processing_scripts/generation/process_stereo_items.py
index b4c17197..7d05de54 100644
--- a/ivas_processing_scripts/generation/process_stereo_items.py
+++ b/ivas_processing_scripts/generation/process_stereo_items.py
@@ -187,7 +187,7 @@ def generate_stereo_scene(
     # extract the number of audio sources
     N_sources = len(np.atleast_1d(scene["source"]))
 
-    # read the overlap length
+    # read the shift time in seconds
     if "overlap" in scene.keys():
         source_overlap = float(scene["overlap"])
     else:
-- 
GitLab


From 19619ed031254ccd40e422872b7e39509ca3bc3d Mon Sep 17 00:00:00 2001
From: Vladimir Malenovsky <vladimir.malenovsky@usherbrooke.ca>
Date: Sun, 27 Jul 2025 14:57:30 +0200
Subject: [PATCH 2/2] fix formatting

---
 .../generation/generate_masa_items.py              | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/ivas_processing_scripts/generation/generate_masa_items.py b/ivas_processing_scripts/generation/generate_masa_items.py
index 5ead6147..65b29761 100644
--- a/ivas_processing_scripts/generation/generate_masa_items.py
+++ b/ivas_processing_scripts/generation/generate_masa_items.py
@@ -41,7 +41,11 @@ from ivas_processing_scripts.audiotools import audio, audioarray, audiofile
 from ivas_processing_scripts.audiotools.convert.masa import convert_masa
 from ivas_processing_scripts.audiotools.convert.scenebased import render_sba_to_masa
 from ivas_processing_scripts.audiotools.wrappers.bs1770 import loudness_norm
-from ivas_processing_scripts.audiotools.wrappers.reverb import reverb_foa, reverb_hoa2, reverb_hoa3
+from ivas_processing_scripts.audiotools.wrappers.reverb import (
+    reverb_foa,
+    reverb_hoa2,
+    reverb_hoa3,
+)
 from ivas_processing_scripts.generation import config
 from ivas_processing_scripts.utils import apply_func_parallel
 
@@ -327,9 +331,7 @@ def generate_MASA_scene(
                 offset = source_shift
             else:
                 # insert zeros to the new SBA source signal to shift it right
-                audioarray.trim(
-                    x.audio, x.fs, limits=[delta_offset, 0], samples=True
-                )
+                audioarray.trim(x.audio, x.fs, limits=[delta_offset, 0], samples=True)
 
             # adjust the length of the audio source signal
             delta_length = len(x.audio) - len(y_int.audio)
@@ -449,7 +451,9 @@ def generate_MASA_scene(
     ):
         # create uniformly distributed noise between -4 and 4
         np.random.seed(SEED_RANDOM_NOISE)
-        noise = np.random.randint(low=-4, high=5, size=y_int.audio.shape).astype("float")
+        noise = np.random.randint(low=-4, high=5, size=y_int.audio.shape).astype(
+            "float"
+        )
         y_int.audio += noise
 
     # apply fade-in and fade-out
-- 
GitLab