From 884d79aa4e3cfa60ba6e48317f04c9f8fb073c4c Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Sun, 27 Jul 2025 14:55:06 +0200 Subject: [PATCH 1/2] add support for item generation in MASA format --- examples/ITEM_GENERATION_FOA.yml | 96 ++-- examples/ITEM_GENERATION_MASA.yml | 177 +++++++ examples/ITEM_GENERATION_OMASA.yml | 4 +- .../generation/__init__.py | 8 +- .../generation/generate_ismN_items.py | 2 +- .../generation/generate_masa_items.py | 484 ++++++++++++++++++ .../generation/generate_omasa_items.py | 4 +- .../generation/generate_osba_items.py | 4 +- .../generation/generate_sba_items.py | 4 +- .../generation/generate_stereo_items.py | 4 +- .../generation/process_ambi_items.py | 2 +- .../generation/process_ism1_items.py | 2 +- .../generation/process_ism2_items.py | 2 +- .../generation/process_stereo_items.py | 2 +- 14 files changed, 730 insertions(+), 65 deletions(-) create mode 100644 examples/ITEM_GENERATION_MASA.yml create mode 100644 ivas_processing_scripts/generation/generate_masa_items.py diff --git a/examples/ITEM_GENERATION_FOA.yml b/examples/ITEM_GENERATION_FOA.yml index 46cbe845..879735d4 100644 --- a/examples/ITEM_GENERATION_FOA.yml +++ b/examples/ITEM_GENERATION_FOA.yml @@ -114,57 +114,57 @@ scenes: background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav" background_level: -46 - # "02": - # output: "out/s02.wav" - # description: "Car with AB microphone pickup, overlap between the talkers, car noise." - # input: ["items_mono/untrimmed/f1s6a_Talker2.wav", "items_mono/untrimmed/f2s3b_Talker1.wav"] - # IR: ["IRs/IR_do_p04_e_03_01_FOA.wav", "IRs/IR_do_p04_e_04_01_FOA.wav"] - # shift: [0.0, +1.0] - # background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav" - # background_level: -46 + "02": + output: "out/s02.wav" + description: "Car with AB microphone pickup, overlap between the talkers, car noise." + input: ["items_mono/untrimmed/f1s6a_Talker2.wav", "items_mono/untrimmed/f2s3b_Talker1.wav"] + IR: ["IRs/IR_do_p04_e_03_01_FOA.wav", "IRs/IR_do_p04_e_04_01_FOA.wav"] + shift: [0.0, +1.0] + background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav" + background_level: -46 - # "03": - # output: "out/s03.wav" - # description: "Car with AB microphone pickup, no overlap between the talkers, car noise." - # input: ["items_mono/untrimmed/f3s3a_Talker2.wav", "items_mono/untrimmed/f3s10b_Talker2.wav"] - # IR: ["IRs/IR_do_p04_e_05_01_FOA.wav", "IRs/IR_do_p04_e_06_01_FOA.wav"] - # shift: [0.0, -1.0] + "03": + output: "out/s03.wav" + description: "Car with AB microphone pickup, no overlap between the talkers, car noise." + input: ["items_mono/untrimmed/f3s3a_Talker2.wav", "items_mono/untrimmed/f3s10b_Talker2.wav"] + IR: ["IRs/IR_do_p04_e_05_01_FOA.wav", "IRs/IR_do_p04_e_06_01_FOA.wav"] + shift: [0.0, -1.0] - # "04": - # output: "out/s04.wav" - # description: "Car with AB microphone pickup, no overlap between the talkers, car noise." - # input: ["items_mono/untrimmed/f2s7b_Talker1.wav", "items_mono/untrimmed/f5s15a_Talker1.wav"] - # IR: ["IRs/IR_do_p04_e_07_01_FOA.wav", "IRs/IR_do_p04_e_08_01_FOA.wav"] - # shift: [0.0, -1.0] - # background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav" - # background_level: -46 + "04": + output: "out/s04.wav" + description: "Car with AB microphone pickup, no overlap between the talkers, car noise." + input: ["items_mono/untrimmed/f2s7b_Talker1.wav", "items_mono/untrimmed/f5s15a_Talker1.wav"] + IR: ["IRs/IR_do_p04_e_07_01_FOA.wav", "IRs/IR_do_p04_e_08_01_FOA.wav"] + shift: [0.0, -1.0] + background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav" + background_level: -46 - # "05": - # output: "out/s05.wav" - # description: "Car with AB microphone pickup, no overlap between the talkers, car noise." - # input: ["items_mono/untrimmed/m2s15a_Talker2.wav", "items_mono/untrimmed/m1s4a_Talker1.wav"] - # IR: ["IRs/IR_do_p04_e_07_01_FOA.wav", "IRs/IR_do_p04_e_01_01_FOA.wav"] - # shift: [0.0, -1.0] - # background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav" - # background_level: -46 + "05": + output: "out/s05.wav" + description: "Car with AB microphone pickup, no overlap between the talkers, car noise." + input: ["items_mono/untrimmed/m2s15a_Talker2.wav", "items_mono/untrimmed/m1s4a_Talker1.wav"] + IR: ["IRs/IR_do_p04_e_07_01_FOA.wav", "IRs/IR_do_p04_e_01_01_FOA.wav"] + shift: [0.0, -1.0] + background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav" + background_level: -46 - # "06": - # output: "out/s06.wav" - # description: "Car with AB microphone pickup, no overlap between the talkers." - # input: ["items_mono/untrimmed/m3s8a_Talker2.wav", "items_mono/untrimmed/m4s13a_Talker1.wav"] - # IR: ["IRs/IR_do_p04_e_03_01_FOA.wav", "IRs/IR_do_p04_e_01_01_FOA.wav"] - # shift: [0.0, -1.0] + "06": + output: "out/s06.wav" + description: "Car with AB microphone pickup, no overlap between the talkers." + input: ["items_mono/untrimmed/m3s8a_Talker2.wav", "items_mono/untrimmed/m4s13a_Talker1.wav"] + IR: ["IRs/IR_do_p04_e_03_01_FOA.wav", "IRs/IR_do_p04_e_01_01_FOA.wav"] + shift: [0.0, -1.0] - # "07": - # output: "out/s07.wav" - # description: "Preliminary: Car with AB microphone pickup, no overlap between the talkers." - # input: ["items_mono/untrimmed/f1s20a_Talker2.wav", "items_mono/untrimmed/f5s15b_Talker1.wav"] - # IR: ["IRs/IR_do_p04_e_02_01_FOA.wav", "IRs/IR_do_p04_e_07_01_FOA.wav"] - # shift: [0.0, -1.0] + "07": + output: "out/s07.wav" + description: "Preliminary: Car with AB microphone pickup, no overlap between the talkers." + input: ["items_mono/untrimmed/f1s20a_Talker2.wav", "items_mono/untrimmed/f5s15b_Talker1.wav"] + IR: ["IRs/IR_do_p04_e_02_01_FOA.wav", "IRs/IR_do_p04_e_07_01_FOA.wav"] + shift: [0.0, -1.0] - # "08": - # output: "out/s08.wav" - # description: "Car with AB microphone pickup, overlap between the talkers." - # input: ["items_mono/untrimmed/m2s6b_Talker2.wav", "items_mono/untrimmed/f5s14a_Talker1.wav"] - # IR: ["IRs/IR_do_p04_e_08_01_FOA.wav", "IRs/IR_do_p04_e_04_01_FOA.wav"] - # shift: [0.0, +1.0] + "08": + output: "out/s08.wav" + description: "Car with AB microphone pickup, overlap between the talkers." + input: ["items_mono/untrimmed/m2s6b_Talker2.wav", "items_mono/untrimmed/f5s14a_Talker1.wav"] + IR: ["IRs/IR_do_p04_e_08_01_FOA.wav", "IRs/IR_do_p04_e_04_01_FOA.wav"] + shift: [0.0, +1.0] diff --git a/examples/ITEM_GENERATION_MASA.yml b/examples/ITEM_GENERATION_MASA.yml new file mode 100644 index 00000000..958a69cb --- /dev/null +++ b/examples/ITEM_GENERATION_MASA.yml @@ -0,0 +1,177 @@ +--- +################################################ +# Item generation - General configuration +################################################ + +### Any relative paths will be interpreted relative to the working directory the script is called from! +### Usage of absolute paths is recommended. +### Do not use file names with dots "." in them! This is not supported, use "_" instead +### For Windows users: please use double back slash '\\' in paths and add '.exe' to executable definitions + +### Output format +format: "MASA" +masa_tc: 1 # applicable only to MASA/OMASA format +masa_dirs: 1 # applicable only to MASA/OMASA format +# sba_order: 2 # applicable only to OSBA format + +### Output sampling rate in Hz +fs: 48000 + +### Generate BINAURAL output (_BINAURAL will be appended to the output filename) +binaural_output: true + +### Normalize target loudness to X LKFS +loudness: -26 + +### Apply pre-amble and post-amble in X seconds +preamble: 0.0 +postamble: 0.0 + +### Apply fade-in and fade-out of X seconds +fade_in_out: 0.5 + +### Trim the output such that the total duration is X seconds +duration: 8 + +### Add low-level random background noise (amplitude +-4) instead of silence; default = false (silence) +add_low_level_random_noise: false + +### Process with parallel streams +multiprocessing: false + +################################################ +### Item generation - Filename conventions +################################################ + +### Naming convention for the input mono files +### The input filenames are represented by: +### lLLeeettszz.wav +### where: +### l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com) +### LL stands for the language: JP, FR, GE, MA, DA, EN +### eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09 +### tt stands for the talker ID: f1, f2, f3, m1, m2, m3 +### s stands for 'sample' and zz is the sample number; 01, ..., 14 + +### Naming convention for the generated output files +### The output filenames are represented by: +### leeeayszz.wav +### The filenames of the accompanying output metadata files (applicable to metadata-assisted spatial audio, object-based audio) are represented by: +### leeeayszz.met for metadata-assisted spatial audio +### leeeayszz.wav.o.csv for object-based audio +### where: +### l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com) +### eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09 +### a stands 'audio' +### y is the per-experiment category according to IVAS-8a: 01, 02, 03, 04, 05, 06 +### s stands for sample and zz is the sample number; 01, 02, 03, 04, 05, 06, 07 (07 is the preliminary sample) +### o stands for the object number; 0, 1, 2, 3 + +### File designators, default is "l" for listening lab, "EN" for language, "p07" for experiment and "g" for company +listening_lab: "l" +language: "EN" +exp: "p01" +provider: "va" + +### Insert prefix for all input filenames (default: "") +### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment' +### the number of consecutive letters define the length of each field +# use_input_prefix: "lLLeee" + +### Insert prefix for all output filenames (default: "") +### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment' +### the number of consecutive letters define the length of each field +# use_output_prefix: "leee" + +################################################ +### Item generation - Scene description +################################################ + +### Each scene shall de described using the following parameters/properties: +### output: output filename +### description: textual description of the scene +### input: input filename(s) +### IR: filenames(s) of the input IRs +### azimuth: azimuth in the range [-180,180]; positive values point to the left +### elevation: elevation in the range [-90,90]; positive values indicate up +### shift: time adjustment of the input signal (negative value delays the signal) +### background: background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored) +### background_level: normalized background noise loudness to X dB LKFS +### +### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder) +### Note 1: use brackets [val1, val2, ...] when specifying multiple values +### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames +### Note 3: we're using right-handed coordinate system with azimuth = 0 pointing from the nose to the screen + +scenes: + "01": + output: "out/s01.wav" + description: "Car with AB microphone pickup, no overlap between the talkers, car noise." + input: ["items_mono/untrimmed/f1s4b_Talker2.wav", "items_mono/untrimmed/f2s1a_Talker1.wav"] + IR: ["IRs/IR_do_p04_e_01_01_FOA.wav", "IRs/IR_do_p04_e_02_01_FOA.wav"] + shift: [0.0, -1.0] + background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav" + background_level: -46 + + "02": + output: "out/s02.wav" + description: "Car with AB microphone pickup, overlap between the talkers, car noise." + input: ["items_mono/untrimmed/f1s6a_Talker2.wav", "items_mono/untrimmed/f2s3b_Talker1.wav"] + IR: ["IRs/IR_do_p04_e_03_01_FOA.wav", "IRs/IR_do_p04_e_04_01_FOA.wav"] + shift: [0.0, +1.0] + background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav" + background_level: -46 + + "03": + output: "out/s03.wav" + description: "Car with AB microphone pickup, no overlap between the talkers, car noise." + input: ["items_mono/untrimmed/f3s3a_Talker2.wav", "items_mono/untrimmed/f3s10b_Talker2.wav"] + IR: ["IRs/IR_do_p04_e_05_01_FOA.wav", "IRs/IR_do_p04_e_06_01_FOA.wav"] + shift: [0.0, -1.0] + background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav" + background_level: -46 + + "04": + output: "out/s04.wav" + description: "Car with AB microphone pickup, no overlap between the talkers, car noise." + input: ["items_mono/untrimmed/f2s7b_Talker1.wav", "items_mono/untrimmed/f5s15a_Talker1.wav"] + IR: ["IRs/IR_do_p04_e_07_01_FOA.wav", "IRs/IR_do_p04_e_08_01_FOA.wav"] + shift: [0.0, -1.0] + background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav" + background_level: -46 + + "05": + output: "out/s05.wav" + description: "Car with AB microphone pickup, no overlap between the talkers, car noise." + input: ["items_mono/untrimmed/m2s15a_Talker2.wav", "items_mono/untrimmed/m1s4a_Talker1.wav"] + IR: ["IRs/IR_do_p04_e_07_01_FOA.wav", "IRs/IR_do_p04_e_01_01_FOA.wav"] + shift: [0.0, -1.0] + background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav" + background_level: -46 + + "06": + output: "out/s06.wav" + description: "Car with AB microphone pickup, no overlap between the talkers." + input: ["items_mono/untrimmed/m3s8a_Talker2.wav", "items_mono/untrimmed/m4s13a_Talker1.wav"] + IR: ["IRs/IR_do_p04_e_03_01_FOA.wav", "IRs/IR_do_p04_e_01_01_FOA.wav"] + shift: [0.0, -1.0] + background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav" + background_level: -46 + + "07": + output: "out/s07.wav" + description: "Preliminary: Car with AB microphone pickup, no overlap between the talkers." + input: ["items_mono/untrimmed/f1s20a_Talker2.wav", "items_mono/untrimmed/f5s15b_Talker1.wav"] + IR: ["IRs/IR_do_p04_e_02_01_FOA.wav", "IRs/IR_do_p04_e_07_01_FOA.wav"] + shift: [0.0, -1.0] + background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav" + background_level: -46 + + "08": + output: "out/s08.wav" + description: "Car with AB microphone pickup, overlap between the talkers." + input: ["items_mono/untrimmed/m2s6b_Talker2.wav", "items_mono/untrimmed/f5s14a_Talker1.wav"] + IR: ["IRs/IR_do_p04_e_08_01_FOA.wav", "IRs/IR_do_p04_e_04_01_FOA.wav"] + shift: [0.0, +1.0] + background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav" + background_level: -46 diff --git a/examples/ITEM_GENERATION_OMASA.yml b/examples/ITEM_GENERATION_OMASA.yml index 1f631f3f..942ad1c7 100644 --- a/examples/ITEM_GENERATION_OMASA.yml +++ b/examples/ITEM_GENERATION_OMASA.yml @@ -10,8 +10,8 @@ ### Output format format: "OMASA" -masa_tc: 2 # applicable only to OMASA format -masa_dirs: 2 # applicable only to OMASA format +masa_tc: 2 # applicable only to MASA/OMASA format +masa_dirs: 2 # applicable only to MASA/OMASA format # sba_order: 2 # applicable only to OSBA format ### Output sampling rate in Hz diff --git a/ivas_processing_scripts/generation/__init__.py b/ivas_processing_scripts/generation/__init__.py index 8a9dfb98..2035340e 100755 --- a/ivas_processing_scripts/generation/__init__.py +++ b/ivas_processing_scripts/generation/__init__.py @@ -40,6 +40,7 @@ from ivas_processing_scripts.constants import ( from ivas_processing_scripts.generation import ( config, generate_ismN_items, + generate_masa_items, generate_omasa_items, generate_osba_items, generate_sba_items, @@ -90,11 +91,14 @@ def main(args): elif any(fmt in cfg.format for fmt in ["FOA", "HOA2", "HOA3"]): # generate FOA/HOA2/HOA3 items according to scene description generate_sba_items.generate_sba_items(cfg, logger) + elif "MASA" in cfg.format: + # generate MASA items from MONO items according to scene description + generate_masa_items.generate_masa_items(cfg, logger) elif "OMASA" in cfg.format: - # generate OMASA items from FOA/HO2/HOA3 and MONO items according to scene description + # generate OMASA items from FOA/HOA2/HOA3 and MONO items according to scene description generate_omasa_items.generate_omasa_items(cfg, logger) elif "OSBA" in cfg.format: - # generate OSBA items from FOA/HO2/HOA3 and MONO items according to scene description + # generate OSBA items from FOA/HOA2/HOA3 and MONO items according to scene description generate_osba_items.generate_osba_items(cfg, logger) logger.handlers.clear() diff --git a/ivas_processing_scripts/generation/generate_ismN_items.py b/ivas_processing_scripts/generation/generate_ismN_items.py index bd92367e..adec1961 100644 --- a/ivas_processing_scripts/generation/generate_ismN_items.py +++ b/ivas_processing_scripts/generation/generate_ismN_items.py @@ -234,7 +234,7 @@ def generate_ismN_scene( else: source_shift = 0.0 - # convert overlap to samples and ensure it is a multiple of 20ms + # convert shift from seconds to samples and ensure it is a multiple of 20ms source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: diff --git a/ivas_processing_scripts/generation/generate_masa_items.py b/ivas_processing_scripts/generation/generate_masa_items.py new file mode 100644 index 00000000..5ead6147 --- /dev/null +++ b/ivas_processing_scripts/generation/generate_masa_items.py @@ -0,0 +1,484 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2025 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +import logging +import sys +from itertools import groupby, repeat +from pathlib import Path + +import numpy as np + +from ivas_processing_scripts.audiotools import audio, audioarray, audiofile +from ivas_processing_scripts.audiotools.convert.masa import convert_masa +from ivas_processing_scripts.audiotools.convert.scenebased import render_sba_to_masa +from ivas_processing_scripts.audiotools.wrappers.bs1770 import loudness_norm +from ivas_processing_scripts.audiotools.wrappers.reverb import reverb_foa, reverb_hoa2, reverb_hoa3 +from ivas_processing_scripts.generation import config +from ivas_processing_scripts.utils import apply_func_parallel + +SEED_RANDOM_NOISE = 0 + + +# function for searching sequences of same the same character and replacing it by another string +def replace_char_seq_with_string(str, char_seq, repl_str): + result = [] + + # find groups of consecutive letters + groups = ["".join(list(g)) for k, g in groupby(str)] + + # limit the length of the replacement string by the length of the character sequence + repl_str = repl_str[: len(char_seq)] + + # replace each occurence of the sequence of characters + for g in groups: + if char_seq in g: + result.append(repl_str) + else: + result.append(g) + + return "".join(result) + + +# function for appending string to a filename before file extension +def append_str_filename(filename, str_to_append): + p = Path(filename) + # Combine the stem, the string to append, and the suffix + return p.parent / (p.stem + str_to_append + p.suffix) + + +def generate_masa_items( + cfg: config.TestConfig, + logger: logging.Logger, +): + """Generate MASA items with metadata from FOA/HOA2/HOA3 and ISMn items based on scene description""" + + # set the fs + if "fs" not in cfg.__dict__: + cfg.fs = 48000 + + # set the listening lab designator + if "listening_lab" not in cfg.__dict__: + cfg.listening_lab = "l" + + # set the language designator + if "language" not in cfg.__dict__: + cfg.language = "EN" + + # set the experiment designator + if "exp" not in cfg.__dict__: + cfg.exp = "p07" + + # set the provider + if "provider" not in cfg.__dict__: + cfg.provider = "g" + + # set the prefix for all input filenames + if "use_input_prefix" not in cfg.__dict__: + cfg.use_input_prefix = "" + else: + # replace file designators + cfg.use_input_prefix = replace_char_seq_with_string( + cfg.use_input_prefix, "l", cfg.listening_lab + ) + cfg.use_input_prefix = replace_char_seq_with_string( + cfg.use_input_prefix, "LL", cfg.language + ) + cfg.use_input_prefix = replace_char_seq_with_string( + cfg.use_input_prefix, "eee", cfg.exp + ) + + # set the prefix for all IR filenames + if "use_IR_prefix" not in cfg.__dict__: + cfg.use_IR_prefix = "" + else: + # replace file designators + cfg.use_IR_prefix = replace_char_seq_with_string( + cfg.use_IR_prefix, "p", cfg.provider + ) + cfg.use_IR_prefix = replace_char_seq_with_string( + cfg.use_IR_prefix, "LL", cfg.language + ) + cfg.use_IR_prefix = replace_char_seq_with_string( + cfg.use_IR_prefix, "eee", cfg.exp + ) + + # set the prefix for all output filenames + if "use_output_prefix" not in cfg.__dict__: + cfg.use_output_prefix = "" + else: + # replace file designators + cfg.use_output_prefix = replace_char_seq_with_string( + cfg.use_output_prefix, "l", cfg.listening_lab + ) + cfg.use_output_prefix = replace_char_seq_with_string( + cfg.use_output_prefix, "eee", cfg.exp + ) + + # set multiprocessing + if "multiprocessing" not in cfg.__dict__: + cfg.multiprocessing = False + + apply_func_parallel( + generate_MASA_scene, + zip(cfg.scenes.keys(), cfg.scenes.values(), repeat(cfg), repeat(logger)), + type="mp" if cfg.multiprocessing else None, + show_progress=None, + ) + + return + + +def generate_MASA_scene( + scene_name: str, scene: dict, cfg: config.TestConfig, logger: logging.Logger +): + """ + Processes a single scene to generate MASA item with metadata. + + Args: + scene_name (str): The name of the scene being processed. + scene (dict): A dictionary containing scene description, including source files, azimuth, elevation, and other parameters. + cfg (config.TestConfig): Configuration object containing settings for processing, such as input/output paths, sampling rate, and loudness levels. + logger (logging.Logger): Logger instance for logging information and errors. + + Expected Behavior: + - Reads audio source files and processes them based on the scene description. + - Generates metadata files and appends them to the MASA object. + - Writes the processed audio and metadata to output files. + - Handles various audio formats (e.g., MONO, FOA, HOA2) and applies transformations like loudness normalization, trimming, and padding. + """ + + scenes = list(cfg.scenes.keys()) + logger.info( + f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}" + ) + + # extract the number of audio sources + N_inputs = len(np.atleast_1d(scene["input"])) + + # get output filename + masa_format = f"MASA{cfg.masa_tc}DIR{cfg.masa_dirs}" + output_filename = Path(scene["output"]).parent / ( + cfg.use_output_prefix + Path(scene["output"]).name + ) + + # initialize output dirs + dir_path = output_filename.parent + if dir_path and not dir_path.exists(): + dir_path.mkdir(parents=True, exist_ok=True) + + # initialize output MASA object + y = audio.MetadataAssistedSpatialAudio(masa_format) + y.fs = cfg.fs + + # set the frame length + frame_len = int(cfg.fs / 50) + + # repeat for all source files + offset = 0 + for i in range(N_inputs): + # parse parameters from the scene description + source_file = ( + scene["input"][i] if isinstance(scene["input"], list) else scene["input"] + ) + IR_file = scene["IR"][i] if isinstance(scene["IR"], list) else scene["IR"] + + # get input filename and IR filename + input_filename = Path(source_file).parent / ( + cfg.use_input_prefix + Path(source_file).name + ) + IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) + + # read the shift time in seconds + if "shift" in scene.keys(): + source_shift = ( + scene["shift"][i] + if isinstance(scene["shift"], list) + else scene["shift"] + ) + else: + source_shift = 0.0 + + # convert shift from seconds to samples and ensure it is a multiple of 20ms + source_shift_in_seconds = source_shift + source_shift = source_shift * cfg.fs + if source_shift >= 0: + source_shift = int(np.floor(source_shift / frame_len) * frame_len) + else: + source_shift = int(np.ceil(source_shift / frame_len) * frame_len) + + # read the level + if "level" in scene.keys(): + level = ( + scene["level"][i] + if isinstance(scene["level"], list) + else scene["level"] + ) + else: + level = -26 + + logger.info( + f"-- Convolving {source_file} with {IR_file} at {level} LKFS with shift of {source_shift_in_seconds} seconds" + ) + + # get the number of channels from the IR .wav file header + wav_header = audiofile.parse_wave_header(IR_filename) + IR_channels = wav_header["channels"] + + if IR_channels == 4: + IR_fmt = "FOA" + elif IR_channels == 9: + IR_fmt = "HOA2" + elif IR_channels == 16: + IR_fmt = "HOA3" + else: + logger.error( + f"Error: Input format of the IR source file with {IR_channels} channels is not supported!" + ) + sys.exit(-1) + + # initialize intermediate SBA object + y_int = audio.SceneBasedAudio(IR_fmt) + y_int.fs = cfg.fs + + # read source file + x = audio.fromfile("MONO", input_filename) + + # resample to the target fs if necessary + if x.fs != cfg.fs: + logger.warning( + f"Warning: Sample rate of the audio source is {x.fs} Hz and needs to be resampled to {cfg.fs}!" + ) + resampled_audio = audioarray.resample(x.audio, x.fs, cfg.fs) + x.audio = resampled_audio + x.fs = cfg.fs + + # read the IR file + IR = audio.fromfile(IR_fmt, IR_filename) + + # convolve MONO source audio with FOA/HOA2/HOA3 IR -> results in FOA/HOA2/HOA3 audio object + if IR_fmt == "FOA": + x = reverb_foa(x, IR) + elif IR_fmt == "HOA2": + x = reverb_hoa2(x, IR) + elif IR_fmt == "HOA3": + x = reverb_hoa3(x, IR) + + # adjust the level of the FOA/HOA2/HOA3 signal + x.audio, _ = loudness_norm(x, level, loudness_format="STEREO") + + # ensure the length of the audio source signal is a multiple of 20ms + if len(x.audio) % frame_len != 0: + # pad with zeros to ensure that the signal length is a multiple of 20ms + if len(x.audio) % frame_len != 0: + N_pad = int(frame_len - len(x.audio) % frame_len) + x.audio = audioarray.trim( + x.audio, x.fs, limits=[0, -N_pad], samples=True + ) + + # add the convolved FOA/HOA2/HOA3 audio source signal to the intermediate SBA output signal + if y_int.audio is None: + # this is the first SBA source signal + y_int.audio = x.audio.copy() + + if source_shift < 0: + # insert zeros to the first SBA source signal to shift it right + y_int.audio = audioarray.trim( + y_int.audio, y_int.fs, limits=[source_shift, 0], samples=True + ) + else: + offset = source_shift + else: + # shift the beginning of the audio source signal + delta_offset = source_shift - offset + if delta_offset > 0: + # insert zeros to the output SBA signal to shift it right + audioarray.trim( + y_int.audio, y_int.fs, limits=[-delta_offset, 0], samples=True + ) + offset = source_shift + else: + # insert zeros to the new SBA source signal to shift it right + audioarray.trim( + x.audio, x.fs, limits=[delta_offset, 0], samples=True + ) + + # adjust the length of the audio source signal + delta_length = len(x.audio) - len(y_int.audio) + if delta_length > 0: + # pad zeros to the output SBA signal + y_int.audio = audioarray.trim( + y_int.audio, y_int.fs, limits=[0, -delta_length], samples=True + ) + else: + # pad zeros to the new MASA source signal + x.audio = audioarray.trim( + x.audio, x.fs, limits=[0, delta_length], samples=True + ) + + # superimpose + y_int.audio += x.audio + + # append pre-amble and post-amble + if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: + preamble = int( + np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len + ) # convert to samples and ensure multiple of 20ms + postamble = int( + np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len + ) # convert to samples and ensure multiple of 20ms + if preamble != 0 or postamble != 0: + logger.info( + f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" + ) + y_int.audio = audioarray.trim( + y_int.audio, y_int.fs, limits=[-preamble, -postamble], samples=True + ) + + # adjust the length of the output signal + if "duration" in cfg.__dict__: + # trim the output signal such that the total duration is X seconds + duration = int(cfg.duration * cfg.fs) # convert to samples + else: + # do not change the length of the audio signal + duration = len(y_int.audio) + duration = int( + np.floor(duration / frame_len) * frame_len + ) # ensure multiple of 20ms + if len(y_int.audio) != duration: + y_int.audio = audioarray.trim( + y_int.audio, y_int.fs, limits=[0, len(y_int.audio) - duration], samples=True + ) + + # adjust the loudness of the output signal + if "loudness" in cfg.__dict__: + logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LKFS") + y_int.audio, _ = loudness_norm(y_int, cfg.loudness, loudness_format="BINAURAL") + + # add background noise in FOA/HOA2/HOA3 format + if "background" in scene.keys(): + # check if [] are used in the background noise file name + if isinstance(scene["background"], list): + # if so, use the first element + background_filename = scene["background"][0] + else: + background_filename = scene["background"] + + # read the background noise file + background_filename = Path(scene["background"]).parent / ( + cfg.use_input_prefix + Path(scene["background"]).name + ) + logger.info(f"-- Adding background noise from {background_filename}") + background = audio.fromfile(IR_fmt, background_filename) + + # resample to the target fs if necessary + if background.fs != cfg.fs: + logger.warning( + f"Warning: Sample rate of the background noise is {background.fs} Hz and needs to be resampled to {cfg.fs}!" + ) + resampled_audio = audioarray.resample( + background.audio, background.fs, cfg.fs + ) + background.audio = resampled_audio + background.fs = cfg.fs + + # adjust the length of the background noise signal + if len(background.audio) != len(y_int.audio): + background.audio = audioarray.trim( + background.audio, + background.fs, + limits=[0, len(background.audio) - len(y_int.audio)], + samples=True, + ) + + # adjust the loudness of the background noise signal + if "background_level" in scene.keys(): + logger.info( + f"-- Rescaling background noise to target loudness: {scene['background_level']} LKFS" + ) + + # check if [] are used in the background level + if isinstance(scene["background_level"], list): + # if so, use the first element + scene["background_level"] = scene["background_level"][0] + + # convert to float if the background level was entered in string format + if not isinstance(scene["background_level"], (int, float)): + scene["background_level"] = float(scene["background_level"]) + else: + logger.warning( + "-- Warning: No target loudness for background noise specified, using default value of -26 LKFS" + ) + scene["background_level"] = -26 + background.audio, _ = loudness_norm( + background, scene["background_level"], loudness_format="STEREO", rms=True + ) + + # add the background noise to the output signal + y_int.audio += background.audio + elif ( + "add_low_level_random_noise" in cfg.__dict__ and cfg.add_low_level_random_noise + ): + # create uniformly distributed noise between -4 and 4 + np.random.seed(SEED_RANDOM_NOISE) + noise = np.random.randint(low=-4, high=5, size=y_int.audio.shape).astype("float") + y_int.audio += noise + + # apply fade-in and fade-out + if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0: + logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds") + y_int.audio = audioarray.window(y_int.audio, y_int.fs, cfg.fade_in_out * 1000) + + # generate MASA metadata filename (should end with .met) + y.metadata_file = output_filename.with_suffix(".met") + + # convert the intermediate SBA output signal to MASA format + render_sba_to_masa(y_int, y) + + # write the MASA audio signal to the output file + audiofile.write(output_filename, y.audio, y.fs) + + # convert the MASA audio signal to BINAURAL, if option was chosen + if cfg.binaural_output: + binaural_output_filename = output_filename.with_name( + output_filename.stem + "_BINAURAL" + output_filename.suffix + ) + logger.info( + f"-- Converting to BINAURAL output file: {binaural_output_filename}" + ) + binaudio = audio.fromtype("BINAURAL") + binaudio.fs = y.fs + convert_masa(y, binaudio) + audiofile.write( + binaural_output_filename, + binaudio.audio, + binaudio.fs, + ) diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py index 603a3593..972ee69a 100644 --- a/ivas_processing_scripts/generation/generate_omasa_items.py +++ b/ivas_processing_scripts/generation/generate_omasa_items.py @@ -78,7 +78,7 @@ def generate_omasa_items( cfg: config.TestConfig, logger: logging.Logger, ): - """Generate OMASA items with metadata from FOA/HO2 and ISMn items based on scene description""" + """Generate OMASA items with metadata from FOA/HOA2/HOA3 and ISMn items based on scene description""" # set the fs if "fs" not in cfg.__dict__: @@ -229,7 +229,7 @@ def generate_OMASA_scene( else: source_shift = 0.0 - # convert overlap to samples and ensure it is a multiple of 20ms + # convert shift from seconds to samples and ensure it is a multiple of 20ms source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: diff --git a/ivas_processing_scripts/generation/generate_osba_items.py b/ivas_processing_scripts/generation/generate_osba_items.py index d2a71777..156c3d2d 100644 --- a/ivas_processing_scripts/generation/generate_osba_items.py +++ b/ivas_processing_scripts/generation/generate_osba_items.py @@ -210,7 +210,7 @@ def generate_OSBA_scene( else scene["elevation"] ) - # read the overlap length + # read the shift time in seconds if "shift" in scene.keys(): source_shift = ( scene["shift"][i] @@ -220,7 +220,7 @@ def generate_OSBA_scene( else: source_shift = 0.0 - # convert overlap to samples and ensure it is a multiple of 20ms + # convert shift from seconds to samples and ensure it is a multiple of 20ms source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: diff --git a/ivas_processing_scripts/generation/generate_sba_items.py b/ivas_processing_scripts/generation/generate_sba_items.py index bdb40b1b..96b542d2 100644 --- a/ivas_processing_scripts/generation/generate_sba_items.py +++ b/ivas_processing_scripts/generation/generate_sba_items.py @@ -213,7 +213,7 @@ def generate_sba_scene( ) IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) - # read the overlap length + # read the shift time in seconds if "shift" in scene.keys(): source_shift = ( scene["shift"][i] @@ -223,7 +223,7 @@ def generate_sba_scene( else: source_shift = 0.0 - # convert overlap to samples and ensure it is a multiple of 20ms + # convert shift from seconds to samples and ensure it is a multiple of 20ms source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: diff --git a/ivas_processing_scripts/generation/generate_stereo_items.py b/ivas_processing_scripts/generation/generate_stereo_items.py index 92a68906..bea865de 100644 --- a/ivas_processing_scripts/generation/generate_stereo_items.py +++ b/ivas_processing_scripts/generation/generate_stereo_items.py @@ -219,7 +219,7 @@ def generate_stereo_scene( ) IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) - # read the overlap length + # read the shift time in seconds if "shift" in scene.keys(): source_shift = ( scene["shift"][i] @@ -229,7 +229,7 @@ def generate_stereo_scene( else: source_shift = 0.0 - # convert overlap to samples and ensure it is a multiple of 20ms + # convert shift from seconds to samples and ensure it is a multiple of 20ms source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: diff --git a/ivas_processing_scripts/generation/process_ambi_items.py b/ivas_processing_scripts/generation/process_ambi_items.py index 913fdcc4..f2b8982e 100644 --- a/ivas_processing_scripts/generation/process_ambi_items.py +++ b/ivas_processing_scripts/generation/process_ambi_items.py @@ -191,7 +191,7 @@ def generate_ambi_scene( # extract the number of audio sources N_sources = len(np.atleast_1d(scene["source"])) - # read the overlap length + # read the shift time in seconds if "overlap" in scene.keys(): source_overlap = float(scene["overlap"]) else: diff --git a/ivas_processing_scripts/generation/process_ism1_items.py b/ivas_processing_scripts/generation/process_ism1_items.py index ac1f273e..2177f09b 100644 --- a/ivas_processing_scripts/generation/process_ism1_items.py +++ b/ivas_processing_scripts/generation/process_ism1_items.py @@ -168,7 +168,7 @@ def generate_ism1_scene( y = audio.ChannelBasedAudio("MONO") y_meta = None - # read the overlap length + # read the shift time in seconds if "overlap" in scene.keys(): source_overlap = float(scene["overlap"]) else: diff --git a/ivas_processing_scripts/generation/process_ism2_items.py b/ivas_processing_scripts/generation/process_ism2_items.py index e944ca2c..83bd59e4 100644 --- a/ivas_processing_scripts/generation/process_ism2_items.py +++ b/ivas_processing_scripts/generation/process_ism2_items.py @@ -168,7 +168,7 @@ def generate_ism2_scene( y = audio.ChannelBasedAudio("STEREO") y_meta = None - # read the overlap length + # read the shift time in seconds if "overlap" in scene.keys(): source_overlap = float(scene["overlap"]) else: diff --git a/ivas_processing_scripts/generation/process_stereo_items.py b/ivas_processing_scripts/generation/process_stereo_items.py index b4c17197..7d05de54 100644 --- a/ivas_processing_scripts/generation/process_stereo_items.py +++ b/ivas_processing_scripts/generation/process_stereo_items.py @@ -187,7 +187,7 @@ def generate_stereo_scene( # extract the number of audio sources N_sources = len(np.atleast_1d(scene["source"])) - # read the overlap length + # read the shift time in seconds if "overlap" in scene.keys(): source_overlap = float(scene["overlap"]) else: -- GitLab From 19619ed031254ccd40e422872b7e39509ca3bc3d Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Sun, 27 Jul 2025 14:57:30 +0200 Subject: [PATCH 2/2] fix formatting --- .../generation/generate_masa_items.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/ivas_processing_scripts/generation/generate_masa_items.py b/ivas_processing_scripts/generation/generate_masa_items.py index 5ead6147..65b29761 100644 --- a/ivas_processing_scripts/generation/generate_masa_items.py +++ b/ivas_processing_scripts/generation/generate_masa_items.py @@ -41,7 +41,11 @@ from ivas_processing_scripts.audiotools import audio, audioarray, audiofile from ivas_processing_scripts.audiotools.convert.masa import convert_masa from ivas_processing_scripts.audiotools.convert.scenebased import render_sba_to_masa from ivas_processing_scripts.audiotools.wrappers.bs1770 import loudness_norm -from ivas_processing_scripts.audiotools.wrappers.reverb import reverb_foa, reverb_hoa2, reverb_hoa3 +from ivas_processing_scripts.audiotools.wrappers.reverb import ( + reverb_foa, + reverb_hoa2, + reverb_hoa3, +) from ivas_processing_scripts.generation import config from ivas_processing_scripts.utils import apply_func_parallel @@ -327,9 +331,7 @@ def generate_MASA_scene( offset = source_shift else: # insert zeros to the new SBA source signal to shift it right - audioarray.trim( - x.audio, x.fs, limits=[delta_offset, 0], samples=True - ) + audioarray.trim(x.audio, x.fs, limits=[delta_offset, 0], samples=True) # adjust the length of the audio source signal delta_length = len(x.audio) - len(y_int.audio) @@ -449,7 +451,9 @@ def generate_MASA_scene( ): # create uniformly distributed noise between -4 and 4 np.random.seed(SEED_RANDOM_NOISE) - noise = np.random.randint(low=-4, high=5, size=y_int.audio.shape).astype("float") + noise = np.random.randint(low=-4, high=5, size=y_int.audio.shape).astype( + "float" + ) y_int.audio += noise # apply fade-in and fade-out -- GitLab