From 5cd9308f7883ed78c0057bf4816a36b16844952e Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Fri, 13 Jun 2025 16:14:29 +0200 Subject: [PATCH 01/42] add OMASA item generation script --- .../generation/__init__.py | 5 + .../generation/generate_omasa_items.py | 413 ++++++++++++++++++ 2 files changed, 418 insertions(+) create mode 100644 ivas_processing_scripts/generation/generate_omasa_items.py diff --git a/ivas_processing_scripts/generation/__init__.py b/ivas_processing_scripts/generation/__init__.py index bc20e51d..4b1d3279 100755 --- a/ivas_processing_scripts/generation/__init__.py +++ b/ivas_processing_scripts/generation/__init__.py @@ -45,6 +45,7 @@ from ivas_processing_scripts.generation import ( process_ism1_items, process_ism2_items, process_stereo_items, + generate_omasa_items, ) from ivas_processing_scripts.utils import create_dir @@ -103,6 +104,10 @@ def main(args): elif "FOA" in cfg.format or "HOA2" in cfg.format: # generate FOA/HOA2 items according to scene description process_ambi_items.generate_ambi_items(cfg, logger) + elif "OMASA" in cfg.format: + # generate OMASA items according to scene description + generate_omasa_items.generate_omasa_items(cfg, logger) + # copy configuration to output directory with open(cfg.output_path.joinpath(f"{'_'.join(cfg.format)}.yml"), "w") as f: diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py new file mode 100644 index 00000000..948e474b --- /dev/null +++ b/ivas_processing_scripts/generation/generate_omasa_items.py @@ -0,0 +1,413 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# +import pdb +import csv +import logging +import os +from itertools import groupby, repeat +from math import floor +from pathlib import Path +from sox import file_info + +import numpy as np + +from ivas_processing_scripts.audiotools import audio, audiofile, audioarray +from ivas_processing_scripts.audiotools.wrappers.bs1770 import get_loudness, loudness_norm +from ivas_processing_scripts.audiotools.convert.scenebased import render_sba_to_masa +from ivas_processing_scripts.audiotools.convert.omasa import convert_omasa +from ivas_processing_scripts.generation import config +from ivas_processing_scripts.utils import apply_func_parallel + +import pdb + +SEED_RANDOM_NOISE = 0 + + +# function for converting nd numpy array to strings with 2 decimal digits +def csv_formatdata(data): + for row in data: + yield ["%0.2f" % v for v in row] + + +# function for searching sequences of same the same character and replacing it by another string +def replace_char_seq_with_string(str, char_seq, repl_str): + result = [] + + # find groups of consecutive letters + groups = ["".join(list(g)) for k, g in groupby(str)] + + # limit the length of the replacement string by the length of the character sequence + repl_str = repl_str[: len(char_seq)] + + # replace each occurence of the sequence of characters + for g in groups: + if char_seq in g: + result.append(repl_str) + else: + result.append(g) + + return "".join(result) + +# function for appending string to a filename before file extension +def append_str_filename(filename, str_to_append): + p = Path(filename) + return "{0}{2}{1}".format(p.stem, p.suffix, str_to_append) + +def generate_omasa_items( + cfg: config.TestConfig, + logger: logging.Logger, +): + """Generate OMASA items with metadata from FOA/HO2 and ISMn items based on scene description""" + + # set the target level + if "loudness" not in cfg.__dict__: + cfg.loudness = -26 + + # set the fs + if "fs" not in cfg.__dict__: + cfg.fs = 48000 + + # set the pre-amble and post-amble + if "preamble" not in cfg.__dict__: + cfg.preamble = 0.0 + + if "postamble" not in cfg.__dict__: + cfg.postamble = 0.0 + + # set the pre-amble and post-amble + if "add_low_level_random_noise" not in cfg.__dict__: + cfg.add_low_level_random_noise = False + + # set the listening lab designator + if "listening_lab" not in cfg.__dict__: + cfg.listening_lab = "l" + + # set the language designator + if "language" not in cfg.__dict__: + cfg.language = "EN" + + # set the experiment designator + if "exp" not in cfg.__dict__: + cfg.exp = "p07" + + # set the provider + if "provider" not in cfg.__dict__: + cfg.provider = "g" + + # set the prefix for all input filenames + if "use_input_prefix" not in cfg.__dict__: + cfg.use_input_prefix = "" + else: + # replace file designators + cfg.use_input_prefix = replace_char_seq_with_string( + cfg.use_input_prefix, "l", cfg.listening_lab + ) + cfg.use_input_prefix = replace_char_seq_with_string( + cfg.use_input_prefix, "LL", cfg.language + ) + cfg.use_input_prefix = replace_char_seq_with_string( + cfg.use_input_prefix, "eee", cfg.exp + ) + + # set the prefix for all output filenames + if "use_output_prefix" not in cfg.__dict__: + cfg.use_output_prefix = "" + else: + # replace file designators + cfg.use_output_prefix = replace_char_seq_with_string( + cfg.use_output_prefix, "l", cfg.listening_lab + ) + cfg.use_output_prefix = replace_char_seq_with_string( + cfg.use_output_prefix, "eee", cfg.exp + ) + + # set multiprocessing + if "multiprocessing" not in cfg.__dict__: + cfg.multiprocessing = True + + apply_func_parallel( + generate_scene, + zip(cfg.scenes.keys(), cfg.scenes.values(), repeat(cfg), repeat(logger)), + None, + # "mp" if cfg.multiprocessing else None, + None, + ) + + return + + +def generate_scene( + scene_name: str, scene: dict, cfg: config.TestConfig, logger: logging.Logger +): + logger.info( f"Processing scene {scene_name}:") + + # extract the number of audio sources + N_sources = len(np.atleast_1d(scene["source"])) + N_ISMs = N_sources-1 + + # initialize output array + omasa_format = f"ISM{N_ISMs}MASA{cfg.masa_tc}DIR{cfg.masa_dirs}" + output_filename = os.path.join( cfg.output_path, os.path.dirname(scene["name"]), cfg.use_output_prefix + append_str_filename(os.path.basename(scene["name"]), f"_s{scene_name}_{omasa_format}") ) + y = audio.OMASAAudio(omasa_format) + + # repeat for all source files + for i in range(N_sources): + + # parse parameters from the scene description + source_file = ( + scene["source"][i] if isinstance(scene["source"], list) else scene["source"] + ) + source_azi = ( + scene["azimuth"][i] + if isinstance(scene["azimuth"], list) + else scene["azimuth"] + ) + source_ele = ( + scene["elevation"][i] + if isinstance(scene["elevation"], list) + else scene["elevation"] + ) + + # read the overlap length + if "overlap" in scene.keys(): + source_overlap = ( + scene["overlap"][i] + if isinstance(scene["overlap"], list) + else scene["overlap"] + ) + else: + source_overlap = 0.0 + + # read the level + if "level" in scene.keys(): + level = ( + scene["level"][i] + if isinstance(scene["level"], list) + else scene["level"] + ) + else: + level = -26 + + logger.info(f"Encoding {source_file} at position(s) {source_azi},{source_ele}") + + # get the number of channels from the .wav file header + N_channels = file_info.channels(os.path.join(cfg.input_path, os.path.dirname(source_file), cfg.use_input_prefix + os.path.basename(source_file))) + + if N_channels == 1: + fmt = "MONO" + elif N_channels == 2: + fmt = "STEREO" + elif N_channels == 4: + fmt = "FOA" + elif N_channels == 9: + fmt = "HOA2" + elif N_channels == 16: + fmt = "HOA3" + else: + logger.info(f"Error: Input format of the source file with {N_channels} channels is not supported!") + sys.exit(-1) + + if fmt in ["FOA", "HOA2"]: + # generate MASA metadata .met filename (should end with .met) + y.metadata_files.append(os.path.splitext(output_filename)[0]+".met") + elif fmt == "MONO": + # generate ISM metadata .csv filename (should end with .wav..0.csv, .wav.1.csv, ...) + y.metadata_files.insert(i-1, f"{output_filename}.{i-1}.csv") + + # read source file + x = audio.fromfile( + fmt, + os.path.join( + cfg.input_path, + os.path.dirname(source_file), + cfg.use_input_prefix + os.path.basename(source_file), + ), + fs=cfg.fs, + ) + + # get the number of frames (multiple of 20ms) + N_frames = int(len(x.audio) / x.fs * 50) + frame_len = int(x.fs / 50) + + # trim the samples from the end to ensure that the signal length is a multiple of 20ms + audioarray.cut(x.audio, [0, N_frames * frame_len]) + + # adjust the level of the source file + if fmt in ["FOA", "HOA2"]: + x.audio, _ = loudness_norm(x, level, loudness_format="STEREO", rms=True) + else: + x.audio, _ = loudness_norm(x, level, loudness_format="MONO") + + # shift the source signal (positive shift creates overlap, negative shift creates a gap) + if int(floor(-source_overlap)) != 0: + x.audio = audioarray.trim(x.audio, x.fs, limits=[-source_overlap, 0]) + + # pad with zeros to ensure that the signal length is a multiple of 20ms + if len(x.audio) % frame_len != 0: + # pad the source signal + N_pad = int(frame_len - len(x.audio) % frame_len) + x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) + + # convert FOA to MASA + if fmt in ["FOA", "HOA2"]: + x_masa = audio.MetadataAssistedSpatialAudio(f"MASA2DIR1") + x_masa.metadata_file = y.metadata_files[i] + render_sba_to_masa(x, x_masa) + y.audio = x_masa.audio + y.fs = x.fs + else: + # pad ISM signal with zeros to have the same length as the MASA signal + N_pad = y.audio.shape[0] - x.audio.shape[0] + if N_pad != 0: + x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) + + # append ISM signal to the OMASA object (ISM comes first !!!) + y.audio = np.insert(y.audio, [i-1], x.audio, axis=1) + + # append pre-amble and post-amble to all sources + y.audio = audioarray.trim(y.audio, y.fs, limits=[-cfg.preamble, -cfg.postamble]) + + # add random noise + if cfg.add_low_level_random_noise: + # create uniformly distributed noise between -4 and 4 + np.random.seed(SEED_RANDOM_NOISE) + noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") + + # superimpose + y.audio += noise + + # generate ISM metadata files + y_meta = None + for i in range(1, N_ISMs + 1): + # parse metadata parameters from the scene description + source_azi = ( + scene["azimuth"][i] + if isinstance(scene["azimuth"], list) + else scene["azimuth"] + ) + source_ele = ( + scene["elevation"][i] + if isinstance(scene["elevation"], list) + else scene["elevation"] + ) + + # pdb.set_trace() + N_frames = int(np.rint((len(y.audio) / y.fs * 50))) + + # read azimuth information and convert to an array + if isinstance(source_azi, str): + if ":" in source_azi: + # start with the initial azimuth value and apply step N_frames times + source_azi = source_azi.split(":") + azi = np.arange( + float(eval(source_azi[0])), + float(eval(source_azi[0])) + N_frames * float(eval(source_azi[1])), + float(eval(source_azi[1])) + ) + else: + # replicate static azimuth value N_frames times + azi = np.repeat(float(eval(source_azi)), N_frames) + else: + # replicate static azimuth value N_frames times + azi = np.repeat(float(source_azi), N_frames) + + # convert azimuth from 0 .. 360 to -180 .. +180 + azi = (azi + 180) % 360 - 180 + + # check if azimuth is from -180 .. +180 + if any(azi > 180) or any(azi < -180): + logger.error( + f"Incorrect value(s) of azimuth: {azi[(azi > 180) | (azi < -180)]}" + ) + + # read elevation information and convert to an array + if isinstance(source_ele, str): + if ":" in source_ele: + # convert into array (initial_value:step:stop_value) + # note: the stop_value value is +-90 degrees depending on the sign of the step + source_ele = source_ele.split(":") + ele = np.arange( + float(eval(source_ele[0])), + np.sign(float(eval(source_ele[1]))) * 90, + float(eval(source_ele[1])) + )[:N_frames] + + # repeat the last elevation value, if array is shorter than N_frames + if len(ele) < N_frames: + ele = np.append(ele, np.full(N_frames - len(ele), ele[-1])) + else: + # replicate static elevation value N_frames times + ele = np.repeat(float(eval(source_ele)), N_frames) + else: + # replicate static elevation value N_frames times + ele = np.repeat(float(source_ele), N_frames) + + # check if elevation is from -90 .. +90 + if any(ele > 90) or any(ele < -90): + logger.error( + f"Incorrect value(s) of elevation: {ele[(ele > 90) | (ele < -90)]}" + ) + + # arrange all metadata fields column-wise into a matrix + x_meta = np.column_stack((azi, ele)) + + # write to .csv output metadata file + with open( + y.metadata_files[i-1], + "w", + newline="", + encoding="utf-8", + ) as f: + # create csv writer + writer = csv.writer(f) + + # write all rows to the .csv file + writer.writerows(csv_formatdata(x_meta)) + + y.init_metadata() # this is needed to populate 'y.object_pos[]' + + # write the OMASA output to .wav file in an interleaved format + audiofile.write( output_filename, y.audio, y.fs ) + + # convert to OMASA output to BINAURAL, if option was chosen + if cfg.binaural_path != "": + binaudio = audio.fromtype("BINAURAL") + binaudio.fs = y.fs + convert_omasa(y, binaudio) + audiofile.write( + os.path.join( + cfg.binaural_path, append_str_filename(os.path.basename(scene["name"]), f"_s{scene_name}_{omasa_format}_BINAURAL") ), + binaudio.audio, + binaudio.fs, + ) + + return -- GitLab From 0774f3584d6bc911ca0587c5f08bea0a4ca7ebff Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Mon, 16 Jun 2025 13:53:27 +0200 Subject: [PATCH 02/42] fixes in OMASA item generation script --- .../audiotools/wrappers/masaAnalyzer.py | 4 ++ .../audiotools/wrappers/masaRenderer.py | 7 ++- .../generation/generate_omasa_items.py | 45 +++++++++++++------ 3 files changed, 41 insertions(+), 15 deletions(-) diff --git a/ivas_processing_scripts/audiotools/wrappers/masaAnalyzer.py b/ivas_processing_scripts/audiotools/wrappers/masaAnalyzer.py index 3eea5dbe..c826cdf4 100644 --- a/ivas_processing_scripts/audiotools/wrappers/masaAnalyzer.py +++ b/ivas_processing_scripts/audiotools/wrappers/masaAnalyzer.py @@ -72,6 +72,10 @@ def masaAnalyzer( else: binary = find_binary("masaAnalyzer") + # enforce metadata_out_filename to be a Path object + if metadata_out_filename is not None and not isinstance(metadata_out_filename, Path): + metadata_out_filename = Path(metadata_out_filename) + if num_tcs not in [1, 2]: raise ValueError(f"Only 1 or 2 TCs supported, but {num_tcs} was given.") diff --git a/ivas_processing_scripts/audiotools/wrappers/masaRenderer.py b/ivas_processing_scripts/audiotools/wrappers/masaRenderer.py index 4c6c0bcf..b7406e2a 100755 --- a/ivas_processing_scripts/audiotools/wrappers/masaRenderer.py +++ b/ivas_processing_scripts/audiotools/wrappers/masaRenderer.py @@ -83,11 +83,16 @@ def masaRenderer( output_mode = "-BINAURAL" num_channels = 2 + # enforce masa_metadata_file to be a Path object + masa_metadata_file = masa.metadata_file + if masa_metadata_file is not None and not isinstance(masa_metadata_file, Path): + masa_metadata_file = Path(masa_metadata_file) + cmd = [ str(binary), output_mode, "", # 2 -> inputPcm - str(masa.metadata_file.resolve()), + str(masa_metadata_file.resolve()), "", # 4 -> outputPcm ] diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py index 948e474b..7285a61c 100644 --- a/ivas_processing_scripts/generation/generate_omasa_items.py +++ b/ivas_processing_scripts/generation/generate_omasa_items.py @@ -29,7 +29,7 @@ # accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and # the United Nations Convention on Contracts on the International Sales of Goods. # -import pdb + import csv import logging import os @@ -47,8 +47,6 @@ from ivas_processing_scripts.audiotools.convert.omasa import convert_omasa from ivas_processing_scripts.generation import config from ivas_processing_scripts.utils import apply_func_parallel -import pdb - SEED_RANDOM_NOISE = 0 @@ -152,14 +150,13 @@ def generate_omasa_items( # set multiprocessing if "multiprocessing" not in cfg.__dict__: - cfg.multiprocessing = True + cfg.multiprocessing = False apply_func_parallel( generate_scene, zip(cfg.scenes.keys(), cfg.scenes.values(), repeat(cfg), repeat(logger)), - None, - # "mp" if cfg.multiprocessing else None, - None, + type = "mp" if cfg.multiprocessing else None, + show_progress = None, ) return @@ -168,15 +165,36 @@ def generate_omasa_items( def generate_scene( scene_name: str, scene: dict, cfg: config.TestConfig, logger: logging.Logger ): + """ + Processes a single scene to generate OMASA items with metadata. + + Args: + scene_name (str): The name of the scene being processed. + scene (dict): A dictionary containing scene description, including source files, azimuth, elevation, and other parameters. + cfg (config.TestConfig): Configuration object containing settings for processing, such as input/output paths, sampling rate, and loudness levels. + logger (logging.Logger): Logger instance for logging information and errors. + + Expected Behavior: + - Reads audio source files and processes them based on the scene description. + - Generates metadata files and appends them to the OMASA object. + - Writes the processed audio and metadata to output files. + - Handles various audio formats (e.g., MONO, FOA, HOA2) and applies transformations like loudness normalization, trimming, and padding. + """ logger.info( f"Processing scene {scene_name}:") # extract the number of audio sources N_sources = len(np.atleast_1d(scene["source"])) N_ISMs = N_sources-1 - # initialize output array + # initialize output dirs omasa_format = f"ISM{N_ISMs}MASA{cfg.masa_tc}DIR{cfg.masa_dirs}" output_filename = os.path.join( cfg.output_path, os.path.dirname(scene["name"]), cfg.use_output_prefix + append_str_filename(os.path.basename(scene["name"]), f"_s{scene_name}_{omasa_format}") ) + + dir_path = os.path.dirname(output_filename) + if dir_path and not os.path.exists(dir_path): + os.makedirs(dir_path, exist_ok=True) + + # initialize output OMASA object y = audio.OMASAAudio(omasa_format) # repeat for all source files @@ -236,7 +254,7 @@ def generate_scene( logger.info(f"Error: Input format of the source file with {N_channels} channels is not supported!") sys.exit(-1) - if fmt in ["FOA", "HOA2"]: + if fmt in ["FOA", "HOA2", "HOA3"]: # generate MASA metadata .met filename (should end with .met) y.metadata_files.append(os.path.splitext(output_filename)[0]+".met") elif fmt == "MONO": @@ -262,7 +280,7 @@ def generate_scene( audioarray.cut(x.audio, [0, N_frames * frame_len]) # adjust the level of the source file - if fmt in ["FOA", "HOA2"]: + if fmt in ["FOA", "HOA2", "HOA3"]: x.audio, _ = loudness_norm(x, level, loudness_format="STEREO", rms=True) else: x.audio, _ = loudness_norm(x, level, loudness_format="MONO") @@ -277,9 +295,9 @@ def generate_scene( N_pad = int(frame_len - len(x.audio) % frame_len) x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) - # convert FOA to MASA - if fmt in ["FOA", "HOA2"]: - x_masa = audio.MetadataAssistedSpatialAudio(f"MASA2DIR1") + # convert FOA/HOA2/HOA3 to MASA + if fmt in ["FOA", "HOA2", "HOA3"]: + x_masa = audio.MetadataAssistedSpatialAudio(f"MASA{cfg.masa_tc}DIR{cfg.masa_dirs}") x_masa.metadata_file = y.metadata_files[i] render_sba_to_masa(x, x_masa) y.audio = x_masa.audio @@ -320,7 +338,6 @@ def generate_scene( else scene["elevation"] ) - # pdb.set_trace() N_frames = int(np.rint((len(y.audio) / y.fs * 50))) # read azimuth information and convert to an array -- GitLab From ade4f928eb605e71832027c9fdc03bac03087483 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 19 Jun 2025 10:33:26 +0200 Subject: [PATCH 03/42] examples of scene description files (.yml) for item generation scripts --- examples/ITEM_GENERATION_3ISM.yml | 171 ++++++++++++++++++++++++++++ examples/ITEM_GENERATION_FOA.yml | 154 +++++++++++++++++++++++++ examples/ITEM_GENERATION_OMASA.yml | 170 +++++++++++++++++++++++++++ examples/ITEM_GENERATION_OSBA.yml | 170 +++++++++++++++++++++++++++ examples/ITEM_GENERATION_STEREO.yml | 154 +++++++++++++++++++++++++ 5 files changed, 819 insertions(+) create mode 100644 examples/ITEM_GENERATION_3ISM.yml create mode 100644 examples/ITEM_GENERATION_FOA.yml create mode 100644 examples/ITEM_GENERATION_OMASA.yml create mode 100644 examples/ITEM_GENERATION_OSBA.yml create mode 100644 examples/ITEM_GENERATION_STEREO.yml diff --git a/examples/ITEM_GENERATION_3ISM.yml b/examples/ITEM_GENERATION_3ISM.yml new file mode 100644 index 00000000..7b0cb27d --- /dev/null +++ b/examples/ITEM_GENERATION_3ISM.yml @@ -0,0 +1,171 @@ +--- +################################################ +# Item generation - General configuration +################################################ + +### Any relative paths will be interpreted relative to the working directory the script is called from! +### Usage of absolute paths is recommended. +### Do not use file names with dots "." in them! This is not supported, use "_" instead +### For Windows users: please use double back slash '\\' in paths and add '.exe' to executable definitions + +### Output format +format: "ISM3" +# masa_tc: 2 +# masa_dirs: 2 +# sba_order: 2 + +### Output sampling rate in Hz +fs: 48000 + +### Generate BINAURAL output (_BINAURAL will be appended to the output filename) +binaural_output: true + +### Normalize target loudness to X LKFS +# loudness: -26 + +### Apply pre-amble and post-amble in X seconds +preamble: 0.0 +postamble: 0.0 + +### Add low-level random background noise (amplitude +-4) instead of silence; default = false (silence) +add_low_level_random_noise: true + +### Process with parallel streams +multiprocessing: False + +################################################ +### Item generation - Filename conventions +################################################ + +### Naming convention for the input mono files +### The input filenames are represented by: +### lLLeeettszz.wav +### where: +### l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com) +### LL stands for the language: JP, FR, GE, MA, DA, EN +### eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09 +### tt stands for the talker ID: f1, f2, f3, m1, m2, m3 +### s stands for 'sample' and zz is the sample number; 01, ..., 14 + +### Naming convention for the generated output files +### The output filenames are represented by: +### leeeayszz.wav +### The filenames of the accompanying output metadata files (applicable to metadata-assisted spatial audio, object-based audio) are represented by: +### leeeayszz.met for metadata-assisted spatial audio +### leeeayszz.wav.o.csv for object-based audio +### where: +### l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com) +### eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09 +### a stands 'audio' +### y is the per-experiment category according to IVAS-8a: 01, 02, 03, 04, 05, 06 +### s stands for sample and zz is the sample number; 01, 02, 03, 04, 05, 06, 07 (07 is the preliminary sample) +### o stands for the object number; 0, 1, 2, 3 + +### File designators, default is "l" for listening lab, "EN" for language, "p07" for experiment and "g" for company +listening_lab: "l" +language: "EN" +exp: "p01" +provider: "va" + +### Insert prefix for all input filenames (default: "") +### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment' +### the number of consecutive letters define the length of each field +# use_input_prefix: "lLLeee" + +### Insert prefix for all output filenames (default: "") +### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment' +### the number of consecutive letters define the length of each field +# use_output_prefix: "leee" + +################################################ +### Item generation - Scene description +################################################ + +### Each scene shall de described using the following parameters/properties: +### output: output filename +### description: textual description of the scene +### input: input filename(s) +### azimuth: azimuth in the range [-180,180]; positive values point to the left +### elevation: elevation in the range [-90,90]; positive values indicate up +### shift: time adjustment of the input signal (negative value delays the signal) +### +### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder) +### Note 1: use brackets [val1, val2, ...] when specifying multiple values +### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames +### Note 3: we're using right-handed coordinate system with azimuth = 0 pointing from the nose to the screen + + +scenes: + + "01": + output: "out/VA_3obj_2tlks_music1.wav" + description: "Two talkers sitting at a table, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." + input: ["items_mono/untrimmed/f2s1a_Talker1.wav", "items_mono/untrimmed/m2s10a_Talker2.wav", "items_mono/music/Sc01.wav"] + azimuth: [20, -40, 45] + elevation: [0, 0, 70] + level: [-26, -26, -41] + shift: [0.0, 0.0, 0.0] + + "02": + output: "out/VA_3obj_2tlks_music2.wav" + description: "One talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." + input: ["items_mono/untrimmed/f5s10b_Talker1.wav", "items_mono/untrimmed/m3s2b_Talker2.wav", "items_mono/music/Guitar1.wav"] + azimuth: [50, "180:1:120 + 360", -120] + elevation: [0, 45, 70] + level: [-26, -26, -41] + shift: [0.0, 0.0, 0.0] + + "03": + output: "out/VA_3obj_2tlks_music3.wav" + description: "Two talkers walking side-by-side around the table, ~30% overlapping utterances." + input: ["items_mono/untrimmed/m1s2b_Talker1.wav", "items_mono/untrimmed/f3s5a_Talker2.wav", "items_mono/music/Track066.wav"] + azimuth: ["80:1:20 + 360", "80:1:20 + 360", -30] + elevation: [10, 60, 70] + level: [-26, -26, -41] + shift: [0.0, 0.0, 0.0] + + "04": + output: "out/VA_3obj_2tlks_music4.wav" + description: "Two talkers walking around the table in opposite directions, ~30% overlapping utterances." + input: ["items_mono/untrimmed/m4s12b_Talker1.wav", "items_mono/untrimmed/f1s12b_Talker2.wav", "items_mono/music/Sample02.wav"] + azimuth: ["60:1:0 + 360", "60:-1:120 - 360", 100] + elevation: [20, 50, 70] + level: [-26, -26, -41] + shift: [0.0, 0.0, 0.0] + + "05": + output: "out/VA_3obj_3tlks_1.wav" + description: "Three static talkers, partially overlapping utterances." + input: ["items_mono/untrimmed/m4s12b_Talker1.wav", "items_mono/untrimmed/f1s12b_Talker2.wav", "items_mono/untrimmed/m3s1a_Talker2.wav"] + azimuth: [30, -45, 100] + elevation: [20, 20, 30] + level: [-26, -26, -26] + shift: [0.0, 0.0, -2.5] + + "06": + output: "out/VA_3obj_3tlks_2.wav" + description: "One walking talker, two static talkers, non-overlapping utterances." + input: ["items_mono/untrimmed/f2s5a_Talker1.wav", "items_mono/untrimmed/m2s16b_Talker2.wav", "items_mono/untrimmed/m3s8b_Talker2.wav"] + azimuth: ["-20:0.5:360", 60, -45] + elevation: [10, 10, 10] + level: [-26, -26, -26] + shift: [0.0, 0.0, -3.0] + + "07": + output: "out/VA_3obj_3tlks_3.wav" + description: "Two moving talkers, one static talker, partially overlapping utterances." + input: ["items_mono/untrimmed/f1s16b_Talker2.wav", "items_mono/untrimmed/m4s16a_Talker1.wav", "items_mono/untrimmed/f3s10b_Talker2.wav"] + azimuth: [-90, "0:1:360", "0:-1:-360"] + elevation: [0, 30, 30] + level: [-26, -26, -26] + shift: [0.0, 0.0, -3.0] + + "08": + output: "out/VA_3obj_3tlks_4.wav" + description: "Three walking talkers, partially overlapping utterances." + input: ["items_mono/untrimmed/f5s15b_Talker1.wav", "items_mono/untrimmed/m3s1a_Talker2.wav", "items_mono/untrimmed/m2s17b_Talker2.wav"] + azimuth: ["-90:-1:-360", "-10:1.5:360", "70:1:360"] + elevation: [0, 20, 0] + level: [-26, -26, -26] + shift: [0.0, 0.0, -3.5] + diff --git a/examples/ITEM_GENERATION_FOA.yml b/examples/ITEM_GENERATION_FOA.yml new file mode 100644 index 00000000..f94aadf2 --- /dev/null +++ b/examples/ITEM_GENERATION_FOA.yml @@ -0,0 +1,154 @@ +--- +################################################ +# Item generation - General configuration +################################################ + +### Any relative paths will be interpreted relative to the working directory the script is called from! +### Usage of absolute paths is recommended. +### Do not use file names with dots "." in them! This is not supported, use "_" instead +### For Windows users: please use double back slash '\\' in paths and add '.exe' to executable definitions + +### Output format +format: "FOA" +# masa_tc: 2 +# masa_dirs: 2 +# sba_order: 2 + +### Output sampling rate in Hz +fs: 48000 + +### Generate BINAURAL output (_BINAURAL will be appended to the output filename) +binaural_output: true + +### Normalize target loudness to X LKFS +loudness: -26 + +### Apply pre-amble and post-amble in X seconds +preamble: 0.5 +postamble: 1.0 + +### Add low-level random background noise (amplitude +-4) instead of silence; default = false (silence) +add_low_level_random_noise: False + +### Process with parallel streams +multiprocessing: False + +################################################ +### Item generation - Filename conventions +################################################ + +### Naming convention for the input mono files +### The input filenames are represented by: +### lLLeeettszz.wav +### where: +### l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com) +### LL stands for the language: JP, FR, GE, MA, DA, EN +### eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09 +### tt stands for the talker ID: f1, f2, f3, m1, m2, m3 +### s stands for 'sample' and zz is the sample number; 01, ..., 14 + +### Naming convention for the generated output files +### The output filenames are represented by: +### leeeayszz.wav +### The filenames of the accompanying output metadata files (applicable to metadata-assisted spatial audio, object-based audio) are represented by: +### leeeayszz.met for metadata-assisted spatial audio +### leeeayszz.wav.o.csv for object-based audio +### where: +### l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com) +### eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09 +### a stands 'audio' +### y is the per-experiment category according to IVAS-8a: 01, 02, 03, 04, 05, 06 +### s stands for sample and zz is the sample number; 01, 02, 03, 04, 05, 06, 07 (07 is the preliminary sample) +### o stands for the object number; 0, 1, 2, 3 + +### File designators, default is "l" for listening lab, "EN" for language, "p07" for experiment and "g" for company +listening_lab: "b" +language: "GE" +exp: "p02" +provider: "g" + +### Insert prefix for all input filenames (default: "") +### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment' +### the number of consecutive letters define the length of each field +# use_input_prefix: "lLLeee" + +### Insert prefix for all output filenames (default: "") +### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment' +### the number of consecutive letters define the length of each field +use_output_prefix: "leee" + +################################################ +### Item generation - Scene description +################################################ + +### Each scene shall de described using the following parameters/properties: +### output: output filename +### description: textual description of the scene +### input: input filename(s) +### IR: filenames(s) of the input IRs +### azimuth: azimuth in the range [-180,180]; positive values point to the left +### elevation: elevation in the range [-90,90]; positive values indicate up +### shift: time adjustment of the input signal (negative value delays the signal) +### +### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder) +### Note 1: use brackets [val1, val2, ...] when specifying multiple values +### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames +### Note 3: we're using right-handed coordinate system with azimuth = 0 pointing from the nose to the screen + + +scenes: + "01": + output: "out/s01.wav" + description: "Car with AB microphone pickup, no overlap between the talkers, car noise." + input: ["items_mono/untrimmed/f1s4b_Talker2.wav", "items_mono/untrimmed/f2s1a_Talker1.wav"] + IR: ["IRs/IR_do_p04_e_01_01_FOA.wav", "IRs/IR_do_p04_e_02_01_FOA.wav"] + shift: [0.0, -1.0] + + "02": + output: "out/s02.wav" + description: "Car with AB microphone pickup, overlap between the talkers, car noise." + input: ["items_mono/untrimmed/f1s6a_Talker2.wav", "items_mono/untrimmed/f2s3b_Talker1.wav"] + IR: ["IRs/IR_do_p04_e_03_01_FOA.wav", "IRs/IR_do_p04_e_04_01_FOA.wav"] + shift: [0.0, +1.0] + + "03": + output: "out/s03.wav" + description: "Car with AB microphone pickup, no overlap between the talkers, car noise." + input: ["items_mono/untrimmed/f3s3a_Talker2.wav", "items_mono/untrimmed/f3s10b_Talker2.wav"] + IR: ["IRs/IR_do_p04_e_05_01_FOA.wav", "IRs/IR_do_p04_e_06_01_FOA.wav"] + shift: [0.0, -1.0] + + "04": + output: "out/s04.wav" + description: "Car with AB microphone pickup, no overlap between the talkers, car noise." + input: ["items_mono/untrimmed/f2s7b_Talker1.wav", "items_mono/untrimmed/f5s15a_Talker1.wav"] + IR: ["IRs/IR_do_p04_e_07_01_FOA.wav", "IRs/IR_do_p04_e_08_01_FOA.wav"] + shift: [0.0, -1.0] + + "05": + output: "out/s05.wav" + description: "Car with AB microphone pickup, no overlap between the talkers, car noise." + input: ["items_mono/untrimmed/m2s15a_Talker2.wav", "items_mono/untrimmed/m1s4a_Talker1.wav"] + IR: ["IRs/IR_do_p04_e_07_01_FOA.wav", "IRs/IR_do_p04_e_01_01_FOA.wav"] + shift: [0.0, -1.0] + + "06": + output: "out/s06.wav" + description: "Car with AB microphone pickup, no overlap between the talkers." + input: ["items_mono/untrimmed/m3s8a_Talker2.wav", "items_mono/untrimmed/m4s13a_Talker1.wav"] + IR: ["IRs/IR_do_p04_e_03_01_FOA.wav", "IRs/IR_do_p04_e_01_01_FOA.wav"] + shift: [0.0, -1.0] + + "07": + output: "out/s07.wav" + description: "Preliminary: Car with AB microphone pickup, no overlap between the talkers." + input: ["items_mono/untrimmed/f1s20a_Talker2.wav", "items_mono/untrimmed/f5s15b_Talker1.wav"] + IR: ["IRs/IR_do_p04_e_02_01_FOA.wav", "IRs/IR_do_p04_e_07_01_FOA.wav"] + shift: [0.0, -1.0] + + "08": + output: "out/s08.wav" + description: "Car with AB microphone pickup, overlap between the talkers." + input: ["items_mono/untrimmed/m2s6b_Talker2.wav", "items_mono/untrimmed/f5s14a_Talker1.wav"] + IR: ["IRs/IR_do_p04_e_08_01_FOA.wav", "IRs/IR_do_p04_e_04_01_FOA.wav"] + shift: [0.0, +1.0] diff --git a/examples/ITEM_GENERATION_OMASA.yml b/examples/ITEM_GENERATION_OMASA.yml new file mode 100644 index 00000000..ecf3f33c --- /dev/null +++ b/examples/ITEM_GENERATION_OMASA.yml @@ -0,0 +1,170 @@ +--- +################################################ +# Item generation - General configuration +################################################ + +### Any relative paths will be interpreted relative to the working directory the script is called from! +### Usage of absolute paths is recommended. +### Do not use file names with dots "." in them! This is not supported, use "_" instead +### For Windows users: please use double back slash '\\' in paths and add '.exe' to executable definitions + +### Output format +format: "OMASA" +masa_tc: 2 +masa_dirs: 2 +# sba_order: 2 + +### Output sampling rate in Hz +fs: 48000 + +### Generate BINAURAL output (_BINAURAL will be appended to the output filename) +binaural_output: true + +### Normalize target loudness to X LKFS +# loudness: -26 + +### Apply pre-amble and post-amble in X seconds +preamble: 0.0 +postamble: 0.0 + +### Add low-level random background noise (amplitude +-4) instead of silence; default = false (silence) +add_low_level_random_noise: true + +### Process with parallel streams +multiprocessing: False + +################################################ +### Item generation - Filename conventions +################################################ + +### Naming convention for the input mono files +### The input filenames are represented by: +### lLLeeettszz.wav +### where: +### l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com) +### LL stands for the language: JP, FR, GE, MA, DA, EN +### eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09 +### tt stands for the talker ID: f1, f2, f3, m1, m2, m3 +### s stands for 'sample' and zz is the sample number; 01, ..., 14 + +### Naming convention for the generated output files +### The output filenames are represented by: +### leeeayszz.wav +### The filenames of the accompanying output metadata files (applicable to metadata-assisted spatial audio, object-based audio) are represented by: +### leeeayszz.met for metadata-assisted spatial audio +### leeeayszz.wav.o.csv for object-based audio +### where: +### l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com) +### eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09 +### a stands 'audio' +### y is the per-experiment category according to IVAS-8a: 01, 02, 03, 04, 05, 06 +### s stands for sample and zz is the sample number; 01, 02, 03, 04, 05, 06, 07 (07 is the preliminary sample) +### o stands for the object number; 0, 1, 2, 3 + +### File designators, default is "l" for listening lab, "EN" for language, "p07" for experiment and "g" for company +listening_lab: "l" +language: "EN" +exp: "p01" +provider: "va" + +### Insert prefix for all input filenames (default: "") +### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment' +### the number of consecutive letters define the length of each field +# use_input_prefix: "lLLeee" + +### Insert prefix for all output filenames (default: "") +### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment' +### the number of consecutive letters define the length of each field +# use_output_prefix: "leee" + +################################################ +### Item generation - Scene description +################################################ + +### Each scene shall de described using the following parameters/properties: +### output: output filename +### description: textual description of the scene +### input: input filename(s) +### azimuth: azimuth in the range [-180,180]; positive values point to the left +### elevation: elevation in the range [-90,90]; positive values indicate up +### shift: time adjustment of the input signal (negative value delays the signal) +### +### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder) +### Note 1: use brackets [val1, val2, ...] when specifying multiple values +### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames +### Note 3: we're using right-handed coordinate system with azimuth = 0 pointing from the nose to the screen + +scenes: + "01": + output: "out/VA_3tlks_music.wav" + description: "Three talkers over music background" + input: ["items_hoa2/bm7aa1s01.wav", "items_mono/untrimmed/m4s12b_Talker1.wav", "items_mono/untrimmed/f1s6a_Talker2.wav", "items_mono/untrimmed/m3s1a_Talker2.wav"] + azimuth: [0, 30, -45, 100] + elevation: [0, 20, 20, 30] + level: [-36, -26, -26, -26] + shift: [0.0, 0.0, 0.0, -2.0] + + "02": + output: "out/VA_3tlks_music.wav" + description: "Three talkers over music background" + input: ["items_hoa2/bm7aa1s03.wav", "items_mono/untrimmed/f2s5a_Talker1.wav", "items_mono/untrimmed/f5s10a_Talker1.wav", "items_mono/untrimmed/m3s8b_Talker2.wav"] + azimuth: [0, "-20:0.5:360", "60:-0.5:-360", 60] + elevation: [0, 10, 10, 10] + level: [-46, -26, -26, -26] + shift: [0.0, 0.0, -2.0, -2.5] + + "03": + output: "out/VA_3tlks_music.wav" + description: "Three talkers over music background" + input: ["items_hoa2/bm7aa1s05.wav", "items_mono/untrimmed/f1s16b_Talker2.wav", "items_mono/untrimmed/m4s16a_Talker1.wav", "items_mono/untrimmed/f3s10b_Talker2.wav"] + azimuth: [0, -90, "0:1:360", "0:-1:-360"] + elevation: [0, 0, 30, 30] + level: [-36, -26, -26, -26] + shift: [0.0, 0.0, 0.0, -2.6] + + "04": + output: "out/VA_3tlks_music.wav" + description: "Three talkers over music background" + input: ["items_hoa2/bm7aa1s07.wav", "items_mono/untrimmed/f5s15b_Talker1.wav", "items_mono/untrimmed/m1s7a_Talker1.wav", "items_mono/untrimmed/m1s6b_Talker1.wav"] + azimuth: [0, "-90:-1:-360", "-10:1.5:360", "70:1:360"] + elevation: [0, 0, 20, 0] + level: [-46, -26, -36, -26] + shift: [0.0, -2.0, 0.0, -3.5] + + "05": + output: "out/VA_2tlks_1obj_music.wav" + description: "Two talkers, one musical object over music background" + input: ["items_hoa2/bm7aa1s09.wav", "items_mono/untrimmed/f2s1a_Talker1.wav", "items_mono/untrimmed/f2s5a_Talker1.wav", "music/item_lxa3s3.48k.wav"] + azimuth: [0, 20, -40, 45] + elevation: [0, 0, 0, 70] + level: [-36, -36, -26, -41] + shift: [0.0, 0.0, -2.0, 0.0] + + "06": + output: "out/VA_2tlks_1obj_music.wav" + description: "Two talkers, one musical object over music background" + input: ["items_hoa2/bm7aa1s11.wav", "items_mono/untrimmed/f5s10b_Talker1.wav", "items_mono/untrimmed/m1s4a_Talker1.wav", "music/item_lxa3s5.48k.wav"] + azimuth: [0, 50, "180:1:360", -120] + elevation: [0, 0, 45, 70] + level: [-46, -26, -26, -41] + shift: [0.0, 0.0, -2.5, 0.0] + + "07": + output: "out/VA_2tlks_1obj_music.wav" + description: "Two talkers, one musical object over music background" + input: ["items_hoa2/bm7aa1s13.wav", "items_mono/untrimmed/m1s2b_Talker1.wav", "items_mono/untrimmed/f3s5a_Talker2.wav", "music/641692__theflyfishingfilmmaker__classical-violin-minor-10s-mono.wav"] + azimuth: [0, "80:1:20 + 360", "80:1:20 + 360", -30] + elevation: [0, 10, 60, 70] + level: [-36, -26, -26, -36] + shift: [0.0, 0.0, 0.0, 0.0] + + "08": + output: "out/VA_2tlks_1obj_music.wav" + description: "Two talkers, one musical object over music background" + input: ["items_hoa2/bm7aa1s15.wav", "items_mono/untrimmed/m4s16a_Talker1.wav", "items_mono/untrimmed/f2s4a_Talker1.wav", "music/item_lxa4s2.48k.wav"] + azimuth: [0, "60:1:0 + 360", "60:-1:120 - 360", 100] + elevation: [0, 20, 50, 70] + level: [-46, -26, -26, -41] + shift: [0.0, 0.0, -1.0, -0.5] + + diff --git a/examples/ITEM_GENERATION_OSBA.yml b/examples/ITEM_GENERATION_OSBA.yml new file mode 100644 index 00000000..748a0ad9 --- /dev/null +++ b/examples/ITEM_GENERATION_OSBA.yml @@ -0,0 +1,170 @@ +--- +################################################ +# Item generation - General configuration +################################################ + +### Any relative paths will be interpreted relative to the working directory the script is called from! +### Usage of absolute paths is recommended. +### Do not use file names with dots "." in them! This is not supported, use "_" instead +### For Windows users: please use double back slash '\\' in paths and add '.exe' to executable definitions + +### Output format +format: "OSBA" +# masa_tc: 2 +# masa_dirs: 2 +sba_order: 2 + +### Output sampling rate in Hz +fs: 48000 + +### Generate BINAURAL output (_BINAURAL will be appended to the output filename) +binaural_output: true + +### Normalize target loudness to X LKFS +# loudness: -26 + +### Apply pre-amble and post-amble in X seconds +preamble: 0.0 +postamble: 0.0 + +### Add low-level random background noise (amplitude +-4) instead of silence; default = false (silence) +add_low_level_random_noise: true + +### Process with parallel streams +multiprocessing: False + +################################################ +### Item generation - Filename conventions +################################################ + +### Naming convention for the input mono files +### The input filenames are represented by: +### lLLeeettszz.wav +### where: +### l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com) +### LL stands for the language: JP, FR, GE, MA, DA, EN +### eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09 +### tt stands for the talker ID: f1, f2, f3, m1, m2, m3 +### s stands for 'sample' and zz is the sample number; 01, ..., 14 + +### Naming convention for the generated output files +### The output filenames are represented by: +### leeeayszz.wav +### The filenames of the accompanying output metadata files (applicable to metadata-assisted spatial audio, object-based audio) are represented by: +### leeeayszz.met for metadata-assisted spatial audio +### leeeayszz.wav.o.csv for object-based audio +### where: +### l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com) +### eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09 +### a stands 'audio' +### y is the per-experiment category according to IVAS-8a: 01, 02, 03, 04, 05, 06 +### s stands for sample and zz is the sample number; 01, 02, 03, 04, 05, 06, 07 (07 is the preliminary sample) +### o stands for the object number; 0, 1, 2, 3 + +### File designators, default is "l" for listening lab, "EN" for language, "p07" for experiment and "g" for company +listening_lab: "l" +language: "EN" +exp: "p01" +provider: "va" + +### Insert prefix for all input filenames (default: "") +### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment' +### the number of consecutive letters define the length of each field +# use_input_prefix: "lLLeee" + +### Insert prefix for all output filenames (default: "") +### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment' +### the number of consecutive letters define the length of each field +# use_output_prefix: "leee" + +################################################ +### Item generation - Scene description +################################################ + +### Each scene shall de described using the following parameters/properties: +### output: output filename +### description: textual description of the scene +### input: input filename(s) +### azimuth: azimuth in the range [-180,180]; positive values point to the left +### elevation: elevation in the range [-90,90]; positive values indicate up +### shift: time adjustment of the input signal (negative value delays the signal) +### +### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder) +### Note 1: use brackets [val1, val2, ...] when specifying multiple values +### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames +### Note 3: we're using right-handed coordinate system with azimuth = 0 pointing from the nose to the screen + +scenes: + "01": + output: "out/VA_3tlks_music.wav" + description: "Three talkers over music background" + input: ["items_hoa2/bm7aa1s01.wav", "items_mono/untrimmed/m4s12b_Talker1.wav", "items_mono/untrimmed/f1s6a_Talker2.wav", "items_mono/untrimmed/m3s1a_Talker2.wav"] + azimuth: [0, 30, -45, 100] + elevation: [0, 20, 20, 30] + level: [-36, -26, -26, -26] + shift: [0.0, 0.0, 0.0, -2.0] + + "02": + output: "out/VA_3tlks_music.wav" + description: "Three talkers over music background" + input: ["items_hoa2/bm7aa1s03.wav", "items_mono/untrimmed/f2s5a_Talker1.wav", "items_mono/untrimmed/f5s10a_Talker1.wav", "items_mono/untrimmed/m3s8b_Talker2.wav"] + azimuth: [0, "-20:0.5:360", "60:-0.5:-360", 60] + elevation: [0, 10, 10, 10] + level: [-46, -26, -26, -26] + shift: [0.0, 0.0, -2.0, -2.5] + + "03": + output: "out/VA_3tlks_music.wav" + description: "Three talkers over music background" + input: ["items_hoa2/bm7aa1s05.wav", "items_mono/untrimmed/f1s16b_Talker2.wav", "items_mono/untrimmed/m4s16a_Talker1.wav", "items_mono/untrimmed/f3s10b_Talker2.wav"] + azimuth: [0, -90, "0:1:360", "0:-1:-360"] + elevation: [0, 0, 30, 30] + level: [-36, -26, -26, -26] + shift: [0.0, 0.0, 0.0, -2.6] + + "04": + output: "out/VA_3tlks_music.wav" + description: "Three talkers over music background" + input: ["items_hoa2/bm7aa1s07.wav", "items_mono/untrimmed/f5s15b_Talker1.wav", "items_mono/untrimmed/m1s7a_Talker1.wav", "items_mono/untrimmed/m1s6b_Talker1.wav"] + azimuth: [0, "-90:-1:-360", "-10:1.5:360", "70:1:360"] + elevation: [0, 0, 20, 0] + level: [-46, -26, -36, -26] + shift: [0.0, -2.0, 0.0, -3.5] + + "05": + output: "out/VA_2tlks_1obj_music.wav" + description: "Two talkers, one musical object over music background" + input: ["items_hoa2/bm7aa1s09.wav", "items_mono/untrimmed/f2s1a_Talker1.wav", "items_mono/untrimmed/f2s5a_Talker1.wav", "music/item_lxa3s3.48k.wav"] + azimuth: [0, 20, -40, 45] + elevation: [0, 0, 0, 70] + level: [-36, -36, -26, -41] + shift: [0.0, 0.0, -2.0, 0.0] + + "06": + output: "out/VA_2tlks_1obj_music.wav" + description: "Two talkers, one musical object over music background" + input: ["items_hoa2/bm7aa1s11.wav", "items_mono/untrimmed/f5s10b_Talker1.wav", "items_mono/untrimmed/m1s4a_Talker1.wav", "music/item_lxa3s5.48k.wav"] + azimuth: [0, 50, "180:1:360", -120] + elevation: [0, 0, 45, 70] + level: [-46, -26, -26, -41] + shift: [0.0, 0.0, -2.5, 0.0] + + "07": + output: "out/VA_2tlks_1obj_music.wav" + description: "Two talkers, one musical object over music background" + input: ["items_hoa2/bm7aa1s13.wav", "items_mono/untrimmed/m1s2b_Talker1.wav", "items_mono/untrimmed/f3s5a_Talker2.wav", "music/641692__theflyfishingfilmmaker__classical-violin-minor-10s-mono.wav"] + azimuth: [0, "80:1:20 + 360", "80:1:20 + 360", -30] + elevation: [0, 10, 60, 70] + level: [-36, -26, -26, -36] + shift: [0.0, 0.0, 0.0, 0.0] + + "08": + output: "out/VA_2tlks_1obj_music.wav" + description: "Two talkers, one musical object over music background" + input: ["items_hoa2/bm7aa1s15.wav", "items_mono/untrimmed/m4s16a_Talker1.wav", "items_mono/untrimmed/f2s4a_Talker1.wav", "music/item_lxa4s2.48k.wav"] + azimuth: [0, "60:1:0 + 360", "60:-1:120 - 360", 100] + elevation: [0, 20, 50, 70] + level: [-46, -26, -26, -41] + shift: [0.0, 0.0, -1.0, -0.5] + + diff --git a/examples/ITEM_GENERATION_STEREO.yml b/examples/ITEM_GENERATION_STEREO.yml new file mode 100644 index 00000000..c9c5a983 --- /dev/null +++ b/examples/ITEM_GENERATION_STEREO.yml @@ -0,0 +1,154 @@ +--- +################################################ +# Item generation - General configuration +################################################ + +### Any relative paths will be interpreted relative to the working directory the script is called from! +### Usage of absolute paths is recommended. +### Do not use file names with dots "." in them! This is not supported, use "_" instead +### For Windows users: please use double back slash '\\' in paths and add '.exe' to executable definitions + +### Output format +format: "STEREO" +# masa_tc: 2 +# masa_dirs: 2 +# sba_order: 2 + +### Output sampling rate in Hz +fs: 48000 + +### Generate BINAURAL output (_BINAURAL will be appended to the output filename) +binaural_output: true + +### Normalize target loudness to X LKFS +loudness: -26 + +### Apply pre-amble and post-amble in X seconds +preamble: 0.5 +postamble: 1.0 + +### Add low-level random background noise (amplitude +-4) instead of silence; default = false (silence) +add_low_level_random_noise: true + +### Process with parallel streams +multiprocessing: False + +################################################ +### Item generation - Filename conventions +################################################ + +### Naming convention for the input mono files +### The input filenames are represented by: +### lLLeeettszz.wav +### where: +### l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com) +### LL stands for the language: JP, FR, GE, MA, DA, EN +### eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09 +### tt stands for the talker ID: f1, f2, f3, m1, m2, m3 +### s stands for 'sample' and zz is the sample number; 01, ..., 14 + +### Naming convention for the generated output files +### The output filenames are represented by: +### leeeayszz.wav +### The filenames of the accompanying output metadata files (applicable to metadata-assisted spatial audio, object-based audio) are represented by: +### leeeayszz.met for metadata-assisted spatial audio +### leeeayszz.wav.o.csv for object-based audio +### where: +### l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com) +### eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09 +### a stands 'audio' +### y is the per-experiment category according to IVAS-8a: 01, 02, 03, 04, 05, 06 +### s stands for sample and zz is the sample number; 01, 02, 03, 04, 05, 06, 07 (07 is the preliminary sample) +### o stands for the object number; 0, 1, 2, 3 + +### File designators, default is "l" for listening lab, "EN" for language, "p07" for experiment and "g" for company +listening_lab: "b" +language: "GE" +exp: "p02" +provider: "g" + +### Insert prefix for all input filenames (default: "") +### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment' +### the number of consecutive letters define the length of each field +# use_input_prefix: "lLLeee" + +### Insert prefix for all output filenames (default: "") +### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment' +### the number of consecutive letters define the length of each field +# use_output_prefix: "leee" + +################################################ +### Item generation - Scene description +################################################ + +### Each scene shall de described using the following parameters/properties: +### output: output filename +### description: textual description of the scene +### input: input filename(s) +### IR: filenames(s) of the input IRs +### azimuth: azimuth in the range [-180,180]; positive values point to the left +### elevation: elevation in the range [-90,90]; positive values indicate up +### shift: time adjustment of the input signal (negative value delays the signal) +### +### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder) +### Note 1: use brackets [val1, val2, ...] when specifying multiple values +### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames +### Note 3: we're using right-handed coordinate system with azimuth = 0 pointing from the nose to the screen + + +scenes: + "01": + output: "out/a1s01.wav" + description: "Car with AB microphone pickup, no overlap between the talkers, car noise." + input: ["items_mono/untrimmed/f1s4b_Talker2.wav", "items_mono/untrimmed/f2s1a_Talker1.wav"] + IR: ["IRs/Car_TalkPos1_Stereo_M5_SinSweep_2chn.wav", "IRs/Car_TalkPos2_Stereo_M5_SinSweep_2chn.wav"] + shift: [0.0, -1.0] + + "02": + output: "out/a1s02.wav" + description: "Car with AB microphone pickup, overlap between the talkers, car noise." + input: ["items_mono/untrimmed/f1s6a_Talker2.wav", "items_mono/untrimmed/f2s3b_Talker1.wav"] + IR: ["IRs/Car_TalkPos3_Stereo_M5_SinSweep_2chn.wav", "IRs/Car_TalkPos4_Stereo_M5_SinSweep_2chn.wav"] + shift: [0.0, +1.0] + + "03": + output: "out/a1s03.wav" + description: "Car with AB microphone pickup, no overlap between the talkers, car noise." + input: ["items_mono/untrimmed/f3s3a_Talker2.wav", "items_mono/untrimmed/f3s10b_Talker2.wav"] + IR: ["IRs/Car_TalkPos1_Stereo_M5_SinSweep_2chn.wav", "IRs/Car_TalkPos1_Stereo_M5_SinSweep_2chn.wav"] + shift: [0.0, -1.0] + + "04": + output: "out/a1s04.wav" + description: "Car with AB microphone pickup, no overlap between the talkers, car noise." + input: ["items_mono/untrimmed/f2s7b_Talker1.wav", "items_mono/untrimmed/f5s15a_Talker1.wav"] + IR: ["IRs/FreeField_IR_Python_AB_20cm_Pos1.wav", "IRs/FreeField_IR_Python_AB_20cm_Pos2.wav"] + shift: [0.0, -1.0] + + "05": + output: "out/a1s05.wav" + description: "Car with AB microphone pickup, no overlap between the talkers, car noise." + input: ["items_mono/untrimmed/m2s15a_Talker2.wav", "items_mono/untrimmed/m1s4a_Talker1.wav"] + IR: ["IRs/FreeField_IR_Python_AB_20cm_Pos3.wav", "IRs/FreeField_IR_Python_AB_20cm_Pos4.wav"] + shift: [0.0, -1.0] + + "06": + output: "out/a1s06.wav" + description: "Car with AB microphone pickup, no overlap between the talkers." + input: ["items_mono/untrimmed/m3s8a_Talker2.wav", "items_mono/untrimmed/m4s13a_Talker1.wav"] + IR: ["IRs/IR_do_p04_e_01_01_FOA.wav", "IRs/IR_do_p04_e_02_01_FOA.wav"] + shift: [0.0, -1.0] + + "07": + output: "out/a1s07.wav" + description: "Preliminary: Car with AB microphone pickup, no overlap between the talkers." + input: ["items_mono/untrimmed/f1s20a_Talker2.wav", "items_mono/untrimmed/f5s15b_Talker1.wav"] + IR: ["IRs/IR_do_p04_e_03_01_FOA.wav", "IRs/IR_do_p04_e_04_01_FOA.wav"] + shift: [0.0, -1.0] + + "08": + output: "out/a2s01.wav" + description: "Car with AB microphone pickup, overlap between the talkers." + input: ["items_mono/untrimmed/m2s6b_Talker2.wav", "items_mono/untrimmed/f5s14a_Talker1.wav"] + IR: ["IRs/IR_g_p01_a_07_00_stAB100.wav", "IRs/IR_g_p01_a_06_00_stAB100.wav"] + shift: [0.0, +1.0] -- GitLab From f5121821c94f9553dc5b99ee1510a4e768a7083f Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 19 Jun 2025 10:34:38 +0200 Subject: [PATCH 04/42] complement reverb tool with HOA3 format, corrections and fixes --- .../audiotools/wrappers/reverb.py | 105 +++++++++++++++--- 1 file changed, 87 insertions(+), 18 deletions(-) diff --git a/ivas_processing_scripts/audiotools/wrappers/reverb.py b/ivas_processing_scripts/audiotools/wrappers/reverb.py index 0c570e42..dc53a75c 100644 --- a/ivas_processing_scripts/audiotools/wrappers/reverb.py +++ b/ivas_processing_scripts/audiotools/wrappers/reverb.py @@ -38,6 +38,7 @@ from typing import Optional import numpy as np from scipy.fft import fft +from ivas_processing_scripts.audiotools import audio from ivas_processing_scripts.audiotools.audio import Audio from ivas_processing_scripts.audiotools.audiofile import read, write from ivas_processing_scripts.audiotools.wrappers.filter import resample_itu @@ -128,7 +129,9 @@ def reverb( output = copy(tmp_input) output.audio, _ = read(tmp_output_file, nchannels=1, fs=tmp_input.fs) - # reverse the resampling + # remove trailing part (to ensure that the length of the output is the same as the input) + output.audio = output.audio[:-(IR.audio.shape[0]-1), :] + if old_fs: output.audio = resample_itu(output, old_fs) output.fs = old_fs @@ -183,9 +186,8 @@ def reverb_stereo( y_right = reverb(input, IR_right, align=align) # combine into stereo output - y = copy(input) - y.name = "STEREO" - y.num_channels = 2 + y = audio.fromtype('STEREO') + y.fs = input.fs y.audio = np.column_stack([y_left.audio, y_right.audio]) return y @@ -197,14 +199,14 @@ def reverb_foa( align: Optional[float] = None, ) -> Audio: """ - Wrapper for the ITU-T reverb binary to convolve mono audio signal with an FOA impulse response + Convolve mono audio signal with an FOA impulse response Parameters ---------- input: Audio Input audio signal - IR: Audio - Impulse response + foa_IR: Audio + FOA impulse response align: float multiplicative factor to apply to the reverberated sound in order to align its energy level with the second file @@ -249,10 +251,9 @@ def reverb_foa( y_y = reverb(input, IR_y, align=align) y_z = reverb(input, IR_z, align=align) - # combine into foa output - y = copy(input) - y.name = "FOA" - y.num_channels = 4 + # combine into FOA output + y = audio.fromtype('FOA') + y.fs = input.fs y.audio = np.column_stack([y_w.audio, y_x.audio, y_y.audio, y_z.audio]) return y @@ -264,14 +265,14 @@ def reverb_hoa2( align: Optional[float] = None, ) -> Audio: """ - Wrapper for the ITU-T reverb binary to convolve mono audio signal with an HOA2 impulse response + Convolve mono audio signal with an HOA2 impulse response Parameters ---------- input: Audio Input audio signal - IR: Audio - Impulse response + hoa2_IR: Audio + HOA2 impulse response align: float multiplicative factor to apply to the reverberated sound in order to align its energy level with the second file @@ -301,10 +302,71 @@ def reverb_hoa2( # convolve mono input with channel IR ych.append(reverb(input, IR, align=align)) - # combine into hoa2 output - y = copy(input) - y.name = "HOA2" - y.num_channels = numchannels + # combine into HOA2 output + y = audio.fromtype('HOA2') + y.fs = input.fs + y.audio = np.column_stack( + [ + ych[0].audio, + ych[1].audio, + ych[2].audio, + ych[3].audio, + ych[4].audio, + ych[5].audio, + ych[6].audio, + ych[7].audio, + ych[8].audio, + ] + ) + + return y + +def reverb_hoa3( + input: Audio, + hoa3_IR: Audio, + align: Optional[float] = None, +) -> Audio: + """ + Convolve mono audio signal with an HOA3 impulse response + + Parameters + ---------- + input: Audio + Input audio signal + hoa3_IR: Audio + HOA3 impulse response + align: float + multiplicative factor to apply to the reverberated sound in order to align its energy level with the second file + + Returns + ------- + output: Audio + Convolved audio signal with HOA3 IR + """ + + # convert to float32 + hoa3_IR.audio = np.float32(hoa3_IR.audio) + + numchannels = 16 # HOA3 by definition + + # calculate the scaling (multiplicative) factor such that the maximum gain of the IR filter across all frequencies is 0dB + if align is None: + H = fft(hoa3_IR.audio, axis=0) + align = 1.0 / np.max(np.abs(H)) + + IR = copy(hoa3_IR) + IR.name = "MONO" + IR.num_channels = 1 + ych = [] + for i in range(numchannels): + # separate IR into each channel + IR.audio = np.reshape(hoa3_IR.audio[:, i], (-1, 1)) + # convolve mono input with channel IR + ych.append(reverb(input, IR, align=align)) + + # combine into HOA3 output + y = audio.fromtype('HOA3') + y.fs = input.fs y.audio = np.column_stack( [ ych[0].audio, @@ -316,6 +378,13 @@ def reverb_hoa2( ych[6].audio, ych[7].audio, ych[8].audio, + ych[9].audio, + ych[10].audio, + ych[11].audio, + ych[12].audio, + ych[13].audio, + ych[14].audio, + ych[15].audio, ] ) -- GitLab From a240ad8989d78228e60f8287dfc84586dc9ca6b7 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 19 Jun 2025 10:35:56 +0200 Subject: [PATCH 05/42] update of initialization and caller functions --- .../generation/__init__.py | 52 +++++++------------ ivas_processing_scripts/generation/config.py | 4 -- .../generation/constants.py | 3 +- 3 files changed, 20 insertions(+), 39 deletions(-) diff --git a/ivas_processing_scripts/generation/__init__.py b/ivas_processing_scripts/generation/__init__.py index 4b1d3279..a3c77e3a 100755 --- a/ivas_processing_scripts/generation/__init__.py +++ b/ivas_processing_scripts/generation/__init__.py @@ -31,7 +31,6 @@ # import logging - import yaml from ivas_processing_scripts.constants import ( @@ -41,11 +40,11 @@ from ivas_processing_scripts.constants import ( ) from ivas_processing_scripts.generation import ( config, - process_ambi_items, - process_ism1_items, - process_ism2_items, - process_stereo_items, + generate_ambi_items, + generate_stereo_items, + generate_ismN_items, generate_omasa_items, + generate_osba_items, ) from ivas_processing_scripts.utils import create_dir @@ -62,16 +61,12 @@ def logging_init(args, cfg): logger.addHandler(console_handler) # main log file - file_handler = logging.FileHandler( - cfg.output_path.joinpath(f"{cfg.format}{LOGGER_SUFFIX}"), mode="w" - ) + file_handler = logging.FileHandler( f"{cfg.format}{LOGGER_SUFFIX}", mode="w" ) file_handler.setFormatter(logging.Formatter(LOGGER_FORMAT, datefmt=LOGGER_DATEFMT)) file_handler.setLevel(logging.DEBUG if args.debug else logging.INFO) logger.addHandler(file_handler) logger.info(f"Processing item generation configuration file {args.config}") - logger.info(f"Input path: {cfg.input_path.absolute()}") - logger.info(f"Output path: {cfg.output_path.absolute()}") return logger @@ -81,36 +76,27 @@ def main(args): cfg = config.TestConfig(args.config) # create output directories for categories - for cat in range(1, 7): - create_dir(cfg.output_path.joinpath(f"cat{cat}")) + # for cat in range(1, 7): + # create_dir(cfg.output_path.joinpath(f"cat{cat}")) # set up logging logger = logging_init(args, cfg) - # make format a list - if not isinstance(cfg.format, list): - cfg.format = [cfg.format] - - # generate ISM and STEREO items - if "ISM1" in cfg.format: - # generate ISM1 items with metadata according to scene description - process_ism1_items.generate_ism1_items(cfg, logger) - elif "ISM2" in cfg.format: - # generate ISM2 items with metadata according to scene description - process_ism2_items.generate_ism2_items(cfg, logger) + # generate items in the requested format + if "ISM" in cfg.format: + # generate ISMn items from MONO items according to scene description + generate_ismN_items.generate_ismN_items(cfg, logger) elif "STEREO" in cfg.format: # generate STEREO items according to scene description - process_stereo_items.generate_stereo_items(cfg, logger) - elif "FOA" in cfg.format or "HOA2" in cfg.format: - # generate FOA/HOA2 items according to scene description - process_ambi_items.generate_ambi_items(cfg, logger) + generate_stereo_items.generate_stereo_items(cfg, logger) + elif any(fmt in cfg.format for fmt in ["FOA", "HOA2", "HOA3"]): + # generate FOA/HOA2/HOA3 items according to scene description + generate_ambi_items.generate_ambi_items(cfg, logger) elif "OMASA" in cfg.format: - # generate OMASA items according to scene description + # generate OMASA items from FOA/HO2/HOA3 and MONO items according to scene description generate_omasa_items.generate_omasa_items(cfg, logger) - - - # copy configuration to output directory - with open(cfg.output_path.joinpath(f"{'_'.join(cfg.format)}.yml"), "w") as f: - yaml.safe_dump(cfg._yaml_dump, f) + elif "OSBA" in cfg.format: + # generate OSBA items from FOA/HO2/HOA3 and MONO items according to scene description + generate_osba_items.generate_osba_items(cfg, logger) logger.handlers.clear() diff --git a/ivas_processing_scripts/generation/config.py b/ivas_processing_scripts/generation/config.py index 79b878fc..bfb676bc 100644 --- a/ivas_processing_scripts/generation/config.py +++ b/ivas_processing_scripts/generation/config.py @@ -78,10 +78,6 @@ class TestConfig: # store the merged config for writing to file later self._yaml_dump = self._dump_yaml(cfg) - # convert to Path - self.input_path = Path(self.input_path) - self.output_path = Path(self.output_path) - def _parse_yaml(self, filename): """parse configuration file""" with open(filename) as fp: diff --git a/ivas_processing_scripts/generation/constants.py b/ivas_processing_scripts/generation/constants.py index 8114a91a..8319d318 100644 --- a/ivas_processing_scripts/generation/constants.py +++ b/ivas_processing_scripts/generation/constants.py @@ -61,7 +61,6 @@ DEFAULT_CONFIG_BINARIES = { REQUIRED_KEYS = [ "format", - "input_path", - "output_path", + "fs", "scenes", ] -- GitLab From 800d5656018be3c344dd602da7c873e40d424dff Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 19 Jun 2025 10:36:32 +0200 Subject: [PATCH 06/42] update all item generation scripts for all formats --- ...s_ambi_items.py => generate_ambi_items.py} | 232 ++++------ ...s_ism2_items.py => generate_ismN_items.py} | 252 +++++------ .../generation/generate_omasa_items.py | 96 ++-- .../generation/generate_osba_items.py | 420 ++++++++++++++++++ ...ereo_items.py => generate_stereo_items.py} | 198 ++++----- .../generation/process_ism1_items.py | 356 --------------- 6 files changed, 769 insertions(+), 785 deletions(-) rename ivas_processing_scripts/generation/{process_ambi_items.py => generate_ambi_items.py} (60%) rename ivas_processing_scripts/generation/{process_ism2_items.py => generate_ismN_items.py} (65%) create mode 100644 ivas_processing_scripts/generation/generate_osba_items.py rename ivas_processing_scripts/generation/{process_stereo_items.py => generate_stereo_items.py} (64%) delete mode 100644 ivas_processing_scripts/generation/process_ism1_items.py diff --git a/ivas_processing_scripts/generation/process_ambi_items.py b/ivas_processing_scripts/generation/generate_ambi_items.py similarity index 60% rename from ivas_processing_scripts/generation/process_ambi_items.py rename to ivas_processing_scripts/generation/generate_ambi_items.py index 9cfb9ee9..f1885727 100644 --- a/ivas_processing_scripts/generation/process_ambi_items.py +++ b/ivas_processing_scripts/generation/generate_ambi_items.py @@ -34,12 +34,15 @@ import logging import os from itertools import groupby, repeat from math import floor +from pathlib import Path import numpy as np -from ivas_processing_scripts.audiotools import audio, audiofile, convert -from ivas_processing_scripts.audiotools.wrappers.bs1770 import get_loudness -from ivas_processing_scripts.audiotools.wrappers.reverb import reverb_foa, reverb_hoa2 +from ivas_processing_scripts.audiotools import audio, audioarray, audiofile, convert +from ivas_processing_scripts.audiotools.convert.objectbased import convert_objectbased +from ivas_processing_scripts.audiotools.convert.scenebased import convert_scenebased +from ivas_processing_scripts.audiotools.wrappers.bs1770 import get_loudness, loudness_norm +from ivas_processing_scripts.audiotools.wrappers.reverb import reverb_foa, reverb_hoa2, reverb_hoa3 from ivas_processing_scripts.generation import config from ivas_processing_scripts.utils import apply_func_parallel @@ -76,7 +79,7 @@ def generate_ambi_items( cfg: config.TestConfig, logger: logging.Logger, ): - """Generate FOA/HOA2 items from mono items based on scene description""" + """Generate FOA/HOA2/HOA3 items from mono items based on scene description""" # set the target level if "loudness" not in cfg.__dict__: @@ -97,18 +100,10 @@ def generate_ambi_items( if "postamble" not in cfg.__dict__: cfg.postamble = 0.0 - # set the IR path - if "IR_path" not in cfg.__dict__: - cfg.IR_path = os.path.join(os.path.dirname(__file__), "IR") - # set the pre-amble and post-amble if "add_low_level_random_noise" not in cfg.__dict__: cfg.add_low_level_random_noise = False - # setup binaural rendering - if "binaural_path" not in cfg.__dict__: - cfg.binaural_path = "" - # set the listening lab designator if "listening_lab" not in cfg.__dict__: cfg.listening_lab = "l" @@ -157,7 +152,7 @@ def generate_ambi_items( # set the prefix for all output filenames if "use_output_prefix" not in cfg.__dict__: - cfg.use_output_prefix = None + cfg.use_output_prefix = "" else: # replace file designators cfg.use_output_prefix = replace_char_seq_with_string( @@ -169,13 +164,13 @@ def generate_ambi_items( # set multiprocessing if "multiprocessing" not in cfg.__dict__: - cfg.multiprocessing = True + cfg.multiprocessing = False apply_func_parallel( generate_ambi_scene, zip(cfg.scenes.keys(), cfg.scenes.values(), repeat(cfg), repeat(logger)), - None, - "mp" if cfg.multiprocessing else None, + type = "mp" if cfg.multiprocessing else None, + show_progress = None, ) return @@ -184,140 +179,114 @@ def generate_ambi_items( def generate_ambi_scene( scene_name: str, scene: dict, cfg: config.TestConfig, logger: logging.Logger ): + """ + Processes a single scene to generate FOA/HOA2/HOA3 item. + + Args: + scene_name (str): The name of the scene being processed. + scene (dict): A dictionary containing scene description, including source files, azimuth, elevation, and other parameters. + cfg (config.TestConfig): Configuration object containing settings for processing, such as input/output paths, sampling rate, and loudness levels. + logger (logging.Logger): Logger instance for logging information and errors. + + Expected Behavior: + - Reads mono audio source files and processes them based on the scene description. + - Writes the processed FOA/HOA2/HOA3 audio to the output file. + """ + logger.info( - f"Processing scene: {scene_name} out of {len(cfg.scenes)} scenes, name: {scene['name']}" + f"Processing scene: {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}" ) # extract the number of audio sources - N_sources = len(np.atleast_1d(scene["source"])) + N_inputs = len(np.atleast_1d(scene["input"])) - # read the overlap length - if "overlap" in scene.keys(): - source_overlap = float(scene["overlap"]) - else: - source_overlap = 0.0 + # initialize output dirs + output_filename = Path(scene["output"]).parent / (cfg.use_output_prefix + Path(scene["output"]).name) - # read the ambi format - if "format" in scene.keys(): - ambi_format = scene["format"] - else: - ambi_format = "FOA" + dir_path = output_filename.parent + if dir_path and not dir_path.exists(): + dir_path.mkdir(parents=True, exist_ok=True) + + # initialize output audio object + y = audio.SceneBasedAudio(cfg.format) + + for i in range(N_inputs): - len_s1 = 0 - y = audio.SceneBasedAudio(ambi_format) - for i in range(N_sources): # parse parameters from the scene description - source_file = np.atleast_1d(scene["source"])[i] + source_file = np.atleast_1d(scene["input"])[i] IR_file = np.atleast_1d(scene["IR"])[i] + # read the overlap length + if "shift" in scene.keys(): + source_shift = ( + scene["shift"][i] + if isinstance(scene["shift"], list) + else scene["shift"] + ) + else: + source_shift = 0.0 + + # read the level + if "level" in scene.keys(): + level = ( + scene["level"][i] + if isinstance(scene["level"], list) + else scene["level"] + ) + else: + level = -26 + logger.info(f"Convolving {source_file} with {IR_file}") - # read source file - x = audio.fromfile( - "MONO", - os.path.join( - cfg.input_path, - os.path.dirname(source_file), - cfg.use_input_prefix + os.path.basename(source_file), - ), - fs=cfg.fs, - ) + # get input filename and IR filename + input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name) + IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) - # read the IR file - IR = audio.fromfile( - ambi_format, - os.path.join( - cfg.IR_path, - os.path.dirname(IR_file), - cfg.use_IR_prefix + os.path.basename(IR_file), - ), - fs=cfg.IR_fs, - ) + # read source file + x = audio.fromfile( "MONO", input_filename, fs=cfg.fs ) - if i == 0: - len_s1 = x.audio.shape[0] + # read the IR file (!must be in target format!) + IR = audio.fromfile( cfg.format, IR_filename, fs=cfg.IR_fs ) - # convolve with the FOA/HOA2 IR - if ambi_format == "FOA": + # convolve with the FOA/HOA2/HOA3 IR + if cfg.format == "FOA": x = reverb_foa(x, IR) - elif ambi_format == "HOA2": + elif cfg.format == "HOA2": x = reverb_hoa2(x, IR) + elif cfg.format == "HOA3": + x = reverb_hoa3(x, IR) - # adjust the level of the foa signal - _, scale_factor, _ = get_loudness(x, cfg.loudness, "BINAURAL") - x.audio *= scale_factor - - # shift the second (and all other) source files (positive shift creates overlap, negative shift creates a gap) - if i > 0: - # get the length of the first source file - N_delay = len_s1 + # adjust the level of the target signal + x.audio, _ = loudness_norm(x, level, loudness_format="STEREO") - # add the shift - N_delay += int(-source_overlap * x.fs) + # shift the source signal (positive shift creates overlap, negative shift creates a gap) + if int(floor(-source_shift)) != 0: + x.audio = audioarray.trim(x.audio, x.fs, limits=[-source_shift, 0]) - # insert all-zero preamble - pre = np.zeros((N_delay, x.audio.shape[1])) - x.audio = np.concatenate([pre, x.audio]) + # get the number of frames (multiple of 20ms) + frame_len = int(x.fs / 50) + N_frames = int(len(x.audio) / frame_len) # pad with zeros to ensure that the signal length is a multiple of 20ms - N_frame = x.fs / 50 - if len(x.audio) % N_frame != 0: - N_pad = int(N_frame - len(x.audio) % N_frame) + if len(x.audio) % frame_len != 0: + N_pad = int(frame_len - len(x.audio) % frame_len) + x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) - # insert all-zero preamble - pre = np.zeros((N_pad, x.audio.shape[1])) - x.audio = np.concatenate([pre, x.audio]) - - # add source signal to the array of source signals - y.fs = x.fs if y.audio is None: + # add source signal to the array of all source signals y.audio = x.audio.copy() + y.fs = x.fs else: - # pad with zeros to have equal length of all source signals - if x.audio.shape[0] > y.audio.shape[0]: - y.audio = np.vstack( - ( - y.audio, - np.zeros( - ( - x.audio.shape[0] - y.audio.shape[0], - y.audio.shape[1], - ) - ), - ) - ) - elif y.audio.shape[0] > x.audio.shape[0]: - x.audio = np.vstack( - ( - x.audio, - np.zeros( - ( - y.audio.shape[0] - x.audio.shape[0], - x.audio.shape[1], - ) - ), - ) - ) - + # adjust the signal length (trim from the end or pad with zeros) to align its length with the previous signal(s) + N_pad = y.audio.shape[0] - x.audio.shape[0] + if N_pad != 0: + x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) + # superimpose y.audio += x.audio # append pre-amble and post-amble to all sources - if cfg.preamble != 0.0: - # ensure that pre-amble is a multiple of 20ms - N_pre = int(floor(cfg.preamble * 50) / 50 * y.fs) - - # insert all-zero preamble to all sources - pre = np.zeros((N_pre, y.audio.shape[1])) - y.audio = np.concatenate([pre, y.audio]) - - if cfg.postamble != 0.0: - # ensure that post-mable is a multiple of 20ms - N_post = int(floor(cfg.postamble * 50) / 50 * y.fs) - - # append all-zero postamble to all sources - post = np.zeros((N_post, y.audio.shape[1])) - y.audio = np.concatenate([y.audio, post]) + y.audio = audioarray.trim(y.audio, y.fs, limits=[-cfg.preamble, -cfg.postamble]) # add random noise if cfg.add_low_level_random_noise: @@ -328,26 +297,19 @@ def generate_ambi_scene( # superimpose y.audio += noise - # write the reverberated audio into output file - audiofile.write( - os.path.join( - cfg.output_path, - os.path.dirname(scene["name"]), - cfg.use_output_prefix + os.path.basename(scene["name"]), - ), - y.audio, - y.fs, - ) + # write the FOA/HOA2/HOA3 audio into output file + audiofile.write( output_filename, y.audio, y.fs ) - # convert to binaural if option chosen - if cfg.binaural_path != "": + # convert to BINAURAL, if option was chosen + if cfg.binaural_output: binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs - convert.format_conversion(y, binaudio) + convert_scenebased(y, binaudio) + binaural_output_filename = output_filename.with_name(output_filename.stem + "_BINAURAL" + output_filename.suffix) audiofile.write( - os.path.join(cfg.binaural_path, scene["name"]), + binaural_output_filename, binaudio.audio, binaudio.fs, ) - + logger.info(f"Written BINAURAL output to: {binaural_output_filename}") return diff --git a/ivas_processing_scripts/generation/process_ism2_items.py b/ivas_processing_scripts/generation/generate_ismN_items.py similarity index 65% rename from ivas_processing_scripts/generation/process_ism2_items.py rename to ivas_processing_scripts/generation/generate_ismN_items.py index 357a7276..ea5baf96 100644 --- a/ivas_processing_scripts/generation/process_ism2_items.py +++ b/ivas_processing_scripts/generation/generate_ismN_items.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -# (C) 2022-2024 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, # Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., # Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, # Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other @@ -29,17 +29,16 @@ # accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and # the United Nations Convention on Contracts on the International Sales of Goods. # - import csv import logging -import os from itertools import groupby, repeat from math import floor - import numpy as np +from pathlib import Path -from ivas_processing_scripts.audiotools import audio, audiofile -from ivas_processing_scripts.audiotools.wrappers.bs1770 import get_loudness +from ivas_processing_scripts.audiotools import audio, audioarray, audiofile +from ivas_processing_scripts.audiotools.convert.objectbased import convert_objectbased +from ivas_processing_scripts.audiotools.wrappers.bs1770 import get_loudness, loudness_norm from ivas_processing_scripts.generation import config from ivas_processing_scripts.utils import apply_func_parallel @@ -72,11 +71,17 @@ def replace_char_seq_with_string(str, char_seq, repl_str): return "".join(result) -def generate_ism2_items( +# function for appending string to a filename before file extension +def append_str_filename(filename, str_to_append): + p = Path(filename) + return p.parent / (p.stem + str_to_append + p.suffix) + + +def generate_ismN_items( cfg: config.TestConfig, logger: logging.Logger, ): - """Generate ISM2 items with metadata from mono items based on scene description""" + """Generate ISMN items with metadata from mono items based on scene description""" # set the target level if "loudness" not in cfg.__dict__: @@ -130,7 +135,7 @@ def generate_ism2_items( # set the prefix for all output filenames if "use_output_prefix" not in cfg.__dict__: - cfg.use_output_prefix = None + cfg.use_output_prefix = "" else: # replace file designators cfg.use_output_prefix = replace_char_seq_with_string( @@ -142,43 +147,60 @@ def generate_ism2_items( # set multiprocessing if "multiprocessing" not in cfg.__dict__: - cfg.multiprocessing = True + cfg.multiprocessing = False apply_func_parallel( - generate_ism2_scene, + generate_ismN_scene, zip(cfg.scenes.keys(), cfg.scenes.values(), repeat(cfg), repeat(logger)), - None, - "mp" if cfg.multiprocessing else None, + type = "mp" if cfg.multiprocessing else None, + show_progress = None, ) return -def generate_ism2_scene( +def generate_ismN_scene( scene_name: str, scene: dict, cfg: config.TestConfig, logger: logging.Logger ): + """ + Processes a single scene to generate N ISM items with metadata. + + Args: + scene_name (str): The name of the scene being processed. + scene (dict): A dictionary containing scene description, including source files, azimuth, elevation, and other parameters. + cfg (config.TestConfig): Configuration object containing settings for processing, such as input/output paths, sampling rate, and loudness levels. + logger (logging.Logger): Logger instance for logging information and errors. + + Expected Behavior: + - Reads audio source files and processes them based on the scene description. + - Generates metadata files and appends them to the ISM objects. + - Writes the processed audio and metadata to output files. + """ + logger.info( - f"Processing {scene_name} out of {len(cfg.scenes)} scenes, name: {scene['name']}" + f"Processing {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}" ) # extract the number of audio sources - N_sources = len(np.atleast_1d(scene["source"])) + N_inputs = len(np.atleast_1d(scene["input"])) - # initialize output arrays - y = audio.ChannelBasedAudio("STEREO") - y_meta = None + # initialize output dirs + ism_format = f"ISM{N_inputs}" + output_filename = Path(scene["output"]).parent / (cfg.use_output_prefix + Path(scene["output"]).name) - # read the overlap length - if "overlap" in scene.keys(): - source_overlap = float(scene["overlap"]) - else: - source_overlap = 0.0 + dir_path = output_filename.parent + if dir_path and not dir_path.exists(): + dir_path.mkdir(parents=True, exist_ok=True) + + # initialize output ISM object + y = audio.ObjectBasedAudio(ism_format) # repeat for all source files - for i in range(N_sources): + for i in range(N_inputs): + # parse parameters from the scene description source_file = ( - scene["source"][i] if isinstance(scene["source"], list) else scene["source"] + scene["input"][i] if isinstance(scene["input"], list) else scene["input"] ) source_azi = ( scene["azimuth"][i] @@ -191,92 +213,69 @@ def generate_ism2_scene( else scene["elevation"] ) + # read the overlap length + if "shift" in scene.keys(): + source_shift = ( + scene["shift"][i] + if isinstance(scene["shift"], list) + else scene["shift"] + ) + else: + source_shift = 0.0 + + # read the level + if "level" in scene.keys(): + level = ( + scene["level"][i] + if isinstance(scene["level"], list) + else scene["level"] + ) + else: + level = -26 + logger.info(f"Encoding {source_file} at position(s) {source_azi},{source_ele}") - # read source file - x = audio.fromfile( - "MONO", - os.path.join( - cfg.input_path, - os.path.dirname(source_file), - cfg.use_input_prefix + os.path.basename(source_file), - ), - fs=cfg.fs, - ) + # get input filename + input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name) - # get the number of frames (multiple of 20ms) - N_frames = int(len(x.audio) / x.fs * 50) - frame_len = int(x.fs / 50) + # generate ISM metadata .csv filename (should end with .wav..0.csv, .wav.1.csv, ...) + y.metadata_files.insert(i, str(output_filename.with_suffix(f".{i}.csv"))) - # trim the samples from the end to ensure that the signal length is a multiple of 20ms - x.audio = x.audio[: N_frames * frame_len] + # read source file + x = audio.fromfile( "MONO", input_filename, fs=cfg.fs ) # adjust the level of the source file - _, scale_factor, _ = get_loudness(x, cfg.loudness, "MONO") - x.audio *= scale_factor - - # shift the second (and all other) source files (positive shift creates overlap, negative shift creates a gap) - if i > 0: - # get the length of the first source file - N_delay = len(y.audio[:, 0]) + x.audio, _ = loudness_norm(x, level, loudness_format="MONO") - # add the shift value (ensure that the shift is a multiple of 20ms) - N_delay += int(floor(-source_overlap * 50) / 50 * x.fs) + # shift the source signal (positive shift creates overlap, negative shift creates a gap) + if int(floor(-source_shift)) != 0: + x.audio = audioarray.trim(x.audio, x.fs, limits=[-source_shift, 0]) - # insert all-zero signal - pre = np.zeros((N_delay, x.audio.shape[1])) - x.audio = np.concatenate([pre, x.audio]) + # get the number of frames (multiple of 20ms) + frame_len = int(x.fs / 50) + N_frames = int(len(x.audio) / frame_len) # pad with zeros to ensure that the signal length is a multiple of 20ms if len(x.audio) % frame_len != 0: - # pad the source signal N_pad = int(frame_len - len(x.audio) % frame_len) - post = np.zeros((N_pad, x.audio.shape[1])) - x.audio = np.concatenate([x.audio, post]) + x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) - # add source signal to the array of all source signals - y.fs = x.fs if y.audio is None: + # add source signal to the array of all source signals y.audio = x.audio.copy() + y.fs = x.fs else: - # pad with zeros to have the same length of all source signals - if x.audio.shape[0] > y.audio.shape[0]: - y.audio = np.vstack( - ( - y.audio, - np.zeros( - (x.audio.shape[0] - y.audio.shape[0], y.audio.shape[1]) - ), - ) - ) - elif y.audio.shape[0] > x.audio.shape[0]: - x.audio = np.vstack( - ( - x.audio, - np.zeros( - (y.audio.shape[0] - x.audio.shape[0], x.audio.shape[1]) - ), - ) - ) - y.audio = np.hstack((y.audio, x.audio)) + # pad ISM signal with zeros to have the same length as the MASA signal + N_pad = y.audio.shape[0] - x.audio.shape[0] + if N_pad != 0: + x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) + + # append ISM signal to the ISM object + y.audio = np.append(y.audio, x.audio, axis=1) # append pre-amble and post-amble to all sources - if cfg.preamble != 0.0: - # ensure that pre-amble is a multiple of 20ms - N_pre = int(floor(cfg.preamble * 50) / 50 * y.fs) - - # insert all-zero preamble to all sources - pre = np.zeros((N_pre, y.audio.shape[1])) - y.audio = np.concatenate([pre, y.audio]) - - if cfg.postamble != 0.0: - # ensure that post-mable is a multiple of 20ms - N_post = int(floor(cfg.postamble * 50) / 50 * y.fs) - - # append all-zero postamble to all sources - post = np.zeros((N_post, y.audio.shape[1])) - y.audio = np.concatenate([y.audio, post]) - + y.audio = audioarray.trim(y.audio, y.fs, limits=[-cfg.preamble, -cfg.postamble]) + # add random noise if cfg.add_low_level_random_noise: # create uniformly distributed noise between -4 and 4 @@ -286,8 +285,8 @@ def generate_ism2_scene( # superimpose y.audio += noise - # create metadata files - for i in range(N_sources): + # generate ISM metadata + for i in range(N_inputs): # parse metadata parameters from the scene description source_azi = ( scene["azimuth"][i] @@ -300,9 +299,9 @@ def generate_ism2_scene( else scene["elevation"] ) - N_frames = int(len(y.audio) / y.fs * 50) + N_frames = int(np.rint((len(y.audio) / y.fs * 50))) - # read azimuth information and convert to an array + # read azimuth information and convert to an array if isinstance(source_azi, str): if ":" in source_azi: # start with the initial azimuth value and apply step N_frames times @@ -310,7 +309,7 @@ def generate_ism2_scene( azi = np.arange( float(eval(source_azi[0])), float(eval(source_azi[0])) + N_frames * float(eval(source_azi[1])), - float(eval(source_azi[1])), + float(eval(source_azi[1])) ) else: # replicate static azimuth value N_frames times @@ -318,7 +317,7 @@ def generate_ism2_scene( else: # replicate static azimuth value N_frames times azi = np.repeat(float(source_azi), N_frames) - + # convert azimuth from 0 .. 360 to -180 .. +180 azi = (azi + 180) % 360 - 180 @@ -337,7 +336,7 @@ def generate_ism2_scene( ele = np.arange( float(eval(source_ele[0])), np.sign(float(eval(source_ele[1]))) * 90, - float(eval(source_ele[1])), + float(eval(source_ele[1])) )[:N_frames] # repeat the last elevation value, if array is shorter than N_frames @@ -358,35 +357,10 @@ def generate_ism2_scene( # arrange all metadata fields column-wise into a matrix x_meta = np.column_stack((azi, ele)) - - x_meta = x_meta[np.newaxis, :] - if y_meta is None: - y_meta = x_meta - else: - y_meta = np.concatenate([y_meta, x_meta]) - - # write individual ISM audio streams to the output file in an interleaved format - audiofile.write( - os.path.join( - cfg.output_path, - os.path.dirname(scene["name"]), - cfg.use_output_prefix + os.path.basename(scene["name"]), - ), - y.audio, - y.fs, - ) - - # write individual ISM metadata to output files in .csv format - for i in range(N_sources): - # generate .csv filename (should end with .0.csv, .1.csv, ...) - csv_filename = os.path.join( - cfg.output_path, - os.path.dirname(scene["name"]), - cfg.use_output_prefix + os.path.basename(scene["name"]) + f".{i}.csv", - ) - + + # write to .csv output metadata file with open( - csv_filename, + y.metadata_files[i], "w", newline="", encoding="utf-8", @@ -395,6 +369,22 @@ def generate_ism2_scene( writer = csv.writer(f) # write all rows to the .csv file - writer.writerows(csv_formatdata(y_meta[i])) + writer.writerows(csv_formatdata(x_meta)) + + y.init_metadata() # this is needed to populate 'y.object_pos[]' + + # write the OMASA output to .wav file in an interleaved format + audiofile.write( output_filename, y.audio, y.fs ) + + # convert to ISM output to BINAURAL, if option was chosen + if cfg.binaural_output: + binaudio = audio.fromtype("BINAURAL") + binaudio.fs = y.fs + convert_objectbased(y, binaudio) + binaural_output_filename = output_filename.with_name(output_filename.stem + "_BINAURAL" + output_filename.suffix) + audiofile.write( + binaural_output_filename, + binaudio.audio, + binaudio.fs, + ) - return diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py index 7285a61c..f4ba7711 100644 --- a/ivas_processing_scripts/generation/generate_omasa_items.py +++ b/ivas_processing_scripts/generation/generate_omasa_items.py @@ -32,12 +32,10 @@ import csv import logging -import os +import sys from itertools import groupby, repeat from math import floor from pathlib import Path -from sox import file_info - import numpy as np from ivas_processing_scripts.audiotools import audio, audiofile, audioarray @@ -77,8 +75,9 @@ def replace_char_seq_with_string(str, char_seq, repl_str): # function for appending string to a filename before file extension def append_str_filename(filename, str_to_append): - p = Path(filename) - return "{0}{2}{1}".format(p.stem, p.suffix, str_to_append) + p = Path(filename) + # Combine the stem, the string to append, and the suffix + return p.parent / (p.stem + str_to_append + p.suffix) def generate_omasa_items( cfg: config.TestConfig, @@ -153,7 +152,7 @@ def generate_omasa_items( cfg.multiprocessing = False apply_func_parallel( - generate_scene, + generate_OMASA_scene, zip(cfg.scenes.keys(), cfg.scenes.values(), repeat(cfg), repeat(logger)), type = "mp" if cfg.multiprocessing else None, show_progress = None, @@ -162,7 +161,7 @@ def generate_omasa_items( return -def generate_scene( +def generate_OMASA_scene( scene_name: str, scene: dict, cfg: config.TestConfig, logger: logging.Logger ): """ @@ -180,29 +179,33 @@ def generate_scene( - Writes the processed audio and metadata to output files. - Handles various audio formats (e.g., MONO, FOA, HOA2) and applies transformations like loudness normalization, trimming, and padding. """ - logger.info( f"Processing scene {scene_name}:") + + logger.info( + f"Processing {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}" + ) # extract the number of audio sources - N_sources = len(np.atleast_1d(scene["source"])) - N_ISMs = N_sources-1 + N_inputs = len(np.atleast_1d(scene["input"])) + N_ISMs = N_inputs-1 - # initialize output dirs + # get output filename omasa_format = f"ISM{N_ISMs}MASA{cfg.masa_tc}DIR{cfg.masa_dirs}" - output_filename = os.path.join( cfg.output_path, os.path.dirname(scene["name"]), cfg.use_output_prefix + append_str_filename(os.path.basename(scene["name"]), f"_s{scene_name}_{omasa_format}") ) + output_filename = Path(scene["output"]).parent / (cfg.use_output_prefix + Path(scene["output"]).name) - dir_path = os.path.dirname(output_filename) - if dir_path and not os.path.exists(dir_path): - os.makedirs(dir_path, exist_ok=True) + # initialize output dirs + dir_path = output_filename.parent + if dir_path and not dir_path.exists(): + dir_path.mkdir(parents=True, exist_ok=True) # initialize output OMASA object y = audio.OMASAAudio(omasa_format) # repeat for all source files - for i in range(N_sources): + for i in range(N_inputs): # parse parameters from the scene description source_file = ( - scene["source"][i] if isinstance(scene["source"], list) else scene["source"] + scene["input"][i] if isinstance(scene["input"], list) else scene["input"] ) source_azi = ( scene["azimuth"][i] @@ -215,15 +218,15 @@ def generate_scene( else scene["elevation"] ) - # read the overlap length - if "overlap" in scene.keys(): - source_overlap = ( - scene["overlap"][i] - if isinstance(scene["overlap"], list) - else scene["overlap"] + # read the shift length + if "shift" in scene.keys(): + source_shift = ( + scene["shift"][i] + if isinstance(scene["shift"], list) + else scene["shift"] ) else: - source_overlap = 0.0 + source_shift = 0.0 # read the level if "level" in scene.keys(): @@ -237,9 +240,13 @@ def generate_scene( logger.info(f"Encoding {source_file} at position(s) {source_azi},{source_ele}") + # get input filename + input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name) + # get the number of channels from the .wav file header - N_channels = file_info.channels(os.path.join(cfg.input_path, os.path.dirname(source_file), cfg.use_input_prefix + os.path.basename(source_file))) - + wav_header = audiofile.parse_wave_header(input_filename) + N_channels = wav_header['channels'] + if N_channels == 1: fmt = "MONO" elif N_channels == 2: @@ -256,29 +263,14 @@ def generate_scene( if fmt in ["FOA", "HOA2", "HOA3"]: # generate MASA metadata .met filename (should end with .met) - y.metadata_files.append(os.path.splitext(output_filename)[0]+".met") + y.metadata_files.append(output_filename.with_suffix(".met")) elif fmt == "MONO": - # generate ISM metadata .csv filename (should end with .wav..0.csv, .wav.1.csv, ...) - y.metadata_files.insert(i-1, f"{output_filename}.{i-1}.csv") + # generate ISM metadata .csv filename (should end with .wav.0.csv, .wav.1.csv, ...) + y.metadata_files.insert(i-1, output_filename.with_suffix(f".{i-1}.csv")) # read source file - x = audio.fromfile( - fmt, - os.path.join( - cfg.input_path, - os.path.dirname(source_file), - cfg.use_input_prefix + os.path.basename(source_file), - ), - fs=cfg.fs, - ) + x = audio.fromfile( fmt, input_filename, fs=cfg.fs ) - # get the number of frames (multiple of 20ms) - N_frames = int(len(x.audio) / x.fs * 50) - frame_len = int(x.fs / 50) - - # trim the samples from the end to ensure that the signal length is a multiple of 20ms - audioarray.cut(x.audio, [0, N_frames * frame_len]) - # adjust the level of the source file if fmt in ["FOA", "HOA2", "HOA3"]: x.audio, _ = loudness_norm(x, level, loudness_format="STEREO", rms=True) @@ -286,8 +278,12 @@ def generate_scene( x.audio, _ = loudness_norm(x, level, loudness_format="MONO") # shift the source signal (positive shift creates overlap, negative shift creates a gap) - if int(floor(-source_overlap)) != 0: - x.audio = audioarray.trim(x.audio, x.fs, limits=[-source_overlap, 0]) + if int(floor(-source_shift)) != 0: + x.audio = audioarray.trim(x.audio, x.fs, limits=[-source_shift, 0]) + + # get the number of frames (multiple of 20ms) + frame_len = int(x.fs / 50) + N_frames = int(len(x.audio) / frame_len) # pad with zeros to ensure that the signal length is a multiple of 20ms if len(x.audio) % frame_len != 0: @@ -416,13 +412,13 @@ def generate_scene( audiofile.write( output_filename, y.audio, y.fs ) # convert to OMASA output to BINAURAL, if option was chosen - if cfg.binaural_path != "": + if cfg.binaural_output: binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_omasa(y, binaudio) + binaural_output_filename = output_filename.with_name(output_filename.stem + "_BINAURAL" + output_filename.suffix) audiofile.write( - os.path.join( - cfg.binaural_path, append_str_filename(os.path.basename(scene["name"]), f"_s{scene_name}_{omasa_format}_BINAURAL") ), + binaural_output_filename, binaudio.audio, binaudio.fs, ) diff --git a/ivas_processing_scripts/generation/generate_osba_items.py b/ivas_processing_scripts/generation/generate_osba_items.py new file mode 100644 index 00000000..a2e53d12 --- /dev/null +++ b/ivas_processing_scripts/generation/generate_osba_items.py @@ -0,0 +1,420 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +import csv +import logging +import os +import sys +from itertools import groupby, repeat +from math import floor +from pathlib import Path +import numpy as np + +from ivas_processing_scripts.audiotools import audio, audiofile, audioarray +from ivas_processing_scripts.audiotools.wrappers.bs1770 import get_loudness, loudness_norm +from ivas_processing_scripts.audiotools.convert.scenebased import render_sba_to_masa +from ivas_processing_scripts.audiotools.convert.osba import convert_osba +from ivas_processing_scripts.generation import config +from ivas_processing_scripts.utils import apply_func_parallel + +SEED_RANDOM_NOISE = 0 + + +# function for converting nd numpy array to strings with 2 decimal digits +def csv_formatdata(data): + for row in data: + yield ["%0.2f" % v for v in row] + + +# function for searching sequences of same the same character and replacing it by another string +def replace_char_seq_with_string(str, char_seq, repl_str): + result = [] + + # find groups of consecutive letters + groups = ["".join(list(g)) for k, g in groupby(str)] + + # limit the length of the replacement string by the length of the character sequence + repl_str = repl_str[: len(char_seq)] + + # replace each occurence of the sequence of characters + for g in groups: + if char_seq in g: + result.append(repl_str) + else: + result.append(g) + + return "".join(result) + +# function for appending string to a filename before file extension +def append_str_filename(filename, str_to_append): + p = Path(filename) + return "{0}{2}{1}".format(p.stem, p.suffix, str_to_append) + +def generate_osba_items( + cfg: config.TestConfig, + logger: logging.Logger, +): + """Generate OSBA items from FOA/HOA2/HOA3 and ISMn items based on scene description""" + + # set the target level + if "loudness" not in cfg.__dict__: + cfg.loudness = -26 + + # set the fs + if "fs" not in cfg.__dict__: + cfg.fs = 48000 + + # set the pre-amble and post-amble + if "preamble" not in cfg.__dict__: + cfg.preamble = 0.0 + + if "postamble" not in cfg.__dict__: + cfg.postamble = 0.0 + + # set the pre-amble and post-amble + if "add_low_level_random_noise" not in cfg.__dict__: + cfg.add_low_level_random_noise = False + + # set the listening lab designator + if "listening_lab" not in cfg.__dict__: + cfg.listening_lab = "l" + + # set the language designator + if "language" not in cfg.__dict__: + cfg.language = "EN" + + # set the experiment designator + if "exp" not in cfg.__dict__: + cfg.exp = "p07" + + # set the provider + if "provider" not in cfg.__dict__: + cfg.provider = "g" + + # set the prefix for all input filenames + if "use_input_prefix" not in cfg.__dict__: + cfg.use_input_prefix = "" + else: + # replace file designators + cfg.use_input_prefix = replace_char_seq_with_string( + cfg.use_input_prefix, "l", cfg.listening_lab + ) + cfg.use_input_prefix = replace_char_seq_with_string( + cfg.use_input_prefix, "LL", cfg.language + ) + cfg.use_input_prefix = replace_char_seq_with_string( + cfg.use_input_prefix, "eee", cfg.exp + ) + + # set the prefix for all output filenames + if "use_output_prefix" not in cfg.__dict__: + cfg.use_output_prefix = "" + else: + # replace file designators + cfg.use_output_prefix = replace_char_seq_with_string( + cfg.use_output_prefix, "l", cfg.listening_lab + ) + cfg.use_output_prefix = replace_char_seq_with_string( + cfg.use_output_prefix, "eee", cfg.exp + ) + + # set multiprocessing + if "multiprocessing" not in cfg.__dict__: + cfg.multiprocessing = False + + apply_func_parallel( + generate_OSBA_scene, + zip(cfg.scenes.keys(), cfg.scenes.values(), repeat(cfg), repeat(logger)), + type = "mp" if cfg.multiprocessing else None, + show_progress = None, + ) + + return + + +def generate_OSBA_scene( + scene_name: str, scene: dict, cfg: config.TestConfig, logger: logging.Logger +): + """ + Processes a single scene to generate OSBA item + + Args: + scene_name (str): The name of the scene being processed. + scene (dict): A dictionary containing scene description, including source files, azimuth, elevation, and other parameters. + cfg (config.TestConfig): Configuration object containing settings for processing, such as input/output paths, sampling rate, and loudness levels. + logger (logging.Logger): Logger instance for logging information and errors. + + Expected Behavior: + - Reads audio source files and processes them based on the scene description. + - Generates OSBA object. + - Writes the processed audio to output files. + - Handles various audio formats (e.g., FOA, HOA2, HOA3) and applies transformations like loudness normalization, trimming, and padding. + """ + + logger.info( + f"Processing {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}" + ) + + # extract the number of audio sources + N_inputs = len(np.atleast_1d(scene["input"])) + N_ISMs = N_inputs-1 + + # get input and output filenames + osba_format = f"ISM{N_ISMs}SBA{cfg.sba_order}" + output_filename = Path(scene["output"]).parent / (cfg.use_output_prefix + Path(scene["output"]).name) + + # initialize output dirs + dir_path = output_filename.parent + if dir_path and not dir_path.exists(): + dir_path.mkdir(parents=True, exist_ok=True) + + # initialize output OSBA object + y = audio.OSBAAudio(osba_format) + + # repeat for all source files + for i in range(N_inputs): + + # parse parameters from the scene description + source_file = ( + scene["input"][i] if isinstance(scene["input"], list) else scene["input"] + ) + source_azi = ( + scene["azimuth"][i] + if isinstance(scene["azimuth"], list) + else scene["azimuth"] + ) + source_ele = ( + scene["elevation"][i] + if isinstance(scene["elevation"], list) + else scene["elevation"] + ) + + # read the overlap length + if "shift" in scene.keys(): + source_shift = ( + scene["shift"][i] + if isinstance(scene["shift"], list) + else scene["shift"] + ) + else: + source_shift = 0.0 + + # read the level + if "level" in scene.keys(): + level = ( + scene["level"][i] + if isinstance(scene["level"], list) + else scene["level"] + ) + else: + level = -26 + + logger.info(f"Encoding {source_file} at position(s) {source_azi},{source_ele}") + + # get input filename + input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name) + + # get the number of channels from the .wav file header + wav_header = audiofile.parse_wave_header(input_filename) + N_channels = wav_header['channels'] + + if N_channels == 1: + fmt = "MONO" + elif N_channels == 2: + fmt = "STEREO" + elif N_channels == 4: + fmt = "FOA" + elif N_channels == 9: + fmt = "HOA2" + elif N_channels == 16: + fmt = "HOA3" + else: + logger.info(f"Error: Input format of the source file with {N_channels} channels is not supported!") + sys.exit(-1) + + if fmt == "MONO": + # generate ISM metadata .csv filename (should end with .wav.0.csv, .wav.1.csv, ...) + y.metadata_files.insert(i-1, f"{output_filename}.{i-1}.csv") + + # read source file + x = audio.fromfile( fmt, input_filename, fs=cfg.fs ) + + # adjust the level of the source file + if fmt in ["FOA", "HOA2", "HOA3"]: + x.audio, _ = loudness_norm(x, level, loudness_format="STEREO", rms=True) + else: + x.audio, _ = loudness_norm(x, level, loudness_format="MONO") + + # shift the source signal (positive shift creates overlap, negative shift creates a gap) + if int(floor(-source_shift)) != 0: + x.audio = audioarray.trim(x.audio, x.fs, limits=[-source_shift, 0]) + + # get the number of frames (multiple of 20ms) + frame_len = int(x.fs / 50) + N_frames = int(len(x.audio) / frame_len) + + # pad with zeros to ensure that the signal length is a multiple of 20ms + if len(x.audio) % frame_len != 0: + # pad the source signal + N_pad = int(frame_len - len(x.audio) % frame_len) + x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) + + if fmt in ["FOA", "HOA2", "HOA3"]: + # copy FOA/HOA2/HOA3 signal to the OSBA oject + y.audio = x.audio + y.fs = x.fs + else: + # pad ISM signal with zeros to have the same length as the SBA signal + N_pad = y.audio.shape[0] - x.audio.shape[0] + if N_pad != 0: + x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) + + # append ISM signal to the OSBA object (ISM comes first !!!) + y.audio = np.insert(y.audio, [i-1], x.audio, axis=1) + + # append pre-amble and post-amble to all sources + y.audio = audioarray.trim(y.audio, y.fs, limits=[-cfg.preamble, -cfg.postamble]) + + # add random noise + if cfg.add_low_level_random_noise: + # create uniformly distributed noise between -4 and 4 + np.random.seed(SEED_RANDOM_NOISE) + noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") + + # superimpose + y.audio += noise + + # generate ISM metadata files + y_meta = None + for i in range(1, N_ISMs + 1): + # parse metadata parameters from the scene description + source_azi = ( + scene["azimuth"][i] + if isinstance(scene["azimuth"], list) + else scene["azimuth"] + ) + source_ele = ( + scene["elevation"][i] + if isinstance(scene["elevation"], list) + else scene["elevation"] + ) + + N_frames = int(np.rint((len(y.audio) / y.fs * 50))) + + # read azimuth information and convert to an array + if isinstance(source_azi, str): + if ":" in source_azi: + # start with the initial azimuth value and apply step N_frames times + source_azi = source_azi.split(":") + azi = np.arange( + float(eval(source_azi[0])), + float(eval(source_azi[0])) + N_frames * float(eval(source_azi[1])), + float(eval(source_azi[1])) + ) + else: + # replicate static azimuth value N_frames times + azi = np.repeat(float(eval(source_azi)), N_frames) + else: + # replicate static azimuth value N_frames times + azi = np.repeat(float(source_azi), N_frames) + + # convert azimuth from 0 .. 360 to -180 .. +180 + azi = (azi + 180) % 360 - 180 + + # check if azimuth is from -180 .. +180 + if any(azi > 180) or any(azi < -180): + logger.error( + f"Incorrect value(s) of azimuth: {azi[(azi > 180) | (azi < -180)]}" + ) + + # read elevation information and convert to an array + if isinstance(source_ele, str): + if ":" in source_ele: + # convert into array (initial_value:step:stop_value) + # note: the stop_value value is +-90 degrees depending on the sign of the step + source_ele = source_ele.split(":") + ele = np.arange( + float(eval(source_ele[0])), + np.sign(float(eval(source_ele[1]))) * 90, + float(eval(source_ele[1])) + )[:N_frames] + + # repeat the last elevation value, if array is shorter than N_frames + if len(ele) < N_frames: + ele = np.append(ele, np.full(N_frames - len(ele), ele[-1])) + else: + # replicate static elevation value N_frames times + ele = np.repeat(float(eval(source_ele)), N_frames) + else: + # replicate static elevation value N_frames times + ele = np.repeat(float(source_ele), N_frames) + + # check if elevation is from -90 .. +90 + if any(ele > 90) or any(ele < -90): + logger.error( + f"Incorrect value(s) of elevation: {ele[(ele > 90) | (ele < -90)]}" + ) + + # arrange all metadata fields column-wise into a matrix + x_meta = np.column_stack((azi, ele)) + + # write to .csv output metadata file + with open( + y.metadata_files[i-1], + "w", + newline="", + encoding="utf-8", + ) as f: + # create csv writer + writer = csv.writer(f) + + # write all rows to the .csv file + writer.writerows(csv_formatdata(x_meta)) + + y.init_metadata() # this is needed to populate 'y.object_pos[]' + + # write the OSBA output to .wav file in an interleaved format + audiofile.write( output_filename, y.audio, y.fs ) + + # convert the OSBA output to BINAURAL, if option was chosen + if cfg.binaural_output: + binaudio = audio.fromtype("BINAURAL") + binaudio.fs = y.fs + convert_osba(y, binaudio) + binaural_output_filename = output_filename.with_name(output_filename.stem + "_BINAURAL" + output_filename.suffix) + audiofile.write( + binaural_output_filename, + binaudio.audio, + binaudio.fs, + ) + + return diff --git a/ivas_processing_scripts/generation/process_stereo_items.py b/ivas_processing_scripts/generation/generate_stereo_items.py similarity index 64% rename from ivas_processing_scripts/generation/process_stereo_items.py rename to ivas_processing_scripts/generation/generate_stereo_items.py index 1875f148..1c9251ad 100644 --- a/ivas_processing_scripts/generation/process_stereo_items.py +++ b/ivas_processing_scripts/generation/generate_stereo_items.py @@ -34,11 +34,12 @@ import logging import os from itertools import groupby, repeat from math import floor +from pathlib import Path import numpy as np -from ivas_processing_scripts.audiotools import audio, audiofile -from ivas_processing_scripts.audiotools.wrappers.bs1770 import get_loudness +from ivas_processing_scripts.audiotools import audio, audioarray, audiofile +from ivas_processing_scripts.audiotools.wrappers.bs1770 import get_loudness, loudness_norm from ivas_processing_scripts.audiotools.wrappers.reverb import reverb_stereo from ivas_processing_scripts.generation import config from ivas_processing_scripts.utils import apply_func_parallel @@ -153,7 +154,7 @@ def generate_stereo_items( # set the prefix for all output filenames if "use_output_prefix" not in cfg.__dict__: - cfg.use_output_prefix = None + cfg.use_output_prefix = "" else: # replace file designators cfg.use_output_prefix = replace_char_seq_with_string( @@ -165,13 +166,13 @@ def generate_stereo_items( # set multiprocessing if "multiprocessing" not in cfg.__dict__: - cfg.multiprocessing = True + cfg.multiprocessing = False apply_func_parallel( generate_stereo_scene, zip(cfg.scenes.keys(), cfg.scenes.values(), repeat(cfg), repeat(logger)), - None, - "mp" if cfg.multiprocessing else None, + type = "mp" if cfg.multiprocessing else None, + show_progress = None, ) return @@ -180,131 +181,109 @@ def generate_stereo_items( def generate_stereo_scene( scene_name: str, scene: dict, cfg: config.TestConfig, logger: logging.Logger ): + """ + Processes a single scene to generate STEREO item. + + Args: + scene_name (str): The name of the scene being processed. + scene (dict): A dictionary containing scene description, including source files, azimuth, elevation, and other parameters. + cfg (config.TestConfig): Configuration object containing settings for processing, such as input/output paths, sampling rate, and loudness levels. + logger (logging.Logger): Logger instance for logging information and errors. + + Expected Behavior: + - Reads mono audio source files and processes them based on the scene description. + - Writes the processed STEREO audio to output file. + """ + logger.info( - f"Processing scene: {scene_name} out of {len(cfg.scenes)} scenes, name: {scene['name']}" + f"Processing scene: {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}" ) # extract the number of audio sources - N_sources = len(np.atleast_1d(scene["source"])) + N_inputs = len(np.atleast_1d(scene["input"])) - # read the overlap length - if "overlap" in scene.keys(): - source_overlap = float(scene["overlap"]) - else: - source_overlap = 0.0 + # initialize output dirs + output_filename = Path(scene["output"]).parent / (cfg.use_output_prefix + Path(scene["output"]).name) + + dir_path = output_filename.parent + if dir_path and not dir_path.exists(): + dir_path.mkdir(parents=True, exist_ok=True) + + # initialize output audio object + y = audio.ChannelBasedAudio(cfg.format) + + for i in range(N_inputs): - len_s1 = 0 - y = audio.ChannelBasedAudio("STEREO") - for i in range(N_sources): # parse parameters from the scene description - source_file = np.atleast_1d(scene["source"])[i] + source_file = np.atleast_1d(scene["input"])[i] IR_file = np.atleast_1d(scene["IR"])[i] + # read the overlap length + if "shift" in scene.keys(): + source_shift = ( + scene["shift"][i] + if isinstance(scene["shift"], list) + else scene["shift"] + ) + else: + source_shift = 0.0 + + # read the level + if "level" in scene.keys(): + level = ( + scene["level"][i] + if isinstance(scene["level"], list) + else scene["level"] + ) + else: + level = -26 + logger.info(f"Convolving {source_file} with {IR_file}") - # read source file - x = audio.fromfile( - "MONO", - os.path.join( - cfg.input_path, - os.path.dirname(source_file), - cfg.use_input_prefix + os.path.basename(source_file), - ), - fs=cfg.fs, - ) + # get input filename and IR filename + input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name) + IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) - # read the IR file - IR = audio.fromfile( - "STEREO", - os.path.join( - cfg.IR_path, - os.path.dirname(IR_file), - cfg.use_IR_prefix + os.path.basename(IR_file), - ), - fs=cfg.IR_fs, - ) + # read source file + x = audio.fromfile( "MONO", input_filename, fs=cfg.fs ) - if i == 0: - len_s1 = x.audio.shape[0] + # read the IR file (!must be in STEREO format!) + IR = audio.fromfile( "STEREO", IR_filename, fs=cfg.IR_fs ) - # convolve with stereo IR + # convolve mono source signal with stereo IR x = reverb_stereo(x, IR) # adjust the level of the stereo signal - _, scale_factor, _ = get_loudness(x, cfg.loudness, "STEREO") - x.audio *= scale_factor + x.audio, _ = loudness_norm(x, level, loudness_format="STEREO") - # shift the second (and all other) source files (positive shift creates overlap, negative shift creates a gap) - if i > 0: - # get the length of the first source file - N_delay = len_s1 + # shift the source signal (positive shift creates overlap, negative shift creates a gap) + if int(floor(-source_shift)) != 0: + x.audio = audioarray.trim(x.audio, x.fs, limits=[-source_shift, 0]) - # add the shift - N_delay += int(-source_overlap * x.fs) + # get the number of frames (multiple of 20ms) + frame_len = int(x.fs / 50) + N_frames = int(len(x.audio) / frame_len) - # insert all-zero preamble - pre = np.zeros((N_delay, x.audio.shape[1])) - x.audio = np.concatenate([pre, x.audio]) + # pad with zeros to ensure that the signal length is a multiple of 20ms + if len(x.audio) % frame_len != 0: + N_pad = int(frame_len - len(x.audio) % frame_len) + x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) - # add source signal to the array of source signals - y.fs = x.fs if y.audio is None: + # add source signal to the array of all source signals y.audio = x.audio.copy() + y.fs = x.fs else: - # pad with zeros to have equal length of all source signals - if x.audio.shape[0] > y.audio.shape[0]: - y.audio = np.vstack( - ( - y.audio, - np.zeros( - ( - x.audio.shape[0] - y.audio.shape[0], - y.audio.shape[1], - ) - ), - ) - ) - elif y.audio.shape[0] > x.audio.shape[0]: - x.audio = np.vstack( - ( - x.audio, - np.zeros( - ( - y.audio.shape[0] - x.audio.shape[0], - x.audio.shape[1], - ) - ), - ) - ) - + # pad the signal with zeros to have the same length as the previous signal(s) + N_pad = y.audio.shape[0] - x.audio.shape[0] + if N_pad != 0: + x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) + # superimpose y.audio += x.audio # append pre-amble and post-amble to all sources - if cfg.preamble != 0.0: - # ensure that pre-amble is a multiple of 20ms - N_pre = int(floor(cfg.preamble * 50) / 50 * y.fs) - - # insert all-zero preamble to all sources - pre = np.zeros((N_pre, y.audio.shape[1])) - y.audio = np.concatenate([pre, y.audio]) - - if cfg.postamble != 0.0: - # ensure that post-mable is a multiple of 20ms - N_post = int(floor(cfg.postamble * 50) / 50 * y.fs) - - # append all-zero postamble to all sources - post = np.zeros((N_post, y.audio.shape[1])) - y.audio = np.concatenate([y.audio, post]) - - # pad with zeros to ensure that the signal length is a multiple of 20ms - N_frame = y.fs / 50 - if y.audio.shape[0] % N_frame != 0: - N_pad = int(N_frame - y.audio.shape[0] % N_frame) - - # insert all-zero postamble - post = np.zeros((N_pad, y.audio.shape[1])) - y.audio = np.concatenate([y.audio, post]) + y.audio = audioarray.trim(y.audio, y.fs, limits=[-cfg.preamble, -cfg.postamble]) # add random noise if cfg.add_low_level_random_noise: @@ -315,13 +294,6 @@ def generate_stereo_scene( # superimpose y.audio += noise - # write the reverberated audio into output file - audiofile.write( - os.path.join( - cfg.output_path, - os.path.dirname(scene["name"]), - cfg.use_output_prefix + os.path.basename(scene["name"]), - ), - y.audio, - y.fs, - ) + # write the output STEREO audio signal into output file + audiofile.write( output_filename, y.audio, y.fs ) + diff --git a/ivas_processing_scripts/generation/process_ism1_items.py b/ivas_processing_scripts/generation/process_ism1_items.py deleted file mode 100644 index c7a48684..00000000 --- a/ivas_processing_scripts/generation/process_ism1_items.py +++ /dev/null @@ -1,356 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2024 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import csv -import logging -import os -from itertools import groupby, repeat -from math import floor - -import numpy as np - -from ivas_processing_scripts.audiotools import audio, audiofile -from ivas_processing_scripts.audiotools.wrappers.bs1770 import get_loudness -from ivas_processing_scripts.generation import config -from ivas_processing_scripts.utils import apply_func_parallel - -SEED_RANDOM_NOISE = 0 - - -# function for converting nd numpy array to strings with 2 decimal digits -def csv_formatdata(data): - for row in data: - yield ["%0.2f" % v for v in row] - - -# function for searching sequences of same the same character and replacing it by another string -def replace_char_seq_with_string(str, char_seq, repl_str): - result = [] - - # find groups of consecutive letters - groups = ["".join(list(g)) for k, g in groupby(str)] - - # limit the length of the replacement string by the length of the character sequence - repl_str = repl_str[: len(char_seq)] - - # replace each occurence of the sequence of characters - for g in groups: - if char_seq in g: - result.append(repl_str) - else: - result.append(g) - - return "".join(result) - - -def generate_ism1_items( - cfg: config.TestConfig, - logger: logging.Logger, -): - """Generate ISM2 items with metadata from mono items based on scene description""" - - # set the target level - if "loudness" not in cfg.__dict__: - cfg.loudness = -26 - - # set the fs - if "fs" not in cfg.__dict__: - cfg.fs = 48000 - - # set the pre-amble and post-amble - if "preamble" not in cfg.__dict__: - cfg.preamble = 0.0 - - if "postamble" not in cfg.__dict__: - cfg.postamble = 0.0 - - # set the pre-amble and post-amble - if "add_low_level_random_noise" not in cfg.__dict__: - cfg.add_low_level_random_noise = False - - # set the listening lab designator - if "listening_lab" not in cfg.__dict__: - cfg.listening_lab = "l" - - # set the language designator - if "language" not in cfg.__dict__: - cfg.language = "EN" - - # set the experiment designator - if "exp" not in cfg.__dict__: - cfg.exp = "p06" - - # set the provider - if "provider" not in cfg.__dict__: - cfg.provider = "g" - - # set the prefix for all input filenames - if "use_input_prefix" not in cfg.__dict__: - cfg.use_input_prefix = "" - else: - # replace file designators - cfg.use_input_prefix = replace_char_seq_with_string( - cfg.use_input_prefix, "l", cfg.listening_lab - ) - cfg.use_input_prefix = replace_char_seq_with_string( - cfg.use_input_prefix, "LL", cfg.language - ) - cfg.use_input_prefix = replace_char_seq_with_string( - cfg.use_input_prefix, "eee", cfg.exp - ) - - # set the prefix for all output filenames - if "use_output_prefix" not in cfg.__dict__: - cfg.use_output_prefix = None - else: - # replace file designators - cfg.use_output_prefix = replace_char_seq_with_string( - cfg.use_output_prefix, "l", cfg.listening_lab - ) - cfg.use_output_prefix = replace_char_seq_with_string( - cfg.use_output_prefix, "eee", cfg.exp - ) - - # set multiprocessing - if "multiprocessing" not in cfg.__dict__: - cfg.multiprocessing = True - - apply_func_parallel( - generate_ism1_scene, - zip(cfg.scenes.keys(), cfg.scenes.values(), repeat(cfg), repeat(logger)), - None, - "mp" if cfg.multiprocessing else None, - ) - - return - - -def generate_ism1_scene( - scene_name: str, scene: dict, cfg: config.TestConfig, logger: logging.Logger -): - logger.info( - f"Processing {scene_name} out of {len(cfg.scenes)} scenes, name: {scene['name']}" - ) - - # extract the number of audio sources - N_sources = len(np.atleast_1d(scene["source"])) - - # initialize output arrays - y = audio.ChannelBasedAudio("MONO") - y_meta = None - - # read the overlap length - if "overlap" in scene.keys(): - source_overlap = float(scene["overlap"]) - else: - source_overlap = 0.0 - - logger.info( - f"Encoding {scene['source']} at position(s) {scene['azimuth']},{scene['elevation']}" - ) - - # repeat for all source files - for i in range(N_sources): - # parse parameters from the scene description - source_file = ( - scene["source"][i] if isinstance(scene["source"], list) else scene["source"] - ) - - # read source file - x = audio.fromfile( - "MONO", - os.path.join( - cfg.input_path, - os.path.dirname(source_file), - cfg.use_input_prefix + os.path.basename(source_file), - ), - fs=cfg.fs, - ) - - # get the number of frames (multiple of 20ms) - N_frames = int(len(x.audio) / x.fs * 50) - frame_len = int(x.fs / 50) - - # trim the samples from the end to ensure that the signal length is a multiple of 20ms - x.audio = x.audio[: N_frames * frame_len] - - # adjust the level of the source file - _, scale_factor, _ = get_loudness(x, cfg.loudness, "MONO") - x.audio *= scale_factor - - # shift the second (and all other) source files (positive shift creates overlap, negative shift creates a gap) - if i > 0: - # get the length of the first source file - N_delay = len(y.audio) - - # add the shift value (ensure that the shift is a multiple of 20ms) - N_delay += int(floor(-source_overlap * 50) / 50 * x.fs) - - # insert all-zero signal - pre = np.zeros((N_delay, 1)) - x.audio = np.concatenate([pre, x.audio]) - - # pad with zeros to ensure that the signal length is a multiple of 20ms - if len(x.audio) % frame_len != 0: - # pad the source signal - N_pad = int(frame_len - len(x.audio) % frame_len) - post = np.zeros((N_pad, 1)) - x.audio = np.concatenate([x.audio, post]) - - # superimpose all source signals together - y.fs = x.fs - if y.audio is None: - y.audio = x.audio.copy() - else: - y.audio.resize(x.audio.shape, refcheck=False) - y.audio += x.audio - - # append pre-amble and post-amble to all sources - if cfg.preamble != 0.0: - # ensure that pre-amble is a multiple of 20ms - N_pre = int(floor(cfg.preamble * 50) / 50 * y.fs) - - # insert all-zero preamble to all sources - pre = np.zeros((N_pre, y.audio.shape[1])) - y.audio = np.concatenate([pre, y.audio]) - - if cfg.postamble != 0.0: - # ensure that post-amble is a multiple of 20ms - N_post = int(floor(cfg.postamble * 50) / 50 * y.fs) - - # append all-zero postamble to all sources - post = np.zeros((N_post, y.audio.shape[1])) - y.audio = np.concatenate([y.audio, post]) - - # add random noise - if cfg.add_low_level_random_noise: - # create uniformly distributed noise between -4 and 4 - np.random.seed(SEED_RANDOM_NOISE) - noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") - - # superimpose - y.audio += noise - - # process azimuth and elevation - source_azi = scene["azimuth"] - source_ele = scene["elevation"] - - N_frames = int(len(y.audio) / y.fs * 50) - - # read azimuth information and convert to an array - if isinstance(source_azi, str): - if ":" in source_azi: - # start with the initial azimuth value and apply step N_frames times - source_azi = source_azi.split(":") - azi = np.arange( - float(eval(source_azi[0])), - float(eval(source_azi[0])) + N_frames * float(eval(source_azi[1])), - float(eval(source_azi[1])), - ) - else: - # replicate static azimuth value N_frames times - azi = np.repeat(float(eval(source_azi)), N_frames) - else: - # replicate static azimuth value N_frames times - azi = np.repeat(float(source_azi), N_frames) - - # convert azimuth from 0 .. 360 to -180 .. +180 - azi = (azi + 180) % 360 - 180 - - # check, if azimuth is from -180 .. +180 - if any(azi > 180) or any(azi < -180): - logger.error( - f"Incorrect value(s) of azimuth: {azi[(azi > 180) | (azi < -180)]}" - ) - - # read elevation information and convert to an array - if isinstance(source_ele, str): - if ":" in source_ele: - # convert into array (initial_value:step:stop_value) - # note: the stop_value value is +-90 degrees depending on the sign of the step - source_ele = source_ele.split(":") - ele = np.arange( - float(eval(source_ele[0])), - np.sign(float(eval(source_ele[1]))) * 90, - float(eval(source_ele[1])), - )[:N_frames] - - # repeat the last elevation value, if array is shorter than N_frames - if len(ele) < N_frames: - ele = np.append(ele, np.full(N_frames - len(ele), ele[-1])) - else: - # replicate static elevation value N_frames times - ele = np.repeat(float(eval(source_ele)), N_frames) - else: - # replicate static elevation value N_frames times - ele = np.repeat(float(source_ele), N_frames) - - # check if elevation is from -90 .. +90 - if any(ele > 90) or any(ele < -90): - logger.error( - f"Incorrect value(s) of elevation: {ele[(ele > 90) | (ele < -90)]}" - ) - - # arrange all metadata fields column-wise into a matrix - y_meta = np.column_stack((azi, ele)) - - # write ISM audio stream to the output file - audiofile.write( - os.path.join( - cfg.output_path, - os.path.dirname(scene["name"]), - cfg.use_output_prefix + os.path.basename(scene["name"]), - ), - y.audio, - y.fs, - ) - - # write ISM metadata to the output file in .0.csv format - csv_filename = os.path.join( - cfg.output_path, - os.path.dirname(scene["name"]), - cfg.use_output_prefix + os.path.basename(scene["name"]) + ".0.csv", - ) - - with open( - csv_filename, - "w", - newline="", - encoding="utf-8", - ) as f: - # create csv writer - writer = csv.writer(f) - - # write all rows to the .csv file - writer.writerows(csv_formatdata(y_meta)) - - return -- GitLab From e15eb5ad4f7c9f98b3c950eb5bb74a85596c8fec Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 19 Jun 2025 10:44:51 +0200 Subject: [PATCH 07/42] formatting --- .../audiotools/wrappers/masaAnalyzer.py | 4 +- .../audiotools/wrappers/reverb.py | 11 ++- .../generation/__init__.py | 5 +- .../generation/generate_ambi_items.py | 44 ++++++--- .../generation/generate_ismN_items.py | 54 ++++++----- .../generation/generate_omasa_items.py | 93 +++++++++++-------- .../generation/generate_osba_items.py | 91 ++++++++++-------- .../generation/generate_stereo_items.py | 35 ++++--- 8 files changed, 202 insertions(+), 135 deletions(-) diff --git a/ivas_processing_scripts/audiotools/wrappers/masaAnalyzer.py b/ivas_processing_scripts/audiotools/wrappers/masaAnalyzer.py index c826cdf4..b59a6bf7 100644 --- a/ivas_processing_scripts/audiotools/wrappers/masaAnalyzer.py +++ b/ivas_processing_scripts/audiotools/wrappers/masaAnalyzer.py @@ -73,7 +73,9 @@ def masaAnalyzer( binary = find_binary("masaAnalyzer") # enforce metadata_out_filename to be a Path object - if metadata_out_filename is not None and not isinstance(metadata_out_filename, Path): + if metadata_out_filename is not None and not isinstance( + metadata_out_filename, Path + ): metadata_out_filename = Path(metadata_out_filename) if num_tcs not in [1, 2]: diff --git a/ivas_processing_scripts/audiotools/wrappers/reverb.py b/ivas_processing_scripts/audiotools/wrappers/reverb.py index dc53a75c..55379b6b 100644 --- a/ivas_processing_scripts/audiotools/wrappers/reverb.py +++ b/ivas_processing_scripts/audiotools/wrappers/reverb.py @@ -130,7 +130,7 @@ def reverb( output.audio, _ = read(tmp_output_file, nchannels=1, fs=tmp_input.fs) # remove trailing part (to ensure that the length of the output is the same as the input) - output.audio = output.audio[:-(IR.audio.shape[0]-1), :] + output.audio = output.audio[: -(IR.audio.shape[0] - 1), :] if old_fs: output.audio = resample_itu(output, old_fs) @@ -186,7 +186,7 @@ def reverb_stereo( y_right = reverb(input, IR_right, align=align) # combine into stereo output - y = audio.fromtype('STEREO') + y = audio.fromtype("STEREO") y.fs = input.fs y.audio = np.column_stack([y_left.audio, y_right.audio]) @@ -252,7 +252,7 @@ def reverb_foa( y_z = reverb(input, IR_z, align=align) # combine into FOA output - y = audio.fromtype('FOA') + y = audio.fromtype("FOA") y.fs = input.fs y.audio = np.column_stack([y_w.audio, y_x.audio, y_y.audio, y_z.audio]) @@ -303,7 +303,7 @@ def reverb_hoa2( ych.append(reverb(input, IR, align=align)) # combine into HOA2 output - y = audio.fromtype('HOA2') + y = audio.fromtype("HOA2") y.fs = input.fs y.audio = np.column_stack( [ @@ -321,6 +321,7 @@ def reverb_hoa2( return y + def reverb_hoa3( input: Audio, hoa3_IR: Audio, @@ -365,7 +366,7 @@ def reverb_hoa3( ych.append(reverb(input, IR, align=align)) # combine into HOA3 output - y = audio.fromtype('HOA3') + y = audio.fromtype("HOA3") y.fs = input.fs y.audio = np.column_stack( [ diff --git a/ivas_processing_scripts/generation/__init__.py b/ivas_processing_scripts/generation/__init__.py index a3c77e3a..b41b1b44 100755 --- a/ivas_processing_scripts/generation/__init__.py +++ b/ivas_processing_scripts/generation/__init__.py @@ -31,6 +31,7 @@ # import logging + import yaml from ivas_processing_scripts.constants import ( @@ -41,10 +42,10 @@ from ivas_processing_scripts.constants import ( from ivas_processing_scripts.generation import ( config, generate_ambi_items, - generate_stereo_items, generate_ismN_items, generate_omasa_items, generate_osba_items, + generate_stereo_items, ) from ivas_processing_scripts.utils import create_dir @@ -61,7 +62,7 @@ def logging_init(args, cfg): logger.addHandler(console_handler) # main log file - file_handler = logging.FileHandler( f"{cfg.format}{LOGGER_SUFFIX}", mode="w" ) + file_handler = logging.FileHandler(f"{cfg.format}{LOGGER_SUFFIX}", mode="w") file_handler.setFormatter(logging.Formatter(LOGGER_FORMAT, datefmt=LOGGER_DATEFMT)) file_handler.setLevel(logging.DEBUG if args.debug else logging.INFO) logger.addHandler(file_handler) diff --git a/ivas_processing_scripts/generation/generate_ambi_items.py b/ivas_processing_scripts/generation/generate_ambi_items.py index f1885727..add1d472 100644 --- a/ivas_processing_scripts/generation/generate_ambi_items.py +++ b/ivas_processing_scripts/generation/generate_ambi_items.py @@ -41,8 +41,15 @@ import numpy as np from ivas_processing_scripts.audiotools import audio, audioarray, audiofile, convert from ivas_processing_scripts.audiotools.convert.objectbased import convert_objectbased from ivas_processing_scripts.audiotools.convert.scenebased import convert_scenebased -from ivas_processing_scripts.audiotools.wrappers.bs1770 import get_loudness, loudness_norm -from ivas_processing_scripts.audiotools.wrappers.reverb import reverb_foa, reverb_hoa2, reverb_hoa3 +from ivas_processing_scripts.audiotools.wrappers.bs1770 import ( + get_loudness, + loudness_norm, +) +from ivas_processing_scripts.audiotools.wrappers.reverb import ( + reverb_foa, + reverb_hoa2, + reverb_hoa3, +) from ivas_processing_scripts.generation import config from ivas_processing_scripts.utils import apply_func_parallel @@ -169,8 +176,8 @@ def generate_ambi_items( apply_func_parallel( generate_ambi_scene, zip(cfg.scenes.keys(), cfg.scenes.values(), repeat(cfg), repeat(logger)), - type = "mp" if cfg.multiprocessing else None, - show_progress = None, + type="mp" if cfg.multiprocessing else None, + show_progress=None, ) return @@ -192,7 +199,7 @@ def generate_ambi_scene( - Reads mono audio source files and processes them based on the scene description. - Writes the processed FOA/HOA2/HOA3 audio to the output file. """ - + logger.info( f"Processing scene: {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}" ) @@ -201,7 +208,9 @@ def generate_ambi_scene( N_inputs = len(np.atleast_1d(scene["input"])) # initialize output dirs - output_filename = Path(scene["output"]).parent / (cfg.use_output_prefix + Path(scene["output"]).name) + output_filename = Path(scene["output"]).parent / ( + cfg.use_output_prefix + Path(scene["output"]).name + ) dir_path = output_filename.parent if dir_path and not dir_path.exists(): @@ -211,7 +220,6 @@ def generate_ambi_scene( y = audio.SceneBasedAudio(cfg.format) for i in range(N_inputs): - # parse parameters from the scene description source_file = np.atleast_1d(scene["input"])[i] IR_file = np.atleast_1d(scene["IR"])[i] @@ -239,14 +247,16 @@ def generate_ambi_scene( logger.info(f"Convolving {source_file} with {IR_file}") # get input filename and IR filename - input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name) - IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) + input_filename = Path(source_file).parent / ( + cfg.use_input_prefix + Path(source_file).name + ) + IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) # read source file - x = audio.fromfile( "MONO", input_filename, fs=cfg.fs ) + x = audio.fromfile("MONO", input_filename, fs=cfg.fs) # read the IR file (!must be in target format!) - IR = audio.fromfile( cfg.format, IR_filename, fs=cfg.IR_fs ) + IR = audio.fromfile(cfg.format, IR_filename, fs=cfg.IR_fs) # convolve with the FOA/HOA2/HOA3 IR if cfg.format == "FOA": @@ -280,8 +290,10 @@ def generate_ambi_scene( # adjust the signal length (trim from the end or pad with zeros) to align its length with the previous signal(s) N_pad = y.audio.shape[0] - x.audio.shape[0] if N_pad != 0: - x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) - + x.audio = audioarray.trim( + x.audio, x.fs, limits=[0, -N_pad], samples=True + ) + # superimpose y.audio += x.audio @@ -298,14 +310,16 @@ def generate_ambi_scene( y.audio += noise # write the FOA/HOA2/HOA3 audio into output file - audiofile.write( output_filename, y.audio, y.fs ) + audiofile.write(output_filename, y.audio, y.fs) # convert to BINAURAL, if option was chosen if cfg.binaural_output: binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_scenebased(y, binaudio) - binaural_output_filename = output_filename.with_name(output_filename.stem + "_BINAURAL" + output_filename.suffix) + binaural_output_filename = output_filename.with_name( + output_filename.stem + "_BINAURAL" + output_filename.suffix + ) audiofile.write( binaural_output_filename, binaudio.audio, diff --git a/ivas_processing_scripts/generation/generate_ismN_items.py b/ivas_processing_scripts/generation/generate_ismN_items.py index ea5baf96..3af2ad7b 100644 --- a/ivas_processing_scripts/generation/generate_ismN_items.py +++ b/ivas_processing_scripts/generation/generate_ismN_items.py @@ -33,12 +33,16 @@ import csv import logging from itertools import groupby, repeat from math import floor -import numpy as np from pathlib import Path +import numpy as np + from ivas_processing_scripts.audiotools import audio, audioarray, audiofile from ivas_processing_scripts.audiotools.convert.objectbased import convert_objectbased -from ivas_processing_scripts.audiotools.wrappers.bs1770 import get_loudness, loudness_norm +from ivas_processing_scripts.audiotools.wrappers.bs1770 import ( + get_loudness, + loudness_norm, +) from ivas_processing_scripts.generation import config from ivas_processing_scripts.utils import apply_func_parallel @@ -71,7 +75,7 @@ def replace_char_seq_with_string(str, char_seq, repl_str): return "".join(result) -# function for appending string to a filename before file extension +# function for appending string to a filename before file extension def append_str_filename(filename, str_to_append): p = Path(filename) return p.parent / (p.stem + str_to_append + p.suffix) @@ -152,8 +156,8 @@ def generate_ismN_items( apply_func_parallel( generate_ismN_scene, zip(cfg.scenes.keys(), cfg.scenes.values(), repeat(cfg), repeat(logger)), - type = "mp" if cfg.multiprocessing else None, - show_progress = None, + type="mp" if cfg.multiprocessing else None, + show_progress=None, ) return @@ -186,7 +190,9 @@ def generate_ismN_scene( # initialize output dirs ism_format = f"ISM{N_inputs}" - output_filename = Path(scene["output"]).parent / (cfg.use_output_prefix + Path(scene["output"]).name) + output_filename = Path(scene["output"]).parent / ( + cfg.use_output_prefix + Path(scene["output"]).name + ) dir_path = output_filename.parent if dir_path and not dir_path.exists(): @@ -197,7 +203,6 @@ def generate_ismN_scene( # repeat for all source files for i in range(N_inputs): - # parse parameters from the scene description source_file = ( scene["input"][i] if isinstance(scene["input"], list) else scene["input"] @@ -222,7 +227,7 @@ def generate_ismN_scene( ) else: source_shift = 0.0 - + # read the level if "level" in scene.keys(): level = ( @@ -236,13 +241,15 @@ def generate_ismN_scene( logger.info(f"Encoding {source_file} at position(s) {source_azi},{source_ele}") # get input filename - input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name) + input_filename = Path(source_file).parent / ( + cfg.use_input_prefix + Path(source_file).name + ) # generate ISM metadata .csv filename (should end with .wav..0.csv, .wav.1.csv, ...) y.metadata_files.insert(i, str(output_filename.with_suffix(f".{i}.csv"))) # read source file - x = audio.fromfile( "MONO", input_filename, fs=cfg.fs ) + x = audio.fromfile("MONO", input_filename, fs=cfg.fs) # adjust the level of the source file x.audio, _ = loudness_norm(x, level, loudness_format="MONO") @@ -268,14 +275,16 @@ def generate_ismN_scene( # pad ISM signal with zeros to have the same length as the MASA signal N_pad = y.audio.shape[0] - x.audio.shape[0] if N_pad != 0: - x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) - + x.audio = audioarray.trim( + x.audio, x.fs, limits=[0, -N_pad], samples=True + ) + # append ISM signal to the ISM object y.audio = np.append(y.audio, x.audio, axis=1) # append pre-amble and post-amble to all sources y.audio = audioarray.trim(y.audio, y.fs, limits=[-cfg.preamble, -cfg.postamble]) - + # add random noise if cfg.add_low_level_random_noise: # create uniformly distributed noise between -4 and 4 @@ -301,7 +310,7 @@ def generate_ismN_scene( N_frames = int(np.rint((len(y.audio) / y.fs * 50))) - # read azimuth information and convert to an array + # read azimuth information and convert to an array if isinstance(source_azi, str): if ":" in source_azi: # start with the initial azimuth value and apply step N_frames times @@ -309,7 +318,7 @@ def generate_ismN_scene( azi = np.arange( float(eval(source_azi[0])), float(eval(source_azi[0])) + N_frames * float(eval(source_azi[1])), - float(eval(source_azi[1])) + float(eval(source_azi[1])), ) else: # replicate static azimuth value N_frames times @@ -317,7 +326,7 @@ def generate_ismN_scene( else: # replicate static azimuth value N_frames times azi = np.repeat(float(source_azi), N_frames) - + # convert azimuth from 0 .. 360 to -180 .. +180 azi = (azi + 180) % 360 - 180 @@ -336,7 +345,7 @@ def generate_ismN_scene( ele = np.arange( float(eval(source_ele[0])), np.sign(float(eval(source_ele[1]))) * 90, - float(eval(source_ele[1])) + float(eval(source_ele[1])), )[:N_frames] # repeat the last elevation value, if array is shorter than N_frames @@ -357,7 +366,7 @@ def generate_ismN_scene( # arrange all metadata fields column-wise into a matrix x_meta = np.column_stack((azi, ele)) - + # write to .csv output metadata file with open( y.metadata_files[i], @@ -371,20 +380,21 @@ def generate_ismN_scene( # write all rows to the .csv file writer.writerows(csv_formatdata(x_meta)) - y.init_metadata() # this is needed to populate 'y.object_pos[]' + y.init_metadata() # this is needed to populate 'y.object_pos[]' # write the OMASA output to .wav file in an interleaved format - audiofile.write( output_filename, y.audio, y.fs ) + audiofile.write(output_filename, y.audio, y.fs) # convert to ISM output to BINAURAL, if option was chosen if cfg.binaural_output: binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_objectbased(y, binaudio) - binaural_output_filename = output_filename.with_name(output_filename.stem + "_BINAURAL" + output_filename.suffix) + binaural_output_filename = output_filename.with_name( + output_filename.stem + "_BINAURAL" + output_filename.suffix + ) audiofile.write( binaural_output_filename, binaudio.audio, binaudio.fs, ) - diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py index f4ba7711..3cbc94bf 100644 --- a/ivas_processing_scripts/generation/generate_omasa_items.py +++ b/ivas_processing_scripts/generation/generate_omasa_items.py @@ -36,12 +36,16 @@ import sys from itertools import groupby, repeat from math import floor from pathlib import Path + import numpy as np -from ivas_processing_scripts.audiotools import audio, audiofile, audioarray -from ivas_processing_scripts.audiotools.wrappers.bs1770 import get_loudness, loudness_norm -from ivas_processing_scripts.audiotools.convert.scenebased import render_sba_to_masa +from ivas_processing_scripts.audiotools import audio, audioarray, audiofile from ivas_processing_scripts.audiotools.convert.omasa import convert_omasa +from ivas_processing_scripts.audiotools.convert.scenebased import render_sba_to_masa +from ivas_processing_scripts.audiotools.wrappers.bs1770 import ( + get_loudness, + loudness_norm, +) from ivas_processing_scripts.generation import config from ivas_processing_scripts.utils import apply_func_parallel @@ -72,13 +76,15 @@ def replace_char_seq_with_string(str, char_seq, repl_str): result.append(g) return "".join(result) - -# function for appending string to a filename before file extension + + +# function for appending string to a filename before file extension def append_str_filename(filename, str_to_append): p = Path(filename) # Combine the stem, the string to append, and the suffix return p.parent / (p.stem + str_to_append + p.suffix) + def generate_omasa_items( cfg: config.TestConfig, logger: logging.Logger, @@ -154,8 +160,8 @@ def generate_omasa_items( apply_func_parallel( generate_OMASA_scene, zip(cfg.scenes.keys(), cfg.scenes.values(), repeat(cfg), repeat(logger)), - type = "mp" if cfg.multiprocessing else None, - show_progress = None, + type="mp" if cfg.multiprocessing else None, + show_progress=None, ) return @@ -183,14 +189,16 @@ def generate_OMASA_scene( logger.info( f"Processing {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}" ) - + # extract the number of audio sources N_inputs = len(np.atleast_1d(scene["input"])) - N_ISMs = N_inputs-1 + N_ISMs = N_inputs - 1 # get output filename omasa_format = f"ISM{N_ISMs}MASA{cfg.masa_tc}DIR{cfg.masa_dirs}" - output_filename = Path(scene["output"]).parent / (cfg.use_output_prefix + Path(scene["output"]).name) + output_filename = Path(scene["output"]).parent / ( + cfg.use_output_prefix + Path(scene["output"]).name + ) # initialize output dirs dir_path = output_filename.parent @@ -202,7 +210,6 @@ def generate_OMASA_scene( # repeat for all source files for i in range(N_inputs): - # parse parameters from the scene description source_file = ( scene["input"][i] if isinstance(scene["input"], list) else scene["input"] @@ -227,7 +234,7 @@ def generate_OMASA_scene( ) else: source_shift = 0.0 - + # read the level if "level" in scene.keys(): level = ( @@ -241,11 +248,13 @@ def generate_OMASA_scene( logger.info(f"Encoding {source_file} at position(s) {source_azi},{source_ele}") # get input filename - input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name) - + input_filename = Path(source_file).parent / ( + cfg.use_input_prefix + Path(source_file).name + ) + # get the number of channels from the .wav file header wav_header = audiofile.parse_wave_header(input_filename) - N_channels = wav_header['channels'] + N_channels = wav_header["channels"] if N_channels == 1: fmt = "MONO" @@ -258,25 +267,27 @@ def generate_OMASA_scene( elif N_channels == 16: fmt = "HOA3" else: - logger.info(f"Error: Input format of the source file with {N_channels} channels is not supported!") + logger.info( + f"Error: Input format of the source file with {N_channels} channels is not supported!" + ) sys.exit(-1) - + if fmt in ["FOA", "HOA2", "HOA3"]: # generate MASA metadata .met filename (should end with .met) y.metadata_files.append(output_filename.with_suffix(".met")) elif fmt == "MONO": # generate ISM metadata .csv filename (should end with .wav.0.csv, .wav.1.csv, ...) - y.metadata_files.insert(i-1, output_filename.with_suffix(f".{i-1}.csv")) - + y.metadata_files.insert(i - 1, output_filename.with_suffix(f".{i-1}.csv")) + # read source file - x = audio.fromfile( fmt, input_filename, fs=cfg.fs ) + x = audio.fromfile(fmt, input_filename, fs=cfg.fs) # adjust the level of the source file if fmt in ["FOA", "HOA2", "HOA3"]: x.audio, _ = loudness_norm(x, level, loudness_format="STEREO", rms=True) else: x.audio, _ = loudness_norm(x, level, loudness_format="MONO") - + # shift the source signal (positive shift creates overlap, negative shift creates a gap) if int(floor(-source_shift)) != 0: x.audio = audioarray.trim(x.audio, x.fs, limits=[-source_shift, 0]) @@ -284,7 +295,7 @@ def generate_OMASA_scene( # get the number of frames (multiple of 20ms) frame_len = int(x.fs / 50) N_frames = int(len(x.audio) / frame_len) - + # pad with zeros to ensure that the signal length is a multiple of 20ms if len(x.audio) % frame_len != 0: # pad the source signal @@ -293,7 +304,9 @@ def generate_OMASA_scene( # convert FOA/HOA2/HOA3 to MASA if fmt in ["FOA", "HOA2", "HOA3"]: - x_masa = audio.MetadataAssistedSpatialAudio(f"MASA{cfg.masa_tc}DIR{cfg.masa_dirs}") + x_masa = audio.MetadataAssistedSpatialAudio( + f"MASA{cfg.masa_tc}DIR{cfg.masa_dirs}" + ) x_masa.metadata_file = y.metadata_files[i] render_sba_to_masa(x, x_masa) y.audio = x_masa.audio @@ -302,14 +315,16 @@ def generate_OMASA_scene( # pad ISM signal with zeros to have the same length as the MASA signal N_pad = y.audio.shape[0] - x.audio.shape[0] if N_pad != 0: - x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) - + x.audio = audioarray.trim( + x.audio, x.fs, limits=[0, -N_pad], samples=True + ) + # append ISM signal to the OMASA object (ISM comes first !!!) - y.audio = np.insert(y.audio, [i-1], x.audio, axis=1) + y.audio = np.insert(y.audio, [i - 1], x.audio, axis=1) # append pre-amble and post-amble to all sources y.audio = audioarray.trim(y.audio, y.fs, limits=[-cfg.preamble, -cfg.postamble]) - + # add random noise if cfg.add_low_level_random_noise: # create uniformly distributed noise between -4 and 4 @@ -336,7 +351,7 @@ def generate_OMASA_scene( N_frames = int(np.rint((len(y.audio) / y.fs * 50))) - # read azimuth information and convert to an array + # read azimuth information and convert to an array if isinstance(source_azi, str): if ":" in source_azi: # start with the initial azimuth value and apply step N_frames times @@ -344,7 +359,7 @@ def generate_OMASA_scene( azi = np.arange( float(eval(source_azi[0])), float(eval(source_azi[0])) + N_frames * float(eval(source_azi[1])), - float(eval(source_azi[1])) + float(eval(source_azi[1])), ) else: # replicate static azimuth value N_frames times @@ -352,7 +367,7 @@ def generate_OMASA_scene( else: # replicate static azimuth value N_frames times azi = np.repeat(float(source_azi), N_frames) - + # convert azimuth from 0 .. 360 to -180 .. +180 azi = (azi + 180) % 360 - 180 @@ -371,7 +386,7 @@ def generate_OMASA_scene( ele = np.arange( float(eval(source_ele[0])), np.sign(float(eval(source_ele[1]))) * 90, - float(eval(source_ele[1])) + float(eval(source_ele[1])), )[:N_frames] # repeat the last elevation value, if array is shorter than N_frames @@ -392,10 +407,10 @@ def generate_OMASA_scene( # arrange all metadata fields column-wise into a matrix x_meta = np.column_stack((azi, ele)) - + # write to .csv output metadata file with open( - y.metadata_files[i-1], + y.metadata_files[i - 1], "w", newline="", encoding="utf-8", @@ -406,21 +421,23 @@ def generate_OMASA_scene( # write all rows to the .csv file writer.writerows(csv_formatdata(x_meta)) - y.init_metadata() # this is needed to populate 'y.object_pos[]' + y.init_metadata() # this is needed to populate 'y.object_pos[]' # write the OMASA output to .wav file in an interleaved format - audiofile.write( output_filename, y.audio, y.fs ) - + audiofile.write(output_filename, y.audio, y.fs) + # convert to OMASA output to BINAURAL, if option was chosen if cfg.binaural_output: binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_omasa(y, binaudio) - binaural_output_filename = output_filename.with_name(output_filename.stem + "_BINAURAL" + output_filename.suffix) + binaural_output_filename = output_filename.with_name( + output_filename.stem + "_BINAURAL" + output_filename.suffix + ) audiofile.write( binaural_output_filename, binaudio.audio, binaudio.fs, ) - + return diff --git a/ivas_processing_scripts/generation/generate_osba_items.py b/ivas_processing_scripts/generation/generate_osba_items.py index a2e53d12..7cb4ab68 100644 --- a/ivas_processing_scripts/generation/generate_osba_items.py +++ b/ivas_processing_scripts/generation/generate_osba_items.py @@ -37,12 +37,16 @@ import sys from itertools import groupby, repeat from math import floor from pathlib import Path + import numpy as np -from ivas_processing_scripts.audiotools import audio, audiofile, audioarray -from ivas_processing_scripts.audiotools.wrappers.bs1770 import get_loudness, loudness_norm -from ivas_processing_scripts.audiotools.convert.scenebased import render_sba_to_masa +from ivas_processing_scripts.audiotools import audio, audioarray, audiofile from ivas_processing_scripts.audiotools.convert.osba import convert_osba +from ivas_processing_scripts.audiotools.convert.scenebased import render_sba_to_masa +from ivas_processing_scripts.audiotools.wrappers.bs1770 import ( + get_loudness, + loudness_norm, +) from ivas_processing_scripts.generation import config from ivas_processing_scripts.utils import apply_func_parallel @@ -73,11 +77,13 @@ def replace_char_seq_with_string(str, char_seq, repl_str): result.append(g) return "".join(result) - -# function for appending string to a filename before file extension + + +# function for appending string to a filename before file extension def append_str_filename(filename, str_to_append): - p = Path(filename) - return "{0}{2}{1}".format(p.stem, p.suffix, str_to_append) + p = Path(filename) + return "{0}{2}{1}".format(p.stem, p.suffix, str_to_append) + def generate_osba_items( cfg: config.TestConfig, @@ -154,8 +160,8 @@ def generate_osba_items( apply_func_parallel( generate_OSBA_scene, zip(cfg.scenes.keys(), cfg.scenes.values(), repeat(cfg), repeat(logger)), - type = "mp" if cfg.multiprocessing else None, - show_progress = None, + type="mp" if cfg.multiprocessing else None, + show_progress=None, ) return @@ -186,11 +192,13 @@ def generate_OSBA_scene( # extract the number of audio sources N_inputs = len(np.atleast_1d(scene["input"])) - N_ISMs = N_inputs-1 + N_ISMs = N_inputs - 1 # get input and output filenames osba_format = f"ISM{N_ISMs}SBA{cfg.sba_order}" - output_filename = Path(scene["output"]).parent / (cfg.use_output_prefix + Path(scene["output"]).name) + output_filename = Path(scene["output"]).parent / ( + cfg.use_output_prefix + Path(scene["output"]).name + ) # initialize output dirs dir_path = output_filename.parent @@ -202,7 +210,6 @@ def generate_OSBA_scene( # repeat for all source files for i in range(N_inputs): - # parse parameters from the scene description source_file = ( scene["input"][i] if isinstance(scene["input"], list) else scene["input"] @@ -227,7 +234,7 @@ def generate_OSBA_scene( ) else: source_shift = 0.0 - + # read the level if "level" in scene.keys(): level = ( @@ -241,12 +248,14 @@ def generate_OSBA_scene( logger.info(f"Encoding {source_file} at position(s) {source_azi},{source_ele}") # get input filename - input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name) + input_filename = Path(source_file).parent / ( + cfg.use_input_prefix + Path(source_file).name + ) # get the number of channels from the .wav file header wav_header = audiofile.parse_wave_header(input_filename) - N_channels = wav_header['channels'] - + N_channels = wav_header["channels"] + if N_channels == 1: fmt = "MONO" elif N_channels == 2: @@ -258,22 +267,24 @@ def generate_OSBA_scene( elif N_channels == 16: fmt = "HOA3" else: - logger.info(f"Error: Input format of the source file with {N_channels} channels is not supported!") + logger.info( + f"Error: Input format of the source file with {N_channels} channels is not supported!" + ) sys.exit(-1) - + if fmt == "MONO": # generate ISM metadata .csv filename (should end with .wav.0.csv, .wav.1.csv, ...) - y.metadata_files.insert(i-1, f"{output_filename}.{i-1}.csv") - + y.metadata_files.insert(i - 1, f"{output_filename}.{i-1}.csv") + # read source file - x = audio.fromfile( fmt, input_filename, fs=cfg.fs ) + x = audio.fromfile(fmt, input_filename, fs=cfg.fs) # adjust the level of the source file if fmt in ["FOA", "HOA2", "HOA3"]: x.audio, _ = loudness_norm(x, level, loudness_format="STEREO", rms=True) else: x.audio, _ = loudness_norm(x, level, loudness_format="MONO") - + # shift the source signal (positive shift creates overlap, negative shift creates a gap) if int(floor(-source_shift)) != 0: x.audio = audioarray.trim(x.audio, x.fs, limits=[-source_shift, 0]) @@ -281,7 +292,7 @@ def generate_OSBA_scene( # get the number of frames (multiple of 20ms) frame_len = int(x.fs / 50) N_frames = int(len(x.audio) / frame_len) - + # pad with zeros to ensure that the signal length is a multiple of 20ms if len(x.audio) % frame_len != 0: # pad the source signal @@ -296,14 +307,16 @@ def generate_OSBA_scene( # pad ISM signal with zeros to have the same length as the SBA signal N_pad = y.audio.shape[0] - x.audio.shape[0] if N_pad != 0: - x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) - + x.audio = audioarray.trim( + x.audio, x.fs, limits=[0, -N_pad], samples=True + ) + # append ISM signal to the OSBA object (ISM comes first !!!) - y.audio = np.insert(y.audio, [i-1], x.audio, axis=1) + y.audio = np.insert(y.audio, [i - 1], x.audio, axis=1) # append pre-amble and post-amble to all sources y.audio = audioarray.trim(y.audio, y.fs, limits=[-cfg.preamble, -cfg.postamble]) - + # add random noise if cfg.add_low_level_random_noise: # create uniformly distributed noise between -4 and 4 @@ -330,7 +343,7 @@ def generate_OSBA_scene( N_frames = int(np.rint((len(y.audio) / y.fs * 50))) - # read azimuth information and convert to an array + # read azimuth information and convert to an array if isinstance(source_azi, str): if ":" in source_azi: # start with the initial azimuth value and apply step N_frames times @@ -338,7 +351,7 @@ def generate_OSBA_scene( azi = np.arange( float(eval(source_azi[0])), float(eval(source_azi[0])) + N_frames * float(eval(source_azi[1])), - float(eval(source_azi[1])) + float(eval(source_azi[1])), ) else: # replicate static azimuth value N_frames times @@ -346,7 +359,7 @@ def generate_OSBA_scene( else: # replicate static azimuth value N_frames times azi = np.repeat(float(source_azi), N_frames) - + # convert azimuth from 0 .. 360 to -180 .. +180 azi = (azi + 180) % 360 - 180 @@ -365,7 +378,7 @@ def generate_OSBA_scene( ele = np.arange( float(eval(source_ele[0])), np.sign(float(eval(source_ele[1]))) * 90, - float(eval(source_ele[1])) + float(eval(source_ele[1])), )[:N_frames] # repeat the last elevation value, if array is shorter than N_frames @@ -386,10 +399,10 @@ def generate_OSBA_scene( # arrange all metadata fields column-wise into a matrix x_meta = np.column_stack((azi, ele)) - + # write to .csv output metadata file with open( - y.metadata_files[i-1], + y.metadata_files[i - 1], "w", newline="", encoding="utf-8", @@ -400,21 +413,23 @@ def generate_OSBA_scene( # write all rows to the .csv file writer.writerows(csv_formatdata(x_meta)) - y.init_metadata() # this is needed to populate 'y.object_pos[]' + y.init_metadata() # this is needed to populate 'y.object_pos[]' # write the OSBA output to .wav file in an interleaved format - audiofile.write( output_filename, y.audio, y.fs ) - + audiofile.write(output_filename, y.audio, y.fs) + # convert the OSBA output to BINAURAL, if option was chosen if cfg.binaural_output: binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_osba(y, binaudio) - binaural_output_filename = output_filename.with_name(output_filename.stem + "_BINAURAL" + output_filename.suffix) + binaural_output_filename = output_filename.with_name( + output_filename.stem + "_BINAURAL" + output_filename.suffix + ) audiofile.write( binaural_output_filename, binaudio.audio, binaudio.fs, ) - + return diff --git a/ivas_processing_scripts/generation/generate_stereo_items.py b/ivas_processing_scripts/generation/generate_stereo_items.py index 1c9251ad..8ab8eaaf 100644 --- a/ivas_processing_scripts/generation/generate_stereo_items.py +++ b/ivas_processing_scripts/generation/generate_stereo_items.py @@ -39,7 +39,10 @@ from pathlib import Path import numpy as np from ivas_processing_scripts.audiotools import audio, audioarray, audiofile -from ivas_processing_scripts.audiotools.wrappers.bs1770 import get_loudness, loudness_norm +from ivas_processing_scripts.audiotools.wrappers.bs1770 import ( + get_loudness, + loudness_norm, +) from ivas_processing_scripts.audiotools.wrappers.reverb import reverb_stereo from ivas_processing_scripts.generation import config from ivas_processing_scripts.utils import apply_func_parallel @@ -171,8 +174,8 @@ def generate_stereo_items( apply_func_parallel( generate_stereo_scene, zip(cfg.scenes.keys(), cfg.scenes.values(), repeat(cfg), repeat(logger)), - type = "mp" if cfg.multiprocessing else None, - show_progress = None, + type="mp" if cfg.multiprocessing else None, + show_progress=None, ) return @@ -194,7 +197,7 @@ def generate_stereo_scene( - Reads mono audio source files and processes them based on the scene description. - Writes the processed STEREO audio to output file. """ - + logger.info( f"Processing scene: {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}" ) @@ -203,7 +206,9 @@ def generate_stereo_scene( N_inputs = len(np.atleast_1d(scene["input"])) # initialize output dirs - output_filename = Path(scene["output"]).parent / (cfg.use_output_prefix + Path(scene["output"]).name) + output_filename = Path(scene["output"]).parent / ( + cfg.use_output_prefix + Path(scene["output"]).name + ) dir_path = output_filename.parent if dir_path and not dir_path.exists(): @@ -213,7 +218,6 @@ def generate_stereo_scene( y = audio.ChannelBasedAudio(cfg.format) for i in range(N_inputs): - # parse parameters from the scene description source_file = np.atleast_1d(scene["input"])[i] IR_file = np.atleast_1d(scene["IR"])[i] @@ -241,14 +245,16 @@ def generate_stereo_scene( logger.info(f"Convolving {source_file} with {IR_file}") # get input filename and IR filename - input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name) - IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) + input_filename = Path(source_file).parent / ( + cfg.use_input_prefix + Path(source_file).name + ) + IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) # read source file - x = audio.fromfile( "MONO", input_filename, fs=cfg.fs ) + x = audio.fromfile("MONO", input_filename, fs=cfg.fs) # read the IR file (!must be in STEREO format!) - IR = audio.fromfile( "STEREO", IR_filename, fs=cfg.IR_fs ) + IR = audio.fromfile("STEREO", IR_filename, fs=cfg.IR_fs) # convolve mono source signal with stereo IR x = reverb_stereo(x, IR) @@ -277,8 +283,10 @@ def generate_stereo_scene( # pad the signal with zeros to have the same length as the previous signal(s) N_pad = y.audio.shape[0] - x.audio.shape[0] if N_pad != 0: - x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) - + x.audio = audioarray.trim( + x.audio, x.fs, limits=[0, -N_pad], samples=True + ) + # superimpose y.audio += x.audio @@ -295,5 +303,4 @@ def generate_stereo_scene( y.audio += noise # write the output STEREO audio signal into output file - audiofile.write( output_filename, y.audio, y.fs ) - + audiofile.write(output_filename, y.audio, y.fs) -- GitLab From dbcc02845cee329241787b6ecd7eeb7735eb27ff Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 19 Jun 2025 12:22:15 +0200 Subject: [PATCH 08/42] Apply flake8 linter --- ivas_processing_scripts/generation/__init__.py | 3 --- ivas_processing_scripts/generation/generate_ismN_items.py | 5 +---- ivas_processing_scripts/generation/generate_omasa_items.py | 6 +----- 3 files changed, 2 insertions(+), 12 deletions(-) diff --git a/ivas_processing_scripts/generation/__init__.py b/ivas_processing_scripts/generation/__init__.py index b41b1b44..8cdc0cb0 100755 --- a/ivas_processing_scripts/generation/__init__.py +++ b/ivas_processing_scripts/generation/__init__.py @@ -32,8 +32,6 @@ import logging -import yaml - from ivas_processing_scripts.constants import ( LOGGER_DATEFMT, LOGGER_FORMAT, @@ -47,7 +45,6 @@ from ivas_processing_scripts.generation import ( generate_osba_items, generate_stereo_items, ) -from ivas_processing_scripts.utils import create_dir def logging_init(args, cfg): diff --git a/ivas_processing_scripts/generation/generate_ismN_items.py b/ivas_processing_scripts/generation/generate_ismN_items.py index 3af2ad7b..d232a425 100644 --- a/ivas_processing_scripts/generation/generate_ismN_items.py +++ b/ivas_processing_scripts/generation/generate_ismN_items.py @@ -39,10 +39,7 @@ import numpy as np from ivas_processing_scripts.audiotools import audio, audioarray, audiofile from ivas_processing_scripts.audiotools.convert.objectbased import convert_objectbased -from ivas_processing_scripts.audiotools.wrappers.bs1770 import ( - get_loudness, - loudness_norm, -) +from ivas_processing_scripts.audiotools.wrappers.bs1770 import loudness_norm from ivas_processing_scripts.generation import config from ivas_processing_scripts.utils import apply_func_parallel diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py index 3cbc94bf..c98979b4 100644 --- a/ivas_processing_scripts/generation/generate_omasa_items.py +++ b/ivas_processing_scripts/generation/generate_omasa_items.py @@ -42,10 +42,7 @@ import numpy as np from ivas_processing_scripts.audiotools import audio, audioarray, audiofile from ivas_processing_scripts.audiotools.convert.omasa import convert_omasa from ivas_processing_scripts.audiotools.convert.scenebased import render_sba_to_masa -from ivas_processing_scripts.audiotools.wrappers.bs1770 import ( - get_loudness, - loudness_norm, -) +from ivas_processing_scripts.audiotools.wrappers.bs1770 import loudness_norm from ivas_processing_scripts.generation import config from ivas_processing_scripts.utils import apply_func_parallel @@ -335,7 +332,6 @@ def generate_OMASA_scene( y.audio += noise # generate ISM metadata files - y_meta = None for i in range(1, N_ISMs + 1): # parse metadata parameters from the scene description source_azi = ( -- GitLab From 37681b6cb4fd711aad04adbfd7de5d5feda1683a Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 19 Jun 2025 12:22:42 +0200 Subject: [PATCH 09/42] add flake8 settings --- .flake8 | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 .flake8 diff --git a/.flake8 b/.flake8 new file mode 100644 index 00000000..ed0109ee --- /dev/null +++ b/.flake8 @@ -0,0 +1,4 @@ +[flake8] +max-line-length = 88 +ignore = E203,E402,E501,E741 +exclude = .git,__pycache__,build,dist \ No newline at end of file -- GitLab From 906f53ba591ec187b75fac20643cf4b90675de3a Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 19 Jun 2025 12:57:09 +0200 Subject: [PATCH 10/42] use .flake8 local config file --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 31f4512c..2277f43a 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -121,7 +121,7 @@ lint: - linux allow_failure: true script: - - flake8 --max-line-length 88 --extend-ignore=E203,E402,E501,E741 + - flake8 --config .flake8 format: stage: analyze -- GitLab From c95ffcec134d5d091843401a4dc0548652f2e51e Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 19 Jun 2025 12:58:24 +0200 Subject: [PATCH 11/42] ignore W504 - break line after logical operator (this allows breaking long if conditions on multiple lines) --- .flake8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.flake8 b/.flake8 index ed0109ee..09fd6f0b 100644 --- a/.flake8 +++ b/.flake8 @@ -1,4 +1,4 @@ [flake8] max-line-length = 88 -ignore = E203,E402,E501,E741 +ignore = E203,E402,E501,E741, W504 exclude = .git,__pycache__,build,dist \ No newline at end of file -- GitLab From 8998f3e0795fd8b6ec5061430cc73934b7ffd0a6 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 19 Jun 2025 12:58:57 +0200 Subject: [PATCH 12/42] fix Flake8 errors --- ivas_processing_scripts/__init__.py | 11 +++++------ ivas_processing_scripts/generation/config.py | 4 +--- .../generation/generate_ambi_items.py | 10 ++-------- .../generation/generate_osba_items.py | 8 +------- .../generation/generate_stereo_items.py | 6 +----- 5 files changed, 10 insertions(+), 29 deletions(-) diff --git a/ivas_processing_scripts/__init__.py b/ivas_processing_scripts/__init__.py index 36014870..d34709d5 100755 --- a/ivas_processing_scripts/__init__.py +++ b/ivas_processing_scripts/__init__.py @@ -108,8 +108,7 @@ def main(args): # Re-ordering items based on concatenation order if hasattr(cfg, "preprocessing_2"): if ( - cfg.preprocessing_2.get("concatenate_input") - and cfg.preprocessing_2.get("concatenation_order", None) is not None + cfg.preprocessing_2.get("concatenate_input") and cfg.preprocessing_2.get("concatenation_order", None) is not None ): cfg.items_list = reorder_items_list( cfg.items_list, cfg.preprocessing_2["concatenation_order"] @@ -169,10 +168,10 @@ def main(args): cfg.pre2 = cfg.proc_chains[0]["processes"][0] # preprocess background noise if ( - hasattr(cfg, "preprocessing") - and hasattr(cfg.pre2, "background_noise") - and cfg.pre2.background_noise is not None - and cfg.pre2.background_noise.get("background_noise_path") + hasattr(cfg, "preprocessing") and + hasattr(cfg.pre2, "background_noise") and + cfg.pre2.background_noise is not None and + cfg.pre2.background_noise.get("background_noise_path") ): preprocess_background_noise(cfg) # preprocess 2 diff --git a/ivas_processing_scripts/generation/config.py b/ivas_processing_scripts/generation/config.py index bfb676bc..8b955eb1 100644 --- a/ivas_processing_scripts/generation/config.py +++ b/ivas_processing_scripts/generation/config.py @@ -30,10 +30,8 @@ # the United Nations Convention on Contracts on the International Sales of Goods. # -from copy import deepcopy -from pathlib import Path - import yaml +from copy import deepcopy from ivas_processing_scripts.generation.constants import DEFAULT_CONFIG, REQUIRED_KEYS diff --git a/ivas_processing_scripts/generation/generate_ambi_items.py b/ivas_processing_scripts/generation/generate_ambi_items.py index add1d472..bf69bae1 100644 --- a/ivas_processing_scripts/generation/generate_ambi_items.py +++ b/ivas_processing_scripts/generation/generate_ambi_items.py @@ -31,20 +31,15 @@ # import logging -import os from itertools import groupby, repeat from math import floor from pathlib import Path import numpy as np -from ivas_processing_scripts.audiotools import audio, audioarray, audiofile, convert -from ivas_processing_scripts.audiotools.convert.objectbased import convert_objectbased +from ivas_processing_scripts.audiotools import audio, audioarray, audiofile from ivas_processing_scripts.audiotools.convert.scenebased import convert_scenebased -from ivas_processing_scripts.audiotools.wrappers.bs1770 import ( - get_loudness, - loudness_norm, -) +from ivas_processing_scripts.audiotools.wrappers.bs1770 import loudness_norm from ivas_processing_scripts.audiotools.wrappers.reverb import ( reverb_foa, reverb_hoa2, @@ -275,7 +270,6 @@ def generate_ambi_scene( # get the number of frames (multiple of 20ms) frame_len = int(x.fs / 50) - N_frames = int(len(x.audio) / frame_len) # pad with zeros to ensure that the signal length is a multiple of 20ms if len(x.audio) % frame_len != 0: diff --git a/ivas_processing_scripts/generation/generate_osba_items.py b/ivas_processing_scripts/generation/generate_osba_items.py index 7cb4ab68..b082c184 100644 --- a/ivas_processing_scripts/generation/generate_osba_items.py +++ b/ivas_processing_scripts/generation/generate_osba_items.py @@ -32,7 +32,6 @@ import csv import logging -import os import sys from itertools import groupby, repeat from math import floor @@ -42,11 +41,7 @@ import numpy as np from ivas_processing_scripts.audiotools import audio, audioarray, audiofile from ivas_processing_scripts.audiotools.convert.osba import convert_osba -from ivas_processing_scripts.audiotools.convert.scenebased import render_sba_to_masa -from ivas_processing_scripts.audiotools.wrappers.bs1770 import ( - get_loudness, - loudness_norm, -) +from ivas_processing_scripts.audiotools.wrappers.bs1770 import loudness_norm from ivas_processing_scripts.generation import config from ivas_processing_scripts.utils import apply_func_parallel @@ -327,7 +322,6 @@ def generate_OSBA_scene( y.audio += noise # generate ISM metadata files - y_meta = None for i in range(1, N_ISMs + 1): # parse metadata parameters from the scene description source_azi = ( diff --git a/ivas_processing_scripts/generation/generate_stereo_items.py b/ivas_processing_scripts/generation/generate_stereo_items.py index 8ab8eaaf..84e908ec 100644 --- a/ivas_processing_scripts/generation/generate_stereo_items.py +++ b/ivas_processing_scripts/generation/generate_stereo_items.py @@ -39,10 +39,7 @@ from pathlib import Path import numpy as np from ivas_processing_scripts.audiotools import audio, audioarray, audiofile -from ivas_processing_scripts.audiotools.wrappers.bs1770 import ( - get_loudness, - loudness_norm, -) +from ivas_processing_scripts.audiotools.wrappers.bs1770 import loudness_norm from ivas_processing_scripts.audiotools.wrappers.reverb import reverb_stereo from ivas_processing_scripts.generation import config from ivas_processing_scripts.utils import apply_func_parallel @@ -268,7 +265,6 @@ def generate_stereo_scene( # get the number of frames (multiple of 20ms) frame_len = int(x.fs / 50) - N_frames = int(len(x.audio) / frame_len) # pad with zeros to ensure that the signal length is a multiple of 20ms if len(x.audio) % frame_len != 0: -- GitLab From f6a8e6f4352a1833f5ec9075b137b34052169d7e Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 19 Jun 2025 13:01:21 +0200 Subject: [PATCH 13/42] add also W503 --- .flake8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.flake8 b/.flake8 index 09fd6f0b..0f2c5a5d 100644 --- a/.flake8 +++ b/.flake8 @@ -1,4 +1,4 @@ [flake8] max-line-length = 88 -ignore = E203,E402,E501,E741, W504 +ignore = E203,E402,E501,E741,W503,W504 exclude = .git,__pycache__,build,dist \ No newline at end of file -- GitLab From eba32da2a16d6211429075ec1349b0b018a951d3 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 19 Jun 2025 13:06:01 +0200 Subject: [PATCH 14/42] formatting --- ivas_processing_scripts/__init__.py | 11 ++++++----- ivas_processing_scripts/generation/config.py | 4 ++-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/ivas_processing_scripts/__init__.py b/ivas_processing_scripts/__init__.py index d34709d5..36014870 100755 --- a/ivas_processing_scripts/__init__.py +++ b/ivas_processing_scripts/__init__.py @@ -108,7 +108,8 @@ def main(args): # Re-ordering items based on concatenation order if hasattr(cfg, "preprocessing_2"): if ( - cfg.preprocessing_2.get("concatenate_input") and cfg.preprocessing_2.get("concatenation_order", None) is not None + cfg.preprocessing_2.get("concatenate_input") + and cfg.preprocessing_2.get("concatenation_order", None) is not None ): cfg.items_list = reorder_items_list( cfg.items_list, cfg.preprocessing_2["concatenation_order"] @@ -168,10 +169,10 @@ def main(args): cfg.pre2 = cfg.proc_chains[0]["processes"][0] # preprocess background noise if ( - hasattr(cfg, "preprocessing") and - hasattr(cfg.pre2, "background_noise") and - cfg.pre2.background_noise is not None and - cfg.pre2.background_noise.get("background_noise_path") + hasattr(cfg, "preprocessing") + and hasattr(cfg.pre2, "background_noise") + and cfg.pre2.background_noise is not None + and cfg.pre2.background_noise.get("background_noise_path") ): preprocess_background_noise(cfg) # preprocess 2 diff --git a/ivas_processing_scripts/generation/config.py b/ivas_processing_scripts/generation/config.py index 8b955eb1..b46b0c7f 100644 --- a/ivas_processing_scripts/generation/config.py +++ b/ivas_processing_scripts/generation/config.py @@ -30,11 +30,11 @@ # the United Nations Convention on Contracts on the International Sales of Goods. # -import yaml from copy import deepcopy - from ivas_processing_scripts.generation.constants import DEFAULT_CONFIG, REQUIRED_KEYS +import yaml + def merge_dicts(base: dict, other: dict) -> None: """ -- GitLab From 8281d7262cf9add6c17ae61512eadcd5a5050cba Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 19 Jun 2025 13:08:26 +0200 Subject: [PATCH 15/42] formatting --- ivas_processing_scripts/generation/config.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ivas_processing_scripts/generation/config.py b/ivas_processing_scripts/generation/config.py index b46b0c7f..c35c7134 100644 --- a/ivas_processing_scripts/generation/config.py +++ b/ivas_processing_scripts/generation/config.py @@ -31,10 +31,11 @@ # from copy import deepcopy -from ivas_processing_scripts.generation.constants import DEFAULT_CONFIG, REQUIRED_KEYS import yaml +from ivas_processing_scripts.generation.constants import DEFAULT_CONFIG, REQUIRED_KEYS + def merge_dicts(base: dict, other: dict) -> None: """ -- GitLab From e81d59a05de01edfe7e2147d0c1cefade40136e2 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 19 Jun 2025 13:17:33 +0200 Subject: [PATCH 16/42] update year to 2022-2026 --- ivas_processing_scripts/generation/__init__.py | 2 +- ivas_processing_scripts/generation/__main__.py | 2 +- ivas_processing_scripts/generation/config.py | 2 +- ivas_processing_scripts/generation/constants.py | 2 +- ivas_processing_scripts/generation/generate_ambi_items.py | 2 +- ivas_processing_scripts/generation/generate_stereo_items.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ivas_processing_scripts/generation/__init__.py b/ivas_processing_scripts/generation/__init__.py index 8cdc0cb0..5d8fd1a6 100755 --- a/ivas_processing_scripts/generation/__init__.py +++ b/ivas_processing_scripts/generation/__init__.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -# (C) 2022-2024 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# (C) 2022-2026 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, # Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., # Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, # Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other diff --git a/ivas_processing_scripts/generation/__main__.py b/ivas_processing_scripts/generation/__main__.py index 5914d7f9..a40766c9 100755 --- a/ivas_processing_scripts/generation/__main__.py +++ b/ivas_processing_scripts/generation/__main__.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -# (C) 2022-2024 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# (C) 2022-2026 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, # Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., # Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, # Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other diff --git a/ivas_processing_scripts/generation/config.py b/ivas_processing_scripts/generation/config.py index c35c7134..2c409bd7 100644 --- a/ivas_processing_scripts/generation/config.py +++ b/ivas_processing_scripts/generation/config.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -# (C) 2022-2024 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# (C) 2022-2026 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, # Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., # Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, # Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other diff --git a/ivas_processing_scripts/generation/constants.py b/ivas_processing_scripts/generation/constants.py index 8319d318..d1a057f6 100644 --- a/ivas_processing_scripts/generation/constants.py +++ b/ivas_processing_scripts/generation/constants.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -# (C) 2022-2024 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# (C) 2022-2026 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, # Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., # Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, # Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other diff --git a/ivas_processing_scripts/generation/generate_ambi_items.py b/ivas_processing_scripts/generation/generate_ambi_items.py index bf69bae1..dec02cb2 100644 --- a/ivas_processing_scripts/generation/generate_ambi_items.py +++ b/ivas_processing_scripts/generation/generate_ambi_items.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -# (C) 2022-2024 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# (C) 2022-2026 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, # Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., # Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, # Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other diff --git a/ivas_processing_scripts/generation/generate_stereo_items.py b/ivas_processing_scripts/generation/generate_stereo_items.py index 84e908ec..d97a4c7b 100644 --- a/ivas_processing_scripts/generation/generate_stereo_items.py +++ b/ivas_processing_scripts/generation/generate_stereo_items.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -# (C) 2022-2024 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# (C) 2022-2026 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, # Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., # Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, # Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -- GitLab From eacaa86fccbe566de05f6dadad0867b1c5cb1eb1 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 19 Jun 2025 13:24:14 +0200 Subject: [PATCH 17/42] use np.savetxt() instead of formatted write to .csv file --- .../generation/generate_ambi_items.py | 6 ------ .../generation/generate_omasa_items.py | 19 +------------------ .../generation/generate_osba_items.py | 19 +------------------ 3 files changed, 2 insertions(+), 42 deletions(-) diff --git a/ivas_processing_scripts/generation/generate_ambi_items.py b/ivas_processing_scripts/generation/generate_ambi_items.py index dec02cb2..6e4fe8ea 100644 --- a/ivas_processing_scripts/generation/generate_ambi_items.py +++ b/ivas_processing_scripts/generation/generate_ambi_items.py @@ -51,12 +51,6 @@ from ivas_processing_scripts.utils import apply_func_parallel SEED_RANDOM_NOISE = 0 -# function for converting nd numpy array to strings with 2 decimal digits -def csv_formatdata(data): - for row in data: - yield ["%0.2f" % v for v in row] - - # function for searching sequences of same the same character and replacing it by another string def replace_char_seq_with_string(str, char_seq, repl_str): result = [] diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py index c98979b4..b51375df 100644 --- a/ivas_processing_scripts/generation/generate_omasa_items.py +++ b/ivas_processing_scripts/generation/generate_omasa_items.py @@ -30,7 +30,6 @@ # the United Nations Convention on Contracts on the International Sales of Goods. # -import csv import logging import sys from itertools import groupby, repeat @@ -49,12 +48,6 @@ from ivas_processing_scripts.utils import apply_func_parallel SEED_RANDOM_NOISE = 0 -# function for converting nd numpy array to strings with 2 decimal digits -def csv_formatdata(data): - for row in data: - yield ["%0.2f" % v for v in row] - - # function for searching sequences of same the same character and replacing it by another string def replace_char_seq_with_string(str, char_seq, repl_str): result = [] @@ -405,17 +398,7 @@ def generate_OMASA_scene( x_meta = np.column_stack((azi, ele)) # write to .csv output metadata file - with open( - y.metadata_files[i - 1], - "w", - newline="", - encoding="utf-8", - ) as f: - # create csv writer - writer = csv.writer(f) - - # write all rows to the .csv file - writer.writerows(csv_formatdata(x_meta)) + np.savetxt(y.metadata_files[i - 1], x_meta, fmt="%0.2f", delimiter=",", encoding="utf-8") y.init_metadata() # this is needed to populate 'y.object_pos[]' diff --git a/ivas_processing_scripts/generation/generate_osba_items.py b/ivas_processing_scripts/generation/generate_osba_items.py index b082c184..b344abaa 100644 --- a/ivas_processing_scripts/generation/generate_osba_items.py +++ b/ivas_processing_scripts/generation/generate_osba_items.py @@ -30,7 +30,6 @@ # the United Nations Convention on Contracts on the International Sales of Goods. # -import csv import logging import sys from itertools import groupby, repeat @@ -48,12 +47,6 @@ from ivas_processing_scripts.utils import apply_func_parallel SEED_RANDOM_NOISE = 0 -# function for converting nd numpy array to strings with 2 decimal digits -def csv_formatdata(data): - for row in data: - yield ["%0.2f" % v for v in row] - - # function for searching sequences of same the same character and replacing it by another string def replace_char_seq_with_string(str, char_seq, repl_str): result = [] @@ -395,17 +388,7 @@ def generate_OSBA_scene( x_meta = np.column_stack((azi, ele)) # write to .csv output metadata file - with open( - y.metadata_files[i - 1], - "w", - newline="", - encoding="utf-8", - ) as f: - # create csv writer - writer = csv.writer(f) - - # write all rows to the .csv file - writer.writerows(csv_formatdata(x_meta)) + np.savetxt(y.metadata_files[i - 1], x_meta, fmt="%0.2f", delimiter=",", encoding="utf-8") y.init_metadata() # this is needed to populate 'y.object_pos[]' -- GitLab From 613537e25087b7377c5fbc869440f0f8f6455b69 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 19 Jun 2025 13:49:17 +0200 Subject: [PATCH 18/42] cleanup --- .../generation/generate_ismN_items.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/ivas_processing_scripts/generation/generate_ismN_items.py b/ivas_processing_scripts/generation/generate_ismN_items.py index d232a425..aa4f4078 100644 --- a/ivas_processing_scripts/generation/generate_ismN_items.py +++ b/ivas_processing_scripts/generation/generate_ismN_items.py @@ -365,17 +365,7 @@ def generate_ismN_scene( x_meta = np.column_stack((azi, ele)) # write to .csv output metadata file - with open( - y.metadata_files[i], - "w", - newline="", - encoding="utf-8", - ) as f: - # create csv writer - writer = csv.writer(f) - - # write all rows to the .csv file - writer.writerows(csv_formatdata(x_meta)) + np.savetxt(y.metadata_files[i-1], x_meta, fmt="%0.2f", delimiter=",", encoding="utf-8") y.init_metadata() # this is needed to populate 'y.object_pos[]' -- GitLab From 4f27b5b2566ed8d29c39a100db6f7f227d357dda Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 19 Jun 2025 13:51:13 +0200 Subject: [PATCH 19/42] formatting --- ivas_processing_scripts/generation/generate_ismN_items.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ivas_processing_scripts/generation/generate_ismN_items.py b/ivas_processing_scripts/generation/generate_ismN_items.py index aa4f4078..3c4e7115 100644 --- a/ivas_processing_scripts/generation/generate_ismN_items.py +++ b/ivas_processing_scripts/generation/generate_ismN_items.py @@ -29,7 +29,6 @@ # accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and # the United Nations Convention on Contracts on the International Sales of Goods. # -import csv import logging from itertools import groupby, repeat from math import floor @@ -365,7 +364,7 @@ def generate_ismN_scene( x_meta = np.column_stack((azi, ele)) # write to .csv output metadata file - np.savetxt(y.metadata_files[i-1], x_meta, fmt="%0.2f", delimiter=",", encoding="utf-8") + np.savetxt(y.metadata_files[i - 1], x_meta, fmt="%0.2f", delimiter=",", encoding="utf-8") y.init_metadata() # this is needed to populate 'y.object_pos[]' -- GitLab From 07b19e4b17950abd5bceed3f15555e3eac55ff1a Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Wed, 25 Jun 2025 12:53:58 +0200 Subject: [PATCH 20/42] minor improvements and simplifications --- .../audiotools/wrappers/reverb.py | 43 +++---------------- 1 file changed, 6 insertions(+), 37 deletions(-) diff --git a/ivas_processing_scripts/audiotools/wrappers/reverb.py b/ivas_processing_scripts/audiotools/wrappers/reverb.py index 55379b6b..7b7123a8 100644 --- a/ivas_processing_scripts/audiotools/wrappers/reverb.py +++ b/ivas_processing_scripts/audiotools/wrappers/reverb.py @@ -285,7 +285,7 @@ def reverb_hoa2( # convert to float32 hoa2_IR.audio = np.float32(hoa2_IR.audio) - numchannels = 9 # HOA2 by definition + numchannels = hoa2_IR.num_channels # calculate the scaling (multiplicative) factor such that the maximum gain of the IR filter across all frequencies is 0dB if align is None: @@ -298,26 +298,14 @@ def reverb_hoa2( ych = [] for i in range(numchannels): # separate IR into each channel - IR.audio = np.reshape(hoa2_IR.audio[:, i], (-1, 1)) + IR.audio = hoa2_IR.audio[:, [i]] # convolve mono input with channel IR ych.append(reverb(input, IR, align=align)) # combine into HOA2 output y = audio.fromtype("HOA2") y.fs = input.fs - y.audio = np.column_stack( - [ - ych[0].audio, - ych[1].audio, - ych[2].audio, - ych[3].audio, - ych[4].audio, - ych[5].audio, - ych[6].audio, - ych[7].audio, - ych[8].audio, - ] - ) + y.audio = np.column_stack([ych[i].audio for i in range(numchannels)]) return y @@ -348,7 +336,7 @@ def reverb_hoa3( # convert to float32 hoa3_IR.audio = np.float32(hoa3_IR.audio) - numchannels = 16 # HOA3 by definition + numchannels = hoa3_IR.num_channels # calculate the scaling (multiplicative) factor such that the maximum gain of the IR filter across all frequencies is 0dB if align is None: @@ -361,32 +349,13 @@ def reverb_hoa3( ych = [] for i in range(numchannels): # separate IR into each channel - IR.audio = np.reshape(hoa3_IR.audio[:, i], (-1, 1)) + IR.audio = hoa3_IR.audio[:, [i]] # convolve mono input with channel IR ych.append(reverb(input, IR, align=align)) # combine into HOA3 output y = audio.fromtype("HOA3") y.fs = input.fs - y.audio = np.column_stack( - [ - ych[0].audio, - ych[1].audio, - ych[2].audio, - ych[3].audio, - ych[4].audio, - ych[5].audio, - ych[6].audio, - ych[7].audio, - ych[8].audio, - ych[9].audio, - ych[10].audio, - ych[11].audio, - ych[12].audio, - ych[13].audio, - ych[14].audio, - ych[15].audio, - ] - ) + y.audio = np.column_stack([ych[i].audio for i in range(numchannels)]) return y -- GitLab From 582e619ea20783d665c3df149e020368f730ed04 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Wed, 25 Jun 2025 12:57:21 +0200 Subject: [PATCH 21/42] refactoring of the script to use the trim_meta() function --- .../generation/generate_ismN_items.py | 257 ++++++++++-------- 1 file changed, 149 insertions(+), 108 deletions(-) diff --git a/ivas_processing_scripts/generation/generate_ismN_items.py b/ivas_processing_scripts/generation/generate_ismN_items.py index 3c4e7115..a1689ddc 100644 --- a/ivas_processing_scripts/generation/generate_ismN_items.py +++ b/ivas_processing_scripts/generation/generate_ismN_items.py @@ -31,12 +31,11 @@ # import logging from itertools import groupby, repeat -from math import floor from pathlib import Path import numpy as np -from ivas_processing_scripts.audiotools import audio, audioarray, audiofile +from ivas_processing_scripts.audiotools import audio, audioarray, audiofile, metadata from ivas_processing_scripts.audiotools.convert.objectbased import convert_objectbased from ivas_processing_scripts.audiotools.wrappers.bs1770 import loudness_norm from ivas_processing_scripts.generation import config @@ -196,126 +195,108 @@ def generate_ismN_scene( # initialize output ISM object y = audio.ObjectBasedAudio(ism_format) + y.fs = cfg.fs + + # set the frame length + frame_len = int(cfg.fs / 50) # repeat for all source files + offset = 0 for i in range(N_inputs): - # parse parameters from the scene description + # read input filename source_file = ( scene["input"][i] if isinstance(scene["input"], list) else scene["input"] ) - source_azi = ( - scene["azimuth"][i] - if isinstance(scene["azimuth"], list) - else scene["azimuth"] - ) - source_ele = ( - scene["elevation"][i] - if isinstance(scene["elevation"], list) - else scene["elevation"] + + input_filename = Path(source_file).parent / ( + cfg.use_input_prefix + Path(source_file).name ) - # read the overlap length + # read azimuth and elevation information + if "azimuth" in scene.keys(): + source_azi = ( + scene["azimuth"][i] if isinstance(scene["azimuth"], list) else scene["azimuth"] + ) + else: + source_azi = 0.0 + + if "elevation" in scene.keys(): + source_ele = ( + scene["elevation"][i] if isinstance(scene["elevation"], list) else scene["elevation"] + ) + else: + source_ele = 0.0 + + # read the source shift length (in seconds) if "shift" in scene.keys(): source_shift = ( - scene["shift"][i] - if isinstance(scene["shift"], list) - else scene["shift"] + scene["shift"][i] if isinstance(scene["shift"], list) else scene["shift"] ) else: source_shift = 0.0 + # convert overlap to samples and ensure it is a multiple of 20ms + source_shift = source_shift * cfg.fs + if source_shift >= 0: + source_shift = int(np.floor(source_shift / frame_len) * frame_len) + else: + source_shift = int(np.ceil(source_shift / frame_len) * frame_len) + # read the level if "level" in scene.keys(): level = ( - scene["level"][i] - if isinstance(scene["level"], list) - else scene["level"] + scene["level"][i] if isinstance(scene["level"], list) else scene["level"] ) else: level = -26 logger.info(f"Encoding {source_file} at position(s) {source_azi},{source_ele}") - # get input filename - input_filename = Path(source_file).parent / ( - cfg.use_input_prefix + Path(source_file).name - ) - - # generate ISM metadata .csv filename (should end with .wav..0.csv, .wav.1.csv, ...) - y.metadata_files.insert(i, str(output_filename.with_suffix(f".{i}.csv"))) - # read source file - x = audio.fromfile("MONO", input_filename, fs=cfg.fs) - - # adjust the level of the source file - x.audio, _ = loudness_norm(x, level, loudness_format="MONO") + x = audio.fromtype("ISM1") + x.audio, x.fs = audiofile.read(input_filename) - # shift the source signal (positive shift creates overlap, negative shift creates a gap) - if int(floor(-source_shift)) != 0: - x.audio = audioarray.trim(x.audio, x.fs, limits=[-source_shift, 0]) + # resample to the target fs if necessary + if x.fs != cfg.fs: + logger.warning( + f"Warning: Sample rate of the audio source is {x.fs} Hz and needs to be resampled to {cfg.fs}!" + ) + resampled_audio = audioarray.resample(x.audio, x.fs, cfg.fs) + x.audio = resampled_audio + x.fs = cfg.fs + + # adjust the level of the audio source file (need to convert to MONO first) + x_temp = audio.ChannelBasedAudio("MONO") # create a temporary mono audio object + x_temp.audio = x.audio.copy() + x_temp.fs = x.fs + x_temp.audio, _ = loudness_norm(x_temp, level, loudness_format="MONO") + x.audio = x_temp.audio + + # ensure the length of the audio source signal is a multiple of 20ms + if len(x.audio) % frame_len != 0: + # pad with zeros to ensure that the signal length is a multiple of 20ms + if len(x.audio) % frame_len != 0: + N_pad = int(frame_len - len(x.audio) % frame_len) + x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) # get the number of frames (multiple of 20ms) - frame_len = int(x.fs / 50) N_frames = int(len(x.audio) / frame_len) - # pad with zeros to ensure that the signal length is a multiple of 20ms - if len(x.audio) % frame_len != 0: - N_pad = int(frame_len - len(x.audio) % frame_len) - x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) - - if y.audio is None: - # add source signal to the array of all source signals - y.audio = x.audio.copy() - y.fs = x.fs - else: - # pad ISM signal with zeros to have the same length as the MASA signal - N_pad = y.audio.shape[0] - x.audio.shape[0] - if N_pad != 0: - x.audio = audioarray.trim( - x.audio, x.fs, limits=[0, -N_pad], samples=True - ) - - # append ISM signal to the ISM object - y.audio = np.append(y.audio, x.audio, axis=1) - - # append pre-amble and post-amble to all sources - y.audio = audioarray.trim(y.audio, y.fs, limits=[-cfg.preamble, -cfg.postamble]) - - # add random noise - if cfg.add_low_level_random_noise: - # create uniformly distributed noise between -4 and 4 - np.random.seed(SEED_RANDOM_NOISE) - noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") - - # superimpose - y.audio += noise - - # generate ISM metadata - for i in range(N_inputs): - # parse metadata parameters from the scene description - source_azi = ( - scene["azimuth"][i] - if isinstance(scene["azimuth"], list) - else scene["azimuth"] - ) - source_ele = ( - scene["elevation"][i] - if isinstance(scene["elevation"], list) - else scene["elevation"] - ) - - N_frames = int(np.rint((len(y.audio) / y.fs * 50))) - - # read azimuth information and convert to an array + # convert azimuth information in case of moving object if isinstance(source_azi, str): if ":" in source_azi: - # start with the initial azimuth value and apply step N_frames times - source_azi = source_azi.split(":") - azi = np.arange( - float(eval(source_azi[0])), - float(eval(source_azi[0])) + N_frames * float(eval(source_azi[1])), - float(eval(source_azi[1])), - ) + # convert into array (initial_value:step:stop_value) + start_str, step_str, stop_str = source_azi.split(":") + start = float(eval(start_str)) + step = float(eval(step_str)) + stop = float(eval(stop_str)) + azi = np.arange(start, stop, step) + + # adjust length to N_frames + if len(azi) > N_frames: + azi = azi[:N_frames] + elif len(azi) < N_frames: + azi = np.append(azi, np.full(N_frames - len(azi), azi[-1])) else: # replicate static azimuth value N_frames times azi = np.repeat(float(eval(source_azi)), N_frames) @@ -332,21 +313,22 @@ def generate_ismN_scene( f"Incorrect value(s) of azimuth: {azi[(azi > 180) | (azi < -180)]}" ) - # read elevation information and convert to an array + # convert elevation information in case mof moving object if isinstance(source_ele, str): if ":" in source_ele: # convert into array (initial_value:step:stop_value) - # note: the stop_value value is +-90 degrees depending on the sign of the step - source_ele = source_ele.split(":") - ele = np.arange( - float(eval(source_ele[0])), - np.sign(float(eval(source_ele[1]))) * 90, - float(eval(source_ele[1])), - )[:N_frames] - - # repeat the last elevation value, if array is shorter than N_frames - if len(ele) < N_frames: + start_str, step_str, stop_str = source_ele.split(":") + start = float(eval(start_str)) + step = float(eval(step_str)) + stop = float(eval(stop_str)) + ele = np.arange(start, stop, step) + + # adjust length to N_frames + if len(ele) > N_frames: + ele = ele[:N_frames] + elif len(ele) < N_frames: ele = np.append(ele, np.full(N_frames - len(ele), ele[-1])) + else: # replicate static elevation value N_frames times ele = np.repeat(float(eval(source_ele)), N_frames) @@ -354,24 +336,83 @@ def generate_ismN_scene( # replicate static elevation value N_frames times ele = np.repeat(float(source_ele), N_frames) + # wrap elevation angle to -90 .. +90 + ele = ((ele + 90) % 180) - 90 + # check if elevation is from -90 .. +90 if any(ele > 90) or any(ele < -90): logger.error( f"Incorrect value(s) of elevation: {ele[(ele > 90) | (ele < -90)]}" ) + # generate radius vector with all values equal to 1.0 + rad = np.ones(N_frames) + # arrange all metadata fields column-wise into a matrix - x_meta = np.column_stack((azi, ele)) + x.object_pos.append(np.column_stack((azi, ele, rad))) - # write to .csv output metadata file - np.savetxt(y.metadata_files[i - 1], x_meta, fmt="%0.2f", delimiter=",", encoding="utf-8") + # copy new audio source signal to the ISMn object + if y.audio is None: + # add the first audio source signal to the array of all source signals + y.audio = x.audio.copy() + y.object_pos = x.object_pos.copy() + y.fs = x.fs + # if source_shift < 0: + # # insert zeros to the new audio source signal to shift it right + # metadata.trim_meta(y, limits=[source_shift, 0], samples=True) + offset = source_shift + else: + # shift the beginning of the audio source signal + delta_offset = source_shift - offset + if delta_offset > 0: + # insert zeros to the previous ISM signal(s) to shift them right + metadata.trim_meta(y, limits=[-delta_offset, 0], samples=True) + offset = source_shift + else: + # insert zeros to the new audio source signal to shift it right + metadata.trim_meta(x, limits=[delta_offset, 0], samples=True) + + # adjust the length of the audio source signal + delta_length = len(x.audio) - len(y.audio) + if delta_length > 0: + # pad zeros to the previous ISM signal(s) + metadata.trim_meta(y, limits=[0, -delta_length], samples=True) + else: + # pad zeros to the new audio source signal + metadata.trim_meta(x, limits=[0, delta_length], samples=True) + + y.audio = np.append(y.audio, x.audio, axis=1) + y.object_pos.extend(x.object_pos) + + # add ISM metadata .csv filename (should end with .wav.0.csv, .wav.1.csv, ...) + y.metadata_files.insert(i, str(output_filename.with_suffix(f".{i}.csv"))) + + # append pre-amble and post-amble + metadata.trim_meta(y, limits=[-cfg.preamble * 1000, -cfg.postamble * 1000]) - y.init_metadata() # this is needed to populate 'y.object_pos[]' + # add random noise + if cfg.add_low_level_random_noise: + # create uniformly distributed noise between -4 and 4 + np.random.seed(SEED_RANDOM_NOISE) + noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") + y.audio += noise + + # adjust the length of the output signal + if "duration" in cfg.__dict__: + # trim the output signal such that the total duration is X seconds + duration = int(cfg.duration * cfg.fs) # convert to samples + else: + # do not change the length of the audio signal + duration = len(y.audio) + duration = int(np.floor(duration / frame_len) * frame_len) # ensure multiple of 20ms + if len(y.audio) != duration: + metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) - # write the OMASA output to .wav file in an interleaved format + # write the ISMn output to .wav file in an interleaved format and ISM metadata in .csv files audiofile.write(output_filename, y.audio, y.fs) + metadata.write_ISM_metadata_in_file(y.object_pos, y.metadata_files) - # convert to ISM output to BINAURAL, if option was chosen + # convert to BINAURAL, if option was chosen if cfg.binaural_output: binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs -- GitLab From ac43b06b4280d512cf233a04da84a2876d1ef6fe Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Wed, 25 Jun 2025 13:03:25 +0200 Subject: [PATCH 22/42] add the duration field to the exemplary scene description files (.yml) --- examples/ITEM_GENERATION_3ISM.yml | 3 +++ examples/ITEM_GENERATION_FOA.yml | 3 +++ examples/ITEM_GENERATION_OMASA.yml | 3 +++ examples/ITEM_GENERATION_OSBA.yml | 3 +++ examples/ITEM_GENERATION_STEREO.yml | 3 +++ 5 files changed, 15 insertions(+) diff --git a/examples/ITEM_GENERATION_3ISM.yml b/examples/ITEM_GENERATION_3ISM.yml index 7b0cb27d..753e763e 100644 --- a/examples/ITEM_GENERATION_3ISM.yml +++ b/examples/ITEM_GENERATION_3ISM.yml @@ -27,6 +27,9 @@ binaural_output: true preamble: 0.0 postamble: 0.0 +### Trim the output such that the total duration is X seconds +duration: 8 + ### Add low-level random background noise (amplitude +-4) instead of silence; default = false (silence) add_low_level_random_noise: true diff --git a/examples/ITEM_GENERATION_FOA.yml b/examples/ITEM_GENERATION_FOA.yml index f94aadf2..2990fba6 100644 --- a/examples/ITEM_GENERATION_FOA.yml +++ b/examples/ITEM_GENERATION_FOA.yml @@ -27,6 +27,9 @@ loudness: -26 preamble: 0.5 postamble: 1.0 +### Trim the output such that the total duration is X seconds +duration: 8 + ### Add low-level random background noise (amplitude +-4) instead of silence; default = false (silence) add_low_level_random_noise: False diff --git a/examples/ITEM_GENERATION_OMASA.yml b/examples/ITEM_GENERATION_OMASA.yml index ecf3f33c..31529e1f 100644 --- a/examples/ITEM_GENERATION_OMASA.yml +++ b/examples/ITEM_GENERATION_OMASA.yml @@ -27,6 +27,9 @@ binaural_output: true preamble: 0.0 postamble: 0.0 +### Trim the output such that the total duration is X seconds +duration: 8 + ### Add low-level random background noise (amplitude +-4) instead of silence; default = false (silence) add_low_level_random_noise: true diff --git a/examples/ITEM_GENERATION_OSBA.yml b/examples/ITEM_GENERATION_OSBA.yml index 748a0ad9..f1b7c3ea 100644 --- a/examples/ITEM_GENERATION_OSBA.yml +++ b/examples/ITEM_GENERATION_OSBA.yml @@ -27,6 +27,9 @@ binaural_output: true preamble: 0.0 postamble: 0.0 +### Trim the output such that the total duration is X seconds +duration: 8 + ### Add low-level random background noise (amplitude +-4) instead of silence; default = false (silence) add_low_level_random_noise: true diff --git a/examples/ITEM_GENERATION_STEREO.yml b/examples/ITEM_GENERATION_STEREO.yml index c9c5a983..7c4391dc 100644 --- a/examples/ITEM_GENERATION_STEREO.yml +++ b/examples/ITEM_GENERATION_STEREO.yml @@ -27,6 +27,9 @@ loudness: -26 preamble: 0.5 postamble: 1.0 +### Trim the output such that the total duration is X seconds +duration: 8 + ### Add low-level random background noise (amplitude +-4) instead of silence; default = false (silence) add_low_level_random_noise: true -- GitLab From c1ec9329128b3a4788d2cb27fed6606481a764f7 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Wed, 25 Jun 2025 14:43:47 +0200 Subject: [PATCH 23/42] refactoring to use the trim_meta() function --- .../generation/generate_omasa_items.py | 320 +++++++++++------- 1 file changed, 191 insertions(+), 129 deletions(-) diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py index b51375df..8f890332 100644 --- a/ivas_processing_scripts/generation/generate_omasa_items.py +++ b/ivas_processing_scripts/generation/generate_omasa_items.py @@ -38,7 +38,7 @@ from pathlib import Path import numpy as np -from ivas_processing_scripts.audiotools import audio, audioarray, audiofile +from ivas_processing_scripts.audiotools import audio, audioarray, audiofile, metadata from ivas_processing_scripts.audiotools.convert.omasa import convert_omasa from ivas_processing_scripts.audiotools.convert.scenebased import render_sba_to_masa from ivas_processing_scripts.audiotools.wrappers.bs1770 import loudness_norm @@ -197,34 +197,62 @@ def generate_OMASA_scene( # initialize output OMASA object y = audio.OMASAAudio(omasa_format) + y.fs = cfg.fs + + # set the frame length + frame_len = int(cfg.fs / 50) # repeat for all source files + offset = 0 for i in range(N_inputs): # parse parameters from the scene description source_file = ( scene["input"][i] if isinstance(scene["input"], list) else scene["input"] ) - source_azi = ( - scene["azimuth"][i] - if isinstance(scene["azimuth"], list) - else scene["azimuth"] - ) - source_ele = ( - scene["elevation"][i] - if isinstance(scene["elevation"], list) - else scene["elevation"] + + # get input filename + input_filename = Path(source_file).parent / ( + cfg.use_input_prefix + Path(source_file).name ) - # read the shift length + # read azimuth and elevation information + if "azimuth" in scene.keys(): + source_azi = ( + scene["azimuth"][i] if isinstance(scene["azimuth"], list) else scene["azimuth"] + ) + else: + source_azi = 0.0 + + if "elevation" in scene.keys(): + source_ele = ( + scene["elevation"][i] if isinstance(scene["elevation"], list) else scene["elevation"] + ) + else: + source_ele = 0.0 + + # read the source shift length (in seconds) if "shift" in scene.keys(): source_shift = ( - scene["shift"][i] - if isinstance(scene["shift"], list) - else scene["shift"] + scene["shift"][i] if isinstance(scene["shift"], list) else scene["shift"] ) else: source_shift = 0.0 + # convert overlap to samples and ensure it is a multiple of 20ms + source_shift = source_shift * cfg.fs + if source_shift >= 0: + source_shift = int(np.floor(source_shift / frame_len) * frame_len) + else: + source_shift = int(np.ceil(source_shift / frame_len) * frame_len) + + # read the level + if "level" in scene.keys(): + level = ( + scene["level"][i] if isinstance(scene["level"], list) else scene["level"] + ) + else: + level = -26 + # read the level if "level" in scene.keys(): level = ( @@ -237,11 +265,6 @@ def generate_OMASA_scene( logger.info(f"Encoding {source_file} at position(s) {source_azi},{source_ele}") - # get input filename - input_filename = Path(source_file).parent / ( - cfg.use_input_prefix + Path(source_file).name - ) - # get the number of channels from the .wav file header wav_header = audiofile.parse_wave_header(input_filename) N_channels = wav_header["channels"] @@ -257,153 +280,194 @@ def generate_OMASA_scene( elif N_channels == 16: fmt = "HOA3" else: - logger.info( + logger.error( f"Error: Input format of the source file with {N_channels} channels is not supported!" ) sys.exit(-1) - if fmt in ["FOA", "HOA2", "HOA3"]: - # generate MASA metadata .met filename (should end with .met) - y.metadata_files.append(output_filename.with_suffix(".met")) - elif fmt == "MONO": - # generate ISM metadata .csv filename (should end with .wav.0.csv, .wav.1.csv, ...) - y.metadata_files.insert(i - 1, output_filename.with_suffix(f".{i-1}.csv")) - # read source file x = audio.fromfile(fmt, input_filename, fs=cfg.fs) + # resample to the target fs if necessary + if x.fs != cfg.fs: + logger.warning( + f"Warning: Sample rate of the audio source is {x.fs} Hz and needs to be resampled to {cfg.fs}!" + ) + resampled_audio = audioarray.resample(x.audio, x.fs, cfg.fs) + x.audio = resampled_audio + x.fs = cfg.fs + # adjust the level of the source file if fmt in ["FOA", "HOA2", "HOA3"]: x.audio, _ = loudness_norm(x, level, loudness_format="STEREO", rms=True) else: x.audio, _ = loudness_norm(x, level, loudness_format="MONO") - # shift the source signal (positive shift creates overlap, negative shift creates a gap) - if int(floor(-source_shift)) != 0: - x.audio = audioarray.trim(x.audio, x.fs, limits=[-source_shift, 0]) + # ensure the length of the audio source signal is a multiple of 20ms + if len(x.audio) % frame_len != 0: + # pad with zeros to ensure that the signal length is a multiple of 20ms + if len(x.audio) % frame_len != 0: + N_pad = int(frame_len - len(x.audio) % frame_len) + x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) # get the number of frames (multiple of 20ms) - frame_len = int(x.fs / 50) N_frames = int(len(x.audio) / frame_len) - # pad with zeros to ensure that the signal length is a multiple of 20ms - if len(x.audio) % frame_len != 0: - # pad the source signal - N_pad = int(frame_len - len(x.audio) % frame_len) - x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) - - # convert FOA/HOA2/HOA3 to MASA + # convert input audio source signal to MASA or ISM if fmt in ["FOA", "HOA2", "HOA3"]: + # convert FOA/HOA2/HOA3 to MASA x_masa = audio.MetadataAssistedSpatialAudio( f"MASA{cfg.masa_tc}DIR{cfg.masa_dirs}" ) - x_masa.metadata_file = y.metadata_files[i] + x_masa.fs = cfg.fs + # generate MASA metadata filename (should end with .met) + x_masa.metadata_file = output_filename.with_suffix(".met") render_sba_to_masa(x, x_masa) - y.audio = x_masa.audio - y.fs = x.fs - else: - # pad ISM signal with zeros to have the same length as the MASA signal - N_pad = y.audio.shape[0] - x.audio.shape[0] - if N_pad != 0: - x.audio = audioarray.trim( - x.audio, x.fs, limits=[0, -N_pad], samples=True - ) - - # append ISM signal to the OMASA object (ISM comes first !!!) - y.audio = np.insert(y.audio, [i - 1], x.audio, axis=1) + x = x_masa # replace x with the MASA object + elif fmt == "MONO": + # convert MONO to ISM1 + x_ism = audio.ObjectBasedAudio("ISM1") # ISM with 1 channel + x_ism.fs = cfg.fs + x_ism.audio = x.audio.copy() + + # convert azimuth information in case of moving object + if isinstance(source_azi, str): + if ":" in source_azi: + # convert into array (initial_value:step:stop_value) + start_str, step_str, stop_str = source_azi.split(":") + start = float(eval(start_str)) + step = float(eval(step_str)) + stop = float(eval(stop_str)) + azi = np.arange(start, stop, step) + + # adjust length to N_frames + if len(azi) > N_frames: + azi = azi[:N_frames] + elif len(azi) < N_frames: + azi = np.append(azi, np.full(N_frames - len(azi), azi[-1])) + else: + # replicate static azimuth value N_frames times + azi = np.repeat(float(eval(source_azi)), N_frames) + else: + # replicate static azimuth value N_frames times + azi = np.repeat(float(source_azi), N_frames) - # append pre-amble and post-amble to all sources - y.audio = audioarray.trim(y.audio, y.fs, limits=[-cfg.preamble, -cfg.postamble]) + # convert azimuth from 0 .. 360 to -180 .. +180 + azi = (azi + 180) % 360 - 180 - # add random noise - if cfg.add_low_level_random_noise: - # create uniformly distributed noise between -4 and 4 - np.random.seed(SEED_RANDOM_NOISE) - noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") + # check if azimuth is from -180 .. +180 + if any(azi > 180) or any(azi < -180): + logger.error( + f"Incorrect value(s) of azimuth: {azi[(azi > 180) | (azi < -180)]}" + ) - # superimpose - y.audio += noise + # convert elevation information in case mof moving object + if isinstance(source_ele, str): + if ":" in source_ele: + # convert into array (initial_value:step:stop_value) + start_str, step_str, stop_str = source_ele.split(":") + start = float(eval(start_str)) + step = float(eval(step_str)) + stop = float(eval(stop_str)) + ele = np.arange(start, stop, step) + + # adjust length to N_frames + if len(ele) > N_frames: + ele = ele[:N_frames] + elif len(ele) < N_frames: + ele = np.append(ele, np.full(N_frames - len(ele), ele[-1])) + + else: + # replicate static elevation value N_frames times + ele = np.repeat(float(eval(source_ele)), N_frames) + else: + # replicate static elevation value N_frames times + ele = np.repeat(float(source_ele), N_frames) - # generate ISM metadata files - for i in range(1, N_ISMs + 1): - # parse metadata parameters from the scene description - source_azi = ( - scene["azimuth"][i] - if isinstance(scene["azimuth"], list) - else scene["azimuth"] - ) - source_ele = ( - scene["elevation"][i] - if isinstance(scene["elevation"], list) - else scene["elevation"] - ) + # wrap elevation angle to -90 .. +90 + ele = ((ele + 90) % 180) - 90 - N_frames = int(np.rint((len(y.audio) / y.fs * 50))) - - # read azimuth information and convert to an array - if isinstance(source_azi, str): - if ":" in source_azi: - # start with the initial azimuth value and apply step N_frames times - source_azi = source_azi.split(":") - azi = np.arange( - float(eval(source_azi[0])), - float(eval(source_azi[0])) + N_frames * float(eval(source_azi[1])), - float(eval(source_azi[1])), + # check if elevation is from -90 .. +90 + if any(ele > 90) or any(ele < -90): + logger.error( + f"Incorrect value(s) of elevation: {ele[(ele > 90) | (ele < -90)]}" ) - else: - # replicate static azimuth value N_frames times - azi = np.repeat(float(eval(source_azi)), N_frames) - else: - # replicate static azimuth value N_frames times - azi = np.repeat(float(source_azi), N_frames) - # convert azimuth from 0 .. 360 to -180 .. +180 - azi = (azi + 180) % 360 - 180 + # generate radius vector with all values equal to 1.0 + rad = np.ones(N_frames) - # check if azimuth is from -180 .. +180 - if any(azi > 180) or any(azi < -180): - logger.error( - f"Incorrect value(s) of azimuth: {azi[(azi > 180) | (azi < -180)]}" - ) + # arrange all metadata fields column-wise into a matrix + x_ism.object_pos.append(np.column_stack((azi, ele, rad))) + + x = x_ism # replace x with the ISM object - # read elevation information and convert to an array - if isinstance(source_ele, str): - if ":" in source_ele: - # convert into array (initial_value:step:stop_value) - # note: the stop_value value is +-90 degrees depending on the sign of the step - source_ele = source_ele.split(":") - ele = np.arange( - float(eval(source_ele[0])), - np.sign(float(eval(source_ele[1]))) * 90, - float(eval(source_ele[1])), - )[:N_frames] - - # repeat the last elevation value, if array is shorter than N_frames - if len(ele) < N_frames: - ele = np.append(ele, np.full(N_frames - len(ele), ele[-1])) + # copy new audio source signal to the OMASA object + if y.audio is None: + # add the first audio source signal (should be MASA) to the array of all source signals + y.audio = x.audio.copy() + + if "MASA" in x.name: + # if MASA, append metadata file to the OMASA object + y.metadata_files.append(x.metadata_file) else: - # replicate static elevation value N_frames times - ele = np.repeat(float(eval(source_ele)), N_frames) + # if ISM, append metadata file to the OMASA object + y.object_pos = x.object_pos.copy() + + # if source_shift < 0: + # # insert zeros to the new audio source signal to shift it right + # metadata.trim_meta(y, limits=[source_shift, 0], samples=True) + offset = source_shift else: - # replicate static elevation value N_frames times - ele = np.repeat(float(source_ele), N_frames) + # shift the beginning of the audio source signal + delta_offset = source_shift - offset + if delta_offset > 0: + # insert zeros to the previous ISM signal(s) to shift them right + metadata.trim_meta(y, limits=[-delta_offset, 0], samples=True) + offset = source_shift + else: + # insert zeros to the new audio source signal to shift it right + metadata.trim_meta(x, limits=[delta_offset, 0], samples=True) + + # adjust the length of the audio source signal + delta_length = len(x.audio) - len(y.audio) + if delta_length > 0: + # pad zeros to the previous ISM signal(s) + metadata.trim_meta(y, limits=[0, -delta_length], samples=True) + else: + # pad zeros to the new audio source signal + metadata.trim_meta(x, limits=[0, delta_length], samples=True) - # check if elevation is from -90 .. +90 - if any(ele > 90) or any(ele < -90): - logger.error( - f"Incorrect value(s) of elevation: {ele[(ele > 90) | (ele < -90)]}" - ) + # append ISM signal to the OMASA object (ISM comes first !!!) + y.audio = np.insert(y.audio, [i - 1], x.audio, axis=1) + y.object_pos.extend(x.object_pos) - # arrange all metadata fields column-wise into a matrix - x_meta = np.column_stack((azi, ele)) + # add ISM metadata .csv filename (should end with .wav.0.csv, .wav.1.csv, ...) + y.metadata_files.insert(i-1, str(output_filename.with_suffix(f".{i-1}.csv"))) - # write to .csv output metadata file - np.savetxt(y.metadata_files[i - 1], x_meta, fmt="%0.2f", delimiter=",", encoding="utf-8") + # append pre-amble and post-amble + metadata.trim_meta(y, limits=[-cfg.preamble * 1000, -cfg.postamble * 1000]) - y.init_metadata() # this is needed to populate 'y.object_pos[]' + # add random noise + if cfg.add_low_level_random_noise: + # create uniformly distributed noise between -4 and 4 + np.random.seed(SEED_RANDOM_NOISE) + noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") + y.audio += noise - # write the OMASA output to .wav file in an interleaved format + # adjust the length of the output signal + if "duration" in cfg.__dict__: + # trim the output signal such that the total duration is X seconds + duration = int(cfg.duration * cfg.fs) # convert to samples + else: + # do not change the length of the audio signal + duration = len(y.audio) + duration = int(np.floor(duration / frame_len) * frame_len) # ensure multiple of 20ms + if len(y.audio) != duration: + metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) + + # write the ISMn output to .wav file in an interleaved format and ISM metadata in .csv files audiofile.write(output_filename, y.audio, y.fs) + metadata.write_ISM_metadata_in_file(y.object_pos, y.metadata_files[:-1]) # convert to OMASA output to BINAURAL, if option was chosen if cfg.binaural_output: @@ -418,5 +482,3 @@ def generate_OMASA_scene( binaudio.audio, binaudio.fs, ) - - return -- GitLab From 8134591d8809454c96f483c9fe06636e9f1c2bb8 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Wed, 25 Jun 2025 16:42:47 +0200 Subject: [PATCH 24/42] corrections and cleanup --- .../generation/generate_omasa_items.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py index 8f890332..565cba19 100644 --- a/ivas_processing_scripts/generation/generate_omasa_items.py +++ b/ivas_processing_scripts/generation/generate_omasa_items.py @@ -33,7 +33,6 @@ import logging import sys from itertools import groupby, repeat -from math import floor from pathlib import Path import numpy as np @@ -245,14 +244,6 @@ def generate_OMASA_scene( else: source_shift = int(np.ceil(source_shift / frame_len) * frame_len) - # read the level - if "level" in scene.keys(): - level = ( - scene["level"][i] if isinstance(scene["level"], list) else scene["level"] - ) - else: - level = -26 - # read the level if "level" in scene.keys(): level = ( @@ -313,7 +304,7 @@ def generate_OMASA_scene( # get the number of frames (multiple of 20ms) N_frames = int(len(x.audio) / frame_len) - # convert input audio source signal to MASA or ISM + # convert the input audio source signal to MASA or ISM if fmt in ["FOA", "HOA2", "HOA3"]: # convert FOA/HOA2/HOA3 to MASA x_masa = audio.MetadataAssistedSpatialAudio( @@ -410,7 +401,7 @@ def generate_OMASA_scene( # if MASA, append metadata file to the OMASA object y.metadata_files.append(x.metadata_file) else: - # if ISM, append metadata file to the OMASA object + # if ISM, append object position to the OMASA object y.object_pos = x.object_pos.copy() # if source_shift < 0: @@ -442,7 +433,7 @@ def generate_OMASA_scene( y.object_pos.extend(x.object_pos) # add ISM metadata .csv filename (should end with .wav.0.csv, .wav.1.csv, ...) - y.metadata_files.insert(i-1, str(output_filename.with_suffix(f".{i-1}.csv"))) + y.metadata_files.insert(i - 1, str(output_filename.with_suffix(f".{i - 1}.csv"))) # append pre-amble and post-amble metadata.trim_meta(y, limits=[-cfg.preamble * 1000, -cfg.postamble * 1000]) @@ -465,7 +456,7 @@ def generate_OMASA_scene( if len(y.audio) != duration: metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) - # write the ISMn output to .wav file in an interleaved format and ISM metadata in .csv files + # write the OMASA audio output to .wav file in an interleaved format and ISM metadata in .csv files audiofile.write(output_filename, y.audio, y.fs) metadata.write_ISM_metadata_in_file(y.object_pos, y.metadata_files[:-1]) -- GitLab From 661bc864bc586a0b712c7a3e59457ed6570d1c13 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Wed, 25 Jun 2025 16:43:23 +0200 Subject: [PATCH 25/42] refactoring to use trim_meta() function --- .../generation/generate_osba_items.py | 265 +++++++++++------- 1 file changed, 159 insertions(+), 106 deletions(-) diff --git a/ivas_processing_scripts/generation/generate_osba_items.py b/ivas_processing_scripts/generation/generate_osba_items.py index b344abaa..64b5a030 100644 --- a/ivas_processing_scripts/generation/generate_osba_items.py +++ b/ivas_processing_scripts/generation/generate_osba_items.py @@ -38,7 +38,7 @@ from pathlib import Path import numpy as np -from ivas_processing_scripts.audiotools import audio, audioarray, audiofile +from ivas_processing_scripts.audiotools import audio, audioarray, audiofile, metadata from ivas_processing_scripts.audiotools.convert.osba import convert_osba from ivas_processing_scripts.audiotools.wrappers.bs1770 import loudness_norm from ivas_processing_scripts.generation import config @@ -195,13 +195,25 @@ def generate_OSBA_scene( # initialize output OSBA object y = audio.OSBAAudio(osba_format) + y.fs = cfg.fs + + # set the frame length + frame_len = int(cfg.fs / 50) # repeat for all source files + offset = 0 for i in range(N_inputs): # parse parameters from the scene description source_file = ( scene["input"][i] if isinstance(scene["input"], list) else scene["input"] ) + + # get input filename + input_filename = Path(source_file).parent / ( + cfg.use_input_prefix + Path(source_file).name + ) + + # read azimuth and elevation information source_azi = ( scene["azimuth"][i] if isinstance(scene["azimuth"], list) @@ -223,6 +235,13 @@ def generate_OSBA_scene( else: source_shift = 0.0 + # convert overlap to samples and ensure it is a multiple of 20ms + source_shift = source_shift * cfg.fs + if source_shift >= 0: + source_shift = int(np.floor(source_shift / frame_len) * frame_len) + else: + source_shift = int(np.ceil(source_shift / frame_len) * frame_len) + # read the level if "level" in scene.keys(): level = ( @@ -260,140 +279,176 @@ def generate_OSBA_scene( ) sys.exit(-1) - if fmt == "MONO": - # generate ISM metadata .csv filename (should end with .wav.0.csv, .wav.1.csv, ...) - y.metadata_files.insert(i - 1, f"{output_filename}.{i-1}.csv") - # read source file x = audio.fromfile(fmt, input_filename, fs=cfg.fs) + # resample to the target fs if necessary + if x.fs != cfg.fs: + logger.warning( + f"Warning: Sample rate of the audio source is {x.fs} Hz and needs to be resampled to {cfg.fs}!" + ) + resampled_audio = audioarray.resample(x.audio, x.fs, cfg.fs) + x.audio = resampled_audio + x.fs = cfg.fs + # adjust the level of the source file if fmt in ["FOA", "HOA2", "HOA3"]: x.audio, _ = loudness_norm(x, level, loudness_format="STEREO", rms=True) else: x.audio, _ = loudness_norm(x, level, loudness_format="MONO") - # shift the source signal (positive shift creates overlap, negative shift creates a gap) - if int(floor(-source_shift)) != 0: - x.audio = audioarray.trim(x.audio, x.fs, limits=[-source_shift, 0]) + # ensure the length of the audio source signal is a multiple of 20ms + if len(x.audio) % frame_len != 0: + # pad with zeros to ensure that the signal length is a multiple of 20ms + if len(x.audio) % frame_len != 0: + N_pad = int(frame_len - len(x.audio) % frame_len) + x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) # get the number of frames (multiple of 20ms) - frame_len = int(x.fs / 50) N_frames = int(len(x.audio) / frame_len) - # pad with zeros to ensure that the signal length is a multiple of 20ms - if len(x.audio) % frame_len != 0: - # pad the source signal - N_pad = int(frame_len - len(x.audio) % frame_len) - x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) + # convert the input audio source signal to ISM + if fmt == "MONO": + # convert MONO to ISM1 + x_ism = audio.ObjectBasedAudio("ISM1") # ISM with 1 channel + x_ism.fs = cfg.fs + x_ism.audio = x.audio.copy() + + # convert azimuth information in case of moving object + if isinstance(source_azi, str): + if ":" in source_azi: + # convert into array (initial_value:step:stop_value) + start_str, step_str, stop_str = source_azi.split(":") + start = float(eval(start_str)) + step = float(eval(step_str)) + stop = float(eval(stop_str)) + azi = np.arange(start, stop, step) + + # adjust length to N_frames + if len(azi) > N_frames: + azi = azi[:N_frames] + elif len(azi) < N_frames: + azi = np.append(azi, np.full(N_frames - len(azi), azi[-1])) + else: + # replicate static azimuth value N_frames times + azi = np.repeat(float(eval(source_azi)), N_frames) + else: + # replicate static azimuth value N_frames times + azi = np.repeat(float(source_azi), N_frames) - if fmt in ["FOA", "HOA2", "HOA3"]: - # copy FOA/HOA2/HOA3 signal to the OSBA oject - y.audio = x.audio - y.fs = x.fs - else: - # pad ISM signal with zeros to have the same length as the SBA signal - N_pad = y.audio.shape[0] - x.audio.shape[0] - if N_pad != 0: - x.audio = audioarray.trim( - x.audio, x.fs, limits=[0, -N_pad], samples=True + # convert azimuth from 0 .. 360 to -180 .. +180 + azi = (azi + 180) % 360 - 180 + + # check if azimuth is from -180 .. +180 + if any(azi > 180) or any(azi < -180): + logger.error( + f"Incorrect value(s) of azimuth: {azi[(azi > 180) | (azi < -180)]}" ) - # append ISM signal to the OSBA object (ISM comes first !!!) - y.audio = np.insert(y.audio, [i - 1], x.audio, axis=1) + # convert elevation information in case mof moving object + if isinstance(source_ele, str): + if ":" in source_ele: + # convert into array (initial_value:step:stop_value) + start_str, step_str, stop_str = source_ele.split(":") + start = float(eval(start_str)) + step = float(eval(step_str)) + stop = float(eval(stop_str)) + ele = np.arange(start, stop, step) + + # adjust length to N_frames + if len(ele) > N_frames: + ele = ele[:N_frames] + elif len(ele) < N_frames: + ele = np.append(ele, np.full(N_frames - len(ele), ele[-1])) + + else: + # replicate static elevation value N_frames times + ele = np.repeat(float(eval(source_ele)), N_frames) + else: + # replicate static elevation value N_frames times + ele = np.repeat(float(source_ele), N_frames) - # append pre-amble and post-amble to all sources - y.audio = audioarray.trim(y.audio, y.fs, limits=[-cfg.preamble, -cfg.postamble]) + # wrap elevation angle to -90 .. +90 + ele = ((ele + 90) % 180) - 90 - # add random noise - if cfg.add_low_level_random_noise: - # create uniformly distributed noise between -4 and 4 - np.random.seed(SEED_RANDOM_NOISE) - noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") + # check if elevation is from -90 .. +90 + if any(ele > 90) or any(ele < -90): + logger.error( + f"Incorrect value(s) of elevation: {ele[(ele > 90) | (ele < -90)]}" + ) - # superimpose - y.audio += noise + # generate radius vector with all values equal to 1.0 + rad = np.ones(N_frames) - # generate ISM metadata files - for i in range(1, N_ISMs + 1): - # parse metadata parameters from the scene description - source_azi = ( - scene["azimuth"][i] - if isinstance(scene["azimuth"], list) - else scene["azimuth"] - ) - source_ele = ( - scene["elevation"][i] - if isinstance(scene["elevation"], list) - else scene["elevation"] - ) + # arrange all metadata fields column-wise into a matrix + x_ism.object_pos.append(np.column_stack((azi, ele, rad))) - N_frames = int(np.rint((len(y.audio) / y.fs * 50))) - - # read azimuth information and convert to an array - if isinstance(source_azi, str): - if ":" in source_azi: - # start with the initial azimuth value and apply step N_frames times - source_azi = source_azi.split(":") - azi = np.arange( - float(eval(source_azi[0])), - float(eval(source_azi[0])) + N_frames * float(eval(source_azi[1])), - float(eval(source_azi[1])), - ) - else: - # replicate static azimuth value N_frames times - azi = np.repeat(float(eval(source_azi)), N_frames) - else: - # replicate static azimuth value N_frames times - azi = np.repeat(float(source_azi), N_frames) + x = x_ism # replace x with the ISM object - # convert azimuth from 0 .. 360 to -180 .. +180 - azi = (azi + 180) % 360 - 180 + # copy new audio source signal to the OSBA object + if y.audio is None: + # add the first audio source signal (should be FOA/HOA2/HOA3) to the array of all source signals + y.audio = x.audio.copy() - # check if azimuth is from -180 .. +180 - if any(azi > 180) or any(azi < -180): - logger.error( - f"Incorrect value(s) of azimuth: {azi[(azi > 180) | (azi < -180)]}" - ) + if fmt == "MONO": + # if ISM, append object position to the OSBA object + y.object_pos = x.object_pos.copy() - # read elevation information and convert to an array - if isinstance(source_ele, str): - if ":" in source_ele: - # convert into array (initial_value:step:stop_value) - # note: the stop_value value is +-90 degrees depending on the sign of the step - source_ele = source_ele.split(":") - ele = np.arange( - float(eval(source_ele[0])), - np.sign(float(eval(source_ele[1]))) * 90, - float(eval(source_ele[1])), - )[:N_frames] - - # repeat the last elevation value, if array is shorter than N_frames - if len(ele) < N_frames: - ele = np.append(ele, np.full(N_frames - len(ele), ele[-1])) - else: - # replicate static elevation value N_frames times - ele = np.repeat(float(eval(source_ele)), N_frames) + # if source_shift < 0: + # # insert zeros to the new audio source signal to shift it right + # metadata.trim_meta(y, limits=[source_shift, 0], samples=True) + offset = source_shift else: - # replicate static elevation value N_frames times - ele = np.repeat(float(source_ele), N_frames) + # shift the beginning of the audio source signal + delta_offset = source_shift - offset + if delta_offset > 0: + # insert zeros to the previous ISM signal(s) to shift them right + metadata.trim_meta(y, limits=[-delta_offset, 0], samples=True) + offset = source_shift + else: + # insert zeros to the new audio source signal to shift it right + metadata.trim_meta(x, limits=[delta_offset, 0], samples=True) + + # adjust the length of the audio source signal + delta_length = len(x.audio) - len(y.audio) + if delta_length > 0: + # pad zeros to the previous ISM signal(s) + metadata.trim_meta(y, limits=[0, -delta_length], samples=True) + else: + # pad zeros to the new audio source signal + metadata.trim_meta(x, limits=[0, delta_length], samples=True) - # check if elevation is from -90 .. +90 - if any(ele > 90) or any(ele < -90): - logger.error( - f"Incorrect value(s) of elevation: {ele[(ele > 90) | (ele < -90)]}" - ) + # append ISM signal to the OMASA object (ISM comes first !!!) + y.audio = np.insert(y.audio, [i - 1], x.audio, axis=1) + y.object_pos.extend(x.object_pos) - # arrange all metadata fields column-wise into a matrix - x_meta = np.column_stack((azi, ele)) + # add ISM metadata .csv filename (should end with .wav.0.csv, .wav.1.csv, ...) + y.metadata_files.insert(i - 1, str(output_filename.with_suffix(f".{i - 1}.csv"))) - # write to .csv output metadata file - np.savetxt(y.metadata_files[i - 1], x_meta, fmt="%0.2f", delimiter=",", encoding="utf-8") + # append pre-amble and post-amble + metadata.trim_meta(y, limits=[-cfg.preamble * 1000, -cfg.postamble * 1000]) - y.init_metadata() # this is needed to populate 'y.object_pos[]' + # add random noise + if cfg.add_low_level_random_noise: + # create uniformly distributed noise between -4 and 4 + np.random.seed(SEED_RANDOM_NOISE) + noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") + y.audio += noise - # write the OSBA output to .wav file in an interleaved format + # adjust the length of the output signal + if "duration" in cfg.__dict__: + # trim the output signal such that the total duration is X seconds + duration = int(cfg.duration * cfg.fs) # convert to samples + else: + # do not change the length of the audio signal + duration = len(y.audio) + duration = int(np.floor(duration / frame_len) * frame_len) # ensure multiple of 20ms + if len(y.audio) != duration: + metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) + + # write the OSBA audio output to .wav file in an interleaved format and ISM metadata in .csv files audiofile.write(output_filename, y.audio, y.fs) + metadata.write_ISM_metadata_in_file(y.object_pos, y.metadata_files) # convert the OSBA output to BINAURAL, if option was chosen if cfg.binaural_output: @@ -408,5 +463,3 @@ def generate_OSBA_scene( binaudio.audio, binaudio.fs, ) - - return -- GitLab From c88f0b6d53938ce19f90e14c257f31cc5e3cf3c6 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Wed, 25 Jun 2025 17:51:31 +0200 Subject: [PATCH 26/42] refactor to use source shits --- .../generation/generate_stereo_items.py | 119 ++++++++++++------ 1 file changed, 83 insertions(+), 36 deletions(-) diff --git a/ivas_processing_scripts/generation/generate_stereo_items.py b/ivas_processing_scripts/generation/generate_stereo_items.py index d97a4c7b..7d079c0c 100644 --- a/ivas_processing_scripts/generation/generate_stereo_items.py +++ b/ivas_processing_scripts/generation/generate_stereo_items.py @@ -33,7 +33,6 @@ import logging import os from itertools import groupby, repeat -from math import floor from pathlib import Path import numpy as np @@ -202,22 +201,37 @@ def generate_stereo_scene( # extract the number of audio sources N_inputs = len(np.atleast_1d(scene["input"])) - # initialize output dirs + # get the output filename output_filename = Path(scene["output"]).parent / ( cfg.use_output_prefix + Path(scene["output"]).name ) + # initialize output dirs dir_path = output_filename.parent if dir_path and not dir_path.exists(): dir_path.mkdir(parents=True, exist_ok=True) - # initialize output audio object + # initialize output STEREO object y = audio.ChannelBasedAudio(cfg.format) + y.fs = cfg.fs + + # set the frame length + frame_len = int(cfg.fs / 50) + # repeat for all source files + offset = 0 for i in range(N_inputs): # parse parameters from the scene description - source_file = np.atleast_1d(scene["input"])[i] - IR_file = np.atleast_1d(scene["IR"])[i] + source_file = ( + scene["input"][i] if isinstance(scene["input"], list) else scene["input"] + ) + IR_file = ( + scene["IR"][i] if isinstance(scene["IR"], list) else scene["IR"] + ) + + # get input filename and IR filename + input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name) + IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) # read the overlap length if "shift" in scene.keys(): @@ -229,6 +243,13 @@ def generate_stereo_scene( else: source_shift = 0.0 + # convert overlap to samples and ensure it is a multiple of 20ms + source_shift = source_shift * cfg.fs + if source_shift >= 0: + source_shift = int(np.floor(source_shift / frame_len) * frame_len) + else: + source_shift = int(np.ceil(source_shift / frame_len) * frame_len) + # read the level if "level" in scene.keys(): level = ( @@ -241,62 +262,88 @@ def generate_stereo_scene( logger.info(f"Convolving {source_file} with {IR_file}") - # get input filename and IR filename - input_filename = Path(source_file).parent / ( - cfg.use_input_prefix + Path(source_file).name - ) - IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) - # read source file - x = audio.fromfile("MONO", input_filename, fs=cfg.fs) + x = audio.fromfile("MONO", input_filename) + + # resample to the target fs if necessary + if x.fs != cfg.fs: + logger.warning( + f"Warning: Sample rate of the audio source is {x.fs} Hz and needs to be resampled to {cfg.fs}!" + ) + resampled_audio = audioarray.resample(x.audio, x.fs, cfg.fs) + x.audio = resampled_audio + x.fs = cfg.fs # read the IR file (!must be in STEREO format!) - IR = audio.fromfile("STEREO", IR_filename, fs=cfg.IR_fs) + IR = audio.fromfile("STEREO", IR_filename) - # convolve mono source signal with stereo IR + # convolve MONO source audio with STEREO IR -> results in STEREO audio object x = reverb_stereo(x, IR) # adjust the level of the stereo signal x.audio, _ = loudness_norm(x, level, loudness_format="STEREO") - # shift the source signal (positive shift creates overlap, negative shift creates a gap) - if int(floor(-source_shift)) != 0: - x.audio = audioarray.trim(x.audio, x.fs, limits=[-source_shift, 0]) - - # get the number of frames (multiple of 20ms) - frame_len = int(x.fs / 50) - - # pad with zeros to ensure that the signal length is a multiple of 20ms + # ensure the length of the audio source signal is a multiple of 20ms if len(x.audio) % frame_len != 0: - N_pad = int(frame_len - len(x.audio) % frame_len) - x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) + # pad with zeros to ensure that the signal length is a multiple of 20ms + if len(x.audio) % frame_len != 0: + N_pad = int(frame_len - len(x.audio) % frame_len) + x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) + # add the convolved STEREO audio source signal to the output signal if y.audio is None: # add source signal to the array of all source signals y.audio = x.audio.copy() - y.fs = x.fs + + # if source_shift < 0: + # # insert zeros to the new audio source signal to shift it right + # metadata.trim_meta(y, limits=[source_shift, 0], samples=True) + offset = source_shift else: - # pad the signal with zeros to have the same length as the previous signal(s) - N_pad = y.audio.shape[0] - x.audio.shape[0] - if N_pad != 0: - x.audio = audioarray.trim( - x.audio, x.fs, limits=[0, -N_pad], samples=True - ) + # shift the beginning of the audio source signal + delta_offset = source_shift - offset + if delta_offset > 0: + # insert zeros to the existing output signal to shift it right + y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_offset], samples=True) + offset = source_shift + else: + # insert zeros to the new audio source signal to shift it right + x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_offset], samples=True) + + # adjust the length of the audio source signal + delta_length = len(x.audio) - len(y.audio) + if delta_length > 0: + # pad zeros to the existing output signal + y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_length], samples=True) + else: + # pad zeros to the new audio source signal + x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_length], samples=True) # superimpose y.audio += x.audio - # append pre-amble and post-amble to all sources - y.audio = audioarray.trim(y.audio, y.fs, limits=[-cfg.preamble, -cfg.postamble]) + # append pre-amble and post-amble + preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + y.audio = audioarray.trim(y.audio, y.fs, limits=[-preamble, -postamble], samples=True) # add random noise if cfg.add_low_level_random_noise: # create uniformly distributed noise between -4 and 4 np.random.seed(SEED_RANDOM_NOISE) noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") - - # superimpose y.audio += noise - # write the output STEREO audio signal into output file + # adjust the length of the output signal + if "duration" in cfg.__dict__: + # trim the output signal such that the total duration is X seconds + duration = int(cfg.duration * cfg.fs) # convert to samples + else: + # do not change the length of the audio signal + duration = len(y.audio) + duration = int(np.floor(duration / frame_len) * frame_len) # ensure multiple of 20ms + if len(y.audio) != duration: + y.audio = audioarray.trim(y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True) + + # write the STEREO audio signal into output file audiofile.write(output_filename, y.audio, y.fs) -- GitLab From b323333d753738e6e9e73086844c1dd8eabcacf1 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Wed, 25 Jun 2025 17:52:35 +0200 Subject: [PATCH 27/42] adjust pre-amble and post-amble to 20ms boundary --- .../generation/generate_ambi_items.py | 120 ++++++++++++------ .../generation/generate_ismN_items.py | 4 +- .../generation/generate_omasa_items.py | 8 +- .../generation/generate_osba_items.py | 22 ++-- 4 files changed, 99 insertions(+), 55 deletions(-) diff --git a/ivas_processing_scripts/generation/generate_ambi_items.py b/ivas_processing_scripts/generation/generate_ambi_items.py index 6e4fe8ea..fe1b1c39 100644 --- a/ivas_processing_scripts/generation/generate_ambi_items.py +++ b/ivas_processing_scripts/generation/generate_ambi_items.py @@ -32,7 +32,6 @@ import logging from itertools import groupby, repeat -from math import floor from pathlib import Path import numpy as np @@ -196,22 +195,37 @@ def generate_ambi_scene( # extract the number of audio sources N_inputs = len(np.atleast_1d(scene["input"])) - # initialize output dirs + # get the output filename output_filename = Path(scene["output"]).parent / ( cfg.use_output_prefix + Path(scene["output"]).name ) + # initialize output dirs dir_path = output_filename.parent if dir_path and not dir_path.exists(): dir_path.mkdir(parents=True, exist_ok=True) - # initialize output audio object + # initialize output SBA object y = audio.SceneBasedAudio(cfg.format) + y.fs = cfg.fs + + # set the frame length + frame_len = int(cfg.fs / 50) + # repeat for all source files + offset = 0 for i in range(N_inputs): # parse parameters from the scene description - source_file = np.atleast_1d(scene["input"])[i] - IR_file = np.atleast_1d(scene["IR"])[i] + source_file = ( + scene["input"][i] if isinstance(scene["input"], list) else scene["input"] + ) + IR_file = ( + scene["IR"][i] if isinstance(scene["IR"], list) else scene["IR"] + ) + + # get input filename and IR filename + input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name) + IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) # read the overlap length if "shift" in scene.keys(): @@ -223,6 +237,13 @@ def generate_ambi_scene( else: source_shift = 0.0 + # convert overlap to samples and ensure it is a multiple of 20ms + source_shift = source_shift * cfg.fs + if source_shift >= 0: + source_shift = int(np.floor(source_shift / frame_len) * frame_len) + else: + source_shift = int(np.ceil(source_shift / frame_len) * frame_len) + # read the level if "level" in scene.keys(): level = ( @@ -235,19 +256,22 @@ def generate_ambi_scene( logger.info(f"Convolving {source_file} with {IR_file}") - # get input filename and IR filename - input_filename = Path(source_file).parent / ( - cfg.use_input_prefix + Path(source_file).name - ) - IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) - # read source file - x = audio.fromfile("MONO", input_filename, fs=cfg.fs) + x = audio.fromfile("MONO", input_filename) + + # resample to the target fs if necessary + if x.fs != cfg.fs: + logger.warning( + f"Warning: Sample rate of the audio source is {x.fs} Hz and needs to be resampled to {cfg.fs}!" + ) + resampled_audio = audioarray.resample(x.audio, x.fs, cfg.fs) + x.audio = resampled_audio + x.fs = cfg.fs # read the IR file (!must be in target format!) - IR = audio.fromfile(cfg.format, IR_filename, fs=cfg.IR_fs) + IR = audio.fromfile(cfg.format, IR_filename) - # convolve with the FOA/HOA2/HOA3 IR + # convolve MONO source audio with FOA/HOA2/HOA3 IR -> results in FOA/HOA2/HOA3 audio object if cfg.format == "FOA": x = reverb_foa(x, IR) elif cfg.format == "HOA2": @@ -258,46 +282,69 @@ def generate_ambi_scene( # adjust the level of the target signal x.audio, _ = loudness_norm(x, level, loudness_format="STEREO") - # shift the source signal (positive shift creates overlap, negative shift creates a gap) - if int(floor(-source_shift)) != 0: - x.audio = audioarray.trim(x.audio, x.fs, limits=[-source_shift, 0]) - - # get the number of frames (multiple of 20ms) - frame_len = int(x.fs / 50) - - # pad with zeros to ensure that the signal length is a multiple of 20ms + # ensure the length of the audio source signal is a multiple of 20ms if len(x.audio) % frame_len != 0: - N_pad = int(frame_len - len(x.audio) % frame_len) - x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) + # pad with zeros to ensure that the signal length is a multiple of 20ms + if len(x.audio) % frame_len != 0: + N_pad = int(frame_len - len(x.audio) % frame_len) + x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) + # add the convolved FOA/HOA2/HOA3 audio source signal to the output signal if y.audio is None: # add source signal to the array of all source signals y.audio = x.audio.copy() - y.fs = x.fs + + # if source_shift < 0: + # # insert zeros to the new audio source signal to shift it right + # metadata.trim_meta(y, limits=[source_shift, 0], samples=True) + offset = source_shift else: - # adjust the signal length (trim from the end or pad with zeros) to align its length with the previous signal(s) - N_pad = y.audio.shape[0] - x.audio.shape[0] - if N_pad != 0: - x.audio = audioarray.trim( - x.audio, x.fs, limits=[0, -N_pad], samples=True - ) + # shift the beginning of the audio source signal + delta_offset = source_shift - offset + if delta_offset > 0: + # insert zeros to the existing output signal to shift it right + y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_offset], samples=True) + offset = source_shift + else: + # insert zeros to the new audio source signal to shift it right + x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_offset], samples=True) + + # adjust the length of the audio source signal + delta_length = len(x.audio) - len(y.audio) + if delta_length > 0: + # pad zeros to the existing output signal + y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_length], samples=True) + else: + # pad zeros to the new audio source signal + x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_length], samples=True) # superimpose y.audio += x.audio - # append pre-amble and post-amble to all sources - y.audio = audioarray.trim(y.audio, y.fs, limits=[-cfg.preamble, -cfg.postamble]) + # append pre-amble and post-amble + preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + y.audio = audioarray.trim(y.audio, y.fs, limits=[-preamble, -postamble], samples=True) # add random noise if cfg.add_low_level_random_noise: # create uniformly distributed noise between -4 and 4 np.random.seed(SEED_RANDOM_NOISE) noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") - - # superimpose y.audio += noise - # write the FOA/HOA2/HOA3 audio into output file + # adjust the length of the output signal + if "duration" in cfg.__dict__: + # trim the output signal such that the total duration is X seconds + duration = int(cfg.duration * cfg.fs) # convert to samples + else: + # do not change the length of the audio signal + duration = len(y.audio) + duration = int(np.floor(duration / frame_len) * frame_len) # ensure multiple of 20ms + if len(y.audio) != duration: + y.audio = audioarray.trim(y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True) + + # write the FOA/HOA2/HOA3 audio signal into output file audiofile.write(output_filename, y.audio, y.fs) # convert to BINAURAL, if option was chosen @@ -314,4 +361,3 @@ def generate_ambi_scene( binaudio.fs, ) logger.info(f"Written BINAURAL output to: {binaural_output_filename}") - return diff --git a/ivas_processing_scripts/generation/generate_ismN_items.py b/ivas_processing_scripts/generation/generate_ismN_items.py index a1689ddc..1fbacb83 100644 --- a/ivas_processing_scripts/generation/generate_ismN_items.py +++ b/ivas_processing_scripts/generation/generate_ismN_items.py @@ -388,7 +388,9 @@ def generate_ismN_scene( y.metadata_files.insert(i, str(output_filename.with_suffix(f".{i}.csv"))) # append pre-amble and post-amble - metadata.trim_meta(y, limits=[-cfg.preamble * 1000, -cfg.postamble * 1000]) + preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + metadata.trim_meta(y, limits=[-preamble, -postamble], samples=True) # add random noise if cfg.add_low_level_random_noise: diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py index 565cba19..b510e164 100644 --- a/ivas_processing_scripts/generation/generate_omasa_items.py +++ b/ivas_processing_scripts/generation/generate_omasa_items.py @@ -277,7 +277,7 @@ def generate_OMASA_scene( sys.exit(-1) # read source file - x = audio.fromfile(fmt, input_filename, fs=cfg.fs) + x = audio.fromfile(fmt, input_filename) # resample to the target fs if necessary if x.fs != cfg.fs: @@ -290,7 +290,7 @@ def generate_OMASA_scene( # adjust the level of the source file if fmt in ["FOA", "HOA2", "HOA3"]: - x.audio, _ = loudness_norm(x, level, loudness_format="STEREO", rms=True) + x.audio, _ = loudness_norm(x, level, loudness_format="STEREO") else: x.audio, _ = loudness_norm(x, level, loudness_format="MONO") @@ -436,7 +436,9 @@ def generate_OMASA_scene( y.metadata_files.insert(i - 1, str(output_filename.with_suffix(f".{i - 1}.csv"))) # append pre-amble and post-amble - metadata.trim_meta(y, limits=[-cfg.preamble * 1000, -cfg.postamble * 1000]) + preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + metadata.trim_meta(y, limits=[-preamble, -postamble], samples=True) # add random noise if cfg.add_low_level_random_noise: diff --git a/ivas_processing_scripts/generation/generate_osba_items.py b/ivas_processing_scripts/generation/generate_osba_items.py index 64b5a030..09cbfeae 100644 --- a/ivas_processing_scripts/generation/generate_osba_items.py +++ b/ivas_processing_scripts/generation/generate_osba_items.py @@ -33,7 +33,6 @@ import logging import sys from itertools import groupby, repeat -from math import floor from pathlib import Path import numpy as np @@ -182,7 +181,7 @@ def generate_OSBA_scene( N_inputs = len(np.atleast_1d(scene["input"])) N_ISMs = N_inputs - 1 - # get input and output filenames + # get OSBA format and output filename osba_format = f"ISM{N_ISMs}SBA{cfg.sba_order}" output_filename = Path(scene["output"]).parent / ( cfg.use_output_prefix + Path(scene["output"]).name @@ -209,9 +208,7 @@ def generate_OSBA_scene( ) # get input filename - input_filename = Path(source_file).parent / ( - cfg.use_input_prefix + Path(source_file).name - ) + input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name) # read azimuth and elevation information source_azi = ( @@ -254,11 +251,6 @@ def generate_OSBA_scene( logger.info(f"Encoding {source_file} at position(s) {source_azi},{source_ele}") - # get input filename - input_filename = Path(source_file).parent / ( - cfg.use_input_prefix + Path(source_file).name - ) - # get the number of channels from the .wav file header wav_header = audiofile.parse_wave_header(input_filename) N_channels = wav_header["channels"] @@ -280,7 +272,7 @@ def generate_OSBA_scene( sys.exit(-1) # read source file - x = audio.fromfile(fmt, input_filename, fs=cfg.fs) + x = audio.fromfile(fmt, input_filename) # resample to the target fs if necessary if x.fs != cfg.fs: @@ -293,7 +285,7 @@ def generate_OSBA_scene( # adjust the level of the source file if fmt in ["FOA", "HOA2", "HOA3"]: - x.audio, _ = loudness_norm(x, level, loudness_format="STEREO", rms=True) + x.audio, _ = loudness_norm(x, level, loudness_format="STEREO") else: x.audio, _ = loudness_norm(x, level, loudness_format="MONO") @@ -307,7 +299,7 @@ def generate_OSBA_scene( # get the number of frames (multiple of 20ms) N_frames = int(len(x.audio) / frame_len) - # convert the input audio source signal to ISM + # convert the input MONO audio source signal to ISM1 object if fmt == "MONO": # convert MONO to ISM1 x_ism = audio.ObjectBasedAudio("ISM1") # ISM with 1 channel @@ -426,7 +418,9 @@ def generate_OSBA_scene( y.metadata_files.insert(i - 1, str(output_filename.with_suffix(f".{i - 1}.csv"))) # append pre-amble and post-amble - metadata.trim_meta(y, limits=[-cfg.preamble * 1000, -cfg.postamble * 1000]) + preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + metadata.trim_meta(y, limits=[-preamble, -postamble], samples=True) # add random noise if cfg.add_low_level_random_noise: -- GitLab From 245c7bf65f595e2744f8ff512c24651fcd66bb50 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 26 Jun 2025 09:44:48 +0200 Subject: [PATCH 28/42] suppres the "Chunk (non-data) not understood, skipping it." warning message with a global constant --- ivas_processing_scripts/audiotools/audiofile.py | 4 +++- ivas_processing_scripts/audiotools/constants.py | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/ivas_processing_scripts/audiotools/audiofile.py b/ivas_processing_scripts/audiotools/audiofile.py index 44ef2abc..8d8d1601 100755 --- a/ivas_processing_scripts/audiotools/audiofile.py +++ b/ivas_processing_scripts/audiotools/audiofile.py @@ -40,7 +40,7 @@ import numpy as np import scipy.io.wavfile as wav from .audioarray import trim, window -from .constants import VERT_HOA_CHANNELS_ACN +from .constants import SUPPRESS_CHUNK_WARNING_WAV_READ, VERT_HOA_CHANNELS_ACN logger = logging.getLogger("__main__") logger.setLevel(logging.DEBUG) @@ -80,6 +80,8 @@ def read( with catch_warnings(record=True) as warnings_list: fs, data = wav.read(filename) for w in warnings_list: + if SUPPRESS_CHUNK_WARNING_WAV_READ and "Chunk (non-data) not understood, skipping it." in str(w.message): + continue print(f"{filename} : {w.message} ( {w.filename}:{w.lineno} )") if data.dtype == np.int32: data = np.interp( diff --git a/ivas_processing_scripts/audiotools/constants.py b/ivas_processing_scripts/audiotools/constants.py index c5dbe6ca..925ed2a2 100755 --- a/ivas_processing_scripts/audiotools/constants.py +++ b/ivas_processing_scripts/audiotools/constants.py @@ -32,6 +32,8 @@ import numpy as np +SUPPRESS_CHUNK_WARNING_WAV_READ = True # suppress warning from .wav read() when chunk size is not a multiple of 2 + BINAURAL_AUDIO_FORMATS = { "BINAURAL": { "num_channels": 2, -- GitLab From bc5275f3412e68f60e96f5296a38daf8003b2c21 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 26 Jun 2025 09:45:53 +0200 Subject: [PATCH 29/42] add fade-in fade-out option --- examples/ITEM_GENERATION_3ISM.yml | 13 ++++++++----- examples/ITEM_GENERATION_FOA.yml | 9 ++++++--- examples/ITEM_GENERATION_OMASA.yml | 9 ++++++--- examples/ITEM_GENERATION_OSBA.yml | 9 ++++++--- examples/ITEM_GENERATION_STEREO.yml | 9 ++++++--- 5 files changed, 32 insertions(+), 17 deletions(-) diff --git a/examples/ITEM_GENERATION_3ISM.yml b/examples/ITEM_GENERATION_3ISM.yml index 753e763e..e770cadf 100644 --- a/examples/ITEM_GENERATION_3ISM.yml +++ b/examples/ITEM_GENERATION_3ISM.yml @@ -10,9 +10,9 @@ ### Output format format: "ISM3" -# masa_tc: 2 -# masa_dirs: 2 -# sba_order: 2 +# masa_tc: 2 # applicable only to OMASA format +# masa_dirs: 2 # applicable only to OMASA format +# sba_order: 2 # applicable only to OSBA format ### Output sampling rate in Hz fs: 48000 @@ -27,6 +27,9 @@ binaural_output: true preamble: 0.0 postamble: 0.0 +### Apply fade-in and fade-out of X seconds +fade_in_out: 0.5 + ### Trim the output such that the total duration is X seconds duration: 8 @@ -107,7 +110,7 @@ scenes: azimuth: [20, -40, 45] elevation: [0, 0, 70] level: [-26, -26, -41] - shift: [0.0, 0.0, 0.0] + shift: [-1.0, -2.0, 2.0] "02": output: "out/VA_3obj_2tlks_music2.wav" @@ -116,7 +119,7 @@ scenes: azimuth: [50, "180:1:120 + 360", -120] elevation: [0, 45, 70] level: [-26, -26, -41] - shift: [0.0, 0.0, 0.0] + shift: [1.0, -2.0, -1.0] "03": output: "out/VA_3obj_2tlks_music3.wav" diff --git a/examples/ITEM_GENERATION_FOA.yml b/examples/ITEM_GENERATION_FOA.yml index 2990fba6..2287af4c 100644 --- a/examples/ITEM_GENERATION_FOA.yml +++ b/examples/ITEM_GENERATION_FOA.yml @@ -10,9 +10,9 @@ ### Output format format: "FOA" -# masa_tc: 2 -# masa_dirs: 2 -# sba_order: 2 +# masa_tc: 2 # applicable only to OMASA format +# masa_dirs: 2 # applicable only to OMASA format +# sba_order: 2 # applicable only to OSBA format ### Output sampling rate in Hz fs: 48000 @@ -27,6 +27,9 @@ loudness: -26 preamble: 0.5 postamble: 1.0 +### Apply fade-in and fade-out of X seconds +fade_in_out: 0.5 + ### Trim the output such that the total duration is X seconds duration: 8 diff --git a/examples/ITEM_GENERATION_OMASA.yml b/examples/ITEM_GENERATION_OMASA.yml index 31529e1f..4c8db6d2 100644 --- a/examples/ITEM_GENERATION_OMASA.yml +++ b/examples/ITEM_GENERATION_OMASA.yml @@ -10,9 +10,9 @@ ### Output format format: "OMASA" -masa_tc: 2 -masa_dirs: 2 -# sba_order: 2 +masa_tc: 2 # applicable only to OMASA format +masa_dirs: 2 # applicable only to OMASA format +# sba_order: 2 # applicable only to OSBA format ### Output sampling rate in Hz fs: 48000 @@ -27,6 +27,9 @@ binaural_output: true preamble: 0.0 postamble: 0.0 +### Apply fade-in and fade-out of X seconds +fade_in_out: 0.5 + ### Trim the output such that the total duration is X seconds duration: 8 diff --git a/examples/ITEM_GENERATION_OSBA.yml b/examples/ITEM_GENERATION_OSBA.yml index f1b7c3ea..b7e1400f 100644 --- a/examples/ITEM_GENERATION_OSBA.yml +++ b/examples/ITEM_GENERATION_OSBA.yml @@ -10,9 +10,9 @@ ### Output format format: "OSBA" -# masa_tc: 2 -# masa_dirs: 2 -sba_order: 2 +# masa_tc: 2 # applicable only to OMASA format +# masa_dirs: 2 # applicable only to OMASA format +sba_order: 2 # applicable only to OSBA format ### Output sampling rate in Hz fs: 48000 @@ -27,6 +27,9 @@ binaural_output: true preamble: 0.0 postamble: 0.0 +### Apply fade-in and fade-out of X seconds +fade_in_out: 0.5 + ### Trim the output such that the total duration is X seconds duration: 8 diff --git a/examples/ITEM_GENERATION_STEREO.yml b/examples/ITEM_GENERATION_STEREO.yml index 7c4391dc..14731b4b 100644 --- a/examples/ITEM_GENERATION_STEREO.yml +++ b/examples/ITEM_GENERATION_STEREO.yml @@ -10,9 +10,9 @@ ### Output format format: "STEREO" -# masa_tc: 2 -# masa_dirs: 2 -# sba_order: 2 +# masa_tc: 2 # applicable only to OMASA format +# masa_dirs: 2 # applicable only to OMASA format +# sba_order: 2 # applicable only to OSBA format ### Output sampling rate in Hz fs: 48000 @@ -27,6 +27,9 @@ loudness: -26 preamble: 0.5 postamble: 1.0 +### Apply fade-in and fade-out of X seconds +fade_in_out: 0.5 + ### Trim the output such that the total duration is X seconds duration: 8 -- GitLab From 8cc50861d286991f306dabd45b94351b956676c2 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 26 Jun 2025 09:48:48 +0200 Subject: [PATCH 30/42] improve prinout messages, apply fade-in, fade-out, correct input source shifting --- .../generation/generate_ambi_items.py | 34 +++++++++++++------ .../generation/generate_ismN_items.py | 32 ++++++++++++----- .../generation/generate_omasa_items.py | 31 ++++++++++++----- .../generation/generate_osba_items.py | 33 ++++++++++++------ .../generation/generate_stereo_items.py | 26 ++++++++++---- 5 files changed, 110 insertions(+), 46 deletions(-) diff --git a/ivas_processing_scripts/generation/generate_ambi_items.py b/ivas_processing_scripts/generation/generate_ambi_items.py index fe1b1c39..6cd3de83 100644 --- a/ivas_processing_scripts/generation/generate_ambi_items.py +++ b/ivas_processing_scripts/generation/generate_ambi_items.py @@ -188,8 +188,9 @@ def generate_ambi_scene( - Writes the processed FOA/HOA2/HOA3 audio to the output file. """ + scenes = list(cfg.scenes.keys()) logger.info( - f"Processing scene: {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}" + f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}" ) # extract the number of audio sources @@ -238,6 +239,7 @@ def generate_ambi_scene( source_shift = 0.0 # convert overlap to samples and ensure it is a multiple of 20ms + source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) @@ -254,7 +256,7 @@ def generate_ambi_scene( else: level = -26 - logger.info(f"Convolving {source_file} with {IR_file}") + logger.info(f"-- Convolving {source_file} with {IR_file} at {level} LUFS with shift of {source_shift_in_seconds} seconds") # read source file x = audio.fromfile("MONO", input_filename) @@ -279,7 +281,7 @@ def generate_ambi_scene( elif cfg.format == "HOA3": x = reverb_hoa3(x, IR) - # adjust the level of the target signal + # adjust the level of the FOA/HOA2/HOA3 signal x.audio, _ = loudness_norm(x, level, loudness_format="STEREO") # ensure the length of the audio source signal is a multiple of 20ms @@ -294,10 +296,11 @@ def generate_ambi_scene( # add source signal to the array of all source signals y.audio = x.audio.copy() - # if source_shift < 0: - # # insert zeros to the new audio source signal to shift it right - # metadata.trim_meta(y, limits=[source_shift, 0], samples=True) - offset = source_shift + if source_shift < 0: + # insert zeros to the new audio source signal to shift it right + y.audio = audioarray.trim_meta(y.audio, y.fs, limits=[source_shift, 0], samples=True) + else: + offset = source_shift else: # shift the beginning of the audio source signal delta_offset = source_shift - offset @@ -344,20 +347,29 @@ def generate_ambi_scene( if len(y.audio) != duration: y.audio = audioarray.trim(y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True) + # adjust the loudness of the output signal + logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") + y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL") + + # apply fade-in and fade-out + if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0: + logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds") + y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000) + # write the FOA/HOA2/HOA3 audio signal into output file audiofile.write(output_filename, y.audio, y.fs) # convert to BINAURAL, if option was chosen if cfg.binaural_output: - binaudio = audio.fromtype("BINAURAL") - binaudio.fs = y.fs - convert_scenebased(y, binaudio) binaural_output_filename = output_filename.with_name( output_filename.stem + "_BINAURAL" + output_filename.suffix ) + logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}") + binaudio = audio.fromtype("BINAURAL") + binaudio.fs = y.fs + convert_scenebased(y, binaudio) audiofile.write( binaural_output_filename, binaudio.audio, binaudio.fs, ) - logger.info(f"Written BINAURAL output to: {binaural_output_filename}") diff --git a/ivas_processing_scripts/generation/generate_ismN_items.py b/ivas_processing_scripts/generation/generate_ismN_items.py index 1fbacb83..d71d0fe9 100644 --- a/ivas_processing_scripts/generation/generate_ismN_items.py +++ b/ivas_processing_scripts/generation/generate_ismN_items.py @@ -176,8 +176,9 @@ def generate_ismN_scene( - Writes the processed audio and metadata to output files. """ + scenes = list(cfg.scenes.keys()) logger.info( - f"Processing {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}" + f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}" ) # extract the number of audio sources @@ -236,6 +237,7 @@ def generate_ismN_scene( source_shift = 0.0 # convert overlap to samples and ensure it is a multiple of 20ms + source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) @@ -250,7 +252,7 @@ def generate_ismN_scene( else: level = -26 - logger.info(f"Encoding {source_file} at position(s) {source_azi},{source_ele}") + logger.info(f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds") # read source file x = audio.fromtype("ISM1") @@ -357,10 +359,12 @@ def generate_ismN_scene( y.audio = x.audio.copy() y.object_pos = x.object_pos.copy() y.fs = x.fs - # if source_shift < 0: - # # insert zeros to the new audio source signal to shift it right - # metadata.trim_meta(y, limits=[source_shift, 0], samples=True) - offset = source_shift + + if source_shift < 0: + # insert zeros to the new audio source signal to shift it right + metadata.trim_meta(y, limits=[source_shift, 0], samples=True) + else: + offset = source_shift else: # shift the beginning of the audio source signal delta_offset = source_shift - offset @@ -410,18 +414,28 @@ def generate_ismN_scene( if len(y.audio) != duration: metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) + # adjust the loudness of the output signal + logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") + y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL") + + # apply fade-in and fade-out + if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0: + logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds") + y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000) + # write the ISMn output to .wav file in an interleaved format and ISM metadata in .csv files audiofile.write(output_filename, y.audio, y.fs) metadata.write_ISM_metadata_in_file(y.object_pos, y.metadata_files) # convert to BINAURAL, if option was chosen if cfg.binaural_output: - binaudio = audio.fromtype("BINAURAL") - binaudio.fs = y.fs - convert_objectbased(y, binaudio) binaural_output_filename = output_filename.with_name( output_filename.stem + "_BINAURAL" + output_filename.suffix ) + logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}") + binaudio = audio.fromtype("BINAURAL") + binaudio.fs = y.fs + convert_objectbased(y, binaudio) audiofile.write( binaural_output_filename, binaudio.audio, diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py index b510e164..5657cc89 100644 --- a/ivas_processing_scripts/generation/generate_omasa_items.py +++ b/ivas_processing_scripts/generation/generate_omasa_items.py @@ -175,8 +175,9 @@ def generate_OMASA_scene( - Handles various audio formats (e.g., MONO, FOA, HOA2) and applies transformations like loudness normalization, trimming, and padding. """ + scenes = list(cfg.scenes.keys()) logger.info( - f"Processing {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}" + f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}" ) # extract the number of audio sources @@ -238,6 +239,7 @@ def generate_OMASA_scene( source_shift = 0.0 # convert overlap to samples and ensure it is a multiple of 20ms + source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) @@ -254,7 +256,7 @@ def generate_OMASA_scene( else: level = -26 - logger.info(f"Encoding {source_file} at position(s) {source_azi},{source_ele}") + logger.info(f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds") # get the number of channels from the .wav file header wav_header = audiofile.parse_wave_header(input_filename) @@ -404,10 +406,11 @@ def generate_OMASA_scene( # if ISM, append object position to the OMASA object y.object_pos = x.object_pos.copy() - # if source_shift < 0: - # # insert zeros to the new audio source signal to shift it right - # metadata.trim_meta(y, limits=[source_shift, 0], samples=True) - offset = source_shift + if source_shift < 0: + # insert zeros to the new audio source signal to shift it right + metadata.trim_meta(y, limits=[source_shift, 0], samples=True) + else: + offset = source_shift else: # shift the beginning of the audio source signal delta_offset = source_shift - offset @@ -458,18 +461,28 @@ def generate_OMASA_scene( if len(y.audio) != duration: metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) + # adjust the loudness of the output signal + logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") + y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL") + + # apply fade-in and fade-out + if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0: + logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds") + y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000) + # write the OMASA audio output to .wav file in an interleaved format and ISM metadata in .csv files audiofile.write(output_filename, y.audio, y.fs) metadata.write_ISM_metadata_in_file(y.object_pos, y.metadata_files[:-1]) # convert to OMASA output to BINAURAL, if option was chosen if cfg.binaural_output: - binaudio = audio.fromtype("BINAURAL") - binaudio.fs = y.fs - convert_omasa(y, binaudio) binaural_output_filename = output_filename.with_name( output_filename.stem + "_BINAURAL" + output_filename.suffix ) + logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}") + binaudio = audio.fromtype("BINAURAL") + binaudio.fs = y.fs + convert_omasa(y, binaudio) audiofile.write( binaural_output_filename, binaudio.audio, diff --git a/ivas_processing_scripts/generation/generate_osba_items.py b/ivas_processing_scripts/generation/generate_osba_items.py index 09cbfeae..c13a2e7b 100644 --- a/ivas_processing_scripts/generation/generate_osba_items.py +++ b/ivas_processing_scripts/generation/generate_osba_items.py @@ -173,8 +173,9 @@ def generate_OSBA_scene( - Handles various audio formats (e.g., FOA, HOA2, HOA3) and applies transformations like loudness normalization, trimming, and padding. """ + scenes = list(cfg.scenes.keys()) logger.info( - f"Processing {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}" + f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}" ) # extract the number of audio sources @@ -233,6 +234,7 @@ def generate_OSBA_scene( source_shift = 0.0 # convert overlap to samples and ensure it is a multiple of 20ms + source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) @@ -249,7 +251,7 @@ def generate_OSBA_scene( else: level = -26 - logger.info(f"Encoding {source_file} at position(s) {source_azi},{source_ele}") + logger.info(f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds") # get the number of channels from the .wav file header wav_header = audiofile.parse_wave_header(input_filename) @@ -266,7 +268,7 @@ def generate_OSBA_scene( elif N_channels == 16: fmt = "HOA3" else: - logger.info( + logger.error( f"Error: Input format of the source file with {N_channels} channels is not supported!" ) sys.exit(-1) @@ -386,10 +388,11 @@ def generate_OSBA_scene( # if ISM, append object position to the OSBA object y.object_pos = x.object_pos.copy() - # if source_shift < 0: - # # insert zeros to the new audio source signal to shift it right - # metadata.trim_meta(y, limits=[source_shift, 0], samples=True) - offset = source_shift + if source_shift < 0: + # insert zeros to the new audio source signal to shift it right + metadata.trim_meta(y, limits=[source_shift, 0], samples=True) + else: + offset = source_shift else: # shift the beginning of the audio source signal delta_offset = source_shift - offset @@ -440,18 +443,28 @@ def generate_OSBA_scene( if len(y.audio) != duration: metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) + # adjust the loudness of the output signal + logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") + y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL") + + # apply fade-in and fade-out + if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0: + logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds") + y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000) + # write the OSBA audio output to .wav file in an interleaved format and ISM metadata in .csv files audiofile.write(output_filename, y.audio, y.fs) metadata.write_ISM_metadata_in_file(y.object_pos, y.metadata_files) # convert the OSBA output to BINAURAL, if option was chosen if cfg.binaural_output: - binaudio = audio.fromtype("BINAURAL") - binaudio.fs = y.fs - convert_osba(y, binaudio) binaural_output_filename = output_filename.with_name( output_filename.stem + "_BINAURAL" + output_filename.suffix ) + logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}") + binaudio = audio.fromtype("BINAURAL") + binaudio.fs = y.fs + convert_osba(y, binaudio) audiofile.write( binaural_output_filename, binaudio.audio, diff --git a/ivas_processing_scripts/generation/generate_stereo_items.py b/ivas_processing_scripts/generation/generate_stereo_items.py index 7d079c0c..7b155bd5 100644 --- a/ivas_processing_scripts/generation/generate_stereo_items.py +++ b/ivas_processing_scripts/generation/generate_stereo_items.py @@ -194,8 +194,9 @@ def generate_stereo_scene( - Writes the processed STEREO audio to output file. """ + scenes = list(cfg.scenes.keys()) logger.info( - f"Processing scene: {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}" + f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}" ) # extract the number of audio sources @@ -244,6 +245,7 @@ def generate_stereo_scene( source_shift = 0.0 # convert overlap to samples and ensure it is a multiple of 20ms + source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) @@ -260,7 +262,7 @@ def generate_stereo_scene( else: level = -26 - logger.info(f"Convolving {source_file} with {IR_file}") + logger.info(f"-- Convolving {source_file} with {IR_file} at {level} LUFS with shift of {source_shift_in_seconds} seconds") # read source file x = audio.fromfile("MONO", input_filename) @@ -280,7 +282,7 @@ def generate_stereo_scene( # convolve MONO source audio with STEREO IR -> results in STEREO audio object x = reverb_stereo(x, IR) - # adjust the level of the stereo signal + # adjust the level of the STEREO signal x.audio, _ = loudness_norm(x, level, loudness_format="STEREO") # ensure the length of the audio source signal is a multiple of 20ms @@ -295,10 +297,11 @@ def generate_stereo_scene( # add source signal to the array of all source signals y.audio = x.audio.copy() - # if source_shift < 0: - # # insert zeros to the new audio source signal to shift it right - # metadata.trim_meta(y, limits=[source_shift, 0], samples=True) - offset = source_shift + if source_shift < 0: + # insert zeros to the new audio source signal to shift it right + y.audio = audioarray.trim(y.audio, x.fs, limits=[source_shift, 0], samples=True) + else: + offset = source_shift else: # shift the beginning of the audio source signal delta_offset = source_shift - offset @@ -345,5 +348,14 @@ def generate_stereo_scene( if len(y.audio) != duration: y.audio = audioarray.trim(y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True) + # adjust the loudness of the output signal + logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") + y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="STEREO") + + # apply fade-in and fade-out + if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0: + logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds") + y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000) + # write the STEREO audio signal into output file audiofile.write(output_filename, y.audio, y.fs) -- GitLab From 1d40ba983b06fd5af83e3c3e8e6ce8ef1d77a4a4 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 26 Jun 2025 10:09:53 +0200 Subject: [PATCH 31/42] correction of output filenames --- examples/ITEM_GENERATION_OMASA.yml | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/examples/ITEM_GENERATION_OMASA.yml b/examples/ITEM_GENERATION_OMASA.yml index 4c8db6d2..1f631f3f 100644 --- a/examples/ITEM_GENERATION_OMASA.yml +++ b/examples/ITEM_GENERATION_OMASA.yml @@ -102,7 +102,7 @@ provider: "va" scenes: "01": - output: "out/VA_3tlks_music.wav" + output: "out/VA_3tlks_music_s01.wav" description: "Three talkers over music background" input: ["items_hoa2/bm7aa1s01.wav", "items_mono/untrimmed/m4s12b_Talker1.wav", "items_mono/untrimmed/f1s6a_Talker2.wav", "items_mono/untrimmed/m3s1a_Talker2.wav"] azimuth: [0, 30, -45, 100] @@ -111,7 +111,7 @@ scenes: shift: [0.0, 0.0, 0.0, -2.0] "02": - output: "out/VA_3tlks_music.wav" + output: "out/VA_3tlks_music_s02.wav" description: "Three talkers over music background" input: ["items_hoa2/bm7aa1s03.wav", "items_mono/untrimmed/f2s5a_Talker1.wav", "items_mono/untrimmed/f5s10a_Talker1.wav", "items_mono/untrimmed/m3s8b_Talker2.wav"] azimuth: [0, "-20:0.5:360", "60:-0.5:-360", 60] @@ -120,7 +120,7 @@ scenes: shift: [0.0, 0.0, -2.0, -2.5] "03": - output: "out/VA_3tlks_music.wav" + output: "out/VA_3tlks_music_s03.wav" description: "Three talkers over music background" input: ["items_hoa2/bm7aa1s05.wav", "items_mono/untrimmed/f1s16b_Talker2.wav", "items_mono/untrimmed/m4s16a_Talker1.wav", "items_mono/untrimmed/f3s10b_Talker2.wav"] azimuth: [0, -90, "0:1:360", "0:-1:-360"] @@ -129,7 +129,7 @@ scenes: shift: [0.0, 0.0, 0.0, -2.6] "04": - output: "out/VA_3tlks_music.wav" + output: "out/VA_3tlks_music_s04.wav" description: "Three talkers over music background" input: ["items_hoa2/bm7aa1s07.wav", "items_mono/untrimmed/f5s15b_Talker1.wav", "items_mono/untrimmed/m1s7a_Talker1.wav", "items_mono/untrimmed/m1s6b_Talker1.wav"] azimuth: [0, "-90:-1:-360", "-10:1.5:360", "70:1:360"] @@ -138,36 +138,36 @@ scenes: shift: [0.0, -2.0, 0.0, -3.5] "05": - output: "out/VA_2tlks_1obj_music.wav" + output: "out/VA_2tlks_1obj_music_s05.wav" description: "Two talkers, one musical object over music background" - input: ["items_hoa2/bm7aa1s09.wav", "items_mono/untrimmed/f2s1a_Talker1.wav", "items_mono/untrimmed/f2s5a_Talker1.wav", "music/item_lxa3s3.48k.wav"] + input: ["items_hoa2/bm7aa1s09.wav", "items_mono/untrimmed/f2s1a_Talker1.wav", "items_mono/untrimmed/f2s5a_Talker1.wav", "items_mono/music/item_lxa3s3.48k.wav"] azimuth: [0, 20, -40, 45] elevation: [0, 0, 0, 70] level: [-36, -36, -26, -41] shift: [0.0, 0.0, -2.0, 0.0] "06": - output: "out/VA_2tlks_1obj_music.wav" + output: "out/VA_2tlks_1obj_music_s06.wav" description: "Two talkers, one musical object over music background" - input: ["items_hoa2/bm7aa1s11.wav", "items_mono/untrimmed/f5s10b_Talker1.wav", "items_mono/untrimmed/m1s4a_Talker1.wav", "music/item_lxa3s5.48k.wav"] + input: ["items_hoa2/bm7aa1s11.wav", "items_mono/untrimmed/f5s10b_Talker1.wav", "items_mono/untrimmed/m1s4a_Talker1.wav", "items_mono/music/item_lxa3s5.48k.wav"] azimuth: [0, 50, "180:1:360", -120] elevation: [0, 0, 45, 70] level: [-46, -26, -26, -41] shift: [0.0, 0.0, -2.5, 0.0] "07": - output: "out/VA_2tlks_1obj_music.wav" + output: "out/VA_2tlks_1obj_music_s07.wav" description: "Two talkers, one musical object over music background" - input: ["items_hoa2/bm7aa1s13.wav", "items_mono/untrimmed/m1s2b_Talker1.wav", "items_mono/untrimmed/f3s5a_Talker2.wav", "music/641692__theflyfishingfilmmaker__classical-violin-minor-10s-mono.wav"] + input: ["items_hoa2/bm7aa1s13.wav", "items_mono/untrimmed/m1s2b_Talker1.wav", "items_mono/untrimmed/f3s5a_Talker2.wav", "items_mono/music/641692__theflyfishingfilmmaker__classical-violin-minor-10s-mono.wav"] azimuth: [0, "80:1:20 + 360", "80:1:20 + 360", -30] elevation: [0, 10, 60, 70] level: [-36, -26, -26, -36] shift: [0.0, 0.0, 0.0, 0.0] "08": - output: "out/VA_2tlks_1obj_music.wav" + output: "out/VA_2tlks_1obj_music_s08.wav" description: "Two talkers, one musical object over music background" - input: ["items_hoa2/bm7aa1s15.wav", "items_mono/untrimmed/m4s16a_Talker1.wav", "items_mono/untrimmed/f2s4a_Talker1.wav", "music/item_lxa4s2.48k.wav"] + input: ["items_hoa2/bm7aa1s15.wav", "items_mono/untrimmed/m4s16a_Talker1.wav", "items_mono/untrimmed/f2s4a_Talker1.wav", "items_mono/music/item_lxa4s2.48k.wav"] azimuth: [0, "60:1:0 + 360", "60:-1:120 - 360", 100] elevation: [0, 20, 50, 70] level: [-46, -26, -26, -41] -- GitLab From caced321e8cfba7a5529e66fc59ef43907f268a4 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 26 Jun 2025 10:23:22 +0200 Subject: [PATCH 32/42] skip pre-amble and post-amble if not specified in the .yml file (saves processing time) --- .../generation/generate_ambi_items.py | 33 +++++++------------ .../generation/generate_ismN_items.py | 26 +++++++-------- .../generation/generate_omasa_items.py | 33 +++++++------------ .../generation/generate_osba_items.py | 33 +++++++------------ .../generation/generate_stereo_items.py | 33 +++++++------------ 5 files changed, 60 insertions(+), 98 deletions(-) diff --git a/ivas_processing_scripts/generation/generate_ambi_items.py b/ivas_processing_scripts/generation/generate_ambi_items.py index 6cd3de83..24bf89e9 100644 --- a/ivas_processing_scripts/generation/generate_ambi_items.py +++ b/ivas_processing_scripts/generation/generate_ambi_items.py @@ -76,10 +76,6 @@ def generate_ambi_items( ): """Generate FOA/HOA2/HOA3 items from mono items based on scene description""" - # set the target level - if "loudness" not in cfg.__dict__: - cfg.loudness = -26 - # set the fs if "fs" not in cfg.__dict__: cfg.fs = 48000 @@ -88,17 +84,6 @@ def generate_ambi_items( if "IR_fs" not in cfg.__dict__: cfg.IR_fs = 48000 - # set the pre-amble and post-amble - if "preamble" not in cfg.__dict__: - cfg.preamble = 0.0 - - if "postamble" not in cfg.__dict__: - cfg.postamble = 0.0 - - # set the pre-amble and post-amble - if "add_low_level_random_noise" not in cfg.__dict__: - cfg.add_low_level_random_noise = False - # set the listening lab designator if "listening_lab" not in cfg.__dict__: cfg.listening_lab = "l" @@ -325,12 +310,17 @@ def generate_ambi_scene( y.audio += x.audio # append pre-amble and post-amble - preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms - postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms - y.audio = audioarray.trim(y.audio, y.fs, limits=[-preamble, -postamble], samples=True) + if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: + logger.info( + f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" + ) + if any([cfg.preamble, cfg.postamble]): + preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + y.audio = audioarray.trim(y.audio, y.fs, limits=[-preamble, -postamble], samples=True) # add random noise - if cfg.add_low_level_random_noise: + if "add_low_level_random_noise" in cfg.__dict__ and cfg.add_low_level_random_noise: # create uniformly distributed noise between -4 and 4 np.random.seed(SEED_RANDOM_NOISE) noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") @@ -348,8 +338,9 @@ def generate_ambi_scene( y.audio = audioarray.trim(y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True) # adjust the loudness of the output signal - logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") - y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL") + if "loudness" in cfg.__dict__: + logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") + y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL") # apply fade-in and fade-out if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0: diff --git a/ivas_processing_scripts/generation/generate_ismN_items.py b/ivas_processing_scripts/generation/generate_ismN_items.py index d71d0fe9..559910e9 100644 --- a/ivas_processing_scripts/generation/generate_ismN_items.py +++ b/ivas_processing_scripts/generation/generate_ismN_items.py @@ -82,10 +82,6 @@ def generate_ismN_items( ): """Generate ISMN items with metadata from mono items based on scene description""" - # set the target level - if "loudness" not in cfg.__dict__: - cfg.loudness = -26 - # set the fs if "fs" not in cfg.__dict__: cfg.fs = 48000 @@ -97,10 +93,6 @@ def generate_ismN_items( if "postamble" not in cfg.__dict__: cfg.postamble = 0.0 - # set the pre-amble and post-amble - if "add_low_level_random_noise" not in cfg.__dict__: - cfg.add_low_level_random_noise = False - # set the listening lab designator if "listening_lab" not in cfg.__dict__: cfg.listening_lab = "l" @@ -392,12 +384,17 @@ def generate_ismN_scene( y.metadata_files.insert(i, str(output_filename.with_suffix(f".{i}.csv"))) # append pre-amble and post-amble - preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms - postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms - metadata.trim_meta(y, limits=[-preamble, -postamble], samples=True) + if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: + logger.info( + f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" + ) + if any([cfg.preamble, cfg.postamble]): + preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + metadata.trim_meta(y, limits=[-preamble, -postamble], samples=True) # add random noise - if cfg.add_low_level_random_noise: + if "add_low_level_random_noise" in cfg.__dict__ and cfg.add_low_level_random_noise: # create uniformly distributed noise between -4 and 4 np.random.seed(SEED_RANDOM_NOISE) noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") @@ -415,8 +412,9 @@ def generate_ismN_scene( metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) # adjust the loudness of the output signal - logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") - y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL") + if "loudness" in cfg.__dict__: + logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") + y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL") # apply fade-in and fade-out if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0: diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py index 5657cc89..653d5dee 100644 --- a/ivas_processing_scripts/generation/generate_omasa_items.py +++ b/ivas_processing_scripts/generation/generate_omasa_items.py @@ -80,25 +80,10 @@ def generate_omasa_items( ): """Generate OMASA items with metadata from FOA/HO2 and ISMn items based on scene description""" - # set the target level - if "loudness" not in cfg.__dict__: - cfg.loudness = -26 - # set the fs if "fs" not in cfg.__dict__: cfg.fs = 48000 - # set the pre-amble and post-amble - if "preamble" not in cfg.__dict__: - cfg.preamble = 0.0 - - if "postamble" not in cfg.__dict__: - cfg.postamble = 0.0 - - # set the pre-amble and post-amble - if "add_low_level_random_noise" not in cfg.__dict__: - cfg.add_low_level_random_noise = False - # set the listening lab designator if "listening_lab" not in cfg.__dict__: cfg.listening_lab = "l" @@ -439,12 +424,17 @@ def generate_OMASA_scene( y.metadata_files.insert(i - 1, str(output_filename.with_suffix(f".{i - 1}.csv"))) # append pre-amble and post-amble - preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms - postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms - metadata.trim_meta(y, limits=[-preamble, -postamble], samples=True) + if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: + logger.info( + f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" + ) + if any([cfg.preamble, cfg.postamble]): + preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + metadata.trim_meta(y, limits=[-preamble, -postamble], samples=True) # add random noise - if cfg.add_low_level_random_noise: + if "add_low_level_random_noise" in cfg.__dict__ and cfg.add_low_level_random_noise: # create uniformly distributed noise between -4 and 4 np.random.seed(SEED_RANDOM_NOISE) noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") @@ -462,8 +452,9 @@ def generate_OMASA_scene( metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) # adjust the loudness of the output signal - logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") - y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL") + if "loudness" in cfg.__dict__: + logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") + y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL") # apply fade-in and fade-out if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0: diff --git a/ivas_processing_scripts/generation/generate_osba_items.py b/ivas_processing_scripts/generation/generate_osba_items.py index c13a2e7b..9ef07794 100644 --- a/ivas_processing_scripts/generation/generate_osba_items.py +++ b/ivas_processing_scripts/generation/generate_osba_items.py @@ -78,25 +78,10 @@ def generate_osba_items( ): """Generate OSBA items from FOA/HOA2/HOA3 and ISMn items based on scene description""" - # set the target level - if "loudness" not in cfg.__dict__: - cfg.loudness = -26 - # set the fs if "fs" not in cfg.__dict__: cfg.fs = 48000 - # set the pre-amble and post-amble - if "preamble" not in cfg.__dict__: - cfg.preamble = 0.0 - - if "postamble" not in cfg.__dict__: - cfg.postamble = 0.0 - - # set the pre-amble and post-amble - if "add_low_level_random_noise" not in cfg.__dict__: - cfg.add_low_level_random_noise = False - # set the listening lab designator if "listening_lab" not in cfg.__dict__: cfg.listening_lab = "l" @@ -421,12 +406,17 @@ def generate_OSBA_scene( y.metadata_files.insert(i - 1, str(output_filename.with_suffix(f".{i - 1}.csv"))) # append pre-amble and post-amble - preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms - postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms - metadata.trim_meta(y, limits=[-preamble, -postamble], samples=True) + if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: + logger.info( + f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" + ) + if any([cfg.preamble, cfg.postamble]): + preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + metadata.trim_meta(y, limits=[-preamble, -postamble], samples=True) # add random noise - if cfg.add_low_level_random_noise: + if "add_low_level_random_noise" in cfg.__dict__ and cfg.add_low_level_random_noise: # create uniformly distributed noise between -4 and 4 np.random.seed(SEED_RANDOM_NOISE) noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") @@ -444,8 +434,9 @@ def generate_OSBA_scene( metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) # adjust the loudness of the output signal - logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") - y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL") + if "loudness" in cfg.__dict__: + logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") + y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL") # apply fade-in and fade-out if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0: diff --git a/ivas_processing_scripts/generation/generate_stereo_items.py b/ivas_processing_scripts/generation/generate_stereo_items.py index 7b155bd5..03feecf0 100644 --- a/ivas_processing_scripts/generation/generate_stereo_items.py +++ b/ivas_processing_scripts/generation/generate_stereo_items.py @@ -78,10 +78,6 @@ def generate_stereo_items( ): """Generate STEREO items from mono items based on scene description""" - # set the target level - if "loudness" not in cfg.__dict__: - cfg.loudness = -26 - # set the fs if "fs" not in cfg.__dict__: cfg.fs = 48000 @@ -90,21 +86,10 @@ def generate_stereo_items( if "IR_fs" not in cfg.__dict__: cfg.IR_fs = 48000 - # set the pre-amble and post-amble - if "preamble" not in cfg.__dict__: - cfg.preamble = 0.0 - - if "postamble" not in cfg.__dict__: - cfg.postamble = 0.0 - # set the IR path if "IR_path" not in cfg.__dict__: cfg.IR_path = os.path.join(os.path.dirname(__file__), "IRs") - # set the pre-amble and post-amble - if "add_low_level_random_noise" not in cfg.__dict__: - cfg.add_low_level_random_noise = False - # set the listening lab designator if "listening_lab" not in cfg.__dict__: cfg.listening_lab = "l" @@ -326,12 +311,17 @@ def generate_stereo_scene( y.audio += x.audio # append pre-amble and post-amble - preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms - postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms - y.audio = audioarray.trim(y.audio, y.fs, limits=[-preamble, -postamble], samples=True) + if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: + logger.info( + f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" + ) + if any([cfg.preamble, cfg.postamble]): + preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + y.audio = audioarray.trim(y.audio, y.fs, limits=[-preamble, -postamble], samples=True) # add random noise - if cfg.add_low_level_random_noise: + if "add_low_level_random_noise" in cfg.__dict__ and cfg.add_low_level_random_noise: # create uniformly distributed noise between -4 and 4 np.random.seed(SEED_RANDOM_NOISE) noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") @@ -349,8 +339,9 @@ def generate_stereo_scene( y.audio = audioarray.trim(y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True) # adjust the loudness of the output signal - logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") - y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="STEREO") + if "loudness" in cfg.__dict__: + logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") + y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="STEREO") # apply fade-in and fade-out if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0: -- GitLab From 28511fe4caf0be2b8b789b38bd09b239fb688b7a Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 26 Jun 2025 10:30:14 +0200 Subject: [PATCH 33/42] fix item path --- examples/ITEM_GENERATION_OSBA.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/ITEM_GENERATION_OSBA.yml b/examples/ITEM_GENERATION_OSBA.yml index b7e1400f..f7c33b49 100644 --- a/examples/ITEM_GENERATION_OSBA.yml +++ b/examples/ITEM_GENERATION_OSBA.yml @@ -140,7 +140,7 @@ scenes: "05": output: "out/VA_2tlks_1obj_music.wav" description: "Two talkers, one musical object over music background" - input: ["items_hoa2/bm7aa1s09.wav", "items_mono/untrimmed/f2s1a_Talker1.wav", "items_mono/untrimmed/f2s5a_Talker1.wav", "music/item_lxa3s3.48k.wav"] + input: ["items_hoa2/bm7aa1s09.wav", "items_mono/untrimmed/f2s1a_Talker1.wav", "items_mono/untrimmed/f2s5a_Talker1.wav", "items_mono/music/item_lxa3s3.48k.wav"] azimuth: [0, 20, -40, 45] elevation: [0, 0, 0, 70] level: [-36, -36, -26, -41] @@ -149,7 +149,7 @@ scenes: "06": output: "out/VA_2tlks_1obj_music.wav" description: "Two talkers, one musical object over music background" - input: ["items_hoa2/bm7aa1s11.wav", "items_mono/untrimmed/f5s10b_Talker1.wav", "items_mono/untrimmed/m1s4a_Talker1.wav", "music/item_lxa3s5.48k.wav"] + input: ["items_hoa2/bm7aa1s11.wav", "items_mono/untrimmed/f5s10b_Talker1.wav", "items_mono/untrimmed/m1s4a_Talker1.wav", "items_mono/music/item_lxa3s5.48k.wav"] azimuth: [0, 50, "180:1:360", -120] elevation: [0, 0, 45, 70] level: [-46, -26, -26, -41] @@ -158,7 +158,7 @@ scenes: "07": output: "out/VA_2tlks_1obj_music.wav" description: "Two talkers, one musical object over music background" - input: ["items_hoa2/bm7aa1s13.wav", "items_mono/untrimmed/m1s2b_Talker1.wav", "items_mono/untrimmed/f3s5a_Talker2.wav", "music/641692__theflyfishingfilmmaker__classical-violin-minor-10s-mono.wav"] + input: ["items_hoa2/bm7aa1s13.wav", "items_mono/untrimmed/m1s2b_Talker1.wav", "items_mono/untrimmed/f3s5a_Talker2.wav", "items_mono/music/641692__theflyfishingfilmmaker__classical-violin-minor-10s-mono.wav"] azimuth: [0, "80:1:20 + 360", "80:1:20 + 360", -30] elevation: [0, 10, 60, 70] level: [-36, -26, -26, -36] @@ -167,7 +167,7 @@ scenes: "08": output: "out/VA_2tlks_1obj_music.wav" description: "Two talkers, one musical object over music background" - input: ["items_hoa2/bm7aa1s15.wav", "items_mono/untrimmed/m4s16a_Talker1.wav", "items_mono/untrimmed/f2s4a_Talker1.wav", "music/item_lxa4s2.48k.wav"] + input: ["items_hoa2/bm7aa1s15.wav", "items_mono/untrimmed/m4s16a_Talker1.wav", "items_mono/untrimmed/f2s4a_Talker1.wav", "items_mono/music/item_lxa4s2.48k.wav"] azimuth: [0, "60:1:0 + 360", "60:-1:120 - 360", 100] elevation: [0, 20, 50, 70] level: [-46, -26, -26, -41] -- GitLab From 26b84d774ab8dab5aa97b71693cae97a53ae1794 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 26 Jun 2025 10:30:47 +0200 Subject: [PATCH 34/42] fix printout --- .../generation/generate_ambi_items.py | 12 ++++++------ .../generation/generate_ismN_items.py | 12 ++++++------ .../generation/generate_omasa_items.py | 12 ++++++------ .../generation/generate_osba_items.py | 12 ++++++------ .../generation/generate_stereo_items.py | 12 ++++++------ 5 files changed, 30 insertions(+), 30 deletions(-) diff --git a/ivas_processing_scripts/generation/generate_ambi_items.py b/ivas_processing_scripts/generation/generate_ambi_items.py index 24bf89e9..195600c8 100644 --- a/ivas_processing_scripts/generation/generate_ambi_items.py +++ b/ivas_processing_scripts/generation/generate_ambi_items.py @@ -311,12 +311,12 @@ def generate_ambi_scene( # append pre-amble and post-amble if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: - logger.info( - f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" - ) - if any([cfg.preamble, cfg.postamble]): - preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms - postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + if preamble != 0 or postamble != 0: + logger.info( + f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" + ) y.audio = audioarray.trim(y.audio, y.fs, limits=[-preamble, -postamble], samples=True) # add random noise diff --git a/ivas_processing_scripts/generation/generate_ismN_items.py b/ivas_processing_scripts/generation/generate_ismN_items.py index 559910e9..6c909e9e 100644 --- a/ivas_processing_scripts/generation/generate_ismN_items.py +++ b/ivas_processing_scripts/generation/generate_ismN_items.py @@ -385,12 +385,12 @@ def generate_ismN_scene( # append pre-amble and post-amble if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: - logger.info( - f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" - ) - if any([cfg.preamble, cfg.postamble]): - preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms - postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + if preamble != 0 or postamble != 0: + logger.info( + f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" + ) metadata.trim_meta(y, limits=[-preamble, -postamble], samples=True) # add random noise diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py index 653d5dee..b9de7bc7 100644 --- a/ivas_processing_scripts/generation/generate_omasa_items.py +++ b/ivas_processing_scripts/generation/generate_omasa_items.py @@ -425,12 +425,12 @@ def generate_OMASA_scene( # append pre-amble and post-amble if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: - logger.info( - f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" - ) - if any([cfg.preamble, cfg.postamble]): - preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms - postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + if preamble != 0 or postamble != 0: + logger.info( + f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" + ) metadata.trim_meta(y, limits=[-preamble, -postamble], samples=True) # add random noise diff --git a/ivas_processing_scripts/generation/generate_osba_items.py b/ivas_processing_scripts/generation/generate_osba_items.py index 9ef07794..5c20fc12 100644 --- a/ivas_processing_scripts/generation/generate_osba_items.py +++ b/ivas_processing_scripts/generation/generate_osba_items.py @@ -407,12 +407,12 @@ def generate_OSBA_scene( # append pre-amble and post-amble if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: - logger.info( - f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" - ) - if any([cfg.preamble, cfg.postamble]): - preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms - postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + if preamble != 0 or postamble != 0: + logger.info( + f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" + ) metadata.trim_meta(y, limits=[-preamble, -postamble], samples=True) # add random noise diff --git a/ivas_processing_scripts/generation/generate_stereo_items.py b/ivas_processing_scripts/generation/generate_stereo_items.py index 03feecf0..86477309 100644 --- a/ivas_processing_scripts/generation/generate_stereo_items.py +++ b/ivas_processing_scripts/generation/generate_stereo_items.py @@ -312,12 +312,12 @@ def generate_stereo_scene( # append pre-amble and post-amble if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: - logger.info( - f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" - ) - if any([cfg.preamble, cfg.postamble]): - preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms - postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + if preamble != 0 or postamble != 0: + logger.info( + f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" + ) y.audio = audioarray.trim(y.audio, y.fs, limits=[-preamble, -postamble], samples=True) # add random noise -- GitLab From 1eb0cfedb22765f9ae87eeb17d803d5ce8e4944f Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 26 Jun 2025 10:50:43 +0200 Subject: [PATCH 35/42] Set SUPPRESS_CHUNK_WARNING_WAV_READ to False by default --- ivas_processing_scripts/audiotools/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ivas_processing_scripts/audiotools/constants.py b/ivas_processing_scripts/audiotools/constants.py index 925ed2a2..dfe21e1f 100755 --- a/ivas_processing_scripts/audiotools/constants.py +++ b/ivas_processing_scripts/audiotools/constants.py @@ -32,7 +32,7 @@ import numpy as np -SUPPRESS_CHUNK_WARNING_WAV_READ = True # suppress warning from .wav read() when chunk size is not a multiple of 2 +SUPPRESS_CHUNK_WARNING_WAV_READ = False # suppress warning from .wav read() when chunk size is not a multiple of 2 BINAURAL_AUDIO_FORMATS = { "BINAURAL": { -- GitLab From de1f23bf404108a782b579974d94b29d23cb43d7 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 26 Jun 2025 12:13:14 +0200 Subject: [PATCH 36/42] reformatting --- .../generation/generate_ambi_items.py | 60 +++++++++++++------ .../generation/generate_ismN_items.py | 40 +++++++++---- .../generation/generate_omasa_items.py | 40 +++++++++---- .../generation/generate_osba_items.py | 32 +++++++--- .../generation/generate_stereo_items.py | 56 ++++++++++++----- 5 files changed, 167 insertions(+), 61 deletions(-) diff --git a/ivas_processing_scripts/generation/generate_ambi_items.py b/ivas_processing_scripts/generation/generate_ambi_items.py index 195600c8..3628369c 100644 --- a/ivas_processing_scripts/generation/generate_ambi_items.py +++ b/ivas_processing_scripts/generation/generate_ambi_items.py @@ -205,12 +205,12 @@ def generate_ambi_scene( source_file = ( scene["input"][i] if isinstance(scene["input"], list) else scene["input"] ) - IR_file = ( - scene["IR"][i] if isinstance(scene["IR"], list) else scene["IR"] - ) + IR_file = scene["IR"][i] if isinstance(scene["IR"], list) else scene["IR"] # get input filename and IR filename - input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name) + input_filename = Path(source_file).parent / ( + cfg.use_input_prefix + Path(source_file).name + ) IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) # read the overlap length @@ -241,7 +241,9 @@ def generate_ambi_scene( else: level = -26 - logger.info(f"-- Convolving {source_file} with {IR_file} at {level} LUFS with shift of {source_shift_in_seconds} seconds") + logger.info( + f"-- Convolving {source_file} with {IR_file} at {level} LUFS with shift of {source_shift_in_seconds} seconds" + ) # read source file x = audio.fromfile("MONO", input_filename) @@ -274,7 +276,9 @@ def generate_ambi_scene( # pad with zeros to ensure that the signal length is a multiple of 20ms if len(x.audio) % frame_len != 0: N_pad = int(frame_len - len(x.audio) % frame_len) - x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) + x.audio = audioarray.trim( + x.audio, x.fs, limits=[0, -N_pad], samples=True + ) # add the convolved FOA/HOA2/HOA3 audio source signal to the output signal if y.audio is None: @@ -283,7 +287,9 @@ def generate_ambi_scene( if source_shift < 0: # insert zeros to the new audio source signal to shift it right - y.audio = audioarray.trim_meta(y.audio, y.fs, limits=[source_shift, 0], samples=True) + y.audio = audioarray.trim_meta( + y.audio, y.fs, limits=[source_shift, 0], samples=True + ) else: offset = source_shift else: @@ -291,33 +297,47 @@ def generate_ambi_scene( delta_offset = source_shift - offset if delta_offset > 0: # insert zeros to the existing output signal to shift it right - y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_offset], samples=True) + y.audio = audioarray.trim( + y.audio, y.fs, limits=[0, -delta_offset], samples=True + ) offset = source_shift else: # insert zeros to the new audio source signal to shift it right - x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_offset], samples=True) + x.audio = audioarray.trim( + x.audio, x.fs, limits=[0, delta_offset], samples=True + ) # adjust the length of the audio source signal delta_length = len(x.audio) - len(y.audio) if delta_length > 0: # pad zeros to the existing output signal - y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_length], samples=True) + y.audio = audioarray.trim( + y.audio, y.fs, limits=[0, -delta_length], samples=True + ) else: # pad zeros to the new audio source signal - x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_length], samples=True) + x.audio = audioarray.trim( + x.audio, x.fs, limits=[0, delta_length], samples=True + ) # superimpose y.audio += x.audio # append pre-amble and post-amble if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: - preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms - postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + preamble = int( + np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len + ) # convert to samples and ensure multiple of 20ms + postamble = int( + np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len + ) # convert to samples and ensure multiple of 20ms if preamble != 0 or postamble != 0: logger.info( f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" ) - y.audio = audioarray.trim(y.audio, y.fs, limits=[-preamble, -postamble], samples=True) + y.audio = audioarray.trim( + y.audio, y.fs, limits=[-preamble, -postamble], samples=True + ) # add random noise if "add_low_level_random_noise" in cfg.__dict__ and cfg.add_low_level_random_noise: @@ -333,9 +353,13 @@ def generate_ambi_scene( else: # do not change the length of the audio signal duration = len(y.audio) - duration = int(np.floor(duration / frame_len) * frame_len) # ensure multiple of 20ms + duration = int( + np.floor(duration / frame_len) * frame_len + ) # ensure multiple of 20ms if len(y.audio) != duration: - y.audio = audioarray.trim(y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True) + y.audio = audioarray.trim( + y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True + ) # adjust the loudness of the output signal if "loudness" in cfg.__dict__: @@ -355,7 +379,9 @@ def generate_ambi_scene( binaural_output_filename = output_filename.with_name( output_filename.stem + "_BINAURAL" + output_filename.suffix ) - logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}") + logger.info( + f"-- Converting to BINAURAL output file: {binaural_output_filename}" + ) binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_scenebased(y, binaudio) diff --git a/ivas_processing_scripts/generation/generate_ismN_items.py b/ivas_processing_scripts/generation/generate_ismN_items.py index 6c909e9e..646798fa 100644 --- a/ivas_processing_scripts/generation/generate_ismN_items.py +++ b/ivas_processing_scripts/generation/generate_ismN_items.py @@ -208,14 +208,18 @@ def generate_ismN_scene( # read azimuth and elevation information if "azimuth" in scene.keys(): source_azi = ( - scene["azimuth"][i] if isinstance(scene["azimuth"], list) else scene["azimuth"] + scene["azimuth"][i] + if isinstance(scene["azimuth"], list) + else scene["azimuth"] ) else: source_azi = 0.0 if "elevation" in scene.keys(): source_ele = ( - scene["elevation"][i] if isinstance(scene["elevation"], list) else scene["elevation"] + scene["elevation"][i] + if isinstance(scene["elevation"], list) + else scene["elevation"] ) else: source_ele = 0.0 @@ -223,7 +227,9 @@ def generate_ismN_scene( # read the source shift length (in seconds) if "shift" in scene.keys(): source_shift = ( - scene["shift"][i] if isinstance(scene["shift"], list) else scene["shift"] + scene["shift"][i] + if isinstance(scene["shift"], list) + else scene["shift"] ) else: source_shift = 0.0 @@ -239,12 +245,16 @@ def generate_ismN_scene( # read the level if "level" in scene.keys(): level = ( - scene["level"][i] if isinstance(scene["level"], list) else scene["level"] + scene["level"][i] + if isinstance(scene["level"], list) + else scene["level"] ) else: level = -26 - logger.info(f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds") + logger.info( + f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds" + ) # read source file x = audio.fromtype("ISM1") @@ -271,7 +281,9 @@ def generate_ismN_scene( # pad with zeros to ensure that the signal length is a multiple of 20ms if len(x.audio) % frame_len != 0: N_pad = int(frame_len - len(x.audio) % frame_len) - x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) + x.audio = audioarray.trim( + x.audio, x.fs, limits=[0, -N_pad], samples=True + ) # get the number of frames (multiple of 20ms) N_frames = int(len(x.audio) / frame_len) @@ -385,8 +397,12 @@ def generate_ismN_scene( # append pre-amble and post-amble if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: - preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms - postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + preamble = int( + np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len + ) # convert to samples and ensure multiple of 20ms + postamble = int( + np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len + ) # convert to samples and ensure multiple of 20ms if preamble != 0 or postamble != 0: logger.info( f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" @@ -407,7 +423,9 @@ def generate_ismN_scene( else: # do not change the length of the audio signal duration = len(y.audio) - duration = int(np.floor(duration / frame_len) * frame_len) # ensure multiple of 20ms + duration = int( + np.floor(duration / frame_len) * frame_len + ) # ensure multiple of 20ms if len(y.audio) != duration: metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) @@ -430,7 +448,9 @@ def generate_ismN_scene( binaural_output_filename = output_filename.with_name( output_filename.stem + "_BINAURAL" + output_filename.suffix ) - logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}") + logger.info( + f"-- Converting to BINAURAL output file: {binaural_output_filename}" + ) binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_objectbased(y, binaudio) diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py index b9de7bc7..be1b2bd7 100644 --- a/ivas_processing_scripts/generation/generate_omasa_items.py +++ b/ivas_processing_scripts/generation/generate_omasa_items.py @@ -203,14 +203,18 @@ def generate_OMASA_scene( # read azimuth and elevation information if "azimuth" in scene.keys(): source_azi = ( - scene["azimuth"][i] if isinstance(scene["azimuth"], list) else scene["azimuth"] + scene["azimuth"][i] + if isinstance(scene["azimuth"], list) + else scene["azimuth"] ) else: source_azi = 0.0 if "elevation" in scene.keys(): source_ele = ( - scene["elevation"][i] if isinstance(scene["elevation"], list) else scene["elevation"] + scene["elevation"][i] + if isinstance(scene["elevation"], list) + else scene["elevation"] ) else: source_ele = 0.0 @@ -218,7 +222,9 @@ def generate_OMASA_scene( # read the source shift length (in seconds) if "shift" in scene.keys(): source_shift = ( - scene["shift"][i] if isinstance(scene["shift"], list) else scene["shift"] + scene["shift"][i] + if isinstance(scene["shift"], list) + else scene["shift"] ) else: source_shift = 0.0 @@ -241,7 +247,9 @@ def generate_OMASA_scene( else: level = -26 - logger.info(f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds") + logger.info( + f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds" + ) # get the number of channels from the .wav file header wav_header = audiofile.parse_wave_header(input_filename) @@ -286,7 +294,9 @@ def generate_OMASA_scene( # pad with zeros to ensure that the signal length is a multiple of 20ms if len(x.audio) % frame_len != 0: N_pad = int(frame_len - len(x.audio) % frame_len) - x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) + x.audio = audioarray.trim( + x.audio, x.fs, limits=[0, -N_pad], samples=True + ) # get the number of frames (multiple of 20ms) N_frames = int(len(x.audio) / frame_len) @@ -421,12 +431,18 @@ def generate_OMASA_scene( y.object_pos.extend(x.object_pos) # add ISM metadata .csv filename (should end with .wav.0.csv, .wav.1.csv, ...) - y.metadata_files.insert(i - 1, str(output_filename.with_suffix(f".{i - 1}.csv"))) + y.metadata_files.insert( + i - 1, str(output_filename.with_suffix(f".{i - 1}.csv")) + ) # append pre-amble and post-amble if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: - preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms - postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + preamble = int( + np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len + ) # convert to samples and ensure multiple of 20ms + postamble = int( + np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len + ) # convert to samples and ensure multiple of 20ms if preamble != 0 or postamble != 0: logger.info( f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" @@ -447,7 +463,9 @@ def generate_OMASA_scene( else: # do not change the length of the audio signal duration = len(y.audio) - duration = int(np.floor(duration / frame_len) * frame_len) # ensure multiple of 20ms + duration = int( + np.floor(duration / frame_len) * frame_len + ) # ensure multiple of 20ms if len(y.audio) != duration: metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) @@ -470,7 +488,9 @@ def generate_OMASA_scene( binaural_output_filename = output_filename.with_name( output_filename.stem + "_BINAURAL" + output_filename.suffix ) - logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}") + logger.info( + f"-- Converting to BINAURAL output file: {binaural_output_filename}" + ) binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_omasa(y, binaudio) diff --git a/ivas_processing_scripts/generation/generate_osba_items.py b/ivas_processing_scripts/generation/generate_osba_items.py index 5c20fc12..98e3c9d8 100644 --- a/ivas_processing_scripts/generation/generate_osba_items.py +++ b/ivas_processing_scripts/generation/generate_osba_items.py @@ -194,7 +194,9 @@ def generate_OSBA_scene( ) # get input filename - input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name) + input_filename = Path(source_file).parent / ( + cfg.use_input_prefix + Path(source_file).name + ) # read azimuth and elevation information source_azi = ( @@ -236,7 +238,9 @@ def generate_OSBA_scene( else: level = -26 - logger.info(f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds") + logger.info( + f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds" + ) # get the number of channels from the .wav file header wav_header = audiofile.parse_wave_header(input_filename) @@ -281,7 +285,9 @@ def generate_OSBA_scene( # pad with zeros to ensure that the signal length is a multiple of 20ms if len(x.audio) % frame_len != 0: N_pad = int(frame_len - len(x.audio) % frame_len) - x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) + x.audio = audioarray.trim( + x.audio, x.fs, limits=[0, -N_pad], samples=True + ) # get the number of frames (multiple of 20ms) N_frames = int(len(x.audio) / frame_len) @@ -403,12 +409,18 @@ def generate_OSBA_scene( y.object_pos.extend(x.object_pos) # add ISM metadata .csv filename (should end with .wav.0.csv, .wav.1.csv, ...) - y.metadata_files.insert(i - 1, str(output_filename.with_suffix(f".{i - 1}.csv"))) + y.metadata_files.insert( + i - 1, str(output_filename.with_suffix(f".{i - 1}.csv")) + ) # append pre-amble and post-amble if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: - preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms - postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + preamble = int( + np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len + ) # convert to samples and ensure multiple of 20ms + postamble = int( + np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len + ) # convert to samples and ensure multiple of 20ms if preamble != 0 or postamble != 0: logger.info( f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" @@ -429,7 +441,9 @@ def generate_OSBA_scene( else: # do not change the length of the audio signal duration = len(y.audio) - duration = int(np.floor(duration / frame_len) * frame_len) # ensure multiple of 20ms + duration = int( + np.floor(duration / frame_len) * frame_len + ) # ensure multiple of 20ms if len(y.audio) != duration: metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) @@ -452,7 +466,9 @@ def generate_OSBA_scene( binaural_output_filename = output_filename.with_name( output_filename.stem + "_BINAURAL" + output_filename.suffix ) - logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}") + logger.info( + f"-- Converting to BINAURAL output file: {binaural_output_filename}" + ) binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_osba(y, binaudio) diff --git a/ivas_processing_scripts/generation/generate_stereo_items.py b/ivas_processing_scripts/generation/generate_stereo_items.py index 86477309..f8f6a872 100644 --- a/ivas_processing_scripts/generation/generate_stereo_items.py +++ b/ivas_processing_scripts/generation/generate_stereo_items.py @@ -211,12 +211,12 @@ def generate_stereo_scene( source_file = ( scene["input"][i] if isinstance(scene["input"], list) else scene["input"] ) - IR_file = ( - scene["IR"][i] if isinstance(scene["IR"], list) else scene["IR"] - ) + IR_file = scene["IR"][i] if isinstance(scene["IR"], list) else scene["IR"] # get input filename and IR filename - input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name) + input_filename = Path(source_file).parent / ( + cfg.use_input_prefix + Path(source_file).name + ) IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) # read the overlap length @@ -247,7 +247,9 @@ def generate_stereo_scene( else: level = -26 - logger.info(f"-- Convolving {source_file} with {IR_file} at {level} LUFS with shift of {source_shift_in_seconds} seconds") + logger.info( + f"-- Convolving {source_file} with {IR_file} at {level} LUFS with shift of {source_shift_in_seconds} seconds" + ) # read source file x = audio.fromfile("MONO", input_filename) @@ -275,7 +277,9 @@ def generate_stereo_scene( # pad with zeros to ensure that the signal length is a multiple of 20ms if len(x.audio) % frame_len != 0: N_pad = int(frame_len - len(x.audio) % frame_len) - x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True) + x.audio = audioarray.trim( + x.audio, x.fs, limits=[0, -N_pad], samples=True + ) # add the convolved STEREO audio source signal to the output signal if y.audio is None: @@ -284,7 +288,9 @@ def generate_stereo_scene( if source_shift < 0: # insert zeros to the new audio source signal to shift it right - y.audio = audioarray.trim(y.audio, x.fs, limits=[source_shift, 0], samples=True) + y.audio = audioarray.trim( + y.audio, x.fs, limits=[source_shift, 0], samples=True + ) else: offset = source_shift else: @@ -292,33 +298,47 @@ def generate_stereo_scene( delta_offset = source_shift - offset if delta_offset > 0: # insert zeros to the existing output signal to shift it right - y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_offset], samples=True) + y.audio = audioarray.trim( + y.audio, y.fs, limits=[0, -delta_offset], samples=True + ) offset = source_shift else: # insert zeros to the new audio source signal to shift it right - x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_offset], samples=True) + x.audio = audioarray.trim( + x.audio, x.fs, limits=[0, delta_offset], samples=True + ) # adjust the length of the audio source signal delta_length = len(x.audio) - len(y.audio) if delta_length > 0: # pad zeros to the existing output signal - y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_length], samples=True) + y.audio = audioarray.trim( + y.audio, y.fs, limits=[0, -delta_length], samples=True + ) else: # pad zeros to the new audio source signal - x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_length], samples=True) + x.audio = audioarray.trim( + x.audio, x.fs, limits=[0, delta_length], samples=True + ) # superimpose y.audio += x.audio # append pre-amble and post-amble if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: - preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms - postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms + preamble = int( + np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len + ) # convert to samples and ensure multiple of 20ms + postamble = int( + np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len + ) # convert to samples and ensure multiple of 20ms if preamble != 0 or postamble != 0: logger.info( f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" ) - y.audio = audioarray.trim(y.audio, y.fs, limits=[-preamble, -postamble], samples=True) + y.audio = audioarray.trim( + y.audio, y.fs, limits=[-preamble, -postamble], samples=True + ) # add random noise if "add_low_level_random_noise" in cfg.__dict__ and cfg.add_low_level_random_noise: @@ -334,9 +354,13 @@ def generate_stereo_scene( else: # do not change the length of the audio signal duration = len(y.audio) - duration = int(np.floor(duration / frame_len) * frame_len) # ensure multiple of 20ms + duration = int( + np.floor(duration / frame_len) * frame_len + ) # ensure multiple of 20ms if len(y.audio) != duration: - y.audio = audioarray.trim(y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True) + y.audio = audioarray.trim( + y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True + ) # adjust the loudness of the output signal if "loudness" in cfg.__dict__: -- GitLab From 4c7472af29be1b0f67dcb093ba819a2aa71b2fa9 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 26 Jun 2025 12:19:47 +0200 Subject: [PATCH 37/42] rename generate_ambi_xxx to generate_sba_... --- ivas_processing_scripts/generation/__init__.py | 4 ++-- .../{generate_ambi_items.py => generate_sba_items.py} | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) rename ivas_processing_scripts/generation/{generate_ambi_items.py => generate_sba_items.py} (99%) diff --git a/ivas_processing_scripts/generation/__init__.py b/ivas_processing_scripts/generation/__init__.py index 5d8fd1a6..0b114163 100755 --- a/ivas_processing_scripts/generation/__init__.py +++ b/ivas_processing_scripts/generation/__init__.py @@ -39,7 +39,7 @@ from ivas_processing_scripts.constants import ( ) from ivas_processing_scripts.generation import ( config, - generate_ambi_items, + generate_sba_items, generate_ismN_items, generate_omasa_items, generate_osba_items, @@ -89,7 +89,7 @@ def main(args): generate_stereo_items.generate_stereo_items(cfg, logger) elif any(fmt in cfg.format for fmt in ["FOA", "HOA2", "HOA3"]): # generate FOA/HOA2/HOA3 items according to scene description - generate_ambi_items.generate_ambi_items(cfg, logger) + generate_sba_items.generate_sba_items(cfg, logger) elif "OMASA" in cfg.format: # generate OMASA items from FOA/HO2/HOA3 and MONO items according to scene description generate_omasa_items.generate_omasa_items(cfg, logger) diff --git a/ivas_processing_scripts/generation/generate_ambi_items.py b/ivas_processing_scripts/generation/generate_sba_items.py similarity index 99% rename from ivas_processing_scripts/generation/generate_ambi_items.py rename to ivas_processing_scripts/generation/generate_sba_items.py index 3628369c..9c03b505 100644 --- a/ivas_processing_scripts/generation/generate_ambi_items.py +++ b/ivas_processing_scripts/generation/generate_sba_items.py @@ -70,7 +70,7 @@ def replace_char_seq_with_string(str, char_seq, repl_str): return "".join(result) -def generate_ambi_items( +def generate_sba_items( cfg: config.TestConfig, logger: logging.Logger, ): @@ -147,7 +147,7 @@ def generate_ambi_items( cfg.multiprocessing = False apply_func_parallel( - generate_ambi_scene, + generate_sba_scene, zip(cfg.scenes.keys(), cfg.scenes.values(), repeat(cfg), repeat(logger)), type="mp" if cfg.multiprocessing else None, show_progress=None, @@ -156,7 +156,7 @@ def generate_ambi_items( return -def generate_ambi_scene( +def generate_sba_scene( scene_name: str, scene: dict, cfg: config.TestConfig, logger: logging.Logger ): """ -- GitLab From 9cb0a81dcf82db3b7662801b13e469ffa5e8953f Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 26 Jun 2025 12:57:53 +0200 Subject: [PATCH 38/42] black-related formatting changes --- ivas_processing_scripts/generation/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ivas_processing_scripts/generation/__init__.py b/ivas_processing_scripts/generation/__init__.py index 0b114163..36d2f6d3 100755 --- a/ivas_processing_scripts/generation/__init__.py +++ b/ivas_processing_scripts/generation/__init__.py @@ -39,10 +39,10 @@ from ivas_processing_scripts.constants import ( ) from ivas_processing_scripts.generation import ( config, - generate_sba_items, generate_ismN_items, generate_omasa_items, generate_osba_items, + generate_sba_items, generate_stereo_items, ) -- GitLab From d162c2c7fd5b3002d0bcf4863e893237fdc82e06 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 26 Jun 2025 13:30:07 +0200 Subject: [PATCH 39/42] black formatting --- ivas_processing_scripts/audiotools/audiofile.py | 7 ++++++- ivas_processing_scripts/audiotools/constants.py | 5 ++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/ivas_processing_scripts/audiotools/audiofile.py b/ivas_processing_scripts/audiotools/audiofile.py index 8d8d1601..c4934a6f 100755 --- a/ivas_processing_scripts/audiotools/audiofile.py +++ b/ivas_processing_scripts/audiotools/audiofile.py @@ -80,8 +80,13 @@ def read( with catch_warnings(record=True) as warnings_list: fs, data = wav.read(filename) for w in warnings_list: - if SUPPRESS_CHUNK_WARNING_WAV_READ and "Chunk (non-data) not understood, skipping it." in str(w.message): + if ( + SUPPRESS_CHUNK_WARNING_WAV_READ + and "Chunk (non-data) not understood, skipping it." + in str(w.message) + ): continue + print(f"{filename} : {w.message} ( {w.filename}:{w.lineno} )") if data.dtype == np.int32: data = np.interp( diff --git a/ivas_processing_scripts/audiotools/constants.py b/ivas_processing_scripts/audiotools/constants.py index dfe21e1f..11f70e64 100755 --- a/ivas_processing_scripts/audiotools/constants.py +++ b/ivas_processing_scripts/audiotools/constants.py @@ -32,7 +32,10 @@ import numpy as np -SUPPRESS_CHUNK_WARNING_WAV_READ = False # suppress warning from .wav read() when chunk size is not a multiple of 2 +SUPPRESS_CHUNK_WARNING_WAV_READ = ( + False # suppress warning from .wav read() when chunk size is not a multiple of 2 +) + BINAURAL_AUDIO_FORMATS = { "BINAURAL": { -- GitLab From bc259dc0d1baa1dba926b25606de29b0bdd2ce70 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 26 Jun 2025 14:08:17 +0200 Subject: [PATCH 40/42] update year to 2022-2025 --- ivas_processing_scripts/generation/__init__.py | 4 ---- ivas_processing_scripts/generation/config.py | 4 ---- ivas_processing_scripts/generation/constants.py | 4 ---- ivas_processing_scripts/generation/generate_ismN_items.py | 2 +- ivas_processing_scripts/generation/generate_omasa_items.py | 2 +- ivas_processing_scripts/generation/generate_osba_items.py | 2 +- ivas_processing_scripts/generation/generate_sba_items.py | 2 +- ivas_processing_scripts/generation/generate_stereo_items.py | 2 +- 8 files changed, 5 insertions(+), 17 deletions(-) diff --git a/ivas_processing_scripts/generation/__init__.py b/ivas_processing_scripts/generation/__init__.py index d5958cb0..8a9dfb98 100755 --- a/ivas_processing_scripts/generation/__init__.py +++ b/ivas_processing_scripts/generation/__init__.py @@ -1,11 +1,7 @@ #!/usr/bin/env python3 # -<<<<<<< HEAD -# (C) 2022-2026 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -======= # (C) 2022-2025 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, ->>>>>>> main # Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., # Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, # Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other diff --git a/ivas_processing_scripts/generation/config.py b/ivas_processing_scripts/generation/config.py index ea9fa9a1..b61aa881 100644 --- a/ivas_processing_scripts/generation/config.py +++ b/ivas_processing_scripts/generation/config.py @@ -1,11 +1,7 @@ #!/usr/bin/env python3 # -<<<<<<< HEAD -# (C) 2022-2026 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -======= # (C) 2022-2025 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, ->>>>>>> main # Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., # Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, # Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other diff --git a/ivas_processing_scripts/generation/constants.py b/ivas_processing_scripts/generation/constants.py index 09602825..3bc6b82d 100644 --- a/ivas_processing_scripts/generation/constants.py +++ b/ivas_processing_scripts/generation/constants.py @@ -1,11 +1,7 @@ #!/usr/bin/env python3 # -<<<<<<< HEAD -# (C) 2022-2026 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -======= # (C) 2022-2025 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, ->>>>>>> main # Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., # Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, # Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other diff --git a/ivas_processing_scripts/generation/generate_ismN_items.py b/ivas_processing_scripts/generation/generate_ismN_items.py index 646798fa..dcf76cad 100644 --- a/ivas_processing_scripts/generation/generate_ismN_items.py +++ b/ivas_processing_scripts/generation/generate_ismN_items.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# (C) 2022-2025 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, # Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., # Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, # Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py index be1b2bd7..ed48c37b 100644 --- a/ivas_processing_scripts/generation/generate_omasa_items.py +++ b/ivas_processing_scripts/generation/generate_omasa_items.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# (C) 2022-2025 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, # Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., # Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, # Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other diff --git a/ivas_processing_scripts/generation/generate_osba_items.py b/ivas_processing_scripts/generation/generate_osba_items.py index 98e3c9d8..815be0b5 100644 --- a/ivas_processing_scripts/generation/generate_osba_items.py +++ b/ivas_processing_scripts/generation/generate_osba_items.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# (C) 2022-2025 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, # Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., # Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, # Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other diff --git a/ivas_processing_scripts/generation/generate_sba_items.py b/ivas_processing_scripts/generation/generate_sba_items.py index 9c03b505..6904f107 100644 --- a/ivas_processing_scripts/generation/generate_sba_items.py +++ b/ivas_processing_scripts/generation/generate_sba_items.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -# (C) 2022-2026 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# (C) 2022-2025 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, # Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., # Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, # Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other diff --git a/ivas_processing_scripts/generation/generate_stereo_items.py b/ivas_processing_scripts/generation/generate_stereo_items.py index f8f6a872..d6208096 100644 --- a/ivas_processing_scripts/generation/generate_stereo_items.py +++ b/ivas_processing_scripts/generation/generate_stereo_items.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -# (C) 2022-2026 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# (C) 2022-2025 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, # Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., # Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, # Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -- GitLab From c29906864e896d04d977b7dc2713447b5ed3ec6d Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 26 Jun 2025 14:14:08 +0200 Subject: [PATCH 41/42] fix aut. merge issue --- ivas_processing_scripts/generation/__main__.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/ivas_processing_scripts/generation/__main__.py b/ivas_processing_scripts/generation/__main__.py index 45365159..fe0f58ae 100755 --- a/ivas_processing_scripts/generation/__main__.py +++ b/ivas_processing_scripts/generation/__main__.py @@ -1,11 +1,7 @@ #!/usr/bin/env python3 # -<<<<<<< HEAD -# (C) 2022-2026 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -======= # (C) 2022-2025 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, ->>>>>>> main # Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., # Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, # Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -- GitLab From e575d8cb8a50ba64e10e7a9f0d8a4b33eed1ffd5 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 26 Jun 2025 14:49:28 +0200 Subject: [PATCH 42/42] update the documentation --- README.md | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 8e291a4b..a78877c7 100755 --- a/README.md +++ b/README.md @@ -55,21 +55,34 @@ In the following sections the only purpose of the curly brackets is to mark the ## P800 The setup for a P800 test from the experiments folder consists of two steps: -item generation and item processing. The two steps can be applied independent of each other. +item generation and item processing. The two steps can be applied independently of each other. ### Item generation -To set up the P800-{X} listening test (X = 1, 2, ...9) copy your mono input files to `experiments/selection/P800-{X}/gen_input/items_mono`. -These files have to follow the naming scheme `{l}{LL}p0{X}{name_of_item}` where 'l' stands for the listening lab designator: a (Force Technology), -b (HEAD acoustics), c (MQ University), d (Mesaqin.com), and 'LL' stands for the language: EN, GE, JP, MA, DK, FR. +To facilitate the preparation of items for P800-{X} listening tests, it is possible to generate samples of complex formats (STEREO, SBA, ISMn, OMASA, OSBA) from mono samples. To generate items, run the following command from the root of the repository: -The impluse responses have to be copied to experiments/selection/P800-{X}/gen_input/IRs. +```bash + python generate_items.py --config path/to/scene_description_config_file.yml + ``` + +The YAML configuration file (`scene_description_config_file.yml`) defines how individual mono files should be spatially positioned and combined into the target format. For advanced formats like OMASA or OSBA, note that additional SBA items may be required. Refer to the `examples/` folder for template `.yml` files demonstrating the expected structure and usage. + +Relative paths are resolved from the working directory (not the YAML file location). Use absolute paths if you're unsure. Avoid using dots `.` in file names (e.g., use `item_xxa3s1.wav`, not `item.xx.a3s1.wav`). Windows users: Use double backslashes `\\` and add `.exe` to executables if needed. Input and output files follow structured naming conventions to encode metadata like lab, language, speaker ID, etc. These are explained in detail in the file under *Filename conventions*. + +Each entry under `scenes:` describes one test item, specifying: + +* `output`: output file name +* `description`: human-readable description +* `input`: list of mono `.wav` files +* `azimuth` / `elevation`: spatial placement (°) +* `level`: loudness in dB +* `shift`: timing offsets in seconds + +Dynamic positioning (e.g., `"-20:1.0:360"`) means the source will move over time, stepping every 20 ms. -To generate the items run `python -m ivas_processing_scripts.generation experiments/selection/P800-{X}/config/item_gen_P800-{X}_{l}.yml` from the root folder of the repository. -The resulting files can be found in `experiments/selection/P800-{X}/proc_input_{l}` sorted by category. +The total duration of the output signal can be controlled using the `duration` field. The output signal may optionally be rendered to the BINAURAL format by specifying the `binaural_output` field. -For P800-3 the input files for the processing are already provided by the listening lab. This means this step can be skipped. -For tests with ISM input format (P800-6 and P800-7) no IRs are needed, only mono sentences +Start by running a single scene to verify settings. Output includes both audio and optional metadata files. You can enable multiprocessing by setting `multiprocessing: true`. ### Item processing -- GitLab