From b0229483e57f1503e2908e56dfeba821e707d518 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Wed, 20 Aug 2025 19:19:11 +0200 Subject: [PATCH 1/4] support RIR convolution for SBA formats --- .../generation/generate_masa_items.py | 21 +-- .../generation/generate_omasa_items.py | 4 +- .../generation/generate_osba_items.py | 2 +- .../generation/generate_sba_items.py | 139 +++++++++++++++--- 4 files changed, 132 insertions(+), 34 deletions(-) diff --git a/ivas_processing_scripts/generation/generate_masa_items.py b/ivas_processing_scripts/generation/generate_masa_items.py index 6ddac870..5ecf2358 100644 --- a/ivas_processing_scripts/generation/generate_masa_items.py +++ b/ivas_processing_scripts/generation/generate_masa_items.py @@ -246,22 +246,18 @@ def generate_MASA_scene( # of the reference signal (0-based index) if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: # extract X and i_ref - match = re.match( - r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i] - ) + match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]) if match: overlap = float(match.group(1)) overlap_ref = int(match.group(2)) else: scene_shift_str = scene["shift"][i] - logger.error( - f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!" - ) + logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!") sys.exit(-1) # calculate absolute shift of the source signal in seconds - source_shift = end_position[overlap_ref] - overlap + source_shift = end_position[overlap_ref] + overlap else: source_shift = 0.0 @@ -432,15 +428,14 @@ def generate_MASA_scene( # trim the output signal if the total duration exceeds X seconds if "duration" in cfg.__dict__: # convert from seconds to samples (ensure multiple of 20ms) - duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len) + duration = int( + np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len + ) # check if the current length of the output signal exceeds the duration if len(y_int.audio) > duration: y_int.audio = audioarray.trim( - y_int.audio, - y_int.fs, - limits=[0, len(y_int.audio) - duration], - samples=True, + y_int.audio, y_int.fs, limits=[0, len(y_int.audio) - duration], samples=True ) # adjust the loudness of the output signal @@ -525,7 +520,7 @@ def generate_MASA_scene( y_int.audio = audioarray.window(y_int.audio, y_int.fs, cfg.fade_in_out * 1000) # generate MASA metadata filename (should end with .met) - y.metadata_file = output_filename.with_suffix(".met") + y.metadata_file = output_filename.with_suffix(output_filename.suffix + ".met") # convert the intermediate SBA output signal to MASA format render_sba_to_masa(y_int, y) diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py index 0881c7ca..929fed63 100644 --- a/ivas_processing_scripts/generation/generate_omasa_items.py +++ b/ivas_processing_scripts/generation/generate_omasa_items.py @@ -474,7 +474,7 @@ def generate_OMASA_scene( # add ISM metadata .csv filename (should end with .wav.0.csv, .wav.1.csv, ...) y_int.metadata_files.insert( - i - 1, str(output_filename.with_suffix(f".{i - 1}.csv")) + i - 1, str(output_filename.with_suffix(output_filename.suffix + f".{i - 1}.csv")) ) # append pre-amble and post-amble @@ -520,7 +520,7 @@ def generate_OMASA_scene( y_int.audio = audioarray.window(y_int.audio, y_int.fs, cfg.fade_in_out * 1000) # generate and insert MASA metadata filename (should end with .met) - y.metadata_files.append(str(output_filename.with_suffix(".met"))) + y.metadata_files.append(str(output_filename.with_suffix(output_filename.suffix + ".met"))) # convert the intermediate OSBA object to OMASA object convert_osba(y_int, y) diff --git a/ivas_processing_scripts/generation/generate_osba_items.py b/ivas_processing_scripts/generation/generate_osba_items.py index 8d2ca0d8..7c1ee264 100644 --- a/ivas_processing_scripts/generation/generate_osba_items.py +++ b/ivas_processing_scripts/generation/generate_osba_items.py @@ -460,7 +460,7 @@ def generate_OSBA_scene( # add ISM metadata .csv filename (should end with .wav.0.csv, .wav.1.csv, ...) y.metadata_files.insert( - i - 1, str(output_filename.with_suffix(f".{i - 1}.csv")) + i - 1, str(output_filename.with_suffix(output_filename.suffix + f".{i - 1}.csv")) ) # append pre-amble and post-amble diff --git a/ivas_processing_scripts/generation/generate_sba_items.py b/ivas_processing_scripts/generation/generate_sba_items.py index 631d6165..297a332d 100644 --- a/ivas_processing_scripts/generation/generate_sba_items.py +++ b/ivas_processing_scripts/generation/generate_sba_items.py @@ -39,6 +39,7 @@ from pathlib import Path import numpy as np from ivas_processing_scripts.audiotools import audio, audioarray, audiofile +from ivas_processing_scripts.audiotools.convert.objectbased import convert_objectbased from ivas_processing_scripts.audiotools.convert.scenebased import convert_scenebased from ivas_processing_scripts.audiotools.wrappers.bs1770 import loudness_norm from ivas_processing_scripts.audiotools.wrappers.reverb import ( @@ -208,13 +209,27 @@ def generate_sba_scene( source_file = ( scene["input"][i] if isinstance(scene["input"], list) else scene["input"] ) - IR_file = scene["IR"][i] if isinstance(scene["IR"], list) else scene["IR"] - # get input filename and IR filename input_filename = Path(source_file).parent / ( cfg.use_input_prefix + Path(source_file).name ) - IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) + + # get input filename and IR filename + if "IR" in scene.keys(): + IR_file = scene["IR"][i] if isinstance(scene["IR"], list) else scene["IR"] + IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) + else: + # read azimuth and elevation information + source_azi = ( + scene["azimuth"][i] + if isinstance(scene["azimuth"], list) + else scene["azimuth"] + ) + source_ele = ( + scene["elevation"][i] + if isinstance(scene["elevation"], list) + else scene["elevation"] + ) # read the source shift length (in seconds) if "shift" in scene.keys(): @@ -282,9 +297,14 @@ def generate_sba_scene( else: level = -26 - logger.info( - f"-- Convolving {source_file} with {IR_file} at {level} LKFS with shift of {source_shift_in_seconds} seconds" - ) + if "IR" in scene.keys(): + logger.info( + f"-- Convolving {source_file} with {IR_file} at {level} LKFS with shift of {source_shift_in_seconds} seconds" + ) + else: + logger.info( + f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LKFS with shift of {source_shift_in_seconds} seconds" + ) # read source file x = audio.fromfile("MONO", input_filename) @@ -301,17 +321,6 @@ def generate_sba_scene( x.audio = resampled_audio x.fs = cfg.fs - # read the IR file (!must be in target format!) - IR = audio.fromfile(cfg.format, IR_filename) - - # convolve MONO source audio with FOA/HOA2/HOA3 IR -> results in FOA/HOA2/HOA3 audio object - if cfg.format == "FOA": - x = reverb_foa(x, IR, mode=None) - elif cfg.format == "HOA2": - x = reverb_hoa2(x, IR, mode=None) - elif cfg.format == "HOA3": - x = reverb_hoa3(x, IR, mode=None) - # adjust the level of the FOA/HOA2/HOA3 signal if level is None: # do not change the level of the audio source signal @@ -331,6 +340,100 @@ def generate_sba_scene( x.audio, x.fs, limits=[0, -N_pad], samples=True ) + # get the number of frames (multiple of 20ms) + N_frames = int(len(x.audio) / frame_len) + + if "IR" in scene.keys(): + # read the IR file (!must be in target format!) + IR = audio.fromfile(cfg.format, IR_filename) + + # convolve MONO source audio with FOA/HOA2/HOA3 IR -> results in FOA/HOA2/HOA3 audio object + if cfg.format == "FOA": + x = reverb_foa(x, IR, mode=None) + elif cfg.format == "HOA2": + x = reverb_hoa2(x, IR, mode=None) + elif cfg.format == "HOA3": + x = reverb_hoa3(x, IR, mode=None) + else: + # convert MONO to ISM1 + x_ism = audio.ObjectBasedAudio("ISM1") # ISM with 1 channel + x_ism.fs = cfg.fs + x_ism.audio = x.audio.copy() + + # convert azimuth information in case of moving object + if isinstance(source_azi, str): + if ":" in source_azi: + # convert into array (initial_value:step:stop_value) + start_str, step_str, stop_str = source_azi.split(":") + start = float(eval(start_str)) + step = float(eval(step_str)) + stop = float(eval(stop_str)) + azi = np.arange(start, stop, step) + + # adjust length to N_frames + if len(azi) > N_frames: + azi = azi[:N_frames] + elif len(azi) < N_frames: + azi = np.append(azi, np.full(N_frames - len(azi), azi[-1])) + else: + # replicate static azimuth value N_frames times + azi = np.repeat(float(eval(source_azi)), N_frames) + else: + # replicate static azimuth value N_frames times + azi = np.repeat(float(source_azi), N_frames) + + # convert azimuth from 0 .. 360 to -180 .. +180 + azi = (azi + 180) % 360 - 180 + + # check if azimuth is from -180 .. +180 + if any(azi > 180) or any(azi < -180): + logger.error( + f"Incorrect value(s) of azimuth: {azi[(azi > 180) | (azi < -180)]}" + ) + + # convert elevation information in case mof moving object + if isinstance(source_ele, str): + if ":" in source_ele: + # convert into array (initial_value:step:stop_value) + start_str, step_str, stop_str = source_ele.split(":") + start = float(eval(start_str)) + step = float(eval(step_str)) + stop = float(eval(stop_str)) + ele = np.arange(start, stop, step) + + # adjust length to N_frames + if len(ele) > N_frames: + ele = ele[:N_frames] + elif len(ele) < N_frames: + ele = np.append(ele, np.full(N_frames - len(ele), ele[-1])) + + else: + # replicate static elevation value N_frames times + ele = np.repeat(float(eval(source_ele)), N_frames) + else: + # replicate static elevation value N_frames times + ele = np.repeat(float(source_ele), N_frames) + + # wrap elevation angle to -90 .. +90 + ele = ((ele + 90) % 180) - 90 + + # check if elevation is from -90 .. +90 + if any(ele > 90) or any(ele < -90): + logger.error( + f"Incorrect value(s) of elevation: {ele[(ele > 90) | (ele < -90)]}" + ) + + # generate radius vector with all values equal to 1.0 + rad = np.ones(N_frames) + + # arrange all metadata fields column-wise into a matrix + x_ism.object_pos.append(np.column_stack((azi, ele, rad))) + + # convert ISM1 object to SBA + x_sba = audio.SceneBasedAudio(cfg.format) + convert_objectbased(x_ism, x_sba) + x = x_sba # replace x with the SBA object + # add the convolved FOA/HOA2/HOA3 audio source signal to the output signal if y.audio is None: # add source signal to the array of all source signals @@ -338,7 +441,7 @@ def generate_sba_scene( if source_shift > 0: # insert zeros to the new audio source signal to shift it right - y.audio = audioarray.trim_meta( + y.audio = audioarray.trim( y.audio, y.fs, limits=[-source_shift, 0], samples=True ) else: -- GitLab From d44605c80c329afce35cc18c349a87eb782997db Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 21 Aug 2025 09:03:29 +0200 Subject: [PATCH 2/4] small correction to suffix --- ivas_processing_scripts/generation/generate_ismN_items.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ivas_processing_scripts/generation/generate_ismN_items.py b/ivas_processing_scripts/generation/generate_ismN_items.py index 3c474309..8f9f292c 100644 --- a/ivas_processing_scripts/generation/generate_ismN_items.py +++ b/ivas_processing_scripts/generation/generate_ismN_items.py @@ -446,7 +446,7 @@ def generate_ismN_scene( y.object_pos.extend(x.object_pos) # add ISM metadata .csv filename (should end with .wav.0.csv, .wav.1.csv, ...) - y.metadata_files.insert(i, str(output_filename.with_suffix(f".{i}.csv"))) + y.metadata_files.insert(i, str(output_filename.with_suffix(output_filename.suffix + f".{i}.csv"))) # append pre-amble and post-amble if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: -- GitLab From c57e1c57dbeb419b6d77283e051b6c485eca1efa Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 21 Aug 2025 09:05:26 +0200 Subject: [PATCH 3/4] formatting --- .../generation/generate_masa_items.py | 17 +++++++++++------ .../generation/generate_omasa_items.py | 11 +++++++++-- .../generation/generate_osba_items.py | 7 ++++++- .../generation/generate_sba_items.py | 4 +++- 4 files changed, 29 insertions(+), 10 deletions(-) diff --git a/ivas_processing_scripts/generation/generate_masa_items.py b/ivas_processing_scripts/generation/generate_masa_items.py index 5ecf2358..faba9112 100644 --- a/ivas_processing_scripts/generation/generate_masa_items.py +++ b/ivas_processing_scripts/generation/generate_masa_items.py @@ -246,14 +246,18 @@ def generate_MASA_scene( # of the reference signal (0-based index) if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]: # extract X and i_ref - match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]) + match = re.match( + r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i] + ) if match: overlap = float(match.group(1)) overlap_ref = int(match.group(2)) else: scene_shift_str = scene["shift"][i] - logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!") + logger.error( + f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!" + ) sys.exit(-1) # calculate absolute shift of the source signal in seconds @@ -428,14 +432,15 @@ def generate_MASA_scene( # trim the output signal if the total duration exceeds X seconds if "duration" in cfg.__dict__: # convert from seconds to samples (ensure multiple of 20ms) - duration = int( - np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len - ) + duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len) # check if the current length of the output signal exceeds the duration if len(y_int.audio) > duration: y_int.audio = audioarray.trim( - y_int.audio, y_int.fs, limits=[0, len(y_int.audio) - duration], samples=True + y_int.audio, + y_int.fs, + limits=[0, len(y_int.audio) - duration], + samples=True, ) # adjust the loudness of the output signal diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py index 929fed63..b8aba2b6 100644 --- a/ivas_processing_scripts/generation/generate_omasa_items.py +++ b/ivas_processing_scripts/generation/generate_omasa_items.py @@ -474,7 +474,12 @@ def generate_OMASA_scene( # add ISM metadata .csv filename (should end with .wav.0.csv, .wav.1.csv, ...) y_int.metadata_files.insert( - i - 1, str(output_filename.with_suffix(output_filename.suffix + f".{i - 1}.csv")) + i - 1, + str( + output_filename.with_suffix( + output_filename.suffix + f".{i - 1}.csv" + ) + ), ) # append pre-amble and post-amble @@ -520,7 +525,9 @@ def generate_OMASA_scene( y_int.audio = audioarray.window(y_int.audio, y_int.fs, cfg.fade_in_out * 1000) # generate and insert MASA metadata filename (should end with .met) - y.metadata_files.append(str(output_filename.with_suffix(output_filename.suffix + ".met"))) + y.metadata_files.append( + str(output_filename.with_suffix(output_filename.suffix + ".met")) + ) # convert the intermediate OSBA object to OMASA object convert_osba(y_int, y) diff --git a/ivas_processing_scripts/generation/generate_osba_items.py b/ivas_processing_scripts/generation/generate_osba_items.py index 7c1ee264..8190f5ca 100644 --- a/ivas_processing_scripts/generation/generate_osba_items.py +++ b/ivas_processing_scripts/generation/generate_osba_items.py @@ -460,7 +460,12 @@ def generate_OSBA_scene( # add ISM metadata .csv filename (should end with .wav.0.csv, .wav.1.csv, ...) y.metadata_files.insert( - i - 1, str(output_filename.with_suffix(output_filename.suffix + f".{i - 1}.csv")) + i - 1, + str( + output_filename.with_suffix( + output_filename.suffix + f".{i - 1}.csv" + ) + ), ) # append pre-amble and post-amble diff --git a/ivas_processing_scripts/generation/generate_sba_items.py b/ivas_processing_scripts/generation/generate_sba_items.py index 297a332d..22aba8ab 100644 --- a/ivas_processing_scripts/generation/generate_sba_items.py +++ b/ivas_processing_scripts/generation/generate_sba_items.py @@ -217,7 +217,9 @@ def generate_sba_scene( # get input filename and IR filename if "IR" in scene.keys(): IR_file = scene["IR"][i] if isinstance(scene["IR"], list) else scene["IR"] - IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) + IR_filename = Path(IR_file).parent / ( + cfg.use_IR_prefix + Path(IR_file).name + ) else: # read azimuth and elevation information source_azi = ( -- GitLab From cee72a170eca9618e35572bf199d629e39fd6531 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 21 Aug 2025 09:49:37 +0200 Subject: [PATCH 4/4] formatting --- ivas_processing_scripts/generation/generate_ismN_items.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ivas_processing_scripts/generation/generate_ismN_items.py b/ivas_processing_scripts/generation/generate_ismN_items.py index 8f9f292c..948c6d48 100644 --- a/ivas_processing_scripts/generation/generate_ismN_items.py +++ b/ivas_processing_scripts/generation/generate_ismN_items.py @@ -446,7 +446,9 @@ def generate_ismN_scene( y.object_pos.extend(x.object_pos) # add ISM metadata .csv filename (should end with .wav.0.csv, .wav.1.csv, ...) - y.metadata_files.insert(i, str(output_filename.with_suffix(output_filename.suffix + f".{i}.csv"))) + y.metadata_files.insert( + i, str(output_filename.with_suffix(output_filename.suffix + f".{i}.csv")) + ) # append pre-amble and post-amble if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: -- GitLab