Loading item_gen_configs/HOA2_CONFIG.yml 0 → 100644 +58 −0 Original line number Diff line number Diff line --- ################################################ # General configuration ################################################ ### Output format format: "HOA2" ### Output sampling rate in Hz needed for headerless audio files; default = 48000 fs: 48000 ### IR sampling rate in Hz needed for headerless audio files; default = 48000 IR_fs: 48000 ### Any relative paths will be interpreted relative to the working directory the script is called from! ### Usage of absolute paths is recommended. ### Do not use file names with dots "." in them! This is not supported, use "_" instead ### For Windows users: please use double back slash '\\' in paths and add '.exe' to executable definitions ### Input path to mono files input_path: "./items_mono" ### Input path to stereo impulse response files, default = './ivas_processing_scripts/generation/IR' IR_path: "./IRs" ### Output path for generated test items and metadata files output_path: "./items_HOA2" ### Target loudness in LKFS; default = null (no loudness normalization applied) loudness: -26 ### Pre-amble and Post-amble length in seconds (default = 0.0) preamble: 0.5 postamble: 1.0 ### Flag for adding low-level random background noise (amplitude +-4) instead of silence; default = false (silence) add_low_level_random_noise: true ################################################ ### Scene description ################################################ ### Each scene must start with the sceneN tag ### Specify the mono source filename (the program will search for it in the input_path folder) ### Specify the stereo IR source filename (the program will search for it in the input_path_IR folder) ### Specify the overlap length in seconds for each input source (negative value creates a gap) ### Note 1: use [val1, val2, ...] for multiple sources in a scene ### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames scenes: a1: name: "out.wav" description: "" source: ["fa1.wav", "ma1.wav"] IR: ["IR_HOA2_env1/FreefieldFloor_TalkPos1_EigenHoA2_SinSweep_9chn.wav", "IR_HOA2_env1/FreefieldFloor_TalkPos2_EigenHoA2_SinSweep_9chn.wav"] overlap: -0.2 ivas_processing_scripts/audiotools/wrappers/reverb.py +65 −1 Original line number Diff line number Diff line Loading @@ -238,7 +238,7 @@ def reverb_foa( H = fft(foa_IR.audio, axis=0) align = 1.0 / np.max(np.abs(H)) # convolve mono input with left and right IR # convolve mono input with FOA IR y_w = reverb(input, IR_w, align=align) y_x = reverb(input, IR_x, align=align) y_y = reverb(input, IR_y, align=align) Loading @@ -251,3 +251,67 @@ def reverb_foa( y.audio = np.column_stack([y_w.audio, y_x.audio, y_y.audio, y_z.audio]) return y def reverb_hoa2( input: Audio, hoa2_IR: Audio, align: Optional[float] = None, ) -> Audio: """ Wrapper for the ITU-T reverb binary to convolve mono audio signal with an HOA2 impulse response Parameters ---------- input: Audio Input audio signal IR: Audio Impulse response align: float multiplicative factor to apply to the reverberated sound in order to align its energy level with the second file Returns ------- output: Audio Convolved audio signal with HOA2 IR """ # convert to float32 hoa2_IR.audio = np.float32(hoa2_IR.audio) numchannels = 9 # HOA2 by definition # calculate the scaling (multiplicative) factor such that the maximum gain of the IR filter across all frequencies is 0dB if align is None: H = fft(hoa2_IR.audio, axis=0) align = 1.0 / np.max(np.abs(H)) IR = copy(hoa2_IR) IR.name = "MONO" IR.num_channels = 1 ych = [] for i in range(numchannels): # separate IR into each channel IR.audio = np.reshape(hoa2_IR.audio[:, i], (-1, 1)) # convolve mono input with channel IR ych.append(reverb(input, IR, align=align)) # combine into hoa2 output y = copy(input) y.name = "HOA2" y.num_channels = numchannels y.audio = np.column_stack( [ ych[0].audio, ych[1].audio, ych[2].audio, ych[3].audio, ych[4].audio, ych[5].audio, ych[6].audio, ych[7].audio, ych[8].audio, ] ) return y ivas_processing_scripts/generation/process_hoa2_items.py 0 → 100644 +218 −0 Original line number Diff line number Diff line #!/usr/bin/env python3 # # (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, # Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., # Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, # Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other # contributors to this repository. All Rights Reserved. # # This software is protected by copyright law and by international treaties. # The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, # Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., # Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, # Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other # contributors to this repository retain full ownership rights in their respective contributions in # the software. This notice grants no license of any kind, including but not limited to patent # license, nor is any license granted by implication, estoppel or otherwise. # # Contributors are required to enter into the IVAS codec Public Collaboration agreement before making # contributions. # # This software is provided "AS IS", without any express or implied warranties. The software is in the # development stage. It is intended exclusively for experts who have experience with such software and # solely for the purpose of inspection. All implied warranties of non-infringement, merchantability # and fitness for a particular purpose are hereby disclaimed and excluded. # # Any dispute, controversy or claim arising under or in relation to providing this software shall be # submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in # accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and # the United Nations Convention on Contracts on the International Sales of Goods. # import logging import os from math import floor import numpy as np from ivas_processing_scripts.audiotools import audio, audiofile from ivas_processing_scripts.audiotools.wrappers.bs1770 import get_loudness from ivas_processing_scripts.audiotools.wrappers.reverb import reverb_hoa2 from ivas_processing_scripts.generation import config SEED_RANDOM_NOISE = 0 # function for converting nd numpy array to strings with 2 decimal digits def csv_formatdata(data): for row in data: yield ["%0.2f" % v for v in row] def generate_hoa2_items( cfg: config.TestConfig, logger: logging.Logger, ): """Generate HOA2 items from mono items based on scene description""" # get the number of scenes N_scenes = len(cfg.scenes) # set the target level if "loudness" not in cfg.__dict__: cfg.loudness = -26 # set the fs if "fs" not in cfg.__dict__: cfg.fs = 48000 # set the IR fs if "IR_fs" not in cfg.__dict__: cfg.IR_fs = 48000 # set the pre-amble and post-amble if "preamble" not in cfg.__dict__: cfg.preamble = 0.0 if "postamble" not in cfg.__dict__: cfg.postamble = 0.0 # set the IR path if "IR_path" not in cfg.__dict__: cfg.IR_path = os.path.join(os.path.dirname(__file__), "IR") # set the pre-amble and post-amble if "add_low_level_random_noise" not in cfg.__dict__: cfg.add_low_level_random_noise = False # repeat for all source files for scene_name, scene in cfg.scenes.items(): logger.info( f"Processing scene: {scene_name} out of {N_scenes} scenes, name: {scene['name']}" ) # extract the number of audio sources N_sources = len(np.atleast_1d(scene["source"])) # read the overlap length if "overlap" in scene.keys(): source_overlap = float(scene["overlap"]) else: source_overlap = 0.0 y = audio.SceneBasedAudio("HOA2") for i in range(N_sources): # parse parameters from the scene description source_file = np.atleast_1d(scene["source"])[i] IR_file = np.atleast_1d(scene["IR"])[i] logger.info(f"Convolving {source_file} with {IR_file}") # read source file x = audio.fromfile( "MONO", os.path.join(cfg.input_path, source_file), fs=cfg.fs ) # read the IR file IR = audio.fromfile( "HOA2", os.path.join(cfg.IR_path, IR_file), fs=cfg.IR_fs ) # convolve with HOA2 IR x = reverb_hoa2(x, IR) # adjust the level of the HOA2 signal _, scale_factor, _ = get_loudness(x, cfg.loudness, "BINAURAL") x.audio *= scale_factor # shift the second (and all other) source files (positive shift creates overlap, negative shift creates a gap) if i > 0 and source_overlap != 0.0: # get the length of the first source file N_delay = len(y.audio[:, 0]) # add the shift N_delay += int(-source_overlap * x.fs) # insert all-zero preamble pre = np.zeros((N_delay, x.audio.shape[1])) x.audio = np.concatenate([pre, x.audio]) # pad with zeros to ensure that the signal length is a multiple of 20ms N_frame = x.fs / 50 if len(x.audio) % N_frame != 0: N_pad = int(N_frame - len(x.audio) % N_frame) # insert all-zero preamble pre = np.zeros((N_pad, x.audio.shape[1])) x.audio = np.concatenate([pre, x.audio]) # add source signal to the array of source signals y.fs = x.fs if y.audio is None: y.audio = x.audio else: # pad with zeros to have equal length of all source signals if x.audio.shape[0] > y.audio.shape[0]: y.audio = np.vstack( ( y.audio, np.zeros( ( x.audio.shape[0] - y.audio.shape[0], y.audio.shape[1], ) ), ) ) elif y.audio.shape[0] > x.audio.shape[0]: x.audio = np.vstack( ( x.audio, np.zeros( ( y.audio.shape[0] - x.audio.shape[0], x.audio.shape[1], ) ), ) ) # superimpose y.audio += x.audio # append pre-amble and post-amble to all sources if cfg.preamble != 0.0: # ensure that pre-amble is a multiple of 20ms N_pre = int(floor(cfg.preamble * 50) / 50 * y.fs) # insert all-zero preamble to all sources pre = np.zeros((N_pre, y.audio.shape[1])) y.audio = np.concatenate([pre, y.audio]) if cfg.postamble != 0.0: # ensure that post-mable is a multiple of 20ms N_post = int(floor(cfg.postamble * 50) / 50 * y.fs) # append all-zero postamble to all sources post = np.zeros((N_post, y.audio.shape[1])) y.audio = np.concatenate([y.audio, post]) # add random noise if cfg.add_low_level_random_noise: # create uniformly distributed noise between -4 and 4 np.random.seed(SEED_RANDOM_NOISE) noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype( "float" ) # superimpose y.audio += noise # write the reverberated audio into output file output_filename = scene["name"] audiofile.write( os.path.join(cfg.output_path, output_filename), y.audio, y.fs ) # !!!! TBD: replace all os.path.xxx operations with the Path object return Loading
item_gen_configs/HOA2_CONFIG.yml 0 → 100644 +58 −0 Original line number Diff line number Diff line --- ################################################ # General configuration ################################################ ### Output format format: "HOA2" ### Output sampling rate in Hz needed for headerless audio files; default = 48000 fs: 48000 ### IR sampling rate in Hz needed for headerless audio files; default = 48000 IR_fs: 48000 ### Any relative paths will be interpreted relative to the working directory the script is called from! ### Usage of absolute paths is recommended. ### Do not use file names with dots "." in them! This is not supported, use "_" instead ### For Windows users: please use double back slash '\\' in paths and add '.exe' to executable definitions ### Input path to mono files input_path: "./items_mono" ### Input path to stereo impulse response files, default = './ivas_processing_scripts/generation/IR' IR_path: "./IRs" ### Output path for generated test items and metadata files output_path: "./items_HOA2" ### Target loudness in LKFS; default = null (no loudness normalization applied) loudness: -26 ### Pre-amble and Post-amble length in seconds (default = 0.0) preamble: 0.5 postamble: 1.0 ### Flag for adding low-level random background noise (amplitude +-4) instead of silence; default = false (silence) add_low_level_random_noise: true ################################################ ### Scene description ################################################ ### Each scene must start with the sceneN tag ### Specify the mono source filename (the program will search for it in the input_path folder) ### Specify the stereo IR source filename (the program will search for it in the input_path_IR folder) ### Specify the overlap length in seconds for each input source (negative value creates a gap) ### Note 1: use [val1, val2, ...] for multiple sources in a scene ### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames scenes: a1: name: "out.wav" description: "" source: ["fa1.wav", "ma1.wav"] IR: ["IR_HOA2_env1/FreefieldFloor_TalkPos1_EigenHoA2_SinSweep_9chn.wav", "IR_HOA2_env1/FreefieldFloor_TalkPos2_EigenHoA2_SinSweep_9chn.wav"] overlap: -0.2
ivas_processing_scripts/audiotools/wrappers/reverb.py +65 −1 Original line number Diff line number Diff line Loading @@ -238,7 +238,7 @@ def reverb_foa( H = fft(foa_IR.audio, axis=0) align = 1.0 / np.max(np.abs(H)) # convolve mono input with left and right IR # convolve mono input with FOA IR y_w = reverb(input, IR_w, align=align) y_x = reverb(input, IR_x, align=align) y_y = reverb(input, IR_y, align=align) Loading @@ -251,3 +251,67 @@ def reverb_foa( y.audio = np.column_stack([y_w.audio, y_x.audio, y_y.audio, y_z.audio]) return y def reverb_hoa2( input: Audio, hoa2_IR: Audio, align: Optional[float] = None, ) -> Audio: """ Wrapper for the ITU-T reverb binary to convolve mono audio signal with an HOA2 impulse response Parameters ---------- input: Audio Input audio signal IR: Audio Impulse response align: float multiplicative factor to apply to the reverberated sound in order to align its energy level with the second file Returns ------- output: Audio Convolved audio signal with HOA2 IR """ # convert to float32 hoa2_IR.audio = np.float32(hoa2_IR.audio) numchannels = 9 # HOA2 by definition # calculate the scaling (multiplicative) factor such that the maximum gain of the IR filter across all frequencies is 0dB if align is None: H = fft(hoa2_IR.audio, axis=0) align = 1.0 / np.max(np.abs(H)) IR = copy(hoa2_IR) IR.name = "MONO" IR.num_channels = 1 ych = [] for i in range(numchannels): # separate IR into each channel IR.audio = np.reshape(hoa2_IR.audio[:, i], (-1, 1)) # convolve mono input with channel IR ych.append(reverb(input, IR, align=align)) # combine into hoa2 output y = copy(input) y.name = "HOA2" y.num_channels = numchannels y.audio = np.column_stack( [ ych[0].audio, ych[1].audio, ych[2].audio, ych[3].audio, ych[4].audio, ych[5].audio, ych[6].audio, ych[7].audio, ych[8].audio, ] ) return y
ivas_processing_scripts/generation/process_hoa2_items.py 0 → 100644 +218 −0 Original line number Diff line number Diff line #!/usr/bin/env python3 # # (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, # Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., # Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, # Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other # contributors to this repository. All Rights Reserved. # # This software is protected by copyright law and by international treaties. # The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, # Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., # Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, # Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other # contributors to this repository retain full ownership rights in their respective contributions in # the software. This notice grants no license of any kind, including but not limited to patent # license, nor is any license granted by implication, estoppel or otherwise. # # Contributors are required to enter into the IVAS codec Public Collaboration agreement before making # contributions. # # This software is provided "AS IS", without any express or implied warranties. The software is in the # development stage. It is intended exclusively for experts who have experience with such software and # solely for the purpose of inspection. All implied warranties of non-infringement, merchantability # and fitness for a particular purpose are hereby disclaimed and excluded. # # Any dispute, controversy or claim arising under or in relation to providing this software shall be # submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in # accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and # the United Nations Convention on Contracts on the International Sales of Goods. # import logging import os from math import floor import numpy as np from ivas_processing_scripts.audiotools import audio, audiofile from ivas_processing_scripts.audiotools.wrappers.bs1770 import get_loudness from ivas_processing_scripts.audiotools.wrappers.reverb import reverb_hoa2 from ivas_processing_scripts.generation import config SEED_RANDOM_NOISE = 0 # function for converting nd numpy array to strings with 2 decimal digits def csv_formatdata(data): for row in data: yield ["%0.2f" % v for v in row] def generate_hoa2_items( cfg: config.TestConfig, logger: logging.Logger, ): """Generate HOA2 items from mono items based on scene description""" # get the number of scenes N_scenes = len(cfg.scenes) # set the target level if "loudness" not in cfg.__dict__: cfg.loudness = -26 # set the fs if "fs" not in cfg.__dict__: cfg.fs = 48000 # set the IR fs if "IR_fs" not in cfg.__dict__: cfg.IR_fs = 48000 # set the pre-amble and post-amble if "preamble" not in cfg.__dict__: cfg.preamble = 0.0 if "postamble" not in cfg.__dict__: cfg.postamble = 0.0 # set the IR path if "IR_path" not in cfg.__dict__: cfg.IR_path = os.path.join(os.path.dirname(__file__), "IR") # set the pre-amble and post-amble if "add_low_level_random_noise" not in cfg.__dict__: cfg.add_low_level_random_noise = False # repeat for all source files for scene_name, scene in cfg.scenes.items(): logger.info( f"Processing scene: {scene_name} out of {N_scenes} scenes, name: {scene['name']}" ) # extract the number of audio sources N_sources = len(np.atleast_1d(scene["source"])) # read the overlap length if "overlap" in scene.keys(): source_overlap = float(scene["overlap"]) else: source_overlap = 0.0 y = audio.SceneBasedAudio("HOA2") for i in range(N_sources): # parse parameters from the scene description source_file = np.atleast_1d(scene["source"])[i] IR_file = np.atleast_1d(scene["IR"])[i] logger.info(f"Convolving {source_file} with {IR_file}") # read source file x = audio.fromfile( "MONO", os.path.join(cfg.input_path, source_file), fs=cfg.fs ) # read the IR file IR = audio.fromfile( "HOA2", os.path.join(cfg.IR_path, IR_file), fs=cfg.IR_fs ) # convolve with HOA2 IR x = reverb_hoa2(x, IR) # adjust the level of the HOA2 signal _, scale_factor, _ = get_loudness(x, cfg.loudness, "BINAURAL") x.audio *= scale_factor # shift the second (and all other) source files (positive shift creates overlap, negative shift creates a gap) if i > 0 and source_overlap != 0.0: # get the length of the first source file N_delay = len(y.audio[:, 0]) # add the shift N_delay += int(-source_overlap * x.fs) # insert all-zero preamble pre = np.zeros((N_delay, x.audio.shape[1])) x.audio = np.concatenate([pre, x.audio]) # pad with zeros to ensure that the signal length is a multiple of 20ms N_frame = x.fs / 50 if len(x.audio) % N_frame != 0: N_pad = int(N_frame - len(x.audio) % N_frame) # insert all-zero preamble pre = np.zeros((N_pad, x.audio.shape[1])) x.audio = np.concatenate([pre, x.audio]) # add source signal to the array of source signals y.fs = x.fs if y.audio is None: y.audio = x.audio else: # pad with zeros to have equal length of all source signals if x.audio.shape[0] > y.audio.shape[0]: y.audio = np.vstack( ( y.audio, np.zeros( ( x.audio.shape[0] - y.audio.shape[0], y.audio.shape[1], ) ), ) ) elif y.audio.shape[0] > x.audio.shape[0]: x.audio = np.vstack( ( x.audio, np.zeros( ( y.audio.shape[0] - x.audio.shape[0], x.audio.shape[1], ) ), ) ) # superimpose y.audio += x.audio # append pre-amble and post-amble to all sources if cfg.preamble != 0.0: # ensure that pre-amble is a multiple of 20ms N_pre = int(floor(cfg.preamble * 50) / 50 * y.fs) # insert all-zero preamble to all sources pre = np.zeros((N_pre, y.audio.shape[1])) y.audio = np.concatenate([pre, y.audio]) if cfg.postamble != 0.0: # ensure that post-mable is a multiple of 20ms N_post = int(floor(cfg.postamble * 50) / 50 * y.fs) # append all-zero postamble to all sources post = np.zeros((N_post, y.audio.shape[1])) y.audio = np.concatenate([y.audio, post]) # add random noise if cfg.add_low_level_random_noise: # create uniformly distributed noise between -4 and 4 np.random.seed(SEED_RANDOM_NOISE) noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype( "float" ) # superimpose y.audio += noise # write the reverberated audio into output file output_filename = scene["name"] audiofile.write( os.path.join(cfg.output_path, output_filename), y.audio, y.fs ) # !!!! TBD: replace all os.path.xxx operations with the Path object return