Loading item_generation_scripts/__init__.py +16 −3 Original line number Diff line number Diff line Loading @@ -40,7 +40,7 @@ from item_generation_scripts.constants import ( LOGGER_FORMAT, LOGGER_SUFFIX, ) from item_generation_scripts.processing import config, process_ism_items from item_generation_scripts.processing import config, process_ism_items, process_stereo_items from item_generation_scripts.utils import create_dir Loading Loading @@ -83,7 +83,7 @@ def main(args): # generate input items if cfg.format.startswith("ISM"): # generate ISM items according to scene description # generate ISM items with metadata according to scene description process_ism_items.generate_ism_items( cfg.format, cfg.loudness, Loading @@ -93,6 +93,19 @@ def main(args): logger, fs=cfg.fs ) elif cfg.format == "STEREO": # generate STEREO items according to scene description process_stereo_items.generate_stereo_items( cfg.format, cfg.loudness, cfg.input_path, cfg.IR_path, cfg.output_path, cfg.scenes, logger, fs=cfg.fs, IR_fs=cfg.IR_fs, ) # copy configuration to output directory with open(cfg.output_path.joinpath(f"{cfg.format}.yml"), "w") as f: Loading item_generation_scripts/audiotools/audiofile.py +4 −1 Original line number Diff line number Diff line Loading @@ -110,6 +110,7 @@ def write( filename: Union[str, Path], x: np.ndarray, fs: Optional[int] = 48000, dtype: Optional[str] = "int16", ) -> None: """ Write audio file (.pcm, .wav or .raw) Loading @@ -122,6 +123,8 @@ def write( Numpy 2D array of dimension: number of channels x number of samples fs: Optional[int] Sampling rate, required for .pcm or .raw input file, default = 48000 (Hz) dtype: Optional[str] Data type format required for .pcm or .raw input file, default = 'int16' Returns ------- Loading @@ -141,7 +144,7 @@ def write( x = x.astype(np.int16) wav.write(filename, fs, x) elif file_extension == ".pcm" or file_extension == ".raw": x = x.astype("int16").reshape(-1, 1) x = x.astype(dtype).reshape(-1, 1) x.tofile(filename) else: raise ValueError("Wrong input format. Use wav, pcm or raw") Loading item_generation_scripts/audiotools/wrappers/reverb.py 0 → 100644 +186 −0 Original line number Diff line number Diff line #!/usr/bin/env python3 # # (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, # Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., # Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, # Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other # contributors to this repository. All Rights Reserved. # # This software is protected by copyright law and by international treaties. # The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, # Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., # Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, # Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other # contributors to this repository retain full ownership rights in their respective contributions in # the software. This notice grants no license of any kind, including but not limited to patent # license, nor is any license granted by implication, estoppel or otherwise. # # Contributors are required to enter into the IVAS codec Public Collaboration agreement before making # contributions. # # This software is provided "AS IS", without any express or implied warranties. The software is in the # development stage. It is intended exclusively for experts who have experience with such software and # solely for the purpose of inspection. All implied warranties of non-infringement, merchantability # and fitness for a particular purpose are hereby disclaimed and excluded. # # Any dispute, controversy or claim arising under or in relation to providing this software shall be # submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in # accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and # the United Nations Convention on Contracts on the International Sales of Goods. # import os.path import numpy as np from scipy.fft import fft from copy import copy from pathlib import Path from tempfile import TemporaryDirectory from typing import Optional, Union from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES from item_generation_scripts.utils import find_binary, run from item_generation_scripts.audiotools.audio import Audio from item_generation_scripts.audiotools.audiofile import read, write from item_generation_scripts.audiotools.wrappers.filter import resample_itu def reverb( input: Audio, IR: Audio, align: Optional[float] = None, ) -> Audio: """ Wrapper for the ITU-T reverb binary to convolve mono audio signal with an impulse response Note: The 'reverb' binary tool expects that the IR file is written in the 32b IEEE Standard 754 floating-point representation. Parameters ---------- input: Audio Input audio signal IR: Audio Impulse response align: float multiplicative factor to apply to the reverberated sound in order to align its energy level with a second filePath to the output file Returns ------- output: Audio Convolved audio signal with IR """ # find binary if "reverb" in DEFAULT_CONFIG_BINARIES["binary_paths"]: binary = find_binary( DEFAULT_CONFIG_BINARIES["binary_paths"]["reverb"].name, binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"]["reverb"].parent, ) else: binary = find_binary("reverb") with TemporaryDirectory(dir="./tmp_reverb") as tmp_dir: tmp_dir = Path(tmp_dir) # resample input audio signal to that of the IR old_fs = None tmp_input = copy(input) if input.fs != IR.fs: old_fs = input.fs tmp_input.audio = resample_itu(tmp_input, IR.fs) tmp_input.fs = IR.fs # write input audio signal to temporary file in .pcm format tmp_input_file = tmp_dir.joinpath(f"tmp_reverbIn.pcm") write(tmp_input_file, tmp_input.audio, tmp_input.fs) # down-scale IR to prevent saturation # max_value = np.max(np.abs(IR.audio)) # if max_value > 1.0: # IR.audio = IR.audio / max_value # write IR to temporary file in .pcm format # note: the reverb tool expects 32b float format tmp_IR_file = tmp_dir.joinpath(f"tmp_IR.pcm") write(tmp_IR_file, IR.audio.astype("float32"), IR.fs, dtype="float32") # set up the 'reverb' command line cmd = [ str(binary), ] # append multiplicative factor, if provided if align: cmd.extend(["-align", str(align)]) # append temporary filenames tmp_output_file = tmp_dir.joinpath(f"tmp_reverbOut.pcm") cmd.extend([tmp_input_file, tmp_IR_file, tmp_output_file]) # run the 'reverb' command run(cmd) # read the reverberated output file output = copy(tmp_input) output.audio, _ = read(tmp_output_file, nchannels=1, fs=tmp_input.fs) # reverse the resampling if old_fs: output.audio = resample_itu(output, old_fs) output.fs = old_fs return output def reverb_stereo( input: Audio, stereo_IR: Audio, align: Optional[float] = None, ) -> Audio: """ Wrapper for the ITU-T reverb binary to convolve mono audio signal with a stereo impulse response Parameters ---------- input: Audio Input audio signal IR: Audio Impulse response align: float multiplicative factor to apply to the reverberated sound in order to align its energy level with the second file Returns ------- output: Audio Convolved audio signal with stereo IR """ # convert to float32 stereo_IR.audio = np.float32(stereo_IR.audio) # separate into left and right IR IR_left = copy(stereo_IR) IR_left.name = "MONO" IR_left.num_channels = 1 IR_left.audio = np.reshape(stereo_IR.audio[:,0], (-1, 1)) IR_right = copy(stereo_IR) IR_right.name = "MONO" IR_right.num_channels = 1 IR_right.audio = np.reshape(stereo_IR.audio[:,1], (-1, 1)) # calculate the scaling factor such that the maximum gain of the IR filter across all frequencies is 0dB if align is None: H = fft(stereo_IR.audio, axis=0) align = 1.0 / np.max(np.abs(H)) # stereo_IR.audio *= align # convolve mono input with left and right IR y_left = reverb(input, IR_left, align=align) y_right = reverb(input, IR_right, align=align) # combine into stereo output y = copy(input) y.name = "STEREO" y.num_channels = 2 y.audio = np.column_stack([y_left.audio, y_right.audio]) return y item_generation_scripts/config/STEREO_CONFIG.yml +80 −84 Original line number Diff line number Diff line Loading @@ -15,6 +15,9 @@ format: "STEREO" ### Output sampling rate in Hz needed for headerless audio files; default = 48000 fs: 48000 ### IR sampling rate in Hz needed for headerless audio files; default = 48000 IR_fs: 32000 ### Any relative paths will be interpreted relative to the working directory the script is called from! ### Usage of absolute paths is recommended. ### Do not use file names with dots "." in them! This is not supported, use "_" instead Loading @@ -24,7 +27,7 @@ fs: 48000 input_path: "./items_mono" ### Input path to stereo impulse response files input_path_IR: "./IR" IR_path: "./IR" ### Output path for generated test items and metadata files output_path: "./output" Loading @@ -39,268 +42,261 @@ loudness: -26 ### Each scene must start with the sceneN tag ### Specify the mono source filename (the program will search for it in the input_path folder) ### Specify azimuth and elevation for each input source ### Specify the stereo IR source filename (the program will search for it in the input_path_IR folder) ### Specify the delay in seconds for each input source ### Note 1: use [val1, val2, ...] for multiple sources in a scene ### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames ### Note 3: we're using right-handed coordinate system with azi = 0 pointing from the nose to the screen ### azimuth: float, [-180,180]; positive indicates left ### elevation: float, [-90,90]; positive indicates up ### distance: float, tbd: default: 1 ### spread: float, [0,360]; spread in angles from 0 ... 360˚ ### gain: float, [0,1] scenes: a1: name: "G1S1.wav" description: "Large anechoic room with AB microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["LAABP01.L.IR32", "LAABP01.R.IR32"] delay: [0, 0] description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room." source: ["test_single.wav", "test_single.wav"] IR: ["LEABP05.wav", "LEABP11.wav"] delay: [0, 3] a2: name: "G6S2.wav" description: "Large anechoic room with AB microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["LAABP02.L.IR32", "LAABP02.R.IR32"] delay: [0, 0] source: ["test_single.wav", "test_single.wav"] IR: ["LEABP05.wav", "LEABP11.wav"] delay: [0, 3] a3: name: "G5S3.wav" description: "Large anechoic room with AB microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["LAABP03.L.IR32", "LAABP03.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["LAABP03.wav", "LAABP03.wav"] delay: [0, 0] a4: name: "G4S4.wav" description: "Large anechoic room with AB microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["LAABP04.L.IR32", "LAABP04.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["LAABP04.wav", "LAABP04.wav"] delay: [0, 0] a5: name: "G3S5.wav" description: "Large anechoic room with AB microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["LAABP05.L.IR32", "LAABP05.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["LAABP05.wav", "LAABP05.wav"] delay: [0, 0] a6: name: "G2S6.wav" description: "Large anechoic room with AB microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["LAABP06.L.IR32", "LAABP06.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["LAABP06.wav", "LAABP06.wav"] delay: [0, 0] b1: name: "G2S1.wav" description: "Small anechoic room with AB microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SAABP01.L.IR32", "SAABP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SAABP01.wav", "SAABP01.wav"] delay: [0, 1.5] b2: name: "G1S2.wav" description: "Small anechoic room with AB microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SAABP01.L.IR32", "SAABP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SAABP01.wav", "SAABP01.wav"] delay: [0, 1.5] b3: name: "G6S3.wav" description: "Small anechoic room with AB microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SAABP01.L.IR32", "SAABP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SAABP01.wav", "SAABP01.wav"] delay: [0, 1.5] b4: name: "G5S4.wav" description: "Small anechoic room with AB microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SAABP01.L.IR32", "SAABP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SAABP01.wav", "SAABP01.wav"] delay: [0, 1.5] b5: name: "G4S5.wav" description: "Small anechoic room with AB microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SAABP01.L.IR32", "SAABP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SAABP01.wav", "SAABP01.wav"] delay: [0, 1.5] b6: name: "G3S6.wav" description: "Small anechoic room with AB microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SAABP01.L.IR32", "SAABP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SAABP01.wav", "SAABP01.wav"] delay: [0, 1.5] c1: name: "G3S1.wav" description: "Small anechoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SAMSP01.L.IR32", "SAMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SAMSP01.wav", "SAMSP01.wav"] delay: [0, 0] c2: name: "G2S2.wav" description: "Small anechoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SAMSP01.L.IR32", "SAMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SAMSP01.wav", "SAMSP01.wav"] delay: [0, 0] c3: name: "G1S3.wav" description: "Small anechoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SAMSP01.L.IR32", "SAMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SAMSP01.wav", "SAMSP01.wav"] delay: [0, 0] c4: name: "G6S4.wav" description: "Small anechoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SAMSP01.L.IR32", "SAMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SAMSP01.wav", "SAMSP01.wav"] delay: [0, 1] c5: name: "G5S5.wav" description: "Small anechoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SAMSP01.L.IR32", "SAMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SAMSP01.wav", "SAMSP01.wav"] delay: [0, 0] c6: name: "G4S6.wav" description: "Small anechoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SAMSP01.L.IR32", "SAMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SAMSP01.wav", "SAMSP01.wav"] delay: [0, 0] d1: name: "G4S1.wav" description: "Small echoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 1.5] d2: name: "G3S2.wav" description: "Small echoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 1.5] d3: name: "G3S2.wav" description: "Small echoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 1.5] d4: name: "G1S4.wav" description: "Small echoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 1.5] d5: name: "G6S5.wav" description: "Small echoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 1.5] d6: name: "G5S6.wav" description: "Small echoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 1.5] e1: name: "G5S1.wav" description: "Small echoic room with binaural microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEBIP01.L.IR32", "SEBIP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEBIP01.wav", "SEBIP01.wav"] delay: [0, 1.5] e2: name: "G4S2.wav" description: "Small echoic room with binaural microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEBIP01.L.IR32", "SEBIP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEBIP01.wav", "SEBIP01.wav"] delay: [0, 1.5] e3: name: "G3S3.wav" description: "Small echoic room with binaural microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEBIP01.L.IR32", "SEBIP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEBIP01.wav", "SEBIP01.wav"] delay: [0, 1.5] e4: name: "G2S4.wav" description: "Small echoic room with binaural microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEBIP01.L.IR32", "SEBIP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEBIP01.wav", "SEBIP01.wav"] delay: [0, 1.5] e5: name: "G1S5.wav" description: "Small echoic room with binaural microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEBIP01.L.IR32", "SEBIP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEBIP01.wav", "SEBIP01.wav"] delay: [0, 1.5] e6: name: "G6S6.wav" description: "Small echoic room with binaural microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEBIP01.L.IR32", "SEBIP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEBIP01.wav", "SEBIP01.wav"] delay: [0, 1.5] f1: name: "G6S1.wav" description: "Small echoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 0] f2: name: "G5S2.wav" description: "Small echoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 0] f3: name: "G4S3.wav" description: "Small echoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 0] f4: name: "G3S4.wav" description: "Small echoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 0] f5: name: "G2S5.wav" description: "Small echoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 0] f6: name: "G1S6.wav" description: "Small echoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 0] No newline at end of file item_generation_scripts/constants.py +1 −14 Original line number Diff line number Diff line Loading @@ -42,10 +42,9 @@ LOGGER_FORMAT = ( LOGGER_DATEFMT = "%m-%d %H:%M:%S" SUPPORTED_FORMATS = { "STEREO", "ISM1", "ISM2", "ISM3", "ISM4", } DEFAULT_CONFIG = { Loading @@ -54,18 +53,6 @@ DEFAULT_CONFIG = { "delete_tmp": False, } DEFAULT_CONFIG_ISM2 = { "format": "ISM2", "input_path": "./input", "output_path": "./output", # "cod": { # "bin": find_binary("IVAS_cod", raise_error=False), # }, # "dec": { # "bin": find_binary("IVAS_dec", raise_error=False), # }, } DEFAULT_CONFIG_BINARIES = { "binary_paths": get_binary_paths( Path(__file__).parent.joinpath("binary_paths.yml") Loading Loading
item_generation_scripts/__init__.py +16 −3 Original line number Diff line number Diff line Loading @@ -40,7 +40,7 @@ from item_generation_scripts.constants import ( LOGGER_FORMAT, LOGGER_SUFFIX, ) from item_generation_scripts.processing import config, process_ism_items from item_generation_scripts.processing import config, process_ism_items, process_stereo_items from item_generation_scripts.utils import create_dir Loading Loading @@ -83,7 +83,7 @@ def main(args): # generate input items if cfg.format.startswith("ISM"): # generate ISM items according to scene description # generate ISM items with metadata according to scene description process_ism_items.generate_ism_items( cfg.format, cfg.loudness, Loading @@ -93,6 +93,19 @@ def main(args): logger, fs=cfg.fs ) elif cfg.format == "STEREO": # generate STEREO items according to scene description process_stereo_items.generate_stereo_items( cfg.format, cfg.loudness, cfg.input_path, cfg.IR_path, cfg.output_path, cfg.scenes, logger, fs=cfg.fs, IR_fs=cfg.IR_fs, ) # copy configuration to output directory with open(cfg.output_path.joinpath(f"{cfg.format}.yml"), "w") as f: Loading
item_generation_scripts/audiotools/audiofile.py +4 −1 Original line number Diff line number Diff line Loading @@ -110,6 +110,7 @@ def write( filename: Union[str, Path], x: np.ndarray, fs: Optional[int] = 48000, dtype: Optional[str] = "int16", ) -> None: """ Write audio file (.pcm, .wav or .raw) Loading @@ -122,6 +123,8 @@ def write( Numpy 2D array of dimension: number of channels x number of samples fs: Optional[int] Sampling rate, required for .pcm or .raw input file, default = 48000 (Hz) dtype: Optional[str] Data type format required for .pcm or .raw input file, default = 'int16' Returns ------- Loading @@ -141,7 +144,7 @@ def write( x = x.astype(np.int16) wav.write(filename, fs, x) elif file_extension == ".pcm" or file_extension == ".raw": x = x.astype("int16").reshape(-1, 1) x = x.astype(dtype).reshape(-1, 1) x.tofile(filename) else: raise ValueError("Wrong input format. Use wav, pcm or raw") Loading
item_generation_scripts/audiotools/wrappers/reverb.py 0 → 100644 +186 −0 Original line number Diff line number Diff line #!/usr/bin/env python3 # # (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, # Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., # Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, # Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other # contributors to this repository. All Rights Reserved. # # This software is protected by copyright law and by international treaties. # The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, # Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., # Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, # Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other # contributors to this repository retain full ownership rights in their respective contributions in # the software. This notice grants no license of any kind, including but not limited to patent # license, nor is any license granted by implication, estoppel or otherwise. # # Contributors are required to enter into the IVAS codec Public Collaboration agreement before making # contributions. # # This software is provided "AS IS", without any express or implied warranties. The software is in the # development stage. It is intended exclusively for experts who have experience with such software and # solely for the purpose of inspection. All implied warranties of non-infringement, merchantability # and fitness for a particular purpose are hereby disclaimed and excluded. # # Any dispute, controversy or claim arising under or in relation to providing this software shall be # submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in # accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and # the United Nations Convention on Contracts on the International Sales of Goods. # import os.path import numpy as np from scipy.fft import fft from copy import copy from pathlib import Path from tempfile import TemporaryDirectory from typing import Optional, Union from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES from item_generation_scripts.utils import find_binary, run from item_generation_scripts.audiotools.audio import Audio from item_generation_scripts.audiotools.audiofile import read, write from item_generation_scripts.audiotools.wrappers.filter import resample_itu def reverb( input: Audio, IR: Audio, align: Optional[float] = None, ) -> Audio: """ Wrapper for the ITU-T reverb binary to convolve mono audio signal with an impulse response Note: The 'reverb' binary tool expects that the IR file is written in the 32b IEEE Standard 754 floating-point representation. Parameters ---------- input: Audio Input audio signal IR: Audio Impulse response align: float multiplicative factor to apply to the reverberated sound in order to align its energy level with a second filePath to the output file Returns ------- output: Audio Convolved audio signal with IR """ # find binary if "reverb" in DEFAULT_CONFIG_BINARIES["binary_paths"]: binary = find_binary( DEFAULT_CONFIG_BINARIES["binary_paths"]["reverb"].name, binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"]["reverb"].parent, ) else: binary = find_binary("reverb") with TemporaryDirectory(dir="./tmp_reverb") as tmp_dir: tmp_dir = Path(tmp_dir) # resample input audio signal to that of the IR old_fs = None tmp_input = copy(input) if input.fs != IR.fs: old_fs = input.fs tmp_input.audio = resample_itu(tmp_input, IR.fs) tmp_input.fs = IR.fs # write input audio signal to temporary file in .pcm format tmp_input_file = tmp_dir.joinpath(f"tmp_reverbIn.pcm") write(tmp_input_file, tmp_input.audio, tmp_input.fs) # down-scale IR to prevent saturation # max_value = np.max(np.abs(IR.audio)) # if max_value > 1.0: # IR.audio = IR.audio / max_value # write IR to temporary file in .pcm format # note: the reverb tool expects 32b float format tmp_IR_file = tmp_dir.joinpath(f"tmp_IR.pcm") write(tmp_IR_file, IR.audio.astype("float32"), IR.fs, dtype="float32") # set up the 'reverb' command line cmd = [ str(binary), ] # append multiplicative factor, if provided if align: cmd.extend(["-align", str(align)]) # append temporary filenames tmp_output_file = tmp_dir.joinpath(f"tmp_reverbOut.pcm") cmd.extend([tmp_input_file, tmp_IR_file, tmp_output_file]) # run the 'reverb' command run(cmd) # read the reverberated output file output = copy(tmp_input) output.audio, _ = read(tmp_output_file, nchannels=1, fs=tmp_input.fs) # reverse the resampling if old_fs: output.audio = resample_itu(output, old_fs) output.fs = old_fs return output def reverb_stereo( input: Audio, stereo_IR: Audio, align: Optional[float] = None, ) -> Audio: """ Wrapper for the ITU-T reverb binary to convolve mono audio signal with a stereo impulse response Parameters ---------- input: Audio Input audio signal IR: Audio Impulse response align: float multiplicative factor to apply to the reverberated sound in order to align its energy level with the second file Returns ------- output: Audio Convolved audio signal with stereo IR """ # convert to float32 stereo_IR.audio = np.float32(stereo_IR.audio) # separate into left and right IR IR_left = copy(stereo_IR) IR_left.name = "MONO" IR_left.num_channels = 1 IR_left.audio = np.reshape(stereo_IR.audio[:,0], (-1, 1)) IR_right = copy(stereo_IR) IR_right.name = "MONO" IR_right.num_channels = 1 IR_right.audio = np.reshape(stereo_IR.audio[:,1], (-1, 1)) # calculate the scaling factor such that the maximum gain of the IR filter across all frequencies is 0dB if align is None: H = fft(stereo_IR.audio, axis=0) align = 1.0 / np.max(np.abs(H)) # stereo_IR.audio *= align # convolve mono input with left and right IR y_left = reverb(input, IR_left, align=align) y_right = reverb(input, IR_right, align=align) # combine into stereo output y = copy(input) y.name = "STEREO" y.num_channels = 2 y.audio = np.column_stack([y_left.audio, y_right.audio]) return y
item_generation_scripts/config/STEREO_CONFIG.yml +80 −84 Original line number Diff line number Diff line Loading @@ -15,6 +15,9 @@ format: "STEREO" ### Output sampling rate in Hz needed for headerless audio files; default = 48000 fs: 48000 ### IR sampling rate in Hz needed for headerless audio files; default = 48000 IR_fs: 32000 ### Any relative paths will be interpreted relative to the working directory the script is called from! ### Usage of absolute paths is recommended. ### Do not use file names with dots "." in them! This is not supported, use "_" instead Loading @@ -24,7 +27,7 @@ fs: 48000 input_path: "./items_mono" ### Input path to stereo impulse response files input_path_IR: "./IR" IR_path: "./IR" ### Output path for generated test items and metadata files output_path: "./output" Loading @@ -39,268 +42,261 @@ loudness: -26 ### Each scene must start with the sceneN tag ### Specify the mono source filename (the program will search for it in the input_path folder) ### Specify azimuth and elevation for each input source ### Specify the stereo IR source filename (the program will search for it in the input_path_IR folder) ### Specify the delay in seconds for each input source ### Note 1: use [val1, val2, ...] for multiple sources in a scene ### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames ### Note 3: we're using right-handed coordinate system with azi = 0 pointing from the nose to the screen ### azimuth: float, [-180,180]; positive indicates left ### elevation: float, [-90,90]; positive indicates up ### distance: float, tbd: default: 1 ### spread: float, [0,360]; spread in angles from 0 ... 360˚ ### gain: float, [0,1] scenes: a1: name: "G1S1.wav" description: "Large anechoic room with AB microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["LAABP01.L.IR32", "LAABP01.R.IR32"] delay: [0, 0] description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room." source: ["test_single.wav", "test_single.wav"] IR: ["LEABP05.wav", "LEABP11.wav"] delay: [0, 3] a2: name: "G6S2.wav" description: "Large anechoic room with AB microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["LAABP02.L.IR32", "LAABP02.R.IR32"] delay: [0, 0] source: ["test_single.wav", "test_single.wav"] IR: ["LEABP05.wav", "LEABP11.wav"] delay: [0, 3] a3: name: "G5S3.wav" description: "Large anechoic room with AB microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["LAABP03.L.IR32", "LAABP03.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["LAABP03.wav", "LAABP03.wav"] delay: [0, 0] a4: name: "G4S4.wav" description: "Large anechoic room with AB microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["LAABP04.L.IR32", "LAABP04.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["LAABP04.wav", "LAABP04.wav"] delay: [0, 0] a5: name: "G3S5.wav" description: "Large anechoic room with AB microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["LAABP05.L.IR32", "LAABP05.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["LAABP05.wav", "LAABP05.wav"] delay: [0, 0] a6: name: "G2S6.wav" description: "Large anechoic room with AB microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["LAABP06.L.IR32", "LAABP06.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["LAABP06.wav", "LAABP06.wav"] delay: [0, 0] b1: name: "G2S1.wav" description: "Small anechoic room with AB microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SAABP01.L.IR32", "SAABP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SAABP01.wav", "SAABP01.wav"] delay: [0, 1.5] b2: name: "G1S2.wav" description: "Small anechoic room with AB microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SAABP01.L.IR32", "SAABP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SAABP01.wav", "SAABP01.wav"] delay: [0, 1.5] b3: name: "G6S3.wav" description: "Small anechoic room with AB microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SAABP01.L.IR32", "SAABP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SAABP01.wav", "SAABP01.wav"] delay: [0, 1.5] b4: name: "G5S4.wav" description: "Small anechoic room with AB microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SAABP01.L.IR32", "SAABP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SAABP01.wav", "SAABP01.wav"] delay: [0, 1.5] b5: name: "G4S5.wav" description: "Small anechoic room with AB microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SAABP01.L.IR32", "SAABP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SAABP01.wav", "SAABP01.wav"] delay: [0, 1.5] b6: name: "G3S6.wav" description: "Small anechoic room with AB microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SAABP01.L.IR32", "SAABP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SAABP01.wav", "SAABP01.wav"] delay: [0, 1.5] c1: name: "G3S1.wav" description: "Small anechoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SAMSP01.L.IR32", "SAMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SAMSP01.wav", "SAMSP01.wav"] delay: [0, 0] c2: name: "G2S2.wav" description: "Small anechoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SAMSP01.L.IR32", "SAMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SAMSP01.wav", "SAMSP01.wav"] delay: [0, 0] c3: name: "G1S3.wav" description: "Small anechoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SAMSP01.L.IR32", "SAMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SAMSP01.wav", "SAMSP01.wav"] delay: [0, 0] c4: name: "G6S4.wav" description: "Small anechoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SAMSP01.L.IR32", "SAMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SAMSP01.wav", "SAMSP01.wav"] delay: [0, 1] c5: name: "G5S5.wav" description: "Small anechoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SAMSP01.L.IR32", "SAMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SAMSP01.wav", "SAMSP01.wav"] delay: [0, 0] c6: name: "G4S6.wav" description: "Small anechoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SAMSP01.L.IR32", "SAMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SAMSP01.wav", "SAMSP01.wav"] delay: [0, 0] d1: name: "G4S1.wav" description: "Small echoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 1.5] d2: name: "G3S2.wav" description: "Small echoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 1.5] d3: name: "G3S2.wav" description: "Small echoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 1.5] d4: name: "G1S4.wav" description: "Small echoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 1.5] d5: name: "G6S5.wav" description: "Small echoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 1.5] d6: name: "G5S6.wav" description: "Small echoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 1.5] e1: name: "G5S1.wav" description: "Small echoic room with binaural microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEBIP01.L.IR32", "SEBIP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEBIP01.wav", "SEBIP01.wav"] delay: [0, 1.5] e2: name: "G4S2.wav" description: "Small echoic room with binaural microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEBIP01.L.IR32", "SEBIP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEBIP01.wav", "SEBIP01.wav"] delay: [0, 1.5] e3: name: "G3S3.wav" description: "Small echoic room with binaural microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEBIP01.L.IR32", "SEBIP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEBIP01.wav", "SEBIP01.wav"] delay: [0, 1.5] e4: name: "G2S4.wav" description: "Small echoic room with binaural microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEBIP01.L.IR32", "SEBIP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEBIP01.wav", "SEBIP01.wav"] delay: [0, 1.5] e5: name: "G1S5.wav" description: "Small echoic room with binaural microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEBIP01.L.IR32", "SEBIP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEBIP01.wav", "SEBIP01.wav"] delay: [0, 1.5] e6: name: "G6S6.wav" description: "Small echoic room with binaural microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEBIP01.L.IR32", "SEBIP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEBIP01.wav", "SEBIP01.wav"] delay: [0, 1.5] f1: name: "G6S1.wav" description: "Small echoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 0] f2: name: "G5S2.wav" description: "Small echoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 0] f3: name: "G4S3.wav" description: "Small echoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 0] f4: name: "G3S4.wav" description: "Small echoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 0] f5: name: "G2S5.wav" description: "Small echoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 0] f6: name: "G1S6.wav" description: "Small echoic room with MS microphone pickup." source: ["test_double.wav", "test_double.wav"] IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] source: ["test_single.wav", "test_single.wav"] IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 0] No newline at end of file
item_generation_scripts/constants.py +1 −14 Original line number Diff line number Diff line Loading @@ -42,10 +42,9 @@ LOGGER_FORMAT = ( LOGGER_DATEFMT = "%m-%d %H:%M:%S" SUPPORTED_FORMATS = { "STEREO", "ISM1", "ISM2", "ISM3", "ISM4", } DEFAULT_CONFIG = { Loading @@ -54,18 +53,6 @@ DEFAULT_CONFIG = { "delete_tmp": False, } DEFAULT_CONFIG_ISM2 = { "format": "ISM2", "input_path": "./input", "output_path": "./output", # "cod": { # "bin": find_binary("IVAS_cod", raise_error=False), # }, # "dec": { # "bin": find_binary("IVAS_dec", raise_error=False), # }, } DEFAULT_CONFIG_BINARIES = { "binary_paths": get_binary_paths( Path(__file__).parent.joinpath("binary_paths.yml") Loading