diff --git a/README.md b/README.md index f5ebd10e10405d904139989bbfce8ac913d3d984..e508ad3a00b3ef81ce4c7c4dfc0e24b8d58522ba 100755 --- a/README.md +++ b/README.md @@ -77,6 +77,8 @@ Each entry under `scenes:` describes one test item, specifying: - `azimuth` / `elevation`: spatial placement (°) - `level`: loudness in dB - `shift`: timing offsets in seconds +- `background`: background noise file (applicable to STEREO and SBA only) +- `background_level`: level of the background noise (applicable to STEREO and SBA only) Dynamic positioning (e.g., `"-20:1.0:360"`) means the source will move over time, stepping every 20 ms. @@ -84,6 +86,8 @@ The total duration of the output signal can be controlled using the `duration` f Start by running a single scene to verify settings. Output includes both audio and optional metadata files. You can enable multiprocessing by setting `multiprocessing: true`. +The addition of custom background noise at specific level is supported for the STEREO and SBA formats only. For ISMs it's not applicable. For OMASA and OSBA formats, it is expected that the backround noise is provided in the FOA/HOA2/HOA3 format as the first item in the `input` list. + ### Item processing The input has to be in the folder `experiments/selection/P800-{X}/proc_input_{l}`. If item generation is performed previous to this step, the corresponding files are already in the right folder. diff --git a/examples/ITEM_GENERATION_FOA.yml b/examples/ITEM_GENERATION_FOA.yml index 2287af4c67e7be64c7a7928d9a68afcd87e8d467..46cbe8456575aefa94c3c4307d9637e0f77ae103 100644 --- a/examples/ITEM_GENERATION_FOA.yml +++ b/examples/ITEM_GENERATION_FOA.yml @@ -95,6 +95,8 @@ use_output_prefix: "leee" ### azimuth: azimuth in the range [-180,180]; positive values point to the left ### elevation: elevation in the range [-90,90]; positive values indicate up ### shift: time adjustment of the input signal (negative value delays the signal) +### background: background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored) +### background_level: normalized background noise loudness to X dB LKFS ### ### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder) ### Note 1: use brackets [val1, val2, ...] when specifying multiple values @@ -109,52 +111,60 @@ scenes: input: ["items_mono/untrimmed/f1s4b_Talker2.wav", "items_mono/untrimmed/f2s1a_Talker1.wav"] IR: ["IRs/IR_do_p04_e_01_01_FOA.wav", "IRs/IR_do_p04_e_02_01_FOA.wav"] shift: [0.0, -1.0] + background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav" + background_level: -46 + + # "02": + # output: "out/s02.wav" + # description: "Car with AB microphone pickup, overlap between the talkers, car noise." + # input: ["items_mono/untrimmed/f1s6a_Talker2.wav", "items_mono/untrimmed/f2s3b_Talker1.wav"] + # IR: ["IRs/IR_do_p04_e_03_01_FOA.wav", "IRs/IR_do_p04_e_04_01_FOA.wav"] + # shift: [0.0, +1.0] + # background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav" + # background_level: -46 - "02": - output: "out/s02.wav" - description: "Car with AB microphone pickup, overlap between the talkers, car noise." - input: ["items_mono/untrimmed/f1s6a_Talker2.wav", "items_mono/untrimmed/f2s3b_Talker1.wav"] - IR: ["IRs/IR_do_p04_e_03_01_FOA.wav", "IRs/IR_do_p04_e_04_01_FOA.wav"] - shift: [0.0, +1.0] + # "03": + # output: "out/s03.wav" + # description: "Car with AB microphone pickup, no overlap between the talkers, car noise." + # input: ["items_mono/untrimmed/f3s3a_Talker2.wav", "items_mono/untrimmed/f3s10b_Talker2.wav"] + # IR: ["IRs/IR_do_p04_e_05_01_FOA.wav", "IRs/IR_do_p04_e_06_01_FOA.wav"] + # shift: [0.0, -1.0] - "03": - output: "out/s03.wav" - description: "Car with AB microphone pickup, no overlap between the talkers, car noise." - input: ["items_mono/untrimmed/f3s3a_Talker2.wav", "items_mono/untrimmed/f3s10b_Talker2.wav"] - IR: ["IRs/IR_do_p04_e_05_01_FOA.wav", "IRs/IR_do_p04_e_06_01_FOA.wav"] - shift: [0.0, -1.0] + # "04": + # output: "out/s04.wav" + # description: "Car with AB microphone pickup, no overlap between the talkers, car noise." + # input: ["items_mono/untrimmed/f2s7b_Talker1.wav", "items_mono/untrimmed/f5s15a_Talker1.wav"] + # IR: ["IRs/IR_do_p04_e_07_01_FOA.wav", "IRs/IR_do_p04_e_08_01_FOA.wav"] + # shift: [0.0, -1.0] + # background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav" + # background_level: -46 - "04": - output: "out/s04.wav" - description: "Car with AB microphone pickup, no overlap between the talkers, car noise." - input: ["items_mono/untrimmed/f2s7b_Talker1.wav", "items_mono/untrimmed/f5s15a_Talker1.wav"] - IR: ["IRs/IR_do_p04_e_07_01_FOA.wav", "IRs/IR_do_p04_e_08_01_FOA.wav"] - shift: [0.0, -1.0] + # "05": + # output: "out/s05.wav" + # description: "Car with AB microphone pickup, no overlap between the talkers, car noise." + # input: ["items_mono/untrimmed/m2s15a_Talker2.wav", "items_mono/untrimmed/m1s4a_Talker1.wav"] + # IR: ["IRs/IR_do_p04_e_07_01_FOA.wav", "IRs/IR_do_p04_e_01_01_FOA.wav"] + # shift: [0.0, -1.0] + # background: "items_background/Dolby_BG_do_p05_a_01_FOA.wav" + # background_level: -46 - "05": - output: "out/s05.wav" - description: "Car with AB microphone pickup, no overlap between the talkers, car noise." - input: ["items_mono/untrimmed/m2s15a_Talker2.wav", "items_mono/untrimmed/m1s4a_Talker1.wav"] - IR: ["IRs/IR_do_p04_e_07_01_FOA.wav", "IRs/IR_do_p04_e_01_01_FOA.wav"] - shift: [0.0, -1.0] - - "06": - output: "out/s06.wav" - description: "Car with AB microphone pickup, no overlap between the talkers." - input: ["items_mono/untrimmed/m3s8a_Talker2.wav", "items_mono/untrimmed/m4s13a_Talker1.wav"] - IR: ["IRs/IR_do_p04_e_03_01_FOA.wav", "IRs/IR_do_p04_e_01_01_FOA.wav"] - shift: [0.0, -1.0] + # "06": + # output: "out/s06.wav" + # description: "Car with AB microphone pickup, no overlap between the talkers." + # input: ["items_mono/untrimmed/m3s8a_Talker2.wav", "items_mono/untrimmed/m4s13a_Talker1.wav"] + # IR: ["IRs/IR_do_p04_e_03_01_FOA.wav", "IRs/IR_do_p04_e_01_01_FOA.wav"] + # shift: [0.0, -1.0] - "07": - output: "out/s07.wav" - description: "Preliminary: Car with AB microphone pickup, no overlap between the talkers." - input: ["items_mono/untrimmed/f1s20a_Talker2.wav", "items_mono/untrimmed/f5s15b_Talker1.wav"] - IR: ["IRs/IR_do_p04_e_02_01_FOA.wav", "IRs/IR_do_p04_e_07_01_FOA.wav"] - shift: [0.0, -1.0] + # "07": + # output: "out/s07.wav" + # description: "Preliminary: Car with AB microphone pickup, no overlap between the talkers." + # input: ["items_mono/untrimmed/f1s20a_Talker2.wav", "items_mono/untrimmed/f5s15b_Talker1.wav"] + # IR: ["IRs/IR_do_p04_e_02_01_FOA.wav", "IRs/IR_do_p04_e_07_01_FOA.wav"] + # shift: [0.0, -1.0] - "08": - output: "out/s08.wav" - description: "Car with AB microphone pickup, overlap between the talkers." - input: ["items_mono/untrimmed/m2s6b_Talker2.wav", "items_mono/untrimmed/f5s14a_Talker1.wav"] - IR: ["IRs/IR_do_p04_e_08_01_FOA.wav", "IRs/IR_do_p04_e_04_01_FOA.wav"] - shift: [0.0, +1.0] + # "08": + # output: "out/s08.wav" + # description: "Car with AB microphone pickup, overlap between the talkers." + # input: ["items_mono/untrimmed/m2s6b_Talker2.wav", "items_mono/untrimmed/f5s14a_Talker1.wav"] + # IR: ["IRs/IR_do_p04_e_08_01_FOA.wav", "IRs/IR_do_p04_e_04_01_FOA.wav"] + # shift: [0.0, +1.0] diff --git a/examples/ITEM_GENERATION_STEREO.yml b/examples/ITEM_GENERATION_STEREO.yml index 14731b4b65858f5ab78fce3dfbe0178aede95fea..48c6aa61d28ed0a481d35fafd745f5c0e842ab15 100644 --- a/examples/ITEM_GENERATION_STEREO.yml +++ b/examples/ITEM_GENERATION_STEREO.yml @@ -95,6 +95,8 @@ provider: "g" ### azimuth: azimuth in the range [-180,180]; positive values point to the left ### elevation: elevation in the range [-90,90]; positive values indicate up ### shift: time adjustment of the input signal (negative value delays the signal) +### background: background noise filename (if used, the 'add_low_level_random_noise' parameter is ignored) +### background_level: normalized background noise loudness to X dB LKFS ### ### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder) ### Note 1: use brackets [val1, val2, ...] when specifying multiple values @@ -109,6 +111,8 @@ scenes: input: ["items_mono/untrimmed/f1s4b_Talker2.wav", "items_mono/untrimmed/f2s1a_Talker1.wav"] IR: ["IRs/Car_TalkPos1_Stereo_M5_SinSweep_2chn.wav", "IRs/Car_TalkPos2_Stereo_M5_SinSweep_2chn.wav"] shift: [0.0, -1.0] + background: "items_mono/347224__rayjensen__ambience-in-car_stereo.wav" + background_level: -66 "02": output: "out/a1s02.wav" @@ -116,6 +120,8 @@ scenes: input: ["items_mono/untrimmed/f1s6a_Talker2.wav", "items_mono/untrimmed/f2s3b_Talker1.wav"] IR: ["IRs/Car_TalkPos3_Stereo_M5_SinSweep_2chn.wav", "IRs/Car_TalkPos4_Stereo_M5_SinSweep_2chn.wav"] shift: [0.0, +1.0] + background: "items_mono/347224__rayjensen__ambience-in-car_stereo.wav" + background_level: -66 "03": output: "out/a1s03.wav" @@ -123,6 +129,8 @@ scenes: input: ["items_mono/untrimmed/f3s3a_Talker2.wav", "items_mono/untrimmed/f3s10b_Talker2.wav"] IR: ["IRs/Car_TalkPos1_Stereo_M5_SinSweep_2chn.wav", "IRs/Car_TalkPos1_Stereo_M5_SinSweep_2chn.wav"] shift: [0.0, -1.0] + background: "items_mono/347224__rayjensen__ambience-in-car_stereo.wav" + background_level: -66 "04": output: "out/a1s04.wav" @@ -130,13 +138,17 @@ scenes: input: ["items_mono/untrimmed/f2s7b_Talker1.wav", "items_mono/untrimmed/f5s15a_Talker1.wav"] IR: ["IRs/FreeField_IR_Python_AB_20cm_Pos1.wav", "IRs/FreeField_IR_Python_AB_20cm_Pos2.wav"] shift: [0.0, -1.0] - + background: "items_mono/347224__rayjensen__ambience-in-car_stereo.wav" + background_level: -66 + "05": output: "out/a1s05.wav" description: "Car with AB microphone pickup, no overlap between the talkers, car noise." input: ["items_mono/untrimmed/m2s15a_Talker2.wav", "items_mono/untrimmed/m1s4a_Talker1.wav"] IR: ["IRs/FreeField_IR_Python_AB_20cm_Pos3.wav", "IRs/FreeField_IR_Python_AB_20cm_Pos4.wav"] shift: [0.0, -1.0] + background: "items_mono/347224__rayjensen__ambience-in-car_stereo.wav" + background_level: -66 "06": output: "out/a1s06.wav" diff --git a/ivas_processing_scripts/generation/generate_ismN_items.py b/ivas_processing_scripts/generation/generate_ismN_items.py index dcf76cad8d11e5c2eafd295776b4d78dce1b9bc6..bd92367eec65a53670beda6b9bd1cbaf659999a6 100644 --- a/ivas_processing_scripts/generation/generate_ismN_items.py +++ b/ivas_processing_scripts/generation/generate_ismN_items.py @@ -253,7 +253,7 @@ def generate_ismN_scene( level = -26 logger.info( - f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds" + f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LKFS with shift of {source_shift_in_seconds} seconds" ) # read source file @@ -431,7 +431,7 @@ def generate_ismN_scene( # adjust the loudness of the output signal if "loudness" in cfg.__dict__: - logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") + logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LKFS") y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL") # apply fade-in and fade-out diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py index ed48c37b7ee3a7213bf0c0b868901dcf1e9a19d0..603a3593843d473ecee488b963d881ce161f0732 100644 --- a/ivas_processing_scripts/generation/generate_omasa_items.py +++ b/ivas_processing_scripts/generation/generate_omasa_items.py @@ -248,7 +248,7 @@ def generate_OMASA_scene( level = -26 logger.info( - f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds" + f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LKFS with shift of {source_shift_in_seconds} seconds" ) # get the number of channels from the .wav file header @@ -471,7 +471,7 @@ def generate_OMASA_scene( # adjust the loudness of the output signal if "loudness" in cfg.__dict__: - logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") + logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LKFS") y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL") # apply fade-in and fade-out diff --git a/ivas_processing_scripts/generation/generate_osba_items.py b/ivas_processing_scripts/generation/generate_osba_items.py index 815be0b563beda1b33dc37f6aef3af81f983af75..d2a71777980fbbe475c1f799bb332c8f641e9a0a 100644 --- a/ivas_processing_scripts/generation/generate_osba_items.py +++ b/ivas_processing_scripts/generation/generate_osba_items.py @@ -239,7 +239,7 @@ def generate_OSBA_scene( level = -26 logger.info( - f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds" + f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LKFS with shift of {source_shift_in_seconds} seconds" ) # get the number of channels from the .wav file header @@ -449,7 +449,7 @@ def generate_OSBA_scene( # adjust the loudness of the output signal if "loudness" in cfg.__dict__: - logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") + logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LKFS") y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL") # apply fade-in and fade-out diff --git a/ivas_processing_scripts/generation/generate_sba_items.py b/ivas_processing_scripts/generation/generate_sba_items.py index 1c5d5ba68f45d0bd3e618d26763e7e2d2c76e187..bdb40b1ba935a805c5904e58c54799bf21974635 100644 --- a/ivas_processing_scripts/generation/generate_sba_items.py +++ b/ivas_processing_scripts/generation/generate_sba_items.py @@ -242,7 +242,7 @@ def generate_sba_scene( level = -26 logger.info( - f"-- Convolving {source_file} with {IR_file} at {level} LUFS with shift of {source_shift_in_seconds} seconds" + f"-- Convolving {source_file} with {IR_file} at {level} LKFS with shift of {source_shift_in_seconds} seconds" ) # read source file @@ -339,13 +339,6 @@ def generate_sba_scene( y.audio, y.fs, limits=[-preamble, -postamble], samples=True ) - # add random noise - if "add_low_level_random_noise" in cfg.__dict__ and cfg.add_low_level_random_noise: - # create uniformly distributed noise between -4 and 4 - np.random.seed(SEED_RANDOM_NOISE) - noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") - y.audio += noise - # adjust the length of the output signal if "duration" in cfg.__dict__: # trim the output signal such that the total duration is X seconds @@ -363,9 +356,78 @@ def generate_sba_scene( # adjust the loudness of the output signal if "loudness" in cfg.__dict__: - logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") + logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LKFS") y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL") + # add background noise in FOA/HOA2/HOA3 format + if "background" in scene.keys(): + # check if [] are used in the background noise file name + if isinstance(scene["background"], list): + # if so, use the first element + background_filename = scene["background"][0] + else: + background_filename = scene["background"] + + # read the background noise file + background_filename = Path(scene["background"]).parent / ( + cfg.use_input_prefix + Path(scene["background"]).name + ) + logger.info(f"-- Adding background noise from {background_filename}") + background = audio.fromfile(cfg.format, background_filename) + + # resample to the target fs if necessary + if background.fs != cfg.fs: + logger.warning( + f"Warning: Sample rate of the background noise is {background.fs} Hz and needs to be resampled to {cfg.fs}!" + ) + resampled_audio = audioarray.resample( + background.audio, background.fs, cfg.fs + ) + background.audio = resampled_audio + background.fs = cfg.fs + + # adjust the length of the background noise signal + if len(background.audio) != len(y.audio): + background.audio = audioarray.trim( + background.audio, + background.fs, + limits=[0, len(background.audio) - len(y.audio)], + samples=True, + ) + + # adjust the loudness of the background noise signal + if "background_level" in scene.keys(): + logger.info( + f"-- Rescaling background noise to target loudness: {scene['background_level']} LKFS" + ) + + # check if [] are used in the background level + if isinstance(scene["background_level"], list): + # if so, use the first element + scene["background_level"] = scene["background_level"][0] + + # convert to float if the background level was entered in string format + if not isinstance(scene["background_level"], (int, float)): + scene["background_level"] = float(scene["background_level"]) + else: + logger.warning( + "-- Warning: No target loudness for background noise specified, using default value of -26 LKFS" + ) + scene["background_level"] = -26 + background.audio, _ = loudness_norm( + background, scene["background_level"], loudness_format="STEREO", rms=True + ) + + # add the background noise to the output signal + y.audio += background.audio + elif ( + "add_low_level_random_noise" in cfg.__dict__ and cfg.add_low_level_random_noise + ): + # create uniformly distributed noise between -4 and 4 + np.random.seed(SEED_RANDOM_NOISE) + noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") + y.audio += noise + # apply fade-in and fade-out if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0: logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds") diff --git a/ivas_processing_scripts/generation/generate_stereo_items.py b/ivas_processing_scripts/generation/generate_stereo_items.py index 8dc6d50c77968699e123a3b5507f8a58585dfb8c..92a689063f3ed5c66ea80520409959ab0be00898 100644 --- a/ivas_processing_scripts/generation/generate_stereo_items.py +++ b/ivas_processing_scripts/generation/generate_stereo_items.py @@ -248,7 +248,7 @@ def generate_stereo_scene( level = -26 logger.info( - f"-- Convolving {source_file} with {IR_file} at {level} LUFS with shift of {source_shift_in_seconds} seconds" + f"-- Convolving {source_file} with {IR_file} at {level} LKFS with shift of {source_shift_in_seconds} seconds" ) # read source file @@ -340,13 +340,6 @@ def generate_stereo_scene( y.audio, y.fs, limits=[-preamble, -postamble], samples=True ) - # add random noise - if "add_low_level_random_noise" in cfg.__dict__ and cfg.add_low_level_random_noise: - # create uniformly distributed noise between -4 and 4 - np.random.seed(SEED_RANDOM_NOISE) - noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") - y.audio += noise - # adjust the length of the output signal if "duration" in cfg.__dict__: # trim the output signal such that the total duration is X seconds @@ -364,9 +357,78 @@ def generate_stereo_scene( # adjust the loudness of the output signal if "loudness" in cfg.__dict__: - logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") + logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LKFS") y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="STEREO") + # add background noise in STEREO format + if "background" in scene.keys(): + # check if [] are used in the background noise file name + if isinstance(scene["background"], list): + # if so, use the first element + background_filename = scene["background"][0] + else: + background_filename = scene["background"] + + # read the background noise file + background_filename = Path(scene["background"]).parent / ( + cfg.use_input_prefix + Path(scene["background"]).name + ) + logger.info(f"-- Adding background noise from {background_filename}") + background = audio.fromfile("STEREO", background_filename) + + # resample to the target fs if necessary + if background.fs != cfg.fs: + logger.warning( + f"Warning: Sample rate of the background noise is {background.fs} Hz and needs to be resampled to {cfg.fs}!" + ) + resampled_audio = audioarray.resample( + background.audio, background.fs, cfg.fs + ) + background.audio = resampled_audio + background.fs = cfg.fs + + # adjust the length of the background noise signal + if len(background.audio) != len(y.audio): + background.audio = audioarray.trim( + background.audio, + background.fs, + limits=[0, len(background.audio) - len(y.audio)], + samples=True, + ) + + # adjust the loudness of the background noise signal + if "background_level" in scene.keys(): + logger.info( + f"-- Rescaling background noise to target loudness: {scene['background_level']} LKFS" + ) + + # check if [] are used in the background level + if isinstance(scene["background_level"], list): + # if so, use the first element + scene["background_level"] = scene["background_level"][0] + + # convert to float if the background level was entered in string format + if not isinstance(scene["background_level"], (int, float)): + scene["background_level"] = float(scene["background_level"]) + else: + logger.warning( + "-- Warning: No target loudness for background noise specified, using default value of -26 LKFS" + ) + scene["background_level"] = -26 + background.audio, _ = loudness_norm( + background, scene["background_level"], loudness_format="STEREO", rms=True + ) + + # add the background noise to the output signal + y.audio += background.audio + elif ( + "add_low_level_random_noise" in cfg.__dict__ and cfg.add_low_level_random_noise + ): + # create uniformly distributed noise between -4 and 4 + np.random.seed(SEED_RANDOM_NOISE) + noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") + y.audio += noise + # apply fade-in and fade-out if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0: logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds")