diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000000000000000000000000000000000000..0f2c5a5d8d230128e0d2a83b7332012fc017f656 --- /dev/null +++ b/.flake8 @@ -0,0 +1,4 @@ +[flake8] +max-line-length = 88 +ignore = E203,E402,E501,E741,W503,W504 +exclude = .git,__pycache__,build,dist \ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 31f4512cb32f514d115c9ba607d9bdbbf37357ea..2277f43a184f4c3b6eed5a812750294d79a8c249 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -121,7 +121,7 @@ lint: - linux allow_failure: true script: - - flake8 --max-line-length 88 --extend-ignore=E203,E402,E501,E741 + - flake8 --config .flake8 format: stage: analyze diff --git a/README.md b/README.md index 8e291a4b5686e828c75e2dd42d324c32b8b241ec..a78877c7c7e0d0671ba492df97f27b4d63bc1066 100755 --- a/README.md +++ b/README.md @@ -55,21 +55,34 @@ In the following sections the only purpose of the curly brackets is to mark the ## P800 The setup for a P800 test from the experiments folder consists of two steps: -item generation and item processing. The two steps can be applied independent of each other. +item generation and item processing. The two steps can be applied independently of each other. ### Item generation -To set up the P800-{X} listening test (X = 1, 2, ...9) copy your mono input files to `experiments/selection/P800-{X}/gen_input/items_mono`. -These files have to follow the naming scheme `{l}{LL}p0{X}{name_of_item}` where 'l' stands for the listening lab designator: a (Force Technology), -b (HEAD acoustics), c (MQ University), d (Mesaqin.com), and 'LL' stands for the language: EN, GE, JP, MA, DK, FR. +To facilitate the preparation of items for P800-{X} listening tests, it is possible to generate samples of complex formats (STEREO, SBA, ISMn, OMASA, OSBA) from mono samples. To generate items, run the following command from the root of the repository: -The impluse responses have to be copied to experiments/selection/P800-{X}/gen_input/IRs. +```bash + python generate_items.py --config path/to/scene_description_config_file.yml + ``` + +The YAML configuration file (`scene_description_config_file.yml`) defines how individual mono files should be spatially positioned and combined into the target format. For advanced formats like OMASA or OSBA, note that additional SBA items may be required. Refer to the `examples/` folder for template `.yml` files demonstrating the expected structure and usage. + +Relative paths are resolved from the working directory (not the YAML file location). Use absolute paths if you're unsure. Avoid using dots `.` in file names (e.g., use `item_xxa3s1.wav`, not `item.xx.a3s1.wav`). Windows users: Use double backslashes `\\` and add `.exe` to executables if needed. Input and output files follow structured naming conventions to encode metadata like lab, language, speaker ID, etc. These are explained in detail in the file under *Filename conventions*. + +Each entry under `scenes:` describes one test item, specifying: + +* `output`: output file name +* `description`: human-readable description +* `input`: list of mono `.wav` files +* `azimuth` / `elevation`: spatial placement (°) +* `level`: loudness in dB +* `shift`: timing offsets in seconds + +Dynamic positioning (e.g., `"-20:1.0:360"`) means the source will move over time, stepping every 20 ms. -To generate the items run `python -m ivas_processing_scripts.generation experiments/selection/P800-{X}/config/item_gen_P800-{X}_{l}.yml` from the root folder of the repository. -The resulting files can be found in `experiments/selection/P800-{X}/proc_input_{l}` sorted by category. +The total duration of the output signal can be controlled using the `duration` field. The output signal may optionally be rendered to the BINAURAL format by specifying the `binaural_output` field. -For P800-3 the input files for the processing are already provided by the listening lab. This means this step can be skipped. -For tests with ISM input format (P800-6 and P800-7) no IRs are needed, only mono sentences +Start by running a single scene to verify settings. Output includes both audio and optional metadata files. You can enable multiprocessing by setting `multiprocessing: true`. ### Item processing diff --git a/examples/ITEM_GENERATION_3ISM.yml b/examples/ITEM_GENERATION_3ISM.yml new file mode 100644 index 0000000000000000000000000000000000000000..e770cadf706b775682b8339677f64e863b0849b2 --- /dev/null +++ b/examples/ITEM_GENERATION_3ISM.yml @@ -0,0 +1,177 @@ +--- +################################################ +# Item generation - General configuration +################################################ + +### Any relative paths will be interpreted relative to the working directory the script is called from! +### Usage of absolute paths is recommended. +### Do not use file names with dots "." in them! This is not supported, use "_" instead +### For Windows users: please use double back slash '\\' in paths and add '.exe' to executable definitions + +### Output format +format: "ISM3" +# masa_tc: 2 # applicable only to OMASA format +# masa_dirs: 2 # applicable only to OMASA format +# sba_order: 2 # applicable only to OSBA format + +### Output sampling rate in Hz +fs: 48000 + +### Generate BINAURAL output (_BINAURAL will be appended to the output filename) +binaural_output: true + +### Normalize target loudness to X LKFS +# loudness: -26 + +### Apply pre-amble and post-amble in X seconds +preamble: 0.0 +postamble: 0.0 + +### Apply fade-in and fade-out of X seconds +fade_in_out: 0.5 + +### Trim the output such that the total duration is X seconds +duration: 8 + +### Add low-level random background noise (amplitude +-4) instead of silence; default = false (silence) +add_low_level_random_noise: true + +### Process with parallel streams +multiprocessing: False + +################################################ +### Item generation - Filename conventions +################################################ + +### Naming convention for the input mono files +### The input filenames are represented by: +### lLLeeettszz.wav +### where: +### l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com) +### LL stands for the language: JP, FR, GE, MA, DA, EN +### eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09 +### tt stands for the talker ID: f1, f2, f3, m1, m2, m3 +### s stands for 'sample' and zz is the sample number; 01, ..., 14 + +### Naming convention for the generated output files +### The output filenames are represented by: +### leeeayszz.wav +### The filenames of the accompanying output metadata files (applicable to metadata-assisted spatial audio, object-based audio) are represented by: +### leeeayszz.met for metadata-assisted spatial audio +### leeeayszz.wav.o.csv for object-based audio +### where: +### l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com) +### eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09 +### a stands 'audio' +### y is the per-experiment category according to IVAS-8a: 01, 02, 03, 04, 05, 06 +### s stands for sample and zz is the sample number; 01, 02, 03, 04, 05, 06, 07 (07 is the preliminary sample) +### o stands for the object number; 0, 1, 2, 3 + +### File designators, default is "l" for listening lab, "EN" for language, "p07" for experiment and "g" for company +listening_lab: "l" +language: "EN" +exp: "p01" +provider: "va" + +### Insert prefix for all input filenames (default: "") +### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment' +### the number of consecutive letters define the length of each field +# use_input_prefix: "lLLeee" + +### Insert prefix for all output filenames (default: "") +### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment' +### the number of consecutive letters define the length of each field +# use_output_prefix: "leee" + +################################################ +### Item generation - Scene description +################################################ + +### Each scene shall de described using the following parameters/properties: +### output: output filename +### description: textual description of the scene +### input: input filename(s) +### azimuth: azimuth in the range [-180,180]; positive values point to the left +### elevation: elevation in the range [-90,90]; positive values indicate up +### shift: time adjustment of the input signal (negative value delays the signal) +### +### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder) +### Note 1: use brackets [val1, val2, ...] when specifying multiple values +### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames +### Note 3: we're using right-handed coordinate system with azimuth = 0 pointing from the nose to the screen + + +scenes: + + "01": + output: "out/VA_3obj_2tlks_music1.wav" + description: "Two talkers sitting at a table, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." + input: ["items_mono/untrimmed/f2s1a_Talker1.wav", "items_mono/untrimmed/m2s10a_Talker2.wav", "items_mono/music/Sc01.wav"] + azimuth: [20, -40, 45] + elevation: [0, 0, 70] + level: [-26, -26, -41] + shift: [-1.0, -2.0, 2.0] + + "02": + output: "out/VA_3obj_2tlks_music2.wav" + description: "One talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." + input: ["items_mono/untrimmed/f5s10b_Talker1.wav", "items_mono/untrimmed/m3s2b_Talker2.wav", "items_mono/music/Guitar1.wav"] + azimuth: [50, "180:1:120 + 360", -120] + elevation: [0, 45, 70] + level: [-26, -26, -41] + shift: [1.0, -2.0, -1.0] + + "03": + output: "out/VA_3obj_2tlks_music3.wav" + description: "Two talkers walking side-by-side around the table, ~30% overlapping utterances." + input: ["items_mono/untrimmed/m1s2b_Talker1.wav", "items_mono/untrimmed/f3s5a_Talker2.wav", "items_mono/music/Track066.wav"] + azimuth: ["80:1:20 + 360", "80:1:20 + 360", -30] + elevation: [10, 60, 70] + level: [-26, -26, -41] + shift: [0.0, 0.0, 0.0] + + "04": + output: "out/VA_3obj_2tlks_music4.wav" + description: "Two talkers walking around the table in opposite directions, ~30% overlapping utterances." + input: ["items_mono/untrimmed/m4s12b_Talker1.wav", "items_mono/untrimmed/f1s12b_Talker2.wav", "items_mono/music/Sample02.wav"] + azimuth: ["60:1:0 + 360", "60:-1:120 - 360", 100] + elevation: [20, 50, 70] + level: [-26, -26, -41] + shift: [0.0, 0.0, 0.0] + + "05": + output: "out/VA_3obj_3tlks_1.wav" + description: "Three static talkers, partially overlapping utterances." + input: ["items_mono/untrimmed/m4s12b_Talker1.wav", "items_mono/untrimmed/f1s12b_Talker2.wav", "items_mono/untrimmed/m3s1a_Talker2.wav"] + azimuth: [30, -45, 100] + elevation: [20, 20, 30] + level: [-26, -26, -26] + shift: [0.0, 0.0, -2.5] + + "06": + output: "out/VA_3obj_3tlks_2.wav" + description: "One walking talker, two static talkers, non-overlapping utterances." + input: ["items_mono/untrimmed/f2s5a_Talker1.wav", "items_mono/untrimmed/m2s16b_Talker2.wav", "items_mono/untrimmed/m3s8b_Talker2.wav"] + azimuth: ["-20:0.5:360", 60, -45] + elevation: [10, 10, 10] + level: [-26, -26, -26] + shift: [0.0, 0.0, -3.0] + + "07": + output: "out/VA_3obj_3tlks_3.wav" + description: "Two moving talkers, one static talker, partially overlapping utterances." + input: ["items_mono/untrimmed/f1s16b_Talker2.wav", "items_mono/untrimmed/m4s16a_Talker1.wav", "items_mono/untrimmed/f3s10b_Talker2.wav"] + azimuth: [-90, "0:1:360", "0:-1:-360"] + elevation: [0, 30, 30] + level: [-26, -26, -26] + shift: [0.0, 0.0, -3.0] + + "08": + output: "out/VA_3obj_3tlks_4.wav" + description: "Three walking talkers, partially overlapping utterances." + input: ["items_mono/untrimmed/f5s15b_Talker1.wav", "items_mono/untrimmed/m3s1a_Talker2.wav", "items_mono/untrimmed/m2s17b_Talker2.wav"] + azimuth: ["-90:-1:-360", "-10:1.5:360", "70:1:360"] + elevation: [0, 20, 0] + level: [-26, -26, -26] + shift: [0.0, 0.0, -3.5] + diff --git a/examples/ITEM_GENERATION_FOA.yml b/examples/ITEM_GENERATION_FOA.yml new file mode 100644 index 0000000000000000000000000000000000000000..2287af4c67e7be64c7a7928d9a68afcd87e8d467 --- /dev/null +++ b/examples/ITEM_GENERATION_FOA.yml @@ -0,0 +1,160 @@ +--- +################################################ +# Item generation - General configuration +################################################ + +### Any relative paths will be interpreted relative to the working directory the script is called from! +### Usage of absolute paths is recommended. +### Do not use file names with dots "." in them! This is not supported, use "_" instead +### For Windows users: please use double back slash '\\' in paths and add '.exe' to executable definitions + +### Output format +format: "FOA" +# masa_tc: 2 # applicable only to OMASA format +# masa_dirs: 2 # applicable only to OMASA format +# sba_order: 2 # applicable only to OSBA format + +### Output sampling rate in Hz +fs: 48000 + +### Generate BINAURAL output (_BINAURAL will be appended to the output filename) +binaural_output: true + +### Normalize target loudness to X LKFS +loudness: -26 + +### Apply pre-amble and post-amble in X seconds +preamble: 0.5 +postamble: 1.0 + +### Apply fade-in and fade-out of X seconds +fade_in_out: 0.5 + +### Trim the output such that the total duration is X seconds +duration: 8 + +### Add low-level random background noise (amplitude +-4) instead of silence; default = false (silence) +add_low_level_random_noise: False + +### Process with parallel streams +multiprocessing: False + +################################################ +### Item generation - Filename conventions +################################################ + +### Naming convention for the input mono files +### The input filenames are represented by: +### lLLeeettszz.wav +### where: +### l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com) +### LL stands for the language: JP, FR, GE, MA, DA, EN +### eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09 +### tt stands for the talker ID: f1, f2, f3, m1, m2, m3 +### s stands for 'sample' and zz is the sample number; 01, ..., 14 + +### Naming convention for the generated output files +### The output filenames are represented by: +### leeeayszz.wav +### The filenames of the accompanying output metadata files (applicable to metadata-assisted spatial audio, object-based audio) are represented by: +### leeeayszz.met for metadata-assisted spatial audio +### leeeayszz.wav.o.csv for object-based audio +### where: +### l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com) +### eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09 +### a stands 'audio' +### y is the per-experiment category according to IVAS-8a: 01, 02, 03, 04, 05, 06 +### s stands for sample and zz is the sample number; 01, 02, 03, 04, 05, 06, 07 (07 is the preliminary sample) +### o stands for the object number; 0, 1, 2, 3 + +### File designators, default is "l" for listening lab, "EN" for language, "p07" for experiment and "g" for company +listening_lab: "b" +language: "GE" +exp: "p02" +provider: "g" + +### Insert prefix for all input filenames (default: "") +### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment' +### the number of consecutive letters define the length of each field +# use_input_prefix: "lLLeee" + +### Insert prefix for all output filenames (default: "") +### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment' +### the number of consecutive letters define the length of each field +use_output_prefix: "leee" + +################################################ +### Item generation - Scene description +################################################ + +### Each scene shall de described using the following parameters/properties: +### output: output filename +### description: textual description of the scene +### input: input filename(s) +### IR: filenames(s) of the input IRs +### azimuth: azimuth in the range [-180,180]; positive values point to the left +### elevation: elevation in the range [-90,90]; positive values indicate up +### shift: time adjustment of the input signal (negative value delays the signal) +### +### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder) +### Note 1: use brackets [val1, val2, ...] when specifying multiple values +### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames +### Note 3: we're using right-handed coordinate system with azimuth = 0 pointing from the nose to the screen + + +scenes: + "01": + output: "out/s01.wav" + description: "Car with AB microphone pickup, no overlap between the talkers, car noise." + input: ["items_mono/untrimmed/f1s4b_Talker2.wav", "items_mono/untrimmed/f2s1a_Talker1.wav"] + IR: ["IRs/IR_do_p04_e_01_01_FOA.wav", "IRs/IR_do_p04_e_02_01_FOA.wav"] + shift: [0.0, -1.0] + + "02": + output: "out/s02.wav" + description: "Car with AB microphone pickup, overlap between the talkers, car noise." + input: ["items_mono/untrimmed/f1s6a_Talker2.wav", "items_mono/untrimmed/f2s3b_Talker1.wav"] + IR: ["IRs/IR_do_p04_e_03_01_FOA.wav", "IRs/IR_do_p04_e_04_01_FOA.wav"] + shift: [0.0, +1.0] + + "03": + output: "out/s03.wav" + description: "Car with AB microphone pickup, no overlap between the talkers, car noise." + input: ["items_mono/untrimmed/f3s3a_Talker2.wav", "items_mono/untrimmed/f3s10b_Talker2.wav"] + IR: ["IRs/IR_do_p04_e_05_01_FOA.wav", "IRs/IR_do_p04_e_06_01_FOA.wav"] + shift: [0.0, -1.0] + + "04": + output: "out/s04.wav" + description: "Car with AB microphone pickup, no overlap between the talkers, car noise." + input: ["items_mono/untrimmed/f2s7b_Talker1.wav", "items_mono/untrimmed/f5s15a_Talker1.wav"] + IR: ["IRs/IR_do_p04_e_07_01_FOA.wav", "IRs/IR_do_p04_e_08_01_FOA.wav"] + shift: [0.0, -1.0] + + "05": + output: "out/s05.wav" + description: "Car with AB microphone pickup, no overlap between the talkers, car noise." + input: ["items_mono/untrimmed/m2s15a_Talker2.wav", "items_mono/untrimmed/m1s4a_Talker1.wav"] + IR: ["IRs/IR_do_p04_e_07_01_FOA.wav", "IRs/IR_do_p04_e_01_01_FOA.wav"] + shift: [0.0, -1.0] + + "06": + output: "out/s06.wav" + description: "Car with AB microphone pickup, no overlap between the talkers." + input: ["items_mono/untrimmed/m3s8a_Talker2.wav", "items_mono/untrimmed/m4s13a_Talker1.wav"] + IR: ["IRs/IR_do_p04_e_03_01_FOA.wav", "IRs/IR_do_p04_e_01_01_FOA.wav"] + shift: [0.0, -1.0] + + "07": + output: "out/s07.wav" + description: "Preliminary: Car with AB microphone pickup, no overlap between the talkers." + input: ["items_mono/untrimmed/f1s20a_Talker2.wav", "items_mono/untrimmed/f5s15b_Talker1.wav"] + IR: ["IRs/IR_do_p04_e_02_01_FOA.wav", "IRs/IR_do_p04_e_07_01_FOA.wav"] + shift: [0.0, -1.0] + + "08": + output: "out/s08.wav" + description: "Car with AB microphone pickup, overlap between the talkers." + input: ["items_mono/untrimmed/m2s6b_Talker2.wav", "items_mono/untrimmed/f5s14a_Talker1.wav"] + IR: ["IRs/IR_do_p04_e_08_01_FOA.wav", "IRs/IR_do_p04_e_04_01_FOA.wav"] + shift: [0.0, +1.0] diff --git a/examples/ITEM_GENERATION_OMASA.yml b/examples/ITEM_GENERATION_OMASA.yml new file mode 100644 index 0000000000000000000000000000000000000000..1f631f3f372da3c59e4aa2c3687fdb6c10a3322e --- /dev/null +++ b/examples/ITEM_GENERATION_OMASA.yml @@ -0,0 +1,176 @@ +--- +################################################ +# Item generation - General configuration +################################################ + +### Any relative paths will be interpreted relative to the working directory the script is called from! +### Usage of absolute paths is recommended. +### Do not use file names with dots "." in them! This is not supported, use "_" instead +### For Windows users: please use double back slash '\\' in paths and add '.exe' to executable definitions + +### Output format +format: "OMASA" +masa_tc: 2 # applicable only to OMASA format +masa_dirs: 2 # applicable only to OMASA format +# sba_order: 2 # applicable only to OSBA format + +### Output sampling rate in Hz +fs: 48000 + +### Generate BINAURAL output (_BINAURAL will be appended to the output filename) +binaural_output: true + +### Normalize target loudness to X LKFS +# loudness: -26 + +### Apply pre-amble and post-amble in X seconds +preamble: 0.0 +postamble: 0.0 + +### Apply fade-in and fade-out of X seconds +fade_in_out: 0.5 + +### Trim the output such that the total duration is X seconds +duration: 8 + +### Add low-level random background noise (amplitude +-4) instead of silence; default = false (silence) +add_low_level_random_noise: true + +### Process with parallel streams +multiprocessing: False + +################################################ +### Item generation - Filename conventions +################################################ + +### Naming convention for the input mono files +### The input filenames are represented by: +### lLLeeettszz.wav +### where: +### l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com) +### LL stands for the language: JP, FR, GE, MA, DA, EN +### eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09 +### tt stands for the talker ID: f1, f2, f3, m1, m2, m3 +### s stands for 'sample' and zz is the sample number; 01, ..., 14 + +### Naming convention for the generated output files +### The output filenames are represented by: +### leeeayszz.wav +### The filenames of the accompanying output metadata files (applicable to metadata-assisted spatial audio, object-based audio) are represented by: +### leeeayszz.met for metadata-assisted spatial audio +### leeeayszz.wav.o.csv for object-based audio +### where: +### l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com) +### eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09 +### a stands 'audio' +### y is the per-experiment category according to IVAS-8a: 01, 02, 03, 04, 05, 06 +### s stands for sample and zz is the sample number; 01, 02, 03, 04, 05, 06, 07 (07 is the preliminary sample) +### o stands for the object number; 0, 1, 2, 3 + +### File designators, default is "l" for listening lab, "EN" for language, "p07" for experiment and "g" for company +listening_lab: "l" +language: "EN" +exp: "p01" +provider: "va" + +### Insert prefix for all input filenames (default: "") +### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment' +### the number of consecutive letters define the length of each field +# use_input_prefix: "lLLeee" + +### Insert prefix for all output filenames (default: "") +### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment' +### the number of consecutive letters define the length of each field +# use_output_prefix: "leee" + +################################################ +### Item generation - Scene description +################################################ + +### Each scene shall de described using the following parameters/properties: +### output: output filename +### description: textual description of the scene +### input: input filename(s) +### azimuth: azimuth in the range [-180,180]; positive values point to the left +### elevation: elevation in the range [-90,90]; positive values indicate up +### shift: time adjustment of the input signal (negative value delays the signal) +### +### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder) +### Note 1: use brackets [val1, val2, ...] when specifying multiple values +### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames +### Note 3: we're using right-handed coordinate system with azimuth = 0 pointing from the nose to the screen + +scenes: + "01": + output: "out/VA_3tlks_music_s01.wav" + description: "Three talkers over music background" + input: ["items_hoa2/bm7aa1s01.wav", "items_mono/untrimmed/m4s12b_Talker1.wav", "items_mono/untrimmed/f1s6a_Talker2.wav", "items_mono/untrimmed/m3s1a_Talker2.wav"] + azimuth: [0, 30, -45, 100] + elevation: [0, 20, 20, 30] + level: [-36, -26, -26, -26] + shift: [0.0, 0.0, 0.0, -2.0] + + "02": + output: "out/VA_3tlks_music_s02.wav" + description: "Three talkers over music background" + input: ["items_hoa2/bm7aa1s03.wav", "items_mono/untrimmed/f2s5a_Talker1.wav", "items_mono/untrimmed/f5s10a_Talker1.wav", "items_mono/untrimmed/m3s8b_Talker2.wav"] + azimuth: [0, "-20:0.5:360", "60:-0.5:-360", 60] + elevation: [0, 10, 10, 10] + level: [-46, -26, -26, -26] + shift: [0.0, 0.0, -2.0, -2.5] + + "03": + output: "out/VA_3tlks_music_s03.wav" + description: "Three talkers over music background" + input: ["items_hoa2/bm7aa1s05.wav", "items_mono/untrimmed/f1s16b_Talker2.wav", "items_mono/untrimmed/m4s16a_Talker1.wav", "items_mono/untrimmed/f3s10b_Talker2.wav"] + azimuth: [0, -90, "0:1:360", "0:-1:-360"] + elevation: [0, 0, 30, 30] + level: [-36, -26, -26, -26] + shift: [0.0, 0.0, 0.0, -2.6] + + "04": + output: "out/VA_3tlks_music_s04.wav" + description: "Three talkers over music background" + input: ["items_hoa2/bm7aa1s07.wav", "items_mono/untrimmed/f5s15b_Talker1.wav", "items_mono/untrimmed/m1s7a_Talker1.wav", "items_mono/untrimmed/m1s6b_Talker1.wav"] + azimuth: [0, "-90:-1:-360", "-10:1.5:360", "70:1:360"] + elevation: [0, 0, 20, 0] + level: [-46, -26, -36, -26] + shift: [0.0, -2.0, 0.0, -3.5] + + "05": + output: "out/VA_2tlks_1obj_music_s05.wav" + description: "Two talkers, one musical object over music background" + input: ["items_hoa2/bm7aa1s09.wav", "items_mono/untrimmed/f2s1a_Talker1.wav", "items_mono/untrimmed/f2s5a_Talker1.wav", "items_mono/music/item_lxa3s3.48k.wav"] + azimuth: [0, 20, -40, 45] + elevation: [0, 0, 0, 70] + level: [-36, -36, -26, -41] + shift: [0.0, 0.0, -2.0, 0.0] + + "06": + output: "out/VA_2tlks_1obj_music_s06.wav" + description: "Two talkers, one musical object over music background" + input: ["items_hoa2/bm7aa1s11.wav", "items_mono/untrimmed/f5s10b_Talker1.wav", "items_mono/untrimmed/m1s4a_Talker1.wav", "items_mono/music/item_lxa3s5.48k.wav"] + azimuth: [0, 50, "180:1:360", -120] + elevation: [0, 0, 45, 70] + level: [-46, -26, -26, -41] + shift: [0.0, 0.0, -2.5, 0.0] + + "07": + output: "out/VA_2tlks_1obj_music_s07.wav" + description: "Two talkers, one musical object over music background" + input: ["items_hoa2/bm7aa1s13.wav", "items_mono/untrimmed/m1s2b_Talker1.wav", "items_mono/untrimmed/f3s5a_Talker2.wav", "items_mono/music/641692__theflyfishingfilmmaker__classical-violin-minor-10s-mono.wav"] + azimuth: [0, "80:1:20 + 360", "80:1:20 + 360", -30] + elevation: [0, 10, 60, 70] + level: [-36, -26, -26, -36] + shift: [0.0, 0.0, 0.0, 0.0] + + "08": + output: "out/VA_2tlks_1obj_music_s08.wav" + description: "Two talkers, one musical object over music background" + input: ["items_hoa2/bm7aa1s15.wav", "items_mono/untrimmed/m4s16a_Talker1.wav", "items_mono/untrimmed/f2s4a_Talker1.wav", "items_mono/music/item_lxa4s2.48k.wav"] + azimuth: [0, "60:1:0 + 360", "60:-1:120 - 360", 100] + elevation: [0, 20, 50, 70] + level: [-46, -26, -26, -41] + shift: [0.0, 0.0, -1.0, -0.5] + + diff --git a/examples/ITEM_GENERATION_OSBA.yml b/examples/ITEM_GENERATION_OSBA.yml new file mode 100644 index 0000000000000000000000000000000000000000..f7c33b490f6edd62ffa2d1e1faf0b582bf614b88 --- /dev/null +++ b/examples/ITEM_GENERATION_OSBA.yml @@ -0,0 +1,176 @@ +--- +################################################ +# Item generation - General configuration +################################################ + +### Any relative paths will be interpreted relative to the working directory the script is called from! +### Usage of absolute paths is recommended. +### Do not use file names with dots "." in them! This is not supported, use "_" instead +### For Windows users: please use double back slash '\\' in paths and add '.exe' to executable definitions + +### Output format +format: "OSBA" +# masa_tc: 2 # applicable only to OMASA format +# masa_dirs: 2 # applicable only to OMASA format +sba_order: 2 # applicable only to OSBA format + +### Output sampling rate in Hz +fs: 48000 + +### Generate BINAURAL output (_BINAURAL will be appended to the output filename) +binaural_output: true + +### Normalize target loudness to X LKFS +# loudness: -26 + +### Apply pre-amble and post-amble in X seconds +preamble: 0.0 +postamble: 0.0 + +### Apply fade-in and fade-out of X seconds +fade_in_out: 0.5 + +### Trim the output such that the total duration is X seconds +duration: 8 + +### Add low-level random background noise (amplitude +-4) instead of silence; default = false (silence) +add_low_level_random_noise: true + +### Process with parallel streams +multiprocessing: False + +################################################ +### Item generation - Filename conventions +################################################ + +### Naming convention for the input mono files +### The input filenames are represented by: +### lLLeeettszz.wav +### where: +### l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com) +### LL stands for the language: JP, FR, GE, MA, DA, EN +### eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09 +### tt stands for the talker ID: f1, f2, f3, m1, m2, m3 +### s stands for 'sample' and zz is the sample number; 01, ..., 14 + +### Naming convention for the generated output files +### The output filenames are represented by: +### leeeayszz.wav +### The filenames of the accompanying output metadata files (applicable to metadata-assisted spatial audio, object-based audio) are represented by: +### leeeayszz.met for metadata-assisted spatial audio +### leeeayszz.wav.o.csv for object-based audio +### where: +### l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com) +### eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09 +### a stands 'audio' +### y is the per-experiment category according to IVAS-8a: 01, 02, 03, 04, 05, 06 +### s stands for sample and zz is the sample number; 01, 02, 03, 04, 05, 06, 07 (07 is the preliminary sample) +### o stands for the object number; 0, 1, 2, 3 + +### File designators, default is "l" for listening lab, "EN" for language, "p07" for experiment and "g" for company +listening_lab: "l" +language: "EN" +exp: "p01" +provider: "va" + +### Insert prefix for all input filenames (default: "") +### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment' +### the number of consecutive letters define the length of each field +# use_input_prefix: "lLLeee" + +### Insert prefix for all output filenames (default: "") +### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment' +### the number of consecutive letters define the length of each field +# use_output_prefix: "leee" + +################################################ +### Item generation - Scene description +################################################ + +### Each scene shall de described using the following parameters/properties: +### output: output filename +### description: textual description of the scene +### input: input filename(s) +### azimuth: azimuth in the range [-180,180]; positive values point to the left +### elevation: elevation in the range [-90,90]; positive values indicate up +### shift: time adjustment of the input signal (negative value delays the signal) +### +### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder) +### Note 1: use brackets [val1, val2, ...] when specifying multiple values +### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames +### Note 3: we're using right-handed coordinate system with azimuth = 0 pointing from the nose to the screen + +scenes: + "01": + output: "out/VA_3tlks_music.wav" + description: "Three talkers over music background" + input: ["items_hoa2/bm7aa1s01.wav", "items_mono/untrimmed/m4s12b_Talker1.wav", "items_mono/untrimmed/f1s6a_Talker2.wav", "items_mono/untrimmed/m3s1a_Talker2.wav"] + azimuth: [0, 30, -45, 100] + elevation: [0, 20, 20, 30] + level: [-36, -26, -26, -26] + shift: [0.0, 0.0, 0.0, -2.0] + + "02": + output: "out/VA_3tlks_music.wav" + description: "Three talkers over music background" + input: ["items_hoa2/bm7aa1s03.wav", "items_mono/untrimmed/f2s5a_Talker1.wav", "items_mono/untrimmed/f5s10a_Talker1.wav", "items_mono/untrimmed/m3s8b_Talker2.wav"] + azimuth: [0, "-20:0.5:360", "60:-0.5:-360", 60] + elevation: [0, 10, 10, 10] + level: [-46, -26, -26, -26] + shift: [0.0, 0.0, -2.0, -2.5] + + "03": + output: "out/VA_3tlks_music.wav" + description: "Three talkers over music background" + input: ["items_hoa2/bm7aa1s05.wav", "items_mono/untrimmed/f1s16b_Talker2.wav", "items_mono/untrimmed/m4s16a_Talker1.wav", "items_mono/untrimmed/f3s10b_Talker2.wav"] + azimuth: [0, -90, "0:1:360", "0:-1:-360"] + elevation: [0, 0, 30, 30] + level: [-36, -26, -26, -26] + shift: [0.0, 0.0, 0.0, -2.6] + + "04": + output: "out/VA_3tlks_music.wav" + description: "Three talkers over music background" + input: ["items_hoa2/bm7aa1s07.wav", "items_mono/untrimmed/f5s15b_Talker1.wav", "items_mono/untrimmed/m1s7a_Talker1.wav", "items_mono/untrimmed/m1s6b_Talker1.wav"] + azimuth: [0, "-90:-1:-360", "-10:1.5:360", "70:1:360"] + elevation: [0, 0, 20, 0] + level: [-46, -26, -36, -26] + shift: [0.0, -2.0, 0.0, -3.5] + + "05": + output: "out/VA_2tlks_1obj_music.wav" + description: "Two talkers, one musical object over music background" + input: ["items_hoa2/bm7aa1s09.wav", "items_mono/untrimmed/f2s1a_Talker1.wav", "items_mono/untrimmed/f2s5a_Talker1.wav", "items_mono/music/item_lxa3s3.48k.wav"] + azimuth: [0, 20, -40, 45] + elevation: [0, 0, 0, 70] + level: [-36, -36, -26, -41] + shift: [0.0, 0.0, -2.0, 0.0] + + "06": + output: "out/VA_2tlks_1obj_music.wav" + description: "Two talkers, one musical object over music background" + input: ["items_hoa2/bm7aa1s11.wav", "items_mono/untrimmed/f5s10b_Talker1.wav", "items_mono/untrimmed/m1s4a_Talker1.wav", "items_mono/music/item_lxa3s5.48k.wav"] + azimuth: [0, 50, "180:1:360", -120] + elevation: [0, 0, 45, 70] + level: [-46, -26, -26, -41] + shift: [0.0, 0.0, -2.5, 0.0] + + "07": + output: "out/VA_2tlks_1obj_music.wav" + description: "Two talkers, one musical object over music background" + input: ["items_hoa2/bm7aa1s13.wav", "items_mono/untrimmed/m1s2b_Talker1.wav", "items_mono/untrimmed/f3s5a_Talker2.wav", "items_mono/music/641692__theflyfishingfilmmaker__classical-violin-minor-10s-mono.wav"] + azimuth: [0, "80:1:20 + 360", "80:1:20 + 360", -30] + elevation: [0, 10, 60, 70] + level: [-36, -26, -26, -36] + shift: [0.0, 0.0, 0.0, 0.0] + + "08": + output: "out/VA_2tlks_1obj_music.wav" + description: "Two talkers, one musical object over music background" + input: ["items_hoa2/bm7aa1s15.wav", "items_mono/untrimmed/m4s16a_Talker1.wav", "items_mono/untrimmed/f2s4a_Talker1.wav", "items_mono/music/item_lxa4s2.48k.wav"] + azimuth: [0, "60:1:0 + 360", "60:-1:120 - 360", 100] + elevation: [0, 20, 50, 70] + level: [-46, -26, -26, -41] + shift: [0.0, 0.0, -1.0, -0.5] + + diff --git a/examples/ITEM_GENERATION_STEREO.yml b/examples/ITEM_GENERATION_STEREO.yml new file mode 100644 index 0000000000000000000000000000000000000000..14731b4b65858f5ab78fce3dfbe0178aede95fea --- /dev/null +++ b/examples/ITEM_GENERATION_STEREO.yml @@ -0,0 +1,160 @@ +--- +################################################ +# Item generation - General configuration +################################################ + +### Any relative paths will be interpreted relative to the working directory the script is called from! +### Usage of absolute paths is recommended. +### Do not use file names with dots "." in them! This is not supported, use "_" instead +### For Windows users: please use double back slash '\\' in paths and add '.exe' to executable definitions + +### Output format +format: "STEREO" +# masa_tc: 2 # applicable only to OMASA format +# masa_dirs: 2 # applicable only to OMASA format +# sba_order: 2 # applicable only to OSBA format + +### Output sampling rate in Hz +fs: 48000 + +### Generate BINAURAL output (_BINAURAL will be appended to the output filename) +binaural_output: true + +### Normalize target loudness to X LKFS +loudness: -26 + +### Apply pre-amble and post-amble in X seconds +preamble: 0.5 +postamble: 1.0 + +### Apply fade-in and fade-out of X seconds +fade_in_out: 0.5 + +### Trim the output such that the total duration is X seconds +duration: 8 + +### Add low-level random background noise (amplitude +-4) instead of silence; default = false (silence) +add_low_level_random_noise: true + +### Process with parallel streams +multiprocessing: False + +################################################ +### Item generation - Filename conventions +################################################ + +### Naming convention for the input mono files +### The input filenames are represented by: +### lLLeeettszz.wav +### where: +### l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com) +### LL stands for the language: JP, FR, GE, MA, DA, EN +### eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09 +### tt stands for the talker ID: f1, f2, f3, m1, m2, m3 +### s stands for 'sample' and zz is the sample number; 01, ..., 14 + +### Naming convention for the generated output files +### The output filenames are represented by: +### leeeayszz.wav +### The filenames of the accompanying output metadata files (applicable to metadata-assisted spatial audio, object-based audio) are represented by: +### leeeayszz.met for metadata-assisted spatial audio +### leeeayszz.wav.o.csv for object-based audio +### where: +### l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com) +### eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09 +### a stands 'audio' +### y is the per-experiment category according to IVAS-8a: 01, 02, 03, 04, 05, 06 +### s stands for sample and zz is the sample number; 01, 02, 03, 04, 05, 06, 07 (07 is the preliminary sample) +### o stands for the object number; 0, 1, 2, 3 + +### File designators, default is "l" for listening lab, "EN" for language, "p07" for experiment and "g" for company +listening_lab: "b" +language: "GE" +exp: "p02" +provider: "g" + +### Insert prefix for all input filenames (default: "") +### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment' +### the number of consecutive letters define the length of each field +# use_input_prefix: "lLLeee" + +### Insert prefix for all output filenames (default: "") +### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment' +### the number of consecutive letters define the length of each field +# use_output_prefix: "leee" + +################################################ +### Item generation - Scene description +################################################ + +### Each scene shall de described using the following parameters/properties: +### output: output filename +### description: textual description of the scene +### input: input filename(s) +### IR: filenames(s) of the input IRs +### azimuth: azimuth in the range [-180,180]; positive values point to the left +### elevation: elevation in the range [-90,90]; positive values indicate up +### shift: time adjustment of the input signal (negative value delays the signal) +### +### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder) +### Note 1: use brackets [val1, val2, ...] when specifying multiple values +### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames +### Note 3: we're using right-handed coordinate system with azimuth = 0 pointing from the nose to the screen + + +scenes: + "01": + output: "out/a1s01.wav" + description: "Car with AB microphone pickup, no overlap between the talkers, car noise." + input: ["items_mono/untrimmed/f1s4b_Talker2.wav", "items_mono/untrimmed/f2s1a_Talker1.wav"] + IR: ["IRs/Car_TalkPos1_Stereo_M5_SinSweep_2chn.wav", "IRs/Car_TalkPos2_Stereo_M5_SinSweep_2chn.wav"] + shift: [0.0, -1.0] + + "02": + output: "out/a1s02.wav" + description: "Car with AB microphone pickup, overlap between the talkers, car noise." + input: ["items_mono/untrimmed/f1s6a_Talker2.wav", "items_mono/untrimmed/f2s3b_Talker1.wav"] + IR: ["IRs/Car_TalkPos3_Stereo_M5_SinSweep_2chn.wav", "IRs/Car_TalkPos4_Stereo_M5_SinSweep_2chn.wav"] + shift: [0.0, +1.0] + + "03": + output: "out/a1s03.wav" + description: "Car with AB microphone pickup, no overlap between the talkers, car noise." + input: ["items_mono/untrimmed/f3s3a_Talker2.wav", "items_mono/untrimmed/f3s10b_Talker2.wav"] + IR: ["IRs/Car_TalkPos1_Stereo_M5_SinSweep_2chn.wav", "IRs/Car_TalkPos1_Stereo_M5_SinSweep_2chn.wav"] + shift: [0.0, -1.0] + + "04": + output: "out/a1s04.wav" + description: "Car with AB microphone pickup, no overlap between the talkers, car noise." + input: ["items_mono/untrimmed/f2s7b_Talker1.wav", "items_mono/untrimmed/f5s15a_Talker1.wav"] + IR: ["IRs/FreeField_IR_Python_AB_20cm_Pos1.wav", "IRs/FreeField_IR_Python_AB_20cm_Pos2.wav"] + shift: [0.0, -1.0] + + "05": + output: "out/a1s05.wav" + description: "Car with AB microphone pickup, no overlap between the talkers, car noise." + input: ["items_mono/untrimmed/m2s15a_Talker2.wav", "items_mono/untrimmed/m1s4a_Talker1.wav"] + IR: ["IRs/FreeField_IR_Python_AB_20cm_Pos3.wav", "IRs/FreeField_IR_Python_AB_20cm_Pos4.wav"] + shift: [0.0, -1.0] + + "06": + output: "out/a1s06.wav" + description: "Car with AB microphone pickup, no overlap between the talkers." + input: ["items_mono/untrimmed/m3s8a_Talker2.wav", "items_mono/untrimmed/m4s13a_Talker1.wav"] + IR: ["IRs/IR_do_p04_e_01_01_FOA.wav", "IRs/IR_do_p04_e_02_01_FOA.wav"] + shift: [0.0, -1.0] + + "07": + output: "out/a1s07.wav" + description: "Preliminary: Car with AB microphone pickup, no overlap between the talkers." + input: ["items_mono/untrimmed/f1s20a_Talker2.wav", "items_mono/untrimmed/f5s15b_Talker1.wav"] + IR: ["IRs/IR_do_p04_e_03_01_FOA.wav", "IRs/IR_do_p04_e_04_01_FOA.wav"] + shift: [0.0, -1.0] + + "08": + output: "out/a2s01.wav" + description: "Car with AB microphone pickup, overlap between the talkers." + input: ["items_mono/untrimmed/m2s6b_Talker2.wav", "items_mono/untrimmed/f5s14a_Talker1.wav"] + IR: ["IRs/IR_g_p01_a_07_00_stAB100.wav", "IRs/IR_g_p01_a_06_00_stAB100.wav"] + shift: [0.0, +1.0] diff --git a/ivas_processing_scripts/audiotools/audiofile.py b/ivas_processing_scripts/audiotools/audiofile.py index b882a2a92016ff9cc0a75ba921e6de83ceb74f8e..46f040a5ac60576e96b1eff733527a435ed4b8c1 100755 --- a/ivas_processing_scripts/audiotools/audiofile.py +++ b/ivas_processing_scripts/audiotools/audiofile.py @@ -40,7 +40,7 @@ import numpy as np import scipy.io.wavfile as wav from .audioarray import trim, window -from .constants import VERT_HOA_CHANNELS_ACN +from .constants import SUPPRESS_CHUNK_WARNING_WAV_READ, VERT_HOA_CHANNELS_ACN logger = logging.getLogger("__main__") logger.setLevel(logging.DEBUG) @@ -80,6 +80,13 @@ def read( with catch_warnings(record=True) as warnings_list: fs, data = wav.read(filename) for w in warnings_list: + if ( + SUPPRESS_CHUNK_WARNING_WAV_READ + and "Chunk (non-data) not understood, skipping it." + in str(w.message) + ): + continue + print(f"{filename} : {w.message} ( {w.filename}:{w.lineno} )") if data.dtype == np.int32: data = np.interp( diff --git a/ivas_processing_scripts/audiotools/constants.py b/ivas_processing_scripts/audiotools/constants.py index cbe0aae41def2d8c89fcf8ae005372f43aed42a2..eed72ecf261b4a0746b5a86dcd9800cadfa7225e 100755 --- a/ivas_processing_scripts/audiotools/constants.py +++ b/ivas_processing_scripts/audiotools/constants.py @@ -32,6 +32,11 @@ import numpy as np +SUPPRESS_CHUNK_WARNING_WAV_READ = ( + False # suppress warning from .wav read() when chunk size is not a multiple of 2 +) + + BINAURAL_AUDIO_FORMATS = { "BINAURAL": { "num_channels": 2, diff --git a/ivas_processing_scripts/audiotools/wrappers/masaAnalyzer.py b/ivas_processing_scripts/audiotools/wrappers/masaAnalyzer.py index e3f2ffc9790d5d369a4ecc56ad395275f539864b..65f2048ecfbcf87309f23940e92d2c285addc845 100644 --- a/ivas_processing_scripts/audiotools/wrappers/masaAnalyzer.py +++ b/ivas_processing_scripts/audiotools/wrappers/masaAnalyzer.py @@ -72,6 +72,12 @@ def masaAnalyzer( else: binary = find_binary("masaAnalyzer") + # enforce metadata_out_filename to be a Path object + if metadata_out_filename is not None and not isinstance( + metadata_out_filename, Path + ): + metadata_out_filename = Path(metadata_out_filename) + if num_tcs not in [1, 2]: raise ValueError(f"Only 1 or 2 TCs supported, but {num_tcs} was given.") diff --git a/ivas_processing_scripts/audiotools/wrappers/masaRenderer.py b/ivas_processing_scripts/audiotools/wrappers/masaRenderer.py index 3a928614e73a852d5a1ba63ca0859457178e65d9..47ec8c6edc1c2a6abb572051baedc3b54b7974ea 100755 --- a/ivas_processing_scripts/audiotools/wrappers/masaRenderer.py +++ b/ivas_processing_scripts/audiotools/wrappers/masaRenderer.py @@ -83,11 +83,16 @@ def masaRenderer( output_mode = "-BINAURAL" num_channels = 2 + # enforce masa_metadata_file to be a Path object + masa_metadata_file = masa.metadata_file + if masa_metadata_file is not None and not isinstance(masa_metadata_file, Path): + masa_metadata_file = Path(masa_metadata_file) + cmd = [ str(binary), output_mode, "", # 2 -> inputPcm - str(masa.metadata_file.resolve()), + str(masa_metadata_file.resolve()), "", # 4 -> outputPcm ] diff --git a/ivas_processing_scripts/audiotools/wrappers/reverb.py b/ivas_processing_scripts/audiotools/wrappers/reverb.py index 86d225a1cce6d45ee70ab4abbd78788d80557a36..bfbe39eb8ce696008a73069128fb1358add6f062 100644 --- a/ivas_processing_scripts/audiotools/wrappers/reverb.py +++ b/ivas_processing_scripts/audiotools/wrappers/reverb.py @@ -38,6 +38,7 @@ from typing import Optional import numpy as np from scipy.fft import fft +from ivas_processing_scripts.audiotools import audio from ivas_processing_scripts.audiotools.audio import Audio from ivas_processing_scripts.audiotools.audiofile import read, write from ivas_processing_scripts.audiotools.wrappers.filter import resample_itu @@ -128,7 +129,9 @@ def reverb( output = copy(tmp_input) output.audio, _ = read(tmp_output_file, nchannels=1, fs=tmp_input.fs) - # reverse the resampling + # remove trailing part (to ensure that the length of the output is the same as the input) + output.audio = output.audio[: -(IR.audio.shape[0] - 1), :] + if old_fs: output.audio = resample_itu(output, old_fs) output.fs = old_fs @@ -183,9 +186,8 @@ def reverb_stereo( y_right = reverb(input, IR_right, align=align) # combine into stereo output - y = copy(input) - y.name = "STEREO" - y.num_channels = 2 + y = audio.fromtype("STEREO") + y.fs = input.fs y.audio = np.column_stack([y_left.audio, y_right.audio]) return y @@ -197,14 +199,14 @@ def reverb_foa( align: Optional[float] = None, ) -> Audio: """ - Wrapper for the ITU-T reverb binary to convolve mono audio signal with an FOA impulse response + Convolve mono audio signal with an FOA impulse response Parameters ---------- input: Audio Input audio signal - IR: Audio - Impulse response + foa_IR: Audio + FOA impulse response align: float multiplicative factor to apply to the reverberated sound in order to align its energy level with the second file @@ -249,10 +251,9 @@ def reverb_foa( y_y = reverb(input, IR_y, align=align) y_z = reverb(input, IR_z, align=align) - # combine into foa output - y = copy(input) - y.name = "FOA" - y.num_channels = 4 + # combine into FOA output + y = audio.fromtype("FOA") + y.fs = input.fs y.audio = np.column_stack([y_w.audio, y_x.audio, y_y.audio, y_z.audio]) return y @@ -264,14 +265,14 @@ def reverb_hoa2( align: Optional[float] = None, ) -> Audio: """ - Wrapper for the ITU-T reverb binary to convolve mono audio signal with an HOA2 impulse response + Convolve mono audio signal with an HOA2 impulse response Parameters ---------- input: Audio Input audio signal - IR: Audio - Impulse response + hoa2_IR: Audio + HOA2 impulse response align: float multiplicative factor to apply to the reverberated sound in order to align its energy level with the second file @@ -284,7 +285,7 @@ def reverb_hoa2( # convert to float32 hoa2_IR.audio = np.float32(hoa2_IR.audio) - numchannels = 9 # HOA2 by definition + numchannels = hoa2_IR.num_channels # calculate the scaling (multiplicative) factor such that the maximum gain of the IR filter across all frequencies is 0dB if align is None: @@ -297,26 +298,64 @@ def reverb_hoa2( ych = [] for i in range(numchannels): # separate IR into each channel - IR.audio = np.reshape(hoa2_IR.audio[:, i], (-1, 1)) + IR.audio = hoa2_IR.audio[:, [i]] # convolve mono input with channel IR ych.append(reverb(input, IR, align=align)) - # combine into hoa2 output - y = copy(input) - y.name = "HOA2" - y.num_channels = numchannels - y.audio = np.column_stack( - [ - ych[0].audio, - ych[1].audio, - ych[2].audio, - ych[3].audio, - ych[4].audio, - ych[5].audio, - ych[6].audio, - ych[7].audio, - ych[8].audio, - ] - ) + # combine into HOA2 output + y = audio.fromtype("HOA2") + y.fs = input.fs + y.audio = np.column_stack([ych[i].audio for i in range(numchannels)]) + + return y + + +def reverb_hoa3( + input: Audio, + hoa3_IR: Audio, + align: Optional[float] = None, +) -> Audio: + """ + Convolve mono audio signal with an HOA3 impulse response + + Parameters + ---------- + input: Audio + Input audio signal + hoa3_IR: Audio + HOA3 impulse response + align: float + multiplicative factor to apply to the reverberated sound in order to align its energy level with the second file + + Returns + ------- + output: Audio + Convolved audio signal with HOA3 IR + """ + + # convert to float32 + hoa3_IR.audio = np.float32(hoa3_IR.audio) + + numchannels = hoa3_IR.num_channels + + # calculate the scaling (multiplicative) factor such that the maximum gain of the IR filter across all frequencies is 0dB + if align is None: + H = fft(hoa3_IR.audio, axis=0) + align = 1.0 / np.max(np.abs(H)) + + IR = copy(hoa3_IR) + IR.name = "MONO" + IR.num_channels = 1 + ych = [] + for i in range(numchannels): + # separate IR into each channel + IR.audio = hoa3_IR.audio[:, [i]] + # convolve mono input with channel IR + ych.append(reverb(input, IR, align=align)) + + # combine into HOA3 output + y = audio.fromtype("HOA3") + y.fs = input.fs + y.audio = np.column_stack([ych[i].audio for i in range(numchannels)]) return y diff --git a/ivas_processing_scripts/generation/__init__.py b/ivas_processing_scripts/generation/__init__.py index 6dc5623ff532711eb2a19a1ef99a897fd32ea0f3..8a9dfb98fdd0f24a24ac36210068a1984f9563aa 100755 --- a/ivas_processing_scripts/generation/__init__.py +++ b/ivas_processing_scripts/generation/__init__.py @@ -32,8 +32,6 @@ import logging -import yaml - from ivas_processing_scripts.constants import ( LOGGER_DATEFMT, LOGGER_FORMAT, @@ -41,12 +39,12 @@ from ivas_processing_scripts.constants import ( ) from ivas_processing_scripts.generation import ( config, - process_ambi_items, - process_ism1_items, - process_ism2_items, - process_stereo_items, + generate_ismN_items, + generate_omasa_items, + generate_osba_items, + generate_sba_items, + generate_stereo_items, ) -from ivas_processing_scripts.utils import create_dir def logging_init(args, cfg): @@ -61,16 +59,12 @@ def logging_init(args, cfg): logger.addHandler(console_handler) # main log file - file_handler = logging.FileHandler( - cfg.output_path.joinpath(f"{cfg.format}{LOGGER_SUFFIX}"), mode="w" - ) + file_handler = logging.FileHandler(f"{cfg.format}{LOGGER_SUFFIX}", mode="w") file_handler.setFormatter(logging.Formatter(LOGGER_FORMAT, datefmt=LOGGER_DATEFMT)) file_handler.setLevel(logging.DEBUG if args.debug else logging.INFO) logger.addHandler(file_handler) logger.info(f"Processing item generation configuration file {args.config}") - logger.info(f"Input path: {cfg.input_path.absolute()}") - logger.info(f"Output path: {cfg.output_path.absolute()}") return logger @@ -80,32 +74,27 @@ def main(args): cfg = config.TestConfig(args.config) # create output directories for categories - for cat in range(1, 7): - create_dir(cfg.output_path.joinpath(f"cat{cat}")) + # for cat in range(1, 7): + # create_dir(cfg.output_path.joinpath(f"cat{cat}")) # set up logging logger = logging_init(args, cfg) - # make format a list - if not isinstance(cfg.format, list): - cfg.format = [cfg.format] - - # generate ISM and STEREO items - if "ISM1" in cfg.format: - # generate ISM1 items with metadata according to scene description - process_ism1_items.generate_ism1_items(cfg, logger) - elif "ISM2" in cfg.format: - # generate ISM2 items with metadata according to scene description - process_ism2_items.generate_ism2_items(cfg, logger) + # generate items in the requested format + if "ISM" in cfg.format: + # generate ISMn items from MONO items according to scene description + generate_ismN_items.generate_ismN_items(cfg, logger) elif "STEREO" in cfg.format: # generate STEREO items according to scene description - process_stereo_items.generate_stereo_items(cfg, logger) - elif "FOA" in cfg.format or "HOA2" in cfg.format: - # generate FOA/HOA2 items according to scene description - process_ambi_items.generate_ambi_items(cfg, logger) - - # copy configuration to output directory - with open(cfg.output_path.joinpath(f"{'_'.join(cfg.format)}.yml"), "w") as f: - yaml.safe_dump(cfg._yaml_dump, f) + generate_stereo_items.generate_stereo_items(cfg, logger) + elif any(fmt in cfg.format for fmt in ["FOA", "HOA2", "HOA3"]): + # generate FOA/HOA2/HOA3 items according to scene description + generate_sba_items.generate_sba_items(cfg, logger) + elif "OMASA" in cfg.format: + # generate OMASA items from FOA/HO2/HOA3 and MONO items according to scene description + generate_omasa_items.generate_omasa_items(cfg, logger) + elif "OSBA" in cfg.format: + # generate OSBA items from FOA/HO2/HOA3 and MONO items according to scene description + generate_osba_items.generate_osba_items(cfg, logger) logger.handlers.clear() diff --git a/ivas_processing_scripts/generation/config.py b/ivas_processing_scripts/generation/config.py index b59540a8c78a09e78a8405981c777f0ec2498c6e..b61aa881e6a39f7204669ec8c82f0d625631fd62 100644 --- a/ivas_processing_scripts/generation/config.py +++ b/ivas_processing_scripts/generation/config.py @@ -31,7 +31,6 @@ # from copy import deepcopy -from pathlib import Path import yaml @@ -78,10 +77,6 @@ class TestConfig: # store the merged config for writing to file later self._yaml_dump = self._dump_yaml(cfg) - # convert to Path - self.input_path = Path(self.input_path) - self.output_path = Path(self.output_path) - def _parse_yaml(self, filename): """parse configuration file""" with open(filename) as fp: diff --git a/ivas_processing_scripts/generation/constants.py b/ivas_processing_scripts/generation/constants.py index e5fbc4531d3fbf2fcb3212c6c9572c6c37500f9d..3bc6b82d9ec6dd7deff8bb070f45a68780cd397b 100644 --- a/ivas_processing_scripts/generation/constants.py +++ b/ivas_processing_scripts/generation/constants.py @@ -61,7 +61,6 @@ DEFAULT_CONFIG_BINARIES = { REQUIRED_KEYS = [ "format", - "input_path", - "output_path", + "fs", "scenes", ] diff --git a/ivas_processing_scripts/generation/generate_ismN_items.py b/ivas_processing_scripts/generation/generate_ismN_items.py new file mode 100644 index 0000000000000000000000000000000000000000..dcf76cad8d11e5c2eafd295776b4d78dce1b9bc6 --- /dev/null +++ b/ivas_processing_scripts/generation/generate_ismN_items.py @@ -0,0 +1,461 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2025 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# +import logging +from itertools import groupby, repeat +from pathlib import Path + +import numpy as np + +from ivas_processing_scripts.audiotools import audio, audioarray, audiofile, metadata +from ivas_processing_scripts.audiotools.convert.objectbased import convert_objectbased +from ivas_processing_scripts.audiotools.wrappers.bs1770 import loudness_norm +from ivas_processing_scripts.generation import config +from ivas_processing_scripts.utils import apply_func_parallel + +SEED_RANDOM_NOISE = 0 + + +# function for converting nd numpy array to strings with 2 decimal digits +def csv_formatdata(data): + for row in data: + yield ["%0.2f" % v for v in row] + + +# function for searching sequences of same the same character and replacing it by another string +def replace_char_seq_with_string(str, char_seq, repl_str): + result = [] + + # find groups of consecutive letters + groups = ["".join(list(g)) for k, g in groupby(str)] + + # limit the length of the replacement string by the length of the character sequence + repl_str = repl_str[: len(char_seq)] + + # replace each occurence of the sequence of characters + for g in groups: + if char_seq in g: + result.append(repl_str) + else: + result.append(g) + + return "".join(result) + + +# function for appending string to a filename before file extension +def append_str_filename(filename, str_to_append): + p = Path(filename) + return p.parent / (p.stem + str_to_append + p.suffix) + + +def generate_ismN_items( + cfg: config.TestConfig, + logger: logging.Logger, +): + """Generate ISMN items with metadata from mono items based on scene description""" + + # set the fs + if "fs" not in cfg.__dict__: + cfg.fs = 48000 + + # set the pre-amble and post-amble + if "preamble" not in cfg.__dict__: + cfg.preamble = 0.0 + + if "postamble" not in cfg.__dict__: + cfg.postamble = 0.0 + + # set the listening lab designator + if "listening_lab" not in cfg.__dict__: + cfg.listening_lab = "l" + + # set the language designator + if "language" not in cfg.__dict__: + cfg.language = "EN" + + # set the experiment designator + if "exp" not in cfg.__dict__: + cfg.exp = "p07" + + # set the provider + if "provider" not in cfg.__dict__: + cfg.provider = "g" + + # set the prefix for all input filenames + if "use_input_prefix" not in cfg.__dict__: + cfg.use_input_prefix = "" + else: + # replace file designators + cfg.use_input_prefix = replace_char_seq_with_string( + cfg.use_input_prefix, "l", cfg.listening_lab + ) + cfg.use_input_prefix = replace_char_seq_with_string( + cfg.use_input_prefix, "LL", cfg.language + ) + cfg.use_input_prefix = replace_char_seq_with_string( + cfg.use_input_prefix, "eee", cfg.exp + ) + + # set the prefix for all output filenames + if "use_output_prefix" not in cfg.__dict__: + cfg.use_output_prefix = "" + else: + # replace file designators + cfg.use_output_prefix = replace_char_seq_with_string( + cfg.use_output_prefix, "l", cfg.listening_lab + ) + cfg.use_output_prefix = replace_char_seq_with_string( + cfg.use_output_prefix, "eee", cfg.exp + ) + + # set multiprocessing + if "multiprocessing" not in cfg.__dict__: + cfg.multiprocessing = False + + apply_func_parallel( + generate_ismN_scene, + zip(cfg.scenes.keys(), cfg.scenes.values(), repeat(cfg), repeat(logger)), + type="mp" if cfg.multiprocessing else None, + show_progress=None, + ) + + return + + +def generate_ismN_scene( + scene_name: str, scene: dict, cfg: config.TestConfig, logger: logging.Logger +): + """ + Processes a single scene to generate N ISM items with metadata. + + Args: + scene_name (str): The name of the scene being processed. + scene (dict): A dictionary containing scene description, including source files, azimuth, elevation, and other parameters. + cfg (config.TestConfig): Configuration object containing settings for processing, such as input/output paths, sampling rate, and loudness levels. + logger (logging.Logger): Logger instance for logging information and errors. + + Expected Behavior: + - Reads audio source files and processes them based on the scene description. + - Generates metadata files and appends them to the ISM objects. + - Writes the processed audio and metadata to output files. + """ + + scenes = list(cfg.scenes.keys()) + logger.info( + f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}" + ) + + # extract the number of audio sources + N_inputs = len(np.atleast_1d(scene["input"])) + + # initialize output dirs + ism_format = f"ISM{N_inputs}" + output_filename = Path(scene["output"]).parent / ( + cfg.use_output_prefix + Path(scene["output"]).name + ) + + dir_path = output_filename.parent + if dir_path and not dir_path.exists(): + dir_path.mkdir(parents=True, exist_ok=True) + + # initialize output ISM object + y = audio.ObjectBasedAudio(ism_format) + y.fs = cfg.fs + + # set the frame length + frame_len = int(cfg.fs / 50) + + # repeat for all source files + offset = 0 + for i in range(N_inputs): + # read input filename + source_file = ( + scene["input"][i] if isinstance(scene["input"], list) else scene["input"] + ) + + input_filename = Path(source_file).parent / ( + cfg.use_input_prefix + Path(source_file).name + ) + + # read azimuth and elevation information + if "azimuth" in scene.keys(): + source_azi = ( + scene["azimuth"][i] + if isinstance(scene["azimuth"], list) + else scene["azimuth"] + ) + else: + source_azi = 0.0 + + if "elevation" in scene.keys(): + source_ele = ( + scene["elevation"][i] + if isinstance(scene["elevation"], list) + else scene["elevation"] + ) + else: + source_ele = 0.0 + + # read the source shift length (in seconds) + if "shift" in scene.keys(): + source_shift = ( + scene["shift"][i] + if isinstance(scene["shift"], list) + else scene["shift"] + ) + else: + source_shift = 0.0 + + # convert overlap to samples and ensure it is a multiple of 20ms + source_shift_in_seconds = source_shift + source_shift = source_shift * cfg.fs + if source_shift >= 0: + source_shift = int(np.floor(source_shift / frame_len) * frame_len) + else: + source_shift = int(np.ceil(source_shift / frame_len) * frame_len) + + # read the level + if "level" in scene.keys(): + level = ( + scene["level"][i] + if isinstance(scene["level"], list) + else scene["level"] + ) + else: + level = -26 + + logger.info( + f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds" + ) + + # read source file + x = audio.fromtype("ISM1") + x.audio, x.fs = audiofile.read(input_filename) + + # resample to the target fs if necessary + if x.fs != cfg.fs: + logger.warning( + f"Warning: Sample rate of the audio source is {x.fs} Hz and needs to be resampled to {cfg.fs}!" + ) + resampled_audio = audioarray.resample(x.audio, x.fs, cfg.fs) + x.audio = resampled_audio + x.fs = cfg.fs + + # adjust the level of the audio source file (need to convert to MONO first) + x_temp = audio.ChannelBasedAudio("MONO") # create a temporary mono audio object + x_temp.audio = x.audio.copy() + x_temp.fs = x.fs + x_temp.audio, _ = loudness_norm(x_temp, level, loudness_format="MONO") + x.audio = x_temp.audio + + # ensure the length of the audio source signal is a multiple of 20ms + if len(x.audio) % frame_len != 0: + # pad with zeros to ensure that the signal length is a multiple of 20ms + if len(x.audio) % frame_len != 0: + N_pad = int(frame_len - len(x.audio) % frame_len) + x.audio = audioarray.trim( + x.audio, x.fs, limits=[0, -N_pad], samples=True + ) + + # get the number of frames (multiple of 20ms) + N_frames = int(len(x.audio) / frame_len) + + # convert azimuth information in case of moving object + if isinstance(source_azi, str): + if ":" in source_azi: + # convert into array (initial_value:step:stop_value) + start_str, step_str, stop_str = source_azi.split(":") + start = float(eval(start_str)) + step = float(eval(step_str)) + stop = float(eval(stop_str)) + azi = np.arange(start, stop, step) + + # adjust length to N_frames + if len(azi) > N_frames: + azi = azi[:N_frames] + elif len(azi) < N_frames: + azi = np.append(azi, np.full(N_frames - len(azi), azi[-1])) + else: + # replicate static azimuth value N_frames times + azi = np.repeat(float(eval(source_azi)), N_frames) + else: + # replicate static azimuth value N_frames times + azi = np.repeat(float(source_azi), N_frames) + + # convert azimuth from 0 .. 360 to -180 .. +180 + azi = (azi + 180) % 360 - 180 + + # check if azimuth is from -180 .. +180 + if any(azi > 180) or any(azi < -180): + logger.error( + f"Incorrect value(s) of azimuth: {azi[(azi > 180) | (azi < -180)]}" + ) + + # convert elevation information in case mof moving object + if isinstance(source_ele, str): + if ":" in source_ele: + # convert into array (initial_value:step:stop_value) + start_str, step_str, stop_str = source_ele.split(":") + start = float(eval(start_str)) + step = float(eval(step_str)) + stop = float(eval(stop_str)) + ele = np.arange(start, stop, step) + + # adjust length to N_frames + if len(ele) > N_frames: + ele = ele[:N_frames] + elif len(ele) < N_frames: + ele = np.append(ele, np.full(N_frames - len(ele), ele[-1])) + + else: + # replicate static elevation value N_frames times + ele = np.repeat(float(eval(source_ele)), N_frames) + else: + # replicate static elevation value N_frames times + ele = np.repeat(float(source_ele), N_frames) + + # wrap elevation angle to -90 .. +90 + ele = ((ele + 90) % 180) - 90 + + # check if elevation is from -90 .. +90 + if any(ele > 90) or any(ele < -90): + logger.error( + f"Incorrect value(s) of elevation: {ele[(ele > 90) | (ele < -90)]}" + ) + + # generate radius vector with all values equal to 1.0 + rad = np.ones(N_frames) + + # arrange all metadata fields column-wise into a matrix + x.object_pos.append(np.column_stack((azi, ele, rad))) + + # copy new audio source signal to the ISMn object + if y.audio is None: + # add the first audio source signal to the array of all source signals + y.audio = x.audio.copy() + y.object_pos = x.object_pos.copy() + y.fs = x.fs + + if source_shift < 0: + # insert zeros to the new audio source signal to shift it right + metadata.trim_meta(y, limits=[source_shift, 0], samples=True) + else: + offset = source_shift + else: + # shift the beginning of the audio source signal + delta_offset = source_shift - offset + if delta_offset > 0: + # insert zeros to the previous ISM signal(s) to shift them right + metadata.trim_meta(y, limits=[-delta_offset, 0], samples=True) + offset = source_shift + else: + # insert zeros to the new audio source signal to shift it right + metadata.trim_meta(x, limits=[delta_offset, 0], samples=True) + + # adjust the length of the audio source signal + delta_length = len(x.audio) - len(y.audio) + if delta_length > 0: + # pad zeros to the previous ISM signal(s) + metadata.trim_meta(y, limits=[0, -delta_length], samples=True) + else: + # pad zeros to the new audio source signal + metadata.trim_meta(x, limits=[0, delta_length], samples=True) + + y.audio = np.append(y.audio, x.audio, axis=1) + y.object_pos.extend(x.object_pos) + + # add ISM metadata .csv filename (should end with .wav.0.csv, .wav.1.csv, ...) + y.metadata_files.insert(i, str(output_filename.with_suffix(f".{i}.csv"))) + + # append pre-amble and post-amble + if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: + preamble = int( + np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len + ) # convert to samples and ensure multiple of 20ms + postamble = int( + np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len + ) # convert to samples and ensure multiple of 20ms + if preamble != 0 or postamble != 0: + logger.info( + f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" + ) + metadata.trim_meta(y, limits=[-preamble, -postamble], samples=True) + + # add random noise + if "add_low_level_random_noise" in cfg.__dict__ and cfg.add_low_level_random_noise: + # create uniformly distributed noise between -4 and 4 + np.random.seed(SEED_RANDOM_NOISE) + noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") + y.audio += noise + + # adjust the length of the output signal + if "duration" in cfg.__dict__: + # trim the output signal such that the total duration is X seconds + duration = int(cfg.duration * cfg.fs) # convert to samples + else: + # do not change the length of the audio signal + duration = len(y.audio) + duration = int( + np.floor(duration / frame_len) * frame_len + ) # ensure multiple of 20ms + if len(y.audio) != duration: + metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) + + # adjust the loudness of the output signal + if "loudness" in cfg.__dict__: + logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") + y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL") + + # apply fade-in and fade-out + if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0: + logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds") + y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000) + + # write the ISMn output to .wav file in an interleaved format and ISM metadata in .csv files + audiofile.write(output_filename, y.audio, y.fs) + metadata.write_ISM_metadata_in_file(y.object_pos, y.metadata_files) + + # convert to BINAURAL, if option was chosen + if cfg.binaural_output: + binaural_output_filename = output_filename.with_name( + output_filename.stem + "_BINAURAL" + output_filename.suffix + ) + logger.info( + f"-- Converting to BINAURAL output file: {binaural_output_filename}" + ) + binaudio = audio.fromtype("BINAURAL") + binaudio.fs = y.fs + convert_objectbased(y, binaudio) + audiofile.write( + binaural_output_filename, + binaudio.audio, + binaudio.fs, + ) diff --git a/ivas_processing_scripts/generation/generate_omasa_items.py b/ivas_processing_scripts/generation/generate_omasa_items.py new file mode 100644 index 0000000000000000000000000000000000000000..ed48c37b7ee3a7213bf0c0b868901dcf1e9a19d0 --- /dev/null +++ b/ivas_processing_scripts/generation/generate_omasa_items.py @@ -0,0 +1,501 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2025 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +import logging +import sys +from itertools import groupby, repeat +from pathlib import Path + +import numpy as np + +from ivas_processing_scripts.audiotools import audio, audioarray, audiofile, metadata +from ivas_processing_scripts.audiotools.convert.omasa import convert_omasa +from ivas_processing_scripts.audiotools.convert.scenebased import render_sba_to_masa +from ivas_processing_scripts.audiotools.wrappers.bs1770 import loudness_norm +from ivas_processing_scripts.generation import config +from ivas_processing_scripts.utils import apply_func_parallel + +SEED_RANDOM_NOISE = 0 + + +# function for searching sequences of same the same character and replacing it by another string +def replace_char_seq_with_string(str, char_seq, repl_str): + result = [] + + # find groups of consecutive letters + groups = ["".join(list(g)) for k, g in groupby(str)] + + # limit the length of the replacement string by the length of the character sequence + repl_str = repl_str[: len(char_seq)] + + # replace each occurence of the sequence of characters + for g in groups: + if char_seq in g: + result.append(repl_str) + else: + result.append(g) + + return "".join(result) + + +# function for appending string to a filename before file extension +def append_str_filename(filename, str_to_append): + p = Path(filename) + # Combine the stem, the string to append, and the suffix + return p.parent / (p.stem + str_to_append + p.suffix) + + +def generate_omasa_items( + cfg: config.TestConfig, + logger: logging.Logger, +): + """Generate OMASA items with metadata from FOA/HO2 and ISMn items based on scene description""" + + # set the fs + if "fs" not in cfg.__dict__: + cfg.fs = 48000 + + # set the listening lab designator + if "listening_lab" not in cfg.__dict__: + cfg.listening_lab = "l" + + # set the language designator + if "language" not in cfg.__dict__: + cfg.language = "EN" + + # set the experiment designator + if "exp" not in cfg.__dict__: + cfg.exp = "p07" + + # set the provider + if "provider" not in cfg.__dict__: + cfg.provider = "g" + + # set the prefix for all input filenames + if "use_input_prefix" not in cfg.__dict__: + cfg.use_input_prefix = "" + else: + # replace file designators + cfg.use_input_prefix = replace_char_seq_with_string( + cfg.use_input_prefix, "l", cfg.listening_lab + ) + cfg.use_input_prefix = replace_char_seq_with_string( + cfg.use_input_prefix, "LL", cfg.language + ) + cfg.use_input_prefix = replace_char_seq_with_string( + cfg.use_input_prefix, "eee", cfg.exp + ) + + # set the prefix for all output filenames + if "use_output_prefix" not in cfg.__dict__: + cfg.use_output_prefix = "" + else: + # replace file designators + cfg.use_output_prefix = replace_char_seq_with_string( + cfg.use_output_prefix, "l", cfg.listening_lab + ) + cfg.use_output_prefix = replace_char_seq_with_string( + cfg.use_output_prefix, "eee", cfg.exp + ) + + # set multiprocessing + if "multiprocessing" not in cfg.__dict__: + cfg.multiprocessing = False + + apply_func_parallel( + generate_OMASA_scene, + zip(cfg.scenes.keys(), cfg.scenes.values(), repeat(cfg), repeat(logger)), + type="mp" if cfg.multiprocessing else None, + show_progress=None, + ) + + return + + +def generate_OMASA_scene( + scene_name: str, scene: dict, cfg: config.TestConfig, logger: logging.Logger +): + """ + Processes a single scene to generate OMASA items with metadata. + + Args: + scene_name (str): The name of the scene being processed. + scene (dict): A dictionary containing scene description, including source files, azimuth, elevation, and other parameters. + cfg (config.TestConfig): Configuration object containing settings for processing, such as input/output paths, sampling rate, and loudness levels. + logger (logging.Logger): Logger instance for logging information and errors. + + Expected Behavior: + - Reads audio source files and processes them based on the scene description. + - Generates metadata files and appends them to the OMASA object. + - Writes the processed audio and metadata to output files. + - Handles various audio formats (e.g., MONO, FOA, HOA2) and applies transformations like loudness normalization, trimming, and padding. + """ + + scenes = list(cfg.scenes.keys()) + logger.info( + f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}" + ) + + # extract the number of audio sources + N_inputs = len(np.atleast_1d(scene["input"])) + N_ISMs = N_inputs - 1 + + # get output filename + omasa_format = f"ISM{N_ISMs}MASA{cfg.masa_tc}DIR{cfg.masa_dirs}" + output_filename = Path(scene["output"]).parent / ( + cfg.use_output_prefix + Path(scene["output"]).name + ) + + # initialize output dirs + dir_path = output_filename.parent + if dir_path and not dir_path.exists(): + dir_path.mkdir(parents=True, exist_ok=True) + + # initialize output OMASA object + y = audio.OMASAAudio(omasa_format) + y.fs = cfg.fs + + # set the frame length + frame_len = int(cfg.fs / 50) + + # repeat for all source files + offset = 0 + for i in range(N_inputs): + # parse parameters from the scene description + source_file = ( + scene["input"][i] if isinstance(scene["input"], list) else scene["input"] + ) + + # get input filename + input_filename = Path(source_file).parent / ( + cfg.use_input_prefix + Path(source_file).name + ) + + # read azimuth and elevation information + if "azimuth" in scene.keys(): + source_azi = ( + scene["azimuth"][i] + if isinstance(scene["azimuth"], list) + else scene["azimuth"] + ) + else: + source_azi = 0.0 + + if "elevation" in scene.keys(): + source_ele = ( + scene["elevation"][i] + if isinstance(scene["elevation"], list) + else scene["elevation"] + ) + else: + source_ele = 0.0 + + # read the source shift length (in seconds) + if "shift" in scene.keys(): + source_shift = ( + scene["shift"][i] + if isinstance(scene["shift"], list) + else scene["shift"] + ) + else: + source_shift = 0.0 + + # convert overlap to samples and ensure it is a multiple of 20ms + source_shift_in_seconds = source_shift + source_shift = source_shift * cfg.fs + if source_shift >= 0: + source_shift = int(np.floor(source_shift / frame_len) * frame_len) + else: + source_shift = int(np.ceil(source_shift / frame_len) * frame_len) + + # read the level + if "level" in scene.keys(): + level = ( + scene["level"][i] + if isinstance(scene["level"], list) + else scene["level"] + ) + else: + level = -26 + + logger.info( + f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds" + ) + + # get the number of channels from the .wav file header + wav_header = audiofile.parse_wave_header(input_filename) + N_channels = wav_header["channels"] + + if N_channels == 1: + fmt = "MONO" + elif N_channels == 2: + fmt = "STEREO" + elif N_channels == 4: + fmt = "FOA" + elif N_channels == 9: + fmt = "HOA2" + elif N_channels == 16: + fmt = "HOA3" + else: + logger.error( + f"Error: Input format of the source file with {N_channels} channels is not supported!" + ) + sys.exit(-1) + + # read source file + x = audio.fromfile(fmt, input_filename) + + # resample to the target fs if necessary + if x.fs != cfg.fs: + logger.warning( + f"Warning: Sample rate of the audio source is {x.fs} Hz and needs to be resampled to {cfg.fs}!" + ) + resampled_audio = audioarray.resample(x.audio, x.fs, cfg.fs) + x.audio = resampled_audio + x.fs = cfg.fs + + # adjust the level of the source file + if fmt in ["FOA", "HOA2", "HOA3"]: + x.audio, _ = loudness_norm(x, level, loudness_format="STEREO") + else: + x.audio, _ = loudness_norm(x, level, loudness_format="MONO") + + # ensure the length of the audio source signal is a multiple of 20ms + if len(x.audio) % frame_len != 0: + # pad with zeros to ensure that the signal length is a multiple of 20ms + if len(x.audio) % frame_len != 0: + N_pad = int(frame_len - len(x.audio) % frame_len) + x.audio = audioarray.trim( + x.audio, x.fs, limits=[0, -N_pad], samples=True + ) + + # get the number of frames (multiple of 20ms) + N_frames = int(len(x.audio) / frame_len) + + # convert the input audio source signal to MASA or ISM + if fmt in ["FOA", "HOA2", "HOA3"]: + # convert FOA/HOA2/HOA3 to MASA + x_masa = audio.MetadataAssistedSpatialAudio( + f"MASA{cfg.masa_tc}DIR{cfg.masa_dirs}" + ) + x_masa.fs = cfg.fs + # generate MASA metadata filename (should end with .met) + x_masa.metadata_file = output_filename.with_suffix(".met") + render_sba_to_masa(x, x_masa) + x = x_masa # replace x with the MASA object + elif fmt == "MONO": + # convert MONO to ISM1 + x_ism = audio.ObjectBasedAudio("ISM1") # ISM with 1 channel + x_ism.fs = cfg.fs + x_ism.audio = x.audio.copy() + + # convert azimuth information in case of moving object + if isinstance(source_azi, str): + if ":" in source_azi: + # convert into array (initial_value:step:stop_value) + start_str, step_str, stop_str = source_azi.split(":") + start = float(eval(start_str)) + step = float(eval(step_str)) + stop = float(eval(stop_str)) + azi = np.arange(start, stop, step) + + # adjust length to N_frames + if len(azi) > N_frames: + azi = azi[:N_frames] + elif len(azi) < N_frames: + azi = np.append(azi, np.full(N_frames - len(azi), azi[-1])) + else: + # replicate static azimuth value N_frames times + azi = np.repeat(float(eval(source_azi)), N_frames) + else: + # replicate static azimuth value N_frames times + azi = np.repeat(float(source_azi), N_frames) + + # convert azimuth from 0 .. 360 to -180 .. +180 + azi = (azi + 180) % 360 - 180 + + # check if azimuth is from -180 .. +180 + if any(azi > 180) or any(azi < -180): + logger.error( + f"Incorrect value(s) of azimuth: {azi[(azi > 180) | (azi < -180)]}" + ) + + # convert elevation information in case mof moving object + if isinstance(source_ele, str): + if ":" in source_ele: + # convert into array (initial_value:step:stop_value) + start_str, step_str, stop_str = source_ele.split(":") + start = float(eval(start_str)) + step = float(eval(step_str)) + stop = float(eval(stop_str)) + ele = np.arange(start, stop, step) + + # adjust length to N_frames + if len(ele) > N_frames: + ele = ele[:N_frames] + elif len(ele) < N_frames: + ele = np.append(ele, np.full(N_frames - len(ele), ele[-1])) + + else: + # replicate static elevation value N_frames times + ele = np.repeat(float(eval(source_ele)), N_frames) + else: + # replicate static elevation value N_frames times + ele = np.repeat(float(source_ele), N_frames) + + # wrap elevation angle to -90 .. +90 + ele = ((ele + 90) % 180) - 90 + + # check if elevation is from -90 .. +90 + if any(ele > 90) or any(ele < -90): + logger.error( + f"Incorrect value(s) of elevation: {ele[(ele > 90) | (ele < -90)]}" + ) + + # generate radius vector with all values equal to 1.0 + rad = np.ones(N_frames) + + # arrange all metadata fields column-wise into a matrix + x_ism.object_pos.append(np.column_stack((azi, ele, rad))) + + x = x_ism # replace x with the ISM object + + # copy new audio source signal to the OMASA object + if y.audio is None: + # add the first audio source signal (should be MASA) to the array of all source signals + y.audio = x.audio.copy() + + if "MASA" in x.name: + # if MASA, append metadata file to the OMASA object + y.metadata_files.append(x.metadata_file) + else: + # if ISM, append object position to the OMASA object + y.object_pos = x.object_pos.copy() + + if source_shift < 0: + # insert zeros to the new audio source signal to shift it right + metadata.trim_meta(y, limits=[source_shift, 0], samples=True) + else: + offset = source_shift + else: + # shift the beginning of the audio source signal + delta_offset = source_shift - offset + if delta_offset > 0: + # insert zeros to the previous ISM signal(s) to shift them right + metadata.trim_meta(y, limits=[-delta_offset, 0], samples=True) + offset = source_shift + else: + # insert zeros to the new audio source signal to shift it right + metadata.trim_meta(x, limits=[delta_offset, 0], samples=True) + + # adjust the length of the audio source signal + delta_length = len(x.audio) - len(y.audio) + if delta_length > 0: + # pad zeros to the previous ISM signal(s) + metadata.trim_meta(y, limits=[0, -delta_length], samples=True) + else: + # pad zeros to the new audio source signal + metadata.trim_meta(x, limits=[0, delta_length], samples=True) + + # append ISM signal to the OMASA object (ISM comes first !!!) + y.audio = np.insert(y.audio, [i - 1], x.audio, axis=1) + y.object_pos.extend(x.object_pos) + + # add ISM metadata .csv filename (should end with .wav.0.csv, .wav.1.csv, ...) + y.metadata_files.insert( + i - 1, str(output_filename.with_suffix(f".{i - 1}.csv")) + ) + + # append pre-amble and post-amble + if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: + preamble = int( + np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len + ) # convert to samples and ensure multiple of 20ms + postamble = int( + np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len + ) # convert to samples and ensure multiple of 20ms + if preamble != 0 or postamble != 0: + logger.info( + f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" + ) + metadata.trim_meta(y, limits=[-preamble, -postamble], samples=True) + + # add random noise + if "add_low_level_random_noise" in cfg.__dict__ and cfg.add_low_level_random_noise: + # create uniformly distributed noise between -4 and 4 + np.random.seed(SEED_RANDOM_NOISE) + noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") + y.audio += noise + + # adjust the length of the output signal + if "duration" in cfg.__dict__: + # trim the output signal such that the total duration is X seconds + duration = int(cfg.duration * cfg.fs) # convert to samples + else: + # do not change the length of the audio signal + duration = len(y.audio) + duration = int( + np.floor(duration / frame_len) * frame_len + ) # ensure multiple of 20ms + if len(y.audio) != duration: + metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) + + # adjust the loudness of the output signal + if "loudness" in cfg.__dict__: + logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") + y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL") + + # apply fade-in and fade-out + if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0: + logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds") + y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000) + + # write the OMASA audio output to .wav file in an interleaved format and ISM metadata in .csv files + audiofile.write(output_filename, y.audio, y.fs) + metadata.write_ISM_metadata_in_file(y.object_pos, y.metadata_files[:-1]) + + # convert to OMASA output to BINAURAL, if option was chosen + if cfg.binaural_output: + binaural_output_filename = output_filename.with_name( + output_filename.stem + "_BINAURAL" + output_filename.suffix + ) + logger.info( + f"-- Converting to BINAURAL output file: {binaural_output_filename}" + ) + binaudio = audio.fromtype("BINAURAL") + binaudio.fs = y.fs + convert_omasa(y, binaudio) + audiofile.write( + binaural_output_filename, + binaudio.audio, + binaudio.fs, + ) diff --git a/ivas_processing_scripts/generation/generate_osba_items.py b/ivas_processing_scripts/generation/generate_osba_items.py new file mode 100644 index 0000000000000000000000000000000000000000..815be0b563beda1b33dc37f6aef3af81f983af75 --- /dev/null +++ b/ivas_processing_scripts/generation/generate_osba_items.py @@ -0,0 +1,479 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2025 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +import logging +import sys +from itertools import groupby, repeat +from pathlib import Path + +import numpy as np + +from ivas_processing_scripts.audiotools import audio, audioarray, audiofile, metadata +from ivas_processing_scripts.audiotools.convert.osba import convert_osba +from ivas_processing_scripts.audiotools.wrappers.bs1770 import loudness_norm +from ivas_processing_scripts.generation import config +from ivas_processing_scripts.utils import apply_func_parallel + +SEED_RANDOM_NOISE = 0 + + +# function for searching sequences of same the same character and replacing it by another string +def replace_char_seq_with_string(str, char_seq, repl_str): + result = [] + + # find groups of consecutive letters + groups = ["".join(list(g)) for k, g in groupby(str)] + + # limit the length of the replacement string by the length of the character sequence + repl_str = repl_str[: len(char_seq)] + + # replace each occurence of the sequence of characters + for g in groups: + if char_seq in g: + result.append(repl_str) + else: + result.append(g) + + return "".join(result) + + +# function for appending string to a filename before file extension +def append_str_filename(filename, str_to_append): + p = Path(filename) + return "{0}{2}{1}".format(p.stem, p.suffix, str_to_append) + + +def generate_osba_items( + cfg: config.TestConfig, + logger: logging.Logger, +): + """Generate OSBA items from FOA/HOA2/HOA3 and ISMn items based on scene description""" + + # set the fs + if "fs" not in cfg.__dict__: + cfg.fs = 48000 + + # set the listening lab designator + if "listening_lab" not in cfg.__dict__: + cfg.listening_lab = "l" + + # set the language designator + if "language" not in cfg.__dict__: + cfg.language = "EN" + + # set the experiment designator + if "exp" not in cfg.__dict__: + cfg.exp = "p07" + + # set the provider + if "provider" not in cfg.__dict__: + cfg.provider = "g" + + # set the prefix for all input filenames + if "use_input_prefix" not in cfg.__dict__: + cfg.use_input_prefix = "" + else: + # replace file designators + cfg.use_input_prefix = replace_char_seq_with_string( + cfg.use_input_prefix, "l", cfg.listening_lab + ) + cfg.use_input_prefix = replace_char_seq_with_string( + cfg.use_input_prefix, "LL", cfg.language + ) + cfg.use_input_prefix = replace_char_seq_with_string( + cfg.use_input_prefix, "eee", cfg.exp + ) + + # set the prefix for all output filenames + if "use_output_prefix" not in cfg.__dict__: + cfg.use_output_prefix = "" + else: + # replace file designators + cfg.use_output_prefix = replace_char_seq_with_string( + cfg.use_output_prefix, "l", cfg.listening_lab + ) + cfg.use_output_prefix = replace_char_seq_with_string( + cfg.use_output_prefix, "eee", cfg.exp + ) + + # set multiprocessing + if "multiprocessing" not in cfg.__dict__: + cfg.multiprocessing = False + + apply_func_parallel( + generate_OSBA_scene, + zip(cfg.scenes.keys(), cfg.scenes.values(), repeat(cfg), repeat(logger)), + type="mp" if cfg.multiprocessing else None, + show_progress=None, + ) + + return + + +def generate_OSBA_scene( + scene_name: str, scene: dict, cfg: config.TestConfig, logger: logging.Logger +): + """ + Processes a single scene to generate OSBA item + + Args: + scene_name (str): The name of the scene being processed. + scene (dict): A dictionary containing scene description, including source files, azimuth, elevation, and other parameters. + cfg (config.TestConfig): Configuration object containing settings for processing, such as input/output paths, sampling rate, and loudness levels. + logger (logging.Logger): Logger instance for logging information and errors. + + Expected Behavior: + - Reads audio source files and processes them based on the scene description. + - Generates OSBA object. + - Writes the processed audio to output files. + - Handles various audio formats (e.g., FOA, HOA2, HOA3) and applies transformations like loudness normalization, trimming, and padding. + """ + + scenes = list(cfg.scenes.keys()) + logger.info( + f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}" + ) + + # extract the number of audio sources + N_inputs = len(np.atleast_1d(scene["input"])) + N_ISMs = N_inputs - 1 + + # get OSBA format and output filename + osba_format = f"ISM{N_ISMs}SBA{cfg.sba_order}" + output_filename = Path(scene["output"]).parent / ( + cfg.use_output_prefix + Path(scene["output"]).name + ) + + # initialize output dirs + dir_path = output_filename.parent + if dir_path and not dir_path.exists(): + dir_path.mkdir(parents=True, exist_ok=True) + + # initialize output OSBA object + y = audio.OSBAAudio(osba_format) + y.fs = cfg.fs + + # set the frame length + frame_len = int(cfg.fs / 50) + + # repeat for all source files + offset = 0 + for i in range(N_inputs): + # parse parameters from the scene description + source_file = ( + scene["input"][i] if isinstance(scene["input"], list) else scene["input"] + ) + + # get input filename + input_filename = Path(source_file).parent / ( + cfg.use_input_prefix + Path(source_file).name + ) + + # read azimuth and elevation information + source_azi = ( + scene["azimuth"][i] + if isinstance(scene["azimuth"], list) + else scene["azimuth"] + ) + source_ele = ( + scene["elevation"][i] + if isinstance(scene["elevation"], list) + else scene["elevation"] + ) + + # read the overlap length + if "shift" in scene.keys(): + source_shift = ( + scene["shift"][i] + if isinstance(scene["shift"], list) + else scene["shift"] + ) + else: + source_shift = 0.0 + + # convert overlap to samples and ensure it is a multiple of 20ms + source_shift_in_seconds = source_shift + source_shift = source_shift * cfg.fs + if source_shift >= 0: + source_shift = int(np.floor(source_shift / frame_len) * frame_len) + else: + source_shift = int(np.ceil(source_shift / frame_len) * frame_len) + + # read the level + if "level" in scene.keys(): + level = ( + scene["level"][i] + if isinstance(scene["level"], list) + else scene["level"] + ) + else: + level = -26 + + logger.info( + f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds" + ) + + # get the number of channels from the .wav file header + wav_header = audiofile.parse_wave_header(input_filename) + N_channels = wav_header["channels"] + + if N_channels == 1: + fmt = "MONO" + elif N_channels == 2: + fmt = "STEREO" + elif N_channels == 4: + fmt = "FOA" + elif N_channels == 9: + fmt = "HOA2" + elif N_channels == 16: + fmt = "HOA3" + else: + logger.error( + f"Error: Input format of the source file with {N_channels} channels is not supported!" + ) + sys.exit(-1) + + # read source file + x = audio.fromfile(fmt, input_filename) + + # resample to the target fs if necessary + if x.fs != cfg.fs: + logger.warning( + f"Warning: Sample rate of the audio source is {x.fs} Hz and needs to be resampled to {cfg.fs}!" + ) + resampled_audio = audioarray.resample(x.audio, x.fs, cfg.fs) + x.audio = resampled_audio + x.fs = cfg.fs + + # adjust the level of the source file + if fmt in ["FOA", "HOA2", "HOA3"]: + x.audio, _ = loudness_norm(x, level, loudness_format="STEREO") + else: + x.audio, _ = loudness_norm(x, level, loudness_format="MONO") + + # ensure the length of the audio source signal is a multiple of 20ms + if len(x.audio) % frame_len != 0: + # pad with zeros to ensure that the signal length is a multiple of 20ms + if len(x.audio) % frame_len != 0: + N_pad = int(frame_len - len(x.audio) % frame_len) + x.audio = audioarray.trim( + x.audio, x.fs, limits=[0, -N_pad], samples=True + ) + + # get the number of frames (multiple of 20ms) + N_frames = int(len(x.audio) / frame_len) + + # convert the input MONO audio source signal to ISM1 object + if fmt == "MONO": + # convert MONO to ISM1 + x_ism = audio.ObjectBasedAudio("ISM1") # ISM with 1 channel + x_ism.fs = cfg.fs + x_ism.audio = x.audio.copy() + + # convert azimuth information in case of moving object + if isinstance(source_azi, str): + if ":" in source_azi: + # convert into array (initial_value:step:stop_value) + start_str, step_str, stop_str = source_azi.split(":") + start = float(eval(start_str)) + step = float(eval(step_str)) + stop = float(eval(stop_str)) + azi = np.arange(start, stop, step) + + # adjust length to N_frames + if len(azi) > N_frames: + azi = azi[:N_frames] + elif len(azi) < N_frames: + azi = np.append(azi, np.full(N_frames - len(azi), azi[-1])) + else: + # replicate static azimuth value N_frames times + azi = np.repeat(float(eval(source_azi)), N_frames) + else: + # replicate static azimuth value N_frames times + azi = np.repeat(float(source_azi), N_frames) + + # convert azimuth from 0 .. 360 to -180 .. +180 + azi = (azi + 180) % 360 - 180 + + # check if azimuth is from -180 .. +180 + if any(azi > 180) or any(azi < -180): + logger.error( + f"Incorrect value(s) of azimuth: {azi[(azi > 180) | (azi < -180)]}" + ) + + # convert elevation information in case mof moving object + if isinstance(source_ele, str): + if ":" in source_ele: + # convert into array (initial_value:step:stop_value) + start_str, step_str, stop_str = source_ele.split(":") + start = float(eval(start_str)) + step = float(eval(step_str)) + stop = float(eval(stop_str)) + ele = np.arange(start, stop, step) + + # adjust length to N_frames + if len(ele) > N_frames: + ele = ele[:N_frames] + elif len(ele) < N_frames: + ele = np.append(ele, np.full(N_frames - len(ele), ele[-1])) + + else: + # replicate static elevation value N_frames times + ele = np.repeat(float(eval(source_ele)), N_frames) + else: + # replicate static elevation value N_frames times + ele = np.repeat(float(source_ele), N_frames) + + # wrap elevation angle to -90 .. +90 + ele = ((ele + 90) % 180) - 90 + + # check if elevation is from -90 .. +90 + if any(ele > 90) or any(ele < -90): + logger.error( + f"Incorrect value(s) of elevation: {ele[(ele > 90) | (ele < -90)]}" + ) + + # generate radius vector with all values equal to 1.0 + rad = np.ones(N_frames) + + # arrange all metadata fields column-wise into a matrix + x_ism.object_pos.append(np.column_stack((azi, ele, rad))) + + x = x_ism # replace x with the ISM object + + # copy new audio source signal to the OSBA object + if y.audio is None: + # add the first audio source signal (should be FOA/HOA2/HOA3) to the array of all source signals + y.audio = x.audio.copy() + + if fmt == "MONO": + # if ISM, append object position to the OSBA object + y.object_pos = x.object_pos.copy() + + if source_shift < 0: + # insert zeros to the new audio source signal to shift it right + metadata.trim_meta(y, limits=[source_shift, 0], samples=True) + else: + offset = source_shift + else: + # shift the beginning of the audio source signal + delta_offset = source_shift - offset + if delta_offset > 0: + # insert zeros to the previous ISM signal(s) to shift them right + metadata.trim_meta(y, limits=[-delta_offset, 0], samples=True) + offset = source_shift + else: + # insert zeros to the new audio source signal to shift it right + metadata.trim_meta(x, limits=[delta_offset, 0], samples=True) + + # adjust the length of the audio source signal + delta_length = len(x.audio) - len(y.audio) + if delta_length > 0: + # pad zeros to the previous ISM signal(s) + metadata.trim_meta(y, limits=[0, -delta_length], samples=True) + else: + # pad zeros to the new audio source signal + metadata.trim_meta(x, limits=[0, delta_length], samples=True) + + # append ISM signal to the OMASA object (ISM comes first !!!) + y.audio = np.insert(y.audio, [i - 1], x.audio, axis=1) + y.object_pos.extend(x.object_pos) + + # add ISM metadata .csv filename (should end with .wav.0.csv, .wav.1.csv, ...) + y.metadata_files.insert( + i - 1, str(output_filename.with_suffix(f".{i - 1}.csv")) + ) + + # append pre-amble and post-amble + if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: + preamble = int( + np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len + ) # convert to samples and ensure multiple of 20ms + postamble = int( + np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len + ) # convert to samples and ensure multiple of 20ms + if preamble != 0 or postamble != 0: + logger.info( + f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" + ) + metadata.trim_meta(y, limits=[-preamble, -postamble], samples=True) + + # add random noise + if "add_low_level_random_noise" in cfg.__dict__ and cfg.add_low_level_random_noise: + # create uniformly distributed noise between -4 and 4 + np.random.seed(SEED_RANDOM_NOISE) + noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") + y.audio += noise + + # adjust the length of the output signal + if "duration" in cfg.__dict__: + # trim the output signal such that the total duration is X seconds + duration = int(cfg.duration * cfg.fs) # convert to samples + else: + # do not change the length of the audio signal + duration = len(y.audio) + duration = int( + np.floor(duration / frame_len) * frame_len + ) # ensure multiple of 20ms + if len(y.audio) != duration: + metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) + + # adjust the loudness of the output signal + if "loudness" in cfg.__dict__: + logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") + y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL") + + # apply fade-in and fade-out + if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0: + logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds") + y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000) + + # write the OSBA audio output to .wav file in an interleaved format and ISM metadata in .csv files + audiofile.write(output_filename, y.audio, y.fs) + metadata.write_ISM_metadata_in_file(y.object_pos, y.metadata_files) + + # convert the OSBA output to BINAURAL, if option was chosen + if cfg.binaural_output: + binaural_output_filename = output_filename.with_name( + output_filename.stem + "_BINAURAL" + output_filename.suffix + ) + logger.info( + f"-- Converting to BINAURAL output file: {binaural_output_filename}" + ) + binaudio = audio.fromtype("BINAURAL") + binaudio.fs = y.fs + convert_osba(y, binaudio) + audiofile.write( + binaural_output_filename, + binaudio.audio, + binaudio.fs, + ) diff --git a/ivas_processing_scripts/generation/generate_sba_items.py b/ivas_processing_scripts/generation/generate_sba_items.py new file mode 100644 index 0000000000000000000000000000000000000000..6904f107245c3156e3c1dec2d59ef8d42a7d060a --- /dev/null +++ b/ivas_processing_scripts/generation/generate_sba_items.py @@ -0,0 +1,392 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2025 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +import logging +from itertools import groupby, repeat +from pathlib import Path + +import numpy as np + +from ivas_processing_scripts.audiotools import audio, audioarray, audiofile +from ivas_processing_scripts.audiotools.convert.scenebased import convert_scenebased +from ivas_processing_scripts.audiotools.wrappers.bs1770 import loudness_norm +from ivas_processing_scripts.audiotools.wrappers.reverb import ( + reverb_foa, + reverb_hoa2, + reverb_hoa3, +) +from ivas_processing_scripts.generation import config +from ivas_processing_scripts.utils import apply_func_parallel + +SEED_RANDOM_NOISE = 0 + + +# function for searching sequences of same the same character and replacing it by another string +def replace_char_seq_with_string(str, char_seq, repl_str): + result = [] + + # find groups of consecutive letters + groups = ["".join(list(g)) for k, g in groupby(str)] + + # limit the length of the replacement string by the length of the character sequence + repl_str = repl_str[: len(char_seq)] + + # replace each occurence of the sequence of characters + for g in groups: + if char_seq in g: + result.append(repl_str) + else: + result.append(g) + + return "".join(result) + + +def generate_sba_items( + cfg: config.TestConfig, + logger: logging.Logger, +): + """Generate FOA/HOA2/HOA3 items from mono items based on scene description""" + + # set the fs + if "fs" not in cfg.__dict__: + cfg.fs = 48000 + + # set the IR fs + if "IR_fs" not in cfg.__dict__: + cfg.IR_fs = 48000 + + # set the listening lab designator + if "listening_lab" not in cfg.__dict__: + cfg.listening_lab = "l" + + # set the language designator + if "language" not in cfg.__dict__: + cfg.language = "EN" + + # set the experiment designator + if "exp" not in cfg.__dict__: + cfg.exp = "p04" + + # set the provider + if "provider" not in cfg.__dict__: + cfg.provider = "g" + + # set the prefix for all input filenames + if "use_input_prefix" not in cfg.__dict__: + cfg.use_input_prefix = "" + else: + # replace file designators + cfg.use_input_prefix = replace_char_seq_with_string( + cfg.use_input_prefix, "l", cfg.listening_lab + ) + cfg.use_input_prefix = replace_char_seq_with_string( + cfg.use_input_prefix, "LL", cfg.language + ) + cfg.use_input_prefix = replace_char_seq_with_string( + cfg.use_input_prefix, "eee", cfg.exp + ) + + # set the prefix for all IR filenames + if "use_IR_prefix" not in cfg.__dict__: + cfg.use_IR_prefix = "" + else: + # replace file designators + cfg.use_IR_prefix = replace_char_seq_with_string( + cfg.use_IR_prefix, "p", cfg.provider + ) + cfg.use_IR_prefix = replace_char_seq_with_string( + cfg.use_IR_prefix, "LL", cfg.language + ) + cfg.use_IR_prefix = replace_char_seq_with_string( + cfg.use_IR_prefix, "eee", cfg.exp + ) + + # set the prefix for all output filenames + if "use_output_prefix" not in cfg.__dict__: + cfg.use_output_prefix = "" + else: + # replace file designators + cfg.use_output_prefix = replace_char_seq_with_string( + cfg.use_output_prefix, "l", cfg.listening_lab + ) + cfg.use_output_prefix = replace_char_seq_with_string( + cfg.use_output_prefix, "eee", cfg.exp + ) + + # set multiprocessing + if "multiprocessing" not in cfg.__dict__: + cfg.multiprocessing = False + + apply_func_parallel( + generate_sba_scene, + zip(cfg.scenes.keys(), cfg.scenes.values(), repeat(cfg), repeat(logger)), + type="mp" if cfg.multiprocessing else None, + show_progress=None, + ) + + return + + +def generate_sba_scene( + scene_name: str, scene: dict, cfg: config.TestConfig, logger: logging.Logger +): + """ + Processes a single scene to generate FOA/HOA2/HOA3 item. + + Args: + scene_name (str): The name of the scene being processed. + scene (dict): A dictionary containing scene description, including source files, azimuth, elevation, and other parameters. + cfg (config.TestConfig): Configuration object containing settings for processing, such as input/output paths, sampling rate, and loudness levels. + logger (logging.Logger): Logger instance for logging information and errors. + + Expected Behavior: + - Reads mono audio source files and processes them based on the scene description. + - Writes the processed FOA/HOA2/HOA3 audio to the output file. + """ + + scenes = list(cfg.scenes.keys()) + logger.info( + f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}" + ) + + # extract the number of audio sources + N_inputs = len(np.atleast_1d(scene["input"])) + + # get the output filename + output_filename = Path(scene["output"]).parent / ( + cfg.use_output_prefix + Path(scene["output"]).name + ) + + # initialize output dirs + dir_path = output_filename.parent + if dir_path and not dir_path.exists(): + dir_path.mkdir(parents=True, exist_ok=True) + + # initialize output SBA object + y = audio.SceneBasedAudio(cfg.format) + y.fs = cfg.fs + + # set the frame length + frame_len = int(cfg.fs / 50) + + # repeat for all source files + offset = 0 + for i in range(N_inputs): + # parse parameters from the scene description + source_file = ( + scene["input"][i] if isinstance(scene["input"], list) else scene["input"] + ) + IR_file = scene["IR"][i] if isinstance(scene["IR"], list) else scene["IR"] + + # get input filename and IR filename + input_filename = Path(source_file).parent / ( + cfg.use_input_prefix + Path(source_file).name + ) + IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) + + # read the overlap length + if "shift" in scene.keys(): + source_shift = ( + scene["shift"][i] + if isinstance(scene["shift"], list) + else scene["shift"] + ) + else: + source_shift = 0.0 + + # convert overlap to samples and ensure it is a multiple of 20ms + source_shift_in_seconds = source_shift + source_shift = source_shift * cfg.fs + if source_shift >= 0: + source_shift = int(np.floor(source_shift / frame_len) * frame_len) + else: + source_shift = int(np.ceil(source_shift / frame_len) * frame_len) + + # read the level + if "level" in scene.keys(): + level = ( + scene["level"][i] + if isinstance(scene["level"], list) + else scene["level"] + ) + else: + level = -26 + + logger.info( + f"-- Convolving {source_file} with {IR_file} at {level} LUFS with shift of {source_shift_in_seconds} seconds" + ) + + # read source file + x = audio.fromfile("MONO", input_filename) + + # resample to the target fs if necessary + if x.fs != cfg.fs: + logger.warning( + f"Warning: Sample rate of the audio source is {x.fs} Hz and needs to be resampled to {cfg.fs}!" + ) + resampled_audio = audioarray.resample(x.audio, x.fs, cfg.fs) + x.audio = resampled_audio + x.fs = cfg.fs + + # read the IR file (!must be in target format!) + IR = audio.fromfile(cfg.format, IR_filename) + + # convolve MONO source audio with FOA/HOA2/HOA3 IR -> results in FOA/HOA2/HOA3 audio object + if cfg.format == "FOA": + x = reverb_foa(x, IR) + elif cfg.format == "HOA2": + x = reverb_hoa2(x, IR) + elif cfg.format == "HOA3": + x = reverb_hoa3(x, IR) + + # adjust the level of the FOA/HOA2/HOA3 signal + x.audio, _ = loudness_norm(x, level, loudness_format="STEREO") + + # ensure the length of the audio source signal is a multiple of 20ms + if len(x.audio) % frame_len != 0: + # pad with zeros to ensure that the signal length is a multiple of 20ms + if len(x.audio) % frame_len != 0: + N_pad = int(frame_len - len(x.audio) % frame_len) + x.audio = audioarray.trim( + x.audio, x.fs, limits=[0, -N_pad], samples=True + ) + + # add the convolved FOA/HOA2/HOA3 audio source signal to the output signal + if y.audio is None: + # add source signal to the array of all source signals + y.audio = x.audio.copy() + + if source_shift < 0: + # insert zeros to the new audio source signal to shift it right + y.audio = audioarray.trim_meta( + y.audio, y.fs, limits=[source_shift, 0], samples=True + ) + else: + offset = source_shift + else: + # shift the beginning of the audio source signal + delta_offset = source_shift - offset + if delta_offset > 0: + # insert zeros to the existing output signal to shift it right + y.audio = audioarray.trim( + y.audio, y.fs, limits=[0, -delta_offset], samples=True + ) + offset = source_shift + else: + # insert zeros to the new audio source signal to shift it right + x.audio = audioarray.trim( + x.audio, x.fs, limits=[0, delta_offset], samples=True + ) + + # adjust the length of the audio source signal + delta_length = len(x.audio) - len(y.audio) + if delta_length > 0: + # pad zeros to the existing output signal + y.audio = audioarray.trim( + y.audio, y.fs, limits=[0, -delta_length], samples=True + ) + else: + # pad zeros to the new audio source signal + x.audio = audioarray.trim( + x.audio, x.fs, limits=[0, delta_length], samples=True + ) + + # superimpose + y.audio += x.audio + + # append pre-amble and post-amble + if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: + preamble = int( + np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len + ) # convert to samples and ensure multiple of 20ms + postamble = int( + np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len + ) # convert to samples and ensure multiple of 20ms + if preamble != 0 or postamble != 0: + logger.info( + f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" + ) + y.audio = audioarray.trim( + y.audio, y.fs, limits=[-preamble, -postamble], samples=True + ) + + # add random noise + if "add_low_level_random_noise" in cfg.__dict__ and cfg.add_low_level_random_noise: + # create uniformly distributed noise between -4 and 4 + np.random.seed(SEED_RANDOM_NOISE) + noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") + y.audio += noise + + # adjust the length of the output signal + if "duration" in cfg.__dict__: + # trim the output signal such that the total duration is X seconds + duration = int(cfg.duration * cfg.fs) # convert to samples + else: + # do not change the length of the audio signal + duration = len(y.audio) + duration = int( + np.floor(duration / frame_len) * frame_len + ) # ensure multiple of 20ms + if len(y.audio) != duration: + y.audio = audioarray.trim( + y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True + ) + + # adjust the loudness of the output signal + if "loudness" in cfg.__dict__: + logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") + y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL") + + # apply fade-in and fade-out + if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0: + logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds") + y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000) + + # write the FOA/HOA2/HOA3 audio signal into output file + audiofile.write(output_filename, y.audio, y.fs) + + # convert to BINAURAL, if option was chosen + if cfg.binaural_output: + binaural_output_filename = output_filename.with_name( + output_filename.stem + "_BINAURAL" + output_filename.suffix + ) + logger.info( + f"-- Converting to BINAURAL output file: {binaural_output_filename}" + ) + binaudio = audio.fromtype("BINAURAL") + binaudio.fs = y.fs + convert_scenebased(y, binaudio) + audiofile.write( + binaural_output_filename, + binaudio.audio, + binaudio.fs, + ) diff --git a/ivas_processing_scripts/generation/generate_stereo_items.py b/ivas_processing_scripts/generation/generate_stereo_items.py new file mode 100644 index 0000000000000000000000000000000000000000..d6208096b5848aedc8f0eb26b526f1fc975aeb06 --- /dev/null +++ b/ivas_processing_scripts/generation/generate_stereo_items.py @@ -0,0 +1,376 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2025 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +import logging +import os +from itertools import groupby, repeat +from pathlib import Path + +import numpy as np + +from ivas_processing_scripts.audiotools import audio, audioarray, audiofile +from ivas_processing_scripts.audiotools.wrappers.bs1770 import loudness_norm +from ivas_processing_scripts.audiotools.wrappers.reverb import reverb_stereo +from ivas_processing_scripts.generation import config +from ivas_processing_scripts.utils import apply_func_parallel + +SEED_RANDOM_NOISE = 0 + + +# function for converting nd numpy array to strings with 2 decimal digits +def csv_formatdata(data): + for row in data: + yield ["%0.2f" % v for v in row] + + +# function for searching sequences of same the same character and replacing it by another string +def replace_char_seq_with_string(str, char_seq, repl_str): + result = [] + + # find groups of consecutive letters + groups = ["".join(list(g)) for k, g in groupby(str)] + + # limit the length of the replacement string by the length of the character sequence + repl_str = repl_str[: len(char_seq)] + + # replace each occurence of the sequence of characters + for g in groups: + if char_seq in g: + result.append(repl_str) + else: + result.append(g) + + return "".join(result) + + +def generate_stereo_items( + cfg: config.TestConfig, + logger: logging.Logger, +): + """Generate STEREO items from mono items based on scene description""" + + # set the fs + if "fs" not in cfg.__dict__: + cfg.fs = 48000 + + # set the IR fs + if "IR_fs" not in cfg.__dict__: + cfg.IR_fs = 48000 + + # set the IR path + if "IR_path" not in cfg.__dict__: + cfg.IR_path = os.path.join(os.path.dirname(__file__), "IRs") + + # set the listening lab designator + if "listening_lab" not in cfg.__dict__: + cfg.listening_lab = "l" + + # set the language designator + if "language" not in cfg.__dict__: + cfg.language = "EN" + + # set the experiment designator + if "exp" not in cfg.__dict__: + cfg.exp = "p01" + + # set the provider + if "provider" not in cfg.__dict__: + cfg.provider = "g" + + # set the prefix for all input filenames + if "use_input_prefix" not in cfg.__dict__: + cfg.use_input_prefix = "" + else: + # replace file designators + cfg.use_input_prefix = replace_char_seq_with_string( + cfg.use_input_prefix, "l", cfg.listening_lab + ) + cfg.use_input_prefix = replace_char_seq_with_string( + cfg.use_input_prefix, "LL", cfg.language + ) + cfg.use_input_prefix = replace_char_seq_with_string( + cfg.use_input_prefix, "eee", cfg.exp + ) + + # set the prefix for all IR filenames + if "use_IR_prefix" not in cfg.__dict__: + cfg.use_IR_prefix = "" + else: + # replace file designators + cfg.use_IR_prefix = replace_char_seq_with_string( + cfg.use_IR_prefix, "p", cfg.provider + ) + cfg.use_IR_prefix = replace_char_seq_with_string( + cfg.use_IR_prefix, "LL", cfg.language + ) + cfg.use_IR_prefix = replace_char_seq_with_string( + cfg.use_IR_prefix, "eee", cfg.exp + ) + + # set the prefix for all output filenames + if "use_output_prefix" not in cfg.__dict__: + cfg.use_output_prefix = "" + else: + # replace file designators + cfg.use_output_prefix = replace_char_seq_with_string( + cfg.use_output_prefix, "l", cfg.listening_lab + ) + cfg.use_output_prefix = replace_char_seq_with_string( + cfg.use_output_prefix, "eee", cfg.exp + ) + + # set multiprocessing + if "multiprocessing" not in cfg.__dict__: + cfg.multiprocessing = False + + apply_func_parallel( + generate_stereo_scene, + zip(cfg.scenes.keys(), cfg.scenes.values(), repeat(cfg), repeat(logger)), + type="mp" if cfg.multiprocessing else None, + show_progress=None, + ) + + return + + +def generate_stereo_scene( + scene_name: str, scene: dict, cfg: config.TestConfig, logger: logging.Logger +): + """ + Processes a single scene to generate STEREO item. + + Args: + scene_name (str): The name of the scene being processed. + scene (dict): A dictionary containing scene description, including source files, azimuth, elevation, and other parameters. + cfg (config.TestConfig): Configuration object containing settings for processing, such as input/output paths, sampling rate, and loudness levels. + logger (logging.Logger): Logger instance for logging information and errors. + + Expected Behavior: + - Reads mono audio source files and processes them based on the scene description. + - Writes the processed STEREO audio to output file. + """ + + scenes = list(cfg.scenes.keys()) + logger.info( + f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}" + ) + + # extract the number of audio sources + N_inputs = len(np.atleast_1d(scene["input"])) + + # get the output filename + output_filename = Path(scene["output"]).parent / ( + cfg.use_output_prefix + Path(scene["output"]).name + ) + + # initialize output dirs + dir_path = output_filename.parent + if dir_path and not dir_path.exists(): + dir_path.mkdir(parents=True, exist_ok=True) + + # initialize output STEREO object + y = audio.ChannelBasedAudio(cfg.format) + y.fs = cfg.fs + + # set the frame length + frame_len = int(cfg.fs / 50) + + # repeat for all source files + offset = 0 + for i in range(N_inputs): + # parse parameters from the scene description + source_file = ( + scene["input"][i] if isinstance(scene["input"], list) else scene["input"] + ) + IR_file = scene["IR"][i] if isinstance(scene["IR"], list) else scene["IR"] + + # get input filename and IR filename + input_filename = Path(source_file).parent / ( + cfg.use_input_prefix + Path(source_file).name + ) + IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name) + + # read the overlap length + if "shift" in scene.keys(): + source_shift = ( + scene["shift"][i] + if isinstance(scene["shift"], list) + else scene["shift"] + ) + else: + source_shift = 0.0 + + # convert overlap to samples and ensure it is a multiple of 20ms + source_shift_in_seconds = source_shift + source_shift = source_shift * cfg.fs + if source_shift >= 0: + source_shift = int(np.floor(source_shift / frame_len) * frame_len) + else: + source_shift = int(np.ceil(source_shift / frame_len) * frame_len) + + # read the level + if "level" in scene.keys(): + level = ( + scene["level"][i] + if isinstance(scene["level"], list) + else scene["level"] + ) + else: + level = -26 + + logger.info( + f"-- Convolving {source_file} with {IR_file} at {level} LUFS with shift of {source_shift_in_seconds} seconds" + ) + + # read source file + x = audio.fromfile("MONO", input_filename) + + # resample to the target fs if necessary + if x.fs != cfg.fs: + logger.warning( + f"Warning: Sample rate of the audio source is {x.fs} Hz and needs to be resampled to {cfg.fs}!" + ) + resampled_audio = audioarray.resample(x.audio, x.fs, cfg.fs) + x.audio = resampled_audio + x.fs = cfg.fs + + # read the IR file (!must be in STEREO format!) + IR = audio.fromfile("STEREO", IR_filename) + + # convolve MONO source audio with STEREO IR -> results in STEREO audio object + x = reverb_stereo(x, IR) + + # adjust the level of the STEREO signal + x.audio, _ = loudness_norm(x, level, loudness_format="STEREO") + + # ensure the length of the audio source signal is a multiple of 20ms + if len(x.audio) % frame_len != 0: + # pad with zeros to ensure that the signal length is a multiple of 20ms + if len(x.audio) % frame_len != 0: + N_pad = int(frame_len - len(x.audio) % frame_len) + x.audio = audioarray.trim( + x.audio, x.fs, limits=[0, -N_pad], samples=True + ) + + # add the convolved STEREO audio source signal to the output signal + if y.audio is None: + # add source signal to the array of all source signals + y.audio = x.audio.copy() + + if source_shift < 0: + # insert zeros to the new audio source signal to shift it right + y.audio = audioarray.trim( + y.audio, x.fs, limits=[source_shift, 0], samples=True + ) + else: + offset = source_shift + else: + # shift the beginning of the audio source signal + delta_offset = source_shift - offset + if delta_offset > 0: + # insert zeros to the existing output signal to shift it right + y.audio = audioarray.trim( + y.audio, y.fs, limits=[0, -delta_offset], samples=True + ) + offset = source_shift + else: + # insert zeros to the new audio source signal to shift it right + x.audio = audioarray.trim( + x.audio, x.fs, limits=[0, delta_offset], samples=True + ) + + # adjust the length of the audio source signal + delta_length = len(x.audio) - len(y.audio) + if delta_length > 0: + # pad zeros to the existing output signal + y.audio = audioarray.trim( + y.audio, y.fs, limits=[0, -delta_length], samples=True + ) + else: + # pad zeros to the new audio source signal + x.audio = audioarray.trim( + x.audio, x.fs, limits=[0, delta_length], samples=True + ) + + # superimpose + y.audio += x.audio + + # append pre-amble and post-amble + if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__: + preamble = int( + np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len + ) # convert to samples and ensure multiple of 20ms + postamble = int( + np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len + ) # convert to samples and ensure multiple of 20ms + if preamble != 0 or postamble != 0: + logger.info( + f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds" + ) + y.audio = audioarray.trim( + y.audio, y.fs, limits=[-preamble, -postamble], samples=True + ) + + # add random noise + if "add_low_level_random_noise" in cfg.__dict__ and cfg.add_low_level_random_noise: + # create uniformly distributed noise between -4 and 4 + np.random.seed(SEED_RANDOM_NOISE) + noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") + y.audio += noise + + # adjust the length of the output signal + if "duration" in cfg.__dict__: + # trim the output signal such that the total duration is X seconds + duration = int(cfg.duration * cfg.fs) # convert to samples + else: + # do not change the length of the audio signal + duration = len(y.audio) + duration = int( + np.floor(duration / frame_len) * frame_len + ) # ensure multiple of 20ms + if len(y.audio) != duration: + y.audio = audioarray.trim( + y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True + ) + + # adjust the loudness of the output signal + if "loudness" in cfg.__dict__: + logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") + y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="STEREO") + + # apply fade-in and fade-out + if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0: + logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds") + y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000) + + # write the STEREO audio signal into output file + audiofile.write(output_filename, y.audio, y.fs)