diff --git a/experiments/selection/P800-6/config/item_gen_P800-6.yml b/experiments/selection/P800-6/config/item_gen_P800-6.yml index 1cddf5bc2e630ef739e6034299d79498c2b7ed9e..c4321aba426aec26b6467e6f8b64aca259bf24b7 100644 --- a/experiments/selection/P800-6/config/item_gen_P800-6.yml +++ b/experiments/selection/P800-6/config/item_gen_P800-6.yml @@ -31,11 +31,12 @@ postamble: 1.0 add_low_level_random_noise: true ### File designators, default is "l" for listening lab, "EN" for language, "p06" for exp and "g" for provider -listening_lab: "l" -language: "EN" +listening_lab: "a" +language: "JP" exp: "p06" provider: "g" + ### Use prefix for all input filenames (default: "") ### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'exp' designator (the number of consecutive letters define the length of the field) use_input_prefix: "lLLeee" @@ -92,333 +93,332 @@ scenes: source: ["m1s01.wav", "m1s07.wav"] azimuth: 0 elevation: 0 - overlap: -1.0 + overlap: -0.5 - cat1_2: + cat1_2: name: "cat1/a1s02.wav" - description: "Talker sitting at a table" - source: ["f3s02.wav", "f3s08.wav"] - azimuth: 60 - elevation: 0 - overlap: -1.0 - - cat1_3: + description: "Standing talker." + source: ["m1s02.wav", "m1s08.wav"] + azimuth: 180 + elevation: 35 + overlap: -0.5 + + cat1_3: name: "cat1/a1s03.wav" - description: "Talker sitting at a table" - source: ["m3s03.wav", "m3s09.wav"] - azimuth: 120 + description: "Smaller talker (child) walking around a table." + source: ["m1s03.wav", "m1s09.wav"] + azimuth: "120:1:120+360" elevation: 0 - overlap: -1.0 + overlap: -0.5 - cat1_4: + cat1_4: name: "cat1/a1s04.wav" - description: "Talker sitting at a table" - source: ["f2s04.wav", "f2s10.wav"] - azimuth: 180 - elevation: 0 - overlap: -1.0 + description: "Talker walking around the table." + source: ["m1s04.wav", "m1s10.wav"] + azimuth: "180:-1:180-360" + elevation: 35 + overlap: -0.5 - cat1_5: + cat1_5: name: "cat1/a1s05.wav" - description: "Talker sitting at a table" - source: ["m2s05.wav", "m2s11.wav"] - azimuth: 240 - elevation: 0 - overlap: -1.0 + description: "Elevation displacement." + source: ["m1s05.wav", "m1s11.wav"] + azimuth: 120 + elevation: "-90:0.3:90" + overlap: -0.5 - cat1_6: + cat1_6: name: "cat1/a1s06.wav" - description: "Talker sitting at a table" - source: ["f1s06.wav", "f1s12.wav"] - azimuth: 300 - elevation: 0 - overlap: -1.0 - - cat1_7: + description: "Azimuth and elevation displacement." + source: ["m1s06.wav", "m1s12.wav"] + azimuth: "0:0.5:0+180" + elevation: "35:-0.2:-35" + overlap: -0.5 + + cat1_7: name: "cat1/a1s07.wav" - description: "Preliminary: Talker sitting at a table" - source: ["f1s13.wav", "f1s14.wav"] - azimuth: 0 - elevation: 0 - overlap: -1.0 + description: "Preliminary: Standing talker." + source: ["m1s13.wav", "m1s14.wav"] + azimuth: 180 + elevation: 35 + overlap: -0.5 - cat2_1: + cat2_1: name: "cat2/a2s01.wav" description: "Standing talker." source: ["f1s01.wav", "f1s07.wav"] azimuth: 120 elevation: 35 - overlap: -1.0 - - cat2_2: + overlap: -0.5 + + cat2_2: name: "cat2/a2s02.wav" - description: "Standing talker." - source: ["m1s02.wav", "m1s08.wav"] - azimuth: 180 - elevation: 35 - overlap: -1.0 - - cat2_3: + description: "Smaller talker (child) walking around a table." + source: ["f1s02.wav", "f1s08.wav"] + azimuth: "60:1:60+360" + elevation: 0 + overlap: -0.5 + + cat2_3: name: "cat2/a2s03.wav" - description: "Standing talker." - source: ["f3s03.wav", "f3s09.wav"] - azimuth: 240 + description: "Talker walking around the table." + source: ["f1s03.wav", "f1s09.wav"] + azimuth: "120:-1:120-360" elevation: 35 - overlap: -1.0 - - cat2_4: + overlap: -0.5 + + cat2_4: name: "cat2/a2s04.wav" - description: "Standing talker." - source: ["m3s04.wav", "m3s10.wav"] - azimuth: 300 - elevation: 35 - overlap: -1.0 + description: "Elevation displacement." + source: ["f1s04.wav", "f1s10.wav"] + azimuth: 60 + elevation: "-90:0.3:90" + overlap: -0.5 - cat2_5: + cat2_5: name: "cat2/a2s05.wav" - description: "Standing talker." - source: ["f2s05.wav", "f2s11.wav"] - azimuth: 0 - elevation: 35 - overlap: -1.0 + description: "Azimuth and elevation displacement." + source: ["f1s05.wav", "f1s11.wav"] + azimuth: "300:0.5:300+180" + elevation: "35:-0.2:-35" + overlap: -0.5 - cat2_6: + cat2_6: name: "cat2/a2s06.wav" - description: "Standing talker." - source: ["m2s06.wav", "m2s12.wav"] - azimuth: 60 - elevation: 35 - overlap: -1.0 - - cat2_7: + description: "Talker sitting at a table" + source: ["f1s06.wav", "f1s12.wav"] + azimuth: 300 + elevation: 0 + overlap: -0.5 + + cat2_7: name: "cat2/a2s07.wav" - description: "Preliminary: Standing talker." - source: ["m1s13.wav", "m1s14.wav"] - azimuth: 180 - elevation: 35 - overlap: -1.0 + description: "Preliminary: Talker sitting at a table" + source: ["f1s13.wav", "f1s14.wav"] + azimuth: 0 + elevation: 0 + overlap: -0.5 - cat3_1: + cat3_1: name: "cat3/a3s01.wav" description: "Smaller talker (child) walking around a table." source: ["m2s01.wav", "m2s07.wav"] azimuth: "0:1:360" elevation: 0 - overlap: -1.0 + overlap: -0.5 - cat3_2: + cat3_2: name: "cat3/a3s02.wav" - description: "Smaller talker (child) walking around a table." - source: ["f1s02.wav", "f1s08.wav"] - azimuth: "60:1:60+360" - elevation: 0 - overlap: -1.0 - - cat3_3: + description: "Talker walking around the table." + source: ["m2s02.wav", "m2s08.wav"] + azimuth: "60:-1:60-360" + elevation: 35 + overlap: -0.5 + + cat3_3: name: "cat3/a3s03.wav" - description: "Smaller talker (child) walking around a table." - source: ["m1s03.wav", "m1s09.wav"] - azimuth: "120:1:120+360" - elevation: 0 - overlap: -1.0 - - cat3_4: + description: "Elevation displacement." + source: ["m2s03.wav", "m2s09.wav"] + azimuth: 0 + elevation: "-90:0.3:90" + overlap: -0.5 + + cat3_4: name: "cat3/a3s04.wav" - description: "Smaller talker (child) walking around a table." - source: ["f3s04.wav", "f3s10.wav"] - azimuth: "180:1:180+360" - elevation: 0 - overlap: -1.0 - - cat3_5: + description: "Azimuth and elevation displacement." + source: ["m2s04.wav", "m2s10.wav"] + azimuth: "240:0.5:240+180" + elevation: "35:-0.2:-35" + overlap: -0.5 + + cat3_5: name: "cat3/a3s05.wav" - description: "Smaller talker (child) walking around a table." - source: ["m3s05.wav", "m3s11.wav"] - azimuth: "240:1:240+360" + description: "Talker sitting at a table" + source: ["m2s05.wav", "m2s11.wav"] + azimuth: 240 elevation: 0 - overlap: -1.0 - - cat3_6: + overlap: -0.5 + + cat3_6: name: "cat3/a3s06.wav" - description: "Smaller talker (child) walking around a table." - source: ["f2s06.wav", "f2s12.wav"] - azimuth: "300:1:300+360" - elevation: 0 - overlap: -1.0 - + description: "Standing talker." + source: ["m2s06.wav", "m2s12.wav"] + azimuth: 60 + elevation: 35 + overlap: -0.5 + cat3_7: name: "cat3/a3s07.wav" - description: "Preliminary: Smaller talker (child) walking around a table." - source: ["f2s13.wav", "f2s14.wav"] - azimuth: "120:1:120+360" - elevation: 0 - overlap: -1.0 - - cat4_1: + description: "Preliminary: Talker walking around the table." + source: ["m2s13.wav", "m2s14.wav"] + azimuth: "180:-1:180-360" + elevation: 35 + overlap: -0.5 + + cat4_1: name: "cat4/a4s01.wav" description: "Talker walking around the table." source: ["f2s01.wav", "f2s07.wav"] azimuth: "0:-1:-360" elevation: 35 - overlap: -1.0 - - cat4_2: + overlap: -0.5 + + cat4_2: name: "cat4/a4s02.wav" - description: "Talker walking around the table." - source: ["m2s02.wav", "m2s08.wav"] - azimuth: "60:-1:60-360" - elevation: 35 - overlap: -1.0 - - cat4_3: + description: "Elevation displacement." + source: ["f2s02.wav", "f2s08.wav"] + azimuth: 300 + elevation: "-90:0.3:90" + overlap: -0.5 + + cat4_3: name: "cat4/a4s03.wav" - description: "Talker walking around the table." - source: ["f1s03.wav", "f1s09.wav"] - azimuth: "120:-1:120-360" - elevation: 35 - overlap: -1.0 - - cat4_4: + description: "Azimuth and elevation displacement." + source: ["f2s03.wav", "f2s09.wav"] + azimuth: "180:0.5:180+180" + elevation: "35:-0.2:-35" + overlap: -0.5 + + cat4_4: name: "cat4/a4s04.wav" - description: "Talker walking around the table." - source: ["m1s04.wav", "m1s10.wav"] - azimuth: "180:-1:180-360" - elevation: 35 - overlap: -1.0 - - cat4_5: + description: "Talker sitting at a table" + source: ["f2s04.wav", "f2s10.wav"] + azimuth: 180 + elevation: 0 + overlap: -0.5 + + cat4_5: name: "cat4/a4s05.wav" - description: "Talker walking around the table." - source: ["f3s05.wav", "f3s11.wav"] - azimuth: "240:-1:240-360" + description: "Standing talker." + source: ["f2s05.wav", "f2s11.wav"] + azimuth: 0 elevation: 35 - overlap: -1.0 - - cat4_6: - name: "cat4/a4s06.wav" - description: "Talker walking around the table." - source: ["m3s06.wav", "m3s12.wav"] - azimuth: "300:-1:300-360" - elevation: 35 - overlap: -1.0 + overlap: -0.5 + cat4_6: + name: "cat4/a4s06.wav" + description: "Smaller talker (child) walking around a table." + source: ["f2s06.wav", "f2s12.wav"] + azimuth: "300:1:300+360" + elevation: 0 + overlap: -0.5 + cat4_7: name: "cat4/a4s07.wav" - description: "Preliminary: Talker walking around the table." - source: ["m2s13.wav", "m2s14.wav"] - azimuth: "180:-1:180-360" - elevation: 35 - overlap: -1.0 - - cat5_1: + description: "Preliminary: Smaller talker (child) walking around a table." + source: ["f2s13.wav", "f2s14.wav"] + azimuth: "120:1:120+360" + elevation: 0 + overlap: -0.5 + + cat5_1: name: "cat5/a5s01.wav" description: "Elevation displacement." source: ["m3s01.wav", "m3s07.wav"] azimuth: 240 - elevation: "-90:0.5:90" - overlap: -1.0 - - cat5_2: + elevation: "-90:0.3:90" + overlap: -0.5 + + cat5_2: name: "cat5/a5s02.wav" - description: "Elevation displacement." - source: ["f2s02.wav", "f2s08.wav"] - azimuth: 300 - elevation: 0 - overlap: -1.0 - - cat5_3: + description: "Azimuth and elevation displacement." + source: ["m3s02.wav", "m3s08.wav"] + azimuth: "120:0.5:120+180" + elevation: "35:-0.2:-35" + overlap: -0.5 + + cat5_3: name: "cat5/a5s03.wav" - description: "Elevation displacement." - source: ["m2s03.wav", "m2s09.wav"] - azimuth: 0 - elevation: "-90:0.5:90" - overlap: -1.0 - - cat5_4: + description: "Talker sitting at a table" + source: ["m3s03.wav", "m3s09.wav"] + azimuth: 120 + elevation: 0 + overlap: -0.5 + + cat5_4: name: "cat5/a5s04.wav" - description: "Elevation displacement." - source: ["f1s04.wav", "f1s10.wav"] - azimuth: 60 - elevation: "-90:0.5:90" - overlap: -1.0 - - cat5_5: + description: "Standing talker." + source: ["m3s04.wav", "m3s10.wav"] + azimuth: 300 + elevation: 35 + overlap: -0.5 + + cat5_5: name: "cat5/a5s05.wav" - description: "Elevation displacement." - source: ["m1s05.wav", "m1s11.wav"] - azimuth: 120 - elevation: "-90:0.5:90" - overlap: -1.0 - - cat5_6: + description: "Smaller talker (child) walking around a table." + source: ["m3s05.wav", "m3s11.wav"] + azimuth: "240:1:240+360" + elevation: 0 + overlap: -0.5 + + cat5_6: name: "cat5/a5s06.wav" - description: "Elevation displacement." - source: ["f3s06.wav", "f3s12.wav"] - azimuth: 180 - elevation: "-90:0.5:90" - overlap: -1.0 - + description: "Talker walking around the table." + source: ["m3s06.wav", "m3s12.wav"] + azimuth: "300:-1:300-360" + elevation: 35 + overlap: -0.5 + cat5_7: name: "cat5/a5s07.wav" - description: "Preliminary: Elevation displacement." - source: ["f3s13.wav", "f3s14.wav"] - azimuth: 120 - elevation: "-90:0.5:90" - overlap: -1.0 - - cat6_1: + description: "Preliminary: Azimuth and elevation displacement." + source: ["m3s13.wav", "m3s14.wav"] + azimuth: "0:0.5:0+180" + elevation: "35:-0.2:-35" + overlap: -0.5 + + cat6_1: name: "cat6/a6s01.wav" description: "Azimuth and elevation displacement." source: ["f3s01.wav", "f3s07.wav"] azimuth: "60:0.5:60+180" elevation: "35:-0.2:-35" - overlap: -1.0 - - cat6_2: + overlap: -0.5 + + cat6_2: name: "cat6/a6s02.wav" - description: "Azimuth and elevation displacement." - source: ["m3s02.wav", "m3s08.wav"] - azimuth: "120:0.5:120+180" - elevation: "35:-0.2:-35" - overlap: -1.0 - - cat6_3: + description: "Talker sitting at a table" + source: ["f3s02.wav", "f3s08.wav"] + azimuth: 60 + elevation: 0 + overlap: -0.5 + + cat6_3: name: "cat6/a6s03.wav" - description: "Azimuth and elevation displacement." - source: ["f2s03.wav", "f2s09.wav"] - azimuth: "180:0.5:180+180" - elevation: "35:-0.2:-35" - overlap: -1.0 - - cat6_4: + description: "Standing talker." + source: ["f3s03.wav", "f3s09.wav"] + azimuth: 240 + elevation: 35 + overlap: -0.5 + + cat6_4: name: "cat6/a6s04.wav" - description: "Azimuth and elevation displacement." - source: ["m2s04.wav", "m2s10.wav"] - azimuth: "240:0.5:240+180" - elevation: "35:-0.2:-35" - overlap: -1.0 - - cat6_5: + description: "Smaller talker (child) walking around a table." + source: ["f3s04.wav", "f3s10.wav"] + azimuth: "180:1:180+360" + elevation: 0 + overlap: -0.5 + + cat6_5: name: "cat6/a6s05.wav" - description: "Azimuth and elevation displacement." - source: ["f1s05.wav", "f1s11.wav"] - azimuth: "300:0.5:300+180" - elevation: "35:-0.2:-35" - overlap: -1.0 - - cat6_6: + description: "Talker walking around the table." + source: ["f3s05.wav", "f3s11.wav"] + azimuth: "240:-1:240-360" + elevation: 35 + overlap: -0.5 + + cat6_6: name: "cat6/a6s06.wav" - description: "Azimuth and elevation displacement." - source: ["m1s06.wav", "m1s12.wav"] - azimuth: "0:0.5:0+180" - elevation: "35:-0.2:-35" - overlap: -1.0 + description: "Elevation displacement." + source: ["f3s06.wav", "f3s12.wav"] + azimuth: 180 + elevation: "-90:0.3:90" + overlap: -0.5 cat6_7: name: "cat6/a6s07.wav" - description: "Preliminary: Azimuth and elevation displacement." - source: ["m3s13.wav", "m3s14.wav"] - azimuth: "0:0.5:0+180" - elevation: "35:-0.2:-35" - overlap: -1.0 - \ No newline at end of file + description: "Preliminary: Elevation displacement." + source: ["f3s13.wav", "f3s14.wav"] + azimuth: 120 + elevation: "-90:0.3:90" + overlap: -0.5 diff --git a/experiments/selection/P800-7/config/item_gen_P800-7.yml b/experiments/selection/P800-7/config/item_gen_P800-7.yml index 6a5f7a2ed8b7812058d7a4b5cdcda6152a9ab489..ec79e31b976460fecd15b6e69f82a7f065db1c78 100644 --- a/experiments/selection/P800-7/config/item_gen_P800-7.yml +++ b/experiments/selection/P800-7/config/item_gen_P800-7.yml @@ -31,8 +31,8 @@ postamble: 1.0 add_low_level_random_noise: true ### File designators, default is "l" for listening lab, "EN" for language, "p07" for exp and "g" for provider -listening_lab: "l" -language: "EN" +listening_lab: "a" +language: "DK" exp: "p07" provider: "g" @@ -95,106 +95,106 @@ scenes: cat1_2: name: "cat1/a1s02.wav" - description: "Two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." - source: ["f3s08.wav", "m1s08.wav"] - azimuth: [50, 350] - elevation: [0, 0] - overlap: -1.0 + description: "Two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." + source: ["m1s02.wav", "f1s02.wav"] + azimuth: [10, 110] + elevation: [35, 35] + overlap: 1.0 cat1_3: name: "cat1/a1s03.wav" - description: "Two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." - source: ["f2s09.wav", "m3s09.wav"] - azimuth: [40, 290] - elevation: [0, 0] - overlap: -1.0 - + description: "One talker sitting at a table, second talker standing beside the table, non-overlapping utterances." + source: ["m1s03.wav", "f1s03.wav"] + azimuth: [20, 170] + elevation: [0, 45] + overlap: -1.0 + cat1_4: name: "cat1/a1s04.wav" - description: "Two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." - source: ["f1s10.wav", "m2s10.wav"] - azimuth: [30, 230] - elevation: [15, 15] - overlap: -1.0 - + description: "One talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." + source: ["m1s04.wav", "f1s04.wav"] + azimuth: [200, "30:-1:-270"] + elevation: [0, 45] + overlap: 1.0 + cat1_5: name: "cat1/a1s05.wav" - description: "Two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." - source: ["m3s05.wav", "f3s05.wav"] - azimuth: [20, 170] - elevation: [15, 15] - overlap: -1.0 - + description: "Two talkers walking side-by-side around the table, ~30% overlapping utterances" + source: ["m1s05.wav", "f1s05.wav"] + azimuth: ["-20:-1:-320", "-20:-1:-320"] + elevation: [45, 45] + overlap: 1.0 + cat1_6: name: "cat1/a1s06.wav" - description: "Two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." - source: ["m2s06.wav", "f2s06.wav"] - azimuth: [10, 110] - elevation: [15, 15] - overlap: -1.0 - - cat1_7: + description: "Two talkers walking around the table in opposite directions, non-overlapping utterances." + source: ["m1s06.wav", "f1s06.wav"] + azimuth: ["120:1:60 + 360", "120:-1:180 - 360"] + elevation: [30, 30] + overlap: -1.0 + + cat1_7: name: "cat1/a1s07.wav" description: "Preliminary: Two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." source: ["m1s13.wav", "f1s13.wav"] azimuth: [0, 50] elevation: [0, 0] overlap: -1.0 - + cat2_1: name: "cat2/a2s01.wav" description: "Two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." source: ["m2s01.wav", "f2s01.wav"] azimuth: [20, 170] - elevation: [30, 30] + elevation: [35, 35] overlap: 1.0 - + cat2_2: name: "cat2/a2s02.wav" - description: "Two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." - source: ["m1s02.wav", "f1s02.wav"] - azimuth: [10, 110] - elevation: [30, 30] - overlap: 1.0 - + description: "One talker sitting at a table, second talker standing beside the table, non-overlapping utterances." + source: ["m2s02.wav", "f2s02.wav"] + azimuth: [30, 230] + elevation: [0, 45] + overlap: -1.0 + cat2_3: name: "cat2/a2s03.wav" - description: "Two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." - source: ["f3s09.wav", "m1s09.wav"] - azimuth: [0, 50] - elevation: [30, 30] - overlap: 1.0 - + description: "One talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." + source: ["m2s03.wav", "f2s03.wav"] + azimuth: [250, "-20:-1:-320"] + elevation: [0, 45] + overlap: 1.0 + cat2_4: name: "cat2/a2s04.wav" - description: "Two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." - source: ["f2s10.wav", "m3s10.wav"] - azimuth: [50, 350] - elevation: [60, 60] - overlap: 1.0 + description: "Two talkers walking side-by-side around the table, ~30% overlapping utterances" + source: ["m2s04.wav", "f2s04.wav"] + azimuth: ["-70:-1:-10 - 360", "-70:-1:-10 - 360"] + elevation: [45, 45] + overlap: 1.0 cat2_5: name: "cat2/a2s05.wav" - description: "Two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." - source: ["f1s11.wav", "m2s11.wav"] - azimuth: [40, 290] - elevation: [60, 60] - overlap: 1.0 + description: "Two talkers walking around the table in opposite directions, non-overlapping utterances." + source: ["m2s05.wav", "f2s05.wav"] + azimuth: ["180:1:120 + 360", "180:-1:-120"] + elevation: [30, 30] + overlap: -1.0 cat2_6: name: "cat2/a2s06.wav" - description: "Two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." - source: ["m3s06.wav", "f3s06.wav"] - azimuth: [30, 230] - elevation: [60, 60] - overlap: 1.0 + description: "Two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." + source: ["m2s06.wav", "f2s06.wav"] + azimuth: [10, 110] + elevation: [0, 0] + overlap: -1.0 cat2_7: name: "cat2/a2s07.wav" description: "Preliminary: Two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." - source: ["f2s13.wav", "m2s13.wav"] + source: ["m2s13.wav", "f2s13.wav"] azimuth: [10, 110] - elevation: [30, 30] + elevation: [35, 35] overlap: 1.0 cat3_1: @@ -202,49 +202,49 @@ scenes: description: "One talker sitting at a table, second talker standing beside the table, non-overlapping utterances." source: ["m3s01.wav", "f3s01.wav"] azimuth: [40, 290] - elevation: [0, 60] + elevation: [0, 45] overlap: -1.0 cat3_2: name: "cat3/a3s02.wav" - description: "One talker sitting at a table, second talker standing beside the table, non-overlapping utterances." - source: ["m2s02.wav", "f2s02.wav"] - azimuth: [30, 230] - elevation: [0, 60] - overlap: -1.0 - + description: "One talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." + source: ["m3s02.wav", "f3s02.wav"] + azimuth: [300, "-70:-1:-10 - 360"] + elevation: [0, 45] + overlap: 1.0 + cat3_3: name: "cat3/a3s03.wav" - description: "One talker sitting at a table, second talker standing beside the table, non-overlapping utterances." - source: ["m1s03.wav", "f1s03.wav"] - azimuth: [20, 170] - elevation: [0, 60] - overlap: -1.0 - + description: "Two talkers walking side-by-side around the table, ~30% overlapping utterances" + source: ["m3s03.wav", "f3s03.wav"] + azimuth: ["180:1:120 + 360", "180:1:120 + 360"] + elevation: [45, 45] + overlap: 1.0 + cat3_4: name: "cat3/a3s04.wav" - description: "One talker sitting at a table, second talker standing beside the table, non-overlapping utterances." - source: ["f3s10.wav", "m1s10.wav"] - azimuth: [10, 110] - elevation: [0, 60] - overlap: -1.0 - + description: "Two talkers walking around the table in opposite directions, non-overlapping utterances." + source: ["m3s04.wav", "f3s04.wav"] + azimuth: ["240:1:180 + 360", "240:-1:-60"] + elevation: [30, 30] + overlap: -1.0 + cat3_5: name: "cat3/a3s05.wav" - description: "One talker sitting at a table, second talker standing beside the table, non-overlapping utterances." - source: ["f2s11.wav", "m3s11.wav"] - azimuth: [0, 50] - elevation: [0, 60] - overlap: -1.0 - + description: "Two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." + source: ["m3s05.wav", "f3s05.wav"] + azimuth: [20, 170] + elevation: [0, 0] + overlap: -1.0 + cat3_6: name: "cat3/a3s06.wav" - description: "One talker sitting at a table, second talker standing beside the table, non-overlapping utterances." - source: ["f1s12.wav", "m2s12.wav"] - azimuth: [50, 350] - elevation: [0, 60] - overlap: -1.0 - + description: "Two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." + source: ["m3s06.wav", "f3s06.wav"] + azimuth: [30, 230] + elevation: [35, 35] + overlap: 1.0 + cat3_7: name: "cat3/a3s07.wav" description: "Preliminary: One talker sitting at a table, second talker standing beside the table, non-overlapping utterances." @@ -252,172 +252,172 @@ scenes: azimuth: [20, 170] elevation: [0, 60] overlap: -1.0 - + cat4_1: name: "cat4/a4s01.wav" description: "One talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." source: ["f1s07.wav", "m2s07.wav"] azimuth: [50, "180:1:120 + 360"] - elevation: [0, 60] - overlap: 1.0 - + elevation: [0, 45] + overlap: 1.0 + cat4_2: name: "cat4/a4s02.wav" - description: "One talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." - source: ["m3s02.wav", "f3s02.wav"] - azimuth: [300, "-70:-1:-10 - 360"] - elevation: [0, 60] - overlap: 1.0 - + description: "Two talkers walking side-by-side around the table, ~30% overlapping utterances" + source: ["f1s08.wav", "m2s08.wav"] + azimuth: ["130:1:70 + 360", "130:1:70 + 360"] + elevation: [45, 45] + overlap: 1.0 + cat4_3: name: "cat4/a4s03.wav" - description: "One talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." - source: ["m2s03.wav", "f2s03.wav"] - azimuth: [250, "-20:-1:-320"] - elevation: [0, 60] - overlap: 1.0 - + description: "Two talkers walking around the table in opposite directions, non-overlapping utterances." + source: ["f1s09.wav", "m2s09.wav"] + azimuth: ["300:1:240 + 360", "300:-1:0"] + elevation: [30, 30] + overlap: -1.0 + cat4_4: name: "cat4/a4s04.wav" - description: "One talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." - source: ["m1s04.wav", "f1s04.wav"] - azimuth: [200, "30:-1:-270"] - elevation: [0, 60] - overlap: 1.0 - + description: "Two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." + source: ["f1s10.wav", "m2s10.wav"] + azimuth: [30, 230] + elevation: [0, 0] + overlap: -1.0 + cat4_5: name: "cat4/a4s05.wav" - description: "One talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." - source: ["f3s11.wav", "m1s11.wav"] - azimuth: [150, "80:1:20 + 360"] - elevation: [0, 60] - overlap: 1.0 - + description: "Two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." + source: ["f1s11.wav", "m2s11.wav"] + azimuth: [40, 290] + elevation: [35, 35] + overlap: 1.0 + cat4_6: name: "cat4/a4s06.wav" - description: "One talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." - source: ["f2s12.wav", "m3s12.wav"] - azimuth: [100, "130:1:70 + 360"] + description: "One talker sitting at a table, second talker standing beside the table, non-overlapping utterances." + source: ["f1s12.wav", "m2s12.wav"] + azimuth: [50, 350] elevation: [0, 60] - overlap: 1.0 - + overlap: -1.0 + cat4_7: name: "cat4/a4s07.wav" description: "Preliminary: One talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." - source: ["f1s14.wav", "m1s14.wav"] + source: ["f1s14.wav", "m2s14.wav"] azimuth: [200, "30:-1:-270"] - elevation: [0, 60] + elevation: [0, 45] overlap: 1.0 - + cat5_1: name: "cat5/a5s01.wav" description: "Two talkers walking side-by-side around the table, ~30% overlapping utterances" source: ["f2s07.wav", "m3s07.wav"] azimuth: ["80:1:20 + 360", "80:1:20 + 360"] - elevation: [10, 60] + elevation: [45, 45] overlap: 1.0 - + cat5_2: name: "cat5/a5s02.wav" - description: "Two talkers walking side-by-side around the table, ~30% overlapping utterances" - source: ["f1s08.wav", "m2s08.wav"] - azimuth: ["130:1:70 + 360", "130:1:70 + 360"] - elevation: [10, 60] - overlap: 1.0 - + description: "Two talkers walking around the table in opposite directions, non-overlapping utterances." + source: ["f2s08.wav", "m3s08.wav"] + azimuth: ["0:1:300", "0:-1:60 - 360"] + elevation: [30, 30] + overlap: -1.0 + cat5_3: name: "cat5/a5s03.wav" - description: "Two talkers walking side-by-side around the table, ~30% overlapping utterances" - source: ["m3s03.wav", "f3s03.wav"] - azimuth: ["180:1:120 + 360", "180:1:120 + 360"] - elevation: [10, 60] - overlap: 1.0 - + description: "Two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." + source: ["f2s09.wav", "m3s09.wav"] + azimuth: [40, 290] + elevation: [0, 0] + overlap: -1.0 + cat5_4: name: "cat5/a5s04.wav" - description: "Two talkers walking side-by-side around the table, ~30% overlapping utterances" - source: ["m2s04.wav", "f2s04.wav"] - azimuth: ["-70:-1:-10 - 360", "-70:-1:-10 - 360"] - elevation: [10, 60] - overlap: 1.0 - + description: "Two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." + source: ["f2s10.wav", "m3s10.wav"] + azimuth: [50, 350] + elevation: [35, 35] + overlap: 1.0 + cat5_5: name: "cat5/a5s05.wav" - description: "Two talkers walking side-by-side around the table, ~30% overlapping utterances" - source: ["m1s05.wav", "f1s05.wav"] - azimuth: ["-20:-1:-320", "-20:-1:-320"] - elevation: [10, 60] - overlap: 1.0 - + description: "One talker sitting at a table, second talker standing beside the table, non-overlapping utterances." + source: ["f2s11.wav", "m3s11.wav"] + azimuth: [0, 50] + elevation: [0, 45] + overlap: -1.0 + cat5_6: name: "cat5/a5s06.wav" - description: "Two talkers walking side-by-side around the table, ~30% overlapping utterances" - source: ["f3s12.wav", "m1s12.wav"] - azimuth: ["30:-1:-270", "30:-1:-270"] - elevation: [10, 60] - overlap: 1.0 - + description: "One talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." + source: ["f2s12.wav", "m3s12.wav"] + azimuth: [100, "130:1:70 + 360"] + elevation: [0, 45] + overlap: 1.0 + cat5_7: name: "cat5/a5s07.wav" description: "Preliminary: Two talkers walking side-by-side around the table, ~30% overlapping utterances" - source: ["m2s14.wav", "f2s14.wav"] + source: ["f2s14.wav", "m3s14.wav"] azimuth: ["-20:-1:-320", "-20:-1:-320"] - elevation: [10, 60] + elevation: [45, 45] overlap: 1.0 - + cat6_1: name: "cat6/a6s01.wav" description: "Two talkers walking around the table in opposite directions, non-overlapping utterances." source: ["f3s07.wav", "m1s07.wav"] azimuth: ["60:1:0 + 360", "60:-1:120 - 360"] - elevation: [20, 50] - overlap: -1.0 - + elevation: [30, 30] + overlap: -1.0 + cat6_2: name: "cat6/a6s02.wav" - description: "Two talkers walking around the table in opposite directions, non-overlapping utterances." - source: ["f2s08.wav", "m3s08.wav"] - azimuth: ["0:1:300", "0:-1:60 - 360"] - elevation: [20, 50] - overlap: -1.0 - + description: "Two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." + source: ["f3s08.wav", "m1s08.wav"] + azimuth: [50, 350] + elevation: [0, 0] + overlap: -1.0 + cat6_3: name: "cat6/a6s03.wav" - description: "Two talkers walking around the table in opposite directions, non-overlapping utterances." - source: ["f1s09.wav", "m2s09.wav"] - azimuth: ["300:1:240 + 360", "300:-1:0"] - elevation: [20, 50] - overlap: -1.0 + description: "Two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." + source: ["f3s09.wav", "m1s09.wav"] + azimuth: [0, 50] + elevation: [35, 35] + overlap: 1.0 cat6_4: name: "cat6/a6s04.wav" - description: "Two talkers walking around the table in opposite directions, non-overlapping utterances." - source: ["m3s04.wav", "f3s04.wav"] - azimuth: ["240:1:180 + 360", "240:-1:-60"] - elevation: [20, 50] - overlap: -1.0 - + description: "One talker sitting at a table, second talker standing beside the table, non-overlapping utterances." + source: ["f3s10.wav", "m1s10.wav"] + azimuth: [10, 110] + elevation: [0, 45] + overlap: -1.0 + cat6_5: name: "cat6/a6s05.wav" - description: "Two talkers walking around the table in opposite directions, non-overlapping utterances." - source: ["m2s05.wav", "f2s05.wav"] - azimuth: ["180:1:120 + 360", "180:-1:-120"] - elevation: [20, 50] - overlap: -1.0 - + description: "One talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." + source: ["f3s11.wav", "m1s11.wav"] + azimuth: [150, "80:1:20 + 360"] + elevation: [0, 45] + overlap: 1.0 + cat6_6: name: "cat6/a6s06.wav" - description: "Two talkers walking around the table in opposite directions, non-overlapping utterances." - source: ["m1s06.wav", "f1s06.wav"] - azimuth: ["120:1:60 + 360", "120:-1:180 - 360"] - elevation: [20, 50] - overlap: -1.0 + description: "Two talkers walking side-by-side around the table, ~30% overlapping utterances" + source: ["f3s12.wav", "m1s12.wav"] + azimuth: ["30:-1:-270", "30:-1:-270"] + elevation: [45, 45] + overlap: 1.0 cat6_7: name: "cat6/a6s07.wav" description: "Preliminary: Two talkers walking around the table in opposite directions, non-overlapping utterances." - source: ["f3s14.wav", "m3s14.wav"] + source: ["f3s14.wav", "m1s14.wav"] azimuth: ["120:1:60 + 360", "120:-1:180 - 360"] - elevation: [20, 50] + elevation: [30, 30] overlap: -1.0 \ No newline at end of file diff --git a/ivas_processing_scripts/generation/process_ism1_items.py b/ivas_processing_scripts/generation/process_ism1_items.py index f6d14b1c44f80c2802103fe03385768fdd411205..6c45ad65e1286457286edebf20cd3efceba7927a 100644 --- a/ivas_processing_scripts/generation/process_ism1_items.py +++ b/ivas_processing_scripts/generation/process_ism1_items.py @@ -234,64 +234,85 @@ def generate_ism1_scene( y.audio.resize(x.audio.shape, refcheck=False) y.audio += x.audio + # append pre-amble and post-amble to all sources + if cfg.preamble != 0.0: + # ensure that pre-amble is a multiple of 20ms + N_pre = int(floor(cfg.preamble * 50) / 50 * y.fs) + + # insert all-zero preamble to all sources + pre = np.zeros((N_pre, y.audio.shape[1])) + y.audio = np.concatenate([pre, y.audio]) + + if cfg.postamble != 0.0: + # ensure that post-amble is a multiple of 20ms + N_post = int(floor(cfg.postamble * 50) / 50 * y.fs) + + # append all-zero postamble to all sources + post = np.zeros((N_post, y.audio.shape[1])) + y.audio = np.concatenate([y.audio, post]) + + # add random noise + if cfg.add_low_level_random_noise: + # create uniformly distributed noise between -4 and 4 + np.random.seed(SEED_RANDOM_NOISE) + noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") + + # superimpose + y.audio += noise + # process azimuth and elevation source_azi = scene["azimuth"] source_ele = scene["elevation"] N_frames = int(len(y.audio) / y.fs * 50) - # read azimuth information and create array + # read azimuth information and convert to an array if isinstance(source_azi, str): if ":" in source_azi: + # start with the initial azimuth value and apply step N_frames times source_azi = source_azi.split(":") azi = np.arange( float(eval(source_azi[0])), - float(eval(source_azi[2])), + float(eval(source_azi[0])) + N_frames * float(eval(source_azi[1])), float(eval(source_azi[1])), ) else: - azi = np.array(float(eval(source_azi)), ndmin=1)[:N_frames] + # replicate static azimuth value N_frames times + azi = np.repeat(float(eval(source_azi)), N_frames) else: - azi = np.array(source_azi, ndmin=1)[:N_frames] - - # ensure that azimuth array has N_frames values - if len(azi) > N_frames: - # cut the array of azimuth values - azi = azi[:N_frames] - elif len(azi) < N_frames: - # replicate the last azimuth value - azi = np.append(azi, np.full(N_frames - len(azi), azi[-1])) + # replicate static azimuth value N_frames times + azi = np.repeat(float(source_azi), N_frames) # convert azimuth from 0 .. 360 to -180 .. +180 azi = (azi + 180) % 360 - 180 - # check if azimuth is from -180 .. +180 + # check, if azimuth is from -180 .. +180 if any(azi > 180) or any(azi < -180): logger.error( f"Incorrect value(s) of azimuth: {azi[(azi > 180) | (azi < -180)]}" ) - # read elevation information and create array + # read elevation information and convert to an array if isinstance(source_ele, str): if ":" in source_ele: + # convert into array (initial_value:step:stop_value) + # note: the stop_value value is +-90 degrees depending on the sign of the step source_ele = source_ele.split(":") ele = np.arange( float(eval(source_ele[0])), - float(eval(source_ele[2])), + np.sign(float(eval(source_ele[1]))) * 90, float(eval(source_ele[1])), - ) + )[:N_frames] + + # repeat the last elevation value, if array is shorter than N_frames + if len(ele) < N_frames: + ele = np.append(ele, np.full(N_frames - len(ele), ele[-1])) else: - ele = np.array(float(eval(source_ele)), ndmin=1)[:N_frames] + # replicate static elevation value N_frames times + ele = np.repeat(float(eval(source_ele)), N_frames) else: - ele = np.array(source_ele, ndmin=1)[:N_frames] - - # ensure that elevation array has N_frames values - if len(ele) > N_frames: - # cut the array of elevation values - ele = ele[:N_frames] - elif len(ele) < N_frames: - # replicate the last elevation - ele = np.append(ele, np.full(N_frames - len(ele), ele[-1])) + # replicate static elevation value N_frames times + ele = np.repeat(float(source_ele), N_frames) # check if elevation is from -90 .. +90 if any(ele > 90) or any(ele < -90): @@ -299,49 +320,8 @@ def generate_ism1_scene( f"Incorrect value(s) of elevation: {ele[(ele > 90) | (ele < -90)]}" ) - # additional metadata (default values) - radius = np.ones(N_frames) - spread = np.zeros(N_frames) - gain = np.ones(N_frames) - # arrange all metadata fields column-wise into a matrix - y_meta = np.column_stack((azi, ele, radius, spread, gain)) - - # append pre-amble and post-amble to all sources - if cfg.preamble != 0.0: - # ensure that pre-amble is a multiple of 20ms - N_pre = int(floor(cfg.preamble * 50) / 50 * y.fs) - - # insert all-zero preamble to all sources - pre = np.zeros((N_pre, y.audio.shape[1])) - y.audio = np.concatenate([pre, y.audio]) - - # insert neutral position as a pre-amble to all sources - N_pre = int(N_pre / frame_len) - pre = np.tile([0.00, 0.00, 1.00, 0.00, 1.00], (N_pre, 1)) - y_meta = np.concatenate([pre, y_meta], axis=0) - - if cfg.postamble != 0.0: - # ensure that post-amble is a multiple of 20ms - N_post = int(floor(cfg.postamble * 50) / 50 * y.fs) - - # append all-zero postamble to all sources - post = np.zeros((N_post, y.audio.shape[1])) - y.audio = np.concatenate([y.audio, post]) - - # append neutral position as a post-amble to all sources - N_post = int(N_post / frame_len) - post = np.tile([0.00, 0.00, 1.00, 0.00, 1.00], (N_post, 1)) - y_meta = np.concatenate([y_meta, post], axis=0) - - # add random noise - if cfg.add_low_level_random_noise: - # create uniformly distributed noise between -4 and 4 - np.random.seed(SEED_RANDOM_NOISE) - noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float") - - # superimpose - y.audio += noise + y_meta = np.column_stack((azi, ele)) # write ISM audio stream to the output file audiofile.write( diff --git a/ivas_processing_scripts/generation/process_ism2_items.py b/ivas_processing_scripts/generation/process_ism2_items.py index c1c0964533427c97320e553eec600021e6233ee7..2f3fc0c09024e6a9a0fc0701854c839ff28b4ed2 100644 --- a/ivas_processing_scripts/generation/process_ism2_items.py +++ b/ivas_processing_scripts/generation/process_ism2_items.py @@ -215,73 +215,6 @@ def generate_ism2_scene( _, scale_factor, _ = get_loudness(x, cfg.loudness, "MONO") x.audio *= scale_factor - # read azimuth information and create array - if isinstance(source_azi, str): - if ":" in source_azi: - source_azi = source_azi.split(":") - azi = np.arange( - float(eval(source_azi[0])), - float(eval(source_azi[2])), - float(eval(source_azi[1])), - ) - else: - azi = np.array(float(eval(source_azi)), ndmin=1)[:N_frames] - else: - azi = np.array(source_azi, ndmin=1)[:N_frames] - - # ensure that azimuth array has N_frames values - if len(azi) > N_frames: - # cut the array of azimuth values - azi = azi[:N_frames] - elif len(azi) < N_frames: - # replicate the last azimuth - azi = np.append(azi, np.full(N_frames - len(azi), azi[-1])) - - # convert azimuth from 0 .. 360 to -180 .. +180 - azi = (azi + 180) % 360 - 180 - - # check if azimuth is from -180 .. +180 - if any(azi > 180) or any(azi < -180): - logger.error( - f"Incorrect value(s) of azimuth: {azi[(azi > 180) | (azi < -180)]}" - ) - - # read elevation information and create array - if isinstance(source_ele, str): - if ":" in source_ele: - source_ele = source_ele.split(":") - ele = np.arange( - float(eval(source_ele[0])), - float(eval(source_ele[2])), - float(eval(source_ele[1])), - ) - else: - ele = np.array(float(eval(source_ele)), ndmin=1)[:N_frames] - else: - ele = np.array(source_ele, ndmin=1)[:N_frames] - - # ensure that elevation array has N_frames values - if len(ele) > N_frames: - # cut the array of elevation values - ele = ele[:N_frames] - elif len(ele) < N_frames: - # replicate the last elevation - ele = np.append(ele, np.full(N_frames - len(ele), ele[-1])) - - # check if elevation is from -90 .. +90 - if any(ele > 90) or any(ele < -90): - logger.error( - f"Incorrect value(s) of elevation: {ele[(ele > 90) | (ele < -90)]}" - ) - - # additional metadata (default values) - radius = np.ones(N_frames) - spread = np.zeros(N_frames) - gain = np.ones(N_frames) - - # arrange all metadata fields column-wise into a matrix - x_meta = np.column_stack((azi, ele, radius, spread, gain)) - # shift the second (and all other) source files (positive shift creates overlap, negative shift creates a gap) if i > 0: # get the length of the first source file @@ -294,12 +227,6 @@ def generate_ism2_scene( pre = np.zeros((N_delay, x.audio.shape[1])) x.audio = np.concatenate([pre, x.audio]) - # insert neutral position as a pre-amble - N_delay = int(N_delay / frame_len) - # use neutral position for padding - pre = np.tile([0.00, 0.00, 1.00, 0.00, 1.00], (N_delay, 1)) - x_meta = np.concatenate([pre, x_meta]) - # pad with zeros to ensure that the signal length is a multiple of 20ms if len(x.audio) % frame_len != 0: # pad the source signal @@ -307,13 +234,6 @@ def generate_ism2_scene( post = np.zeros((N_pad, x.audio.shape[1])) x.audio = np.concatenate([x.audio, post]) - # pad the metadata - N_pad = int(len(x.audio) / frame_len) - len(x_meta) - if N_pad > 0: - # use neutral position for padding - post = np.tile([0.00, 0.00, 1.00, 0.00, 1.00], (N_pad, 1)) - x_meta = np.concatenate([x_meta, post]) - # add source signal to the array of all source signals y.fs = x.fs if y.audio is None: @@ -340,35 +260,6 @@ def generate_ism2_scene( ) y.audio = np.hstack((y.audio, x.audio)) - # add metadata to the array of all metadata - # make sure x_meta is a 3d array - x_meta = x_meta[np.newaxis, :] - if y_meta is None: - y_meta = x_meta - else: - N_srcs = y_meta.shape[0] - N_meta_features = y_meta.shape[2] - - # append the last position of the metadata to have equal length of all metadata - if x_meta.shape[1] > y_meta.shape[1]: - N_delta = x_meta.shape[1] - y_meta.shape[1] - # reshape to 2d array - y_meta = y_meta.reshape(y_meta.shape[1], -1) - # repeat last row N_delta times and append to the array - y_meta = np.vstack((y_meta, np.tile(y_meta[-1, :], (N_delta, 1)))) - # reshape back to 3d array - y_meta = y_meta.reshape(N_srcs, -1, N_meta_features) - elif y_meta.shape[1] > x_meta.shape[1]: - N_delta = y_meta.shape[1] - x_meta.shape[1] - # reshape to 2d array - x_meta = x_meta.reshape(x_meta.shape[1], -1) - # repeat last row N_delta times and append to the array - x_meta = np.vstack((x_meta, np.tile(x_meta[-1, :], (N_delta, 1)))) - # reshape back to 3d array - x_meta = np.expand_dims(x_meta, axis=0) - - y_meta = np.concatenate([y_meta, x_meta]) - # append pre-amble and post-amble to all sources if cfg.preamble != 0.0: # ensure that pre-amble is a multiple of 20ms @@ -378,11 +269,6 @@ def generate_ism2_scene( pre = np.zeros((N_pre, y.audio.shape[1])) y.audio = np.concatenate([pre, y.audio]) - # insert neutral position as a pre-amble to all sources - N_pre = int(N_pre / frame_len) - pre = np.tile([0.00, 0.00, 1.00, 0.00, 1.00], (y_meta.shape[0], N_pre, 1)) - y_meta = np.concatenate([pre, y_meta], axis=1) - if cfg.postamble != 0.0: # ensure that post-mable is a multiple of 20ms N_post = int(floor(cfg.postamble * 50) / 50 * y.fs) @@ -391,11 +277,6 @@ def generate_ism2_scene( post = np.zeros((N_post, y.audio.shape[1])) y.audio = np.concatenate([y.audio, post]) - # append neutral position as a post-amble to all sources - N_post = int(N_post / frame_len) - post = np.tile([0.00, 0.00, 1.00, 0.00, 1.00], (y_meta.shape[0], N_post, 1)) - y_meta = np.concatenate([y_meta, post], axis=1) - # add random noise if cfg.add_low_level_random_noise: # create uniformly distributed noise between -4 and 4 @@ -405,6 +286,85 @@ def generate_ism2_scene( # superimpose y.audio += noise + # create metadata files + for i in range(N_sources): + # parse metadata parameters from the scene description + source_azi = ( + scene["azimuth"][i] + if isinstance(scene["azimuth"], list) + else scene["azimuth"] + ) + source_ele = ( + scene["elevation"][i] + if isinstance(scene["elevation"], list) + else scene["elevation"] + ) + + N_frames = int(len(y.audio) / y.fs * 50) + + # read azimuth information and convert to an array + if isinstance(source_azi, str): + if ":" in source_azi: + # start with the initial azimuth value and apply step N_frames times + source_azi = source_azi.split(":") + azi = np.arange( + float(eval(source_azi[0])), + float(eval(source_azi[0])) + N_frames * float(eval(source_azi[1])), + float(eval(source_azi[1])), + ) + else: + # replicate static azimuth value N_frames times + azi = np.repeat(float(eval(source_azi)), N_frames) + else: + # replicate static azimuth value N_frames times + azi = np.repeat(float(source_azi), N_frames) + + # convert azimuth from 0 .. 360 to -180 .. +180 + azi = (azi + 180) % 360 - 180 + + # check if azimuth is from -180 .. +180 + if any(azi > 180) or any(azi < -180): + logger.error( + f"Incorrect value(s) of azimuth: {azi[(azi > 180) | (azi < -180)]}" + ) + + # read elevation information and convert to an array + if isinstance(source_ele, str): + if ":" in source_ele: + # convert into array (initial_value:step:stop_value) + # note: the stop_value value is +-90 degrees depending on the sign of the step + source_ele = source_ele.split(":") + ele = np.arange( + float(eval(source_ele[0])), + np.sign(float(eval(source_ele[1]))) * 90, + float(eval(source_ele[1])), + )[:N_frames] + + # repeat the last elevation value, if array is shorter than N_frames + if len(ele) < N_frames: + ele = np.append(ele, np.full(N_frames - len(ele), ele[-1])) + else: + # replicate static elevation value N_frames times + ele = np.repeat(float(eval(source_ele)), N_frames) + else: + # replicate static elevation value N_frames times + ele = np.repeat(float(source_ele), N_frames) + + # check if elevation is from -90 .. +90 + if any(ele > 90) or any(ele < -90): + logger.error( + f"Incorrect value(s) of elevation: {ele[(ele > 90) | (ele < -90)]}" + ) + + # arrange all metadata fields column-wise into a matrix + x_meta = np.column_stack((azi, ele)) + + x_meta = x_meta[np.newaxis, :] + if y_meta is None: + y_meta = x_meta + else: + y_meta = np.concatenate([y_meta, x_meta]) + # write individual ISM audio streams to the output file in an interleaved format audiofile.write( os.path.join(