Loading item_generation_scripts/config/ISM1_CONFIG.yml +0 −36 Original line number Diff line number Diff line Loading @@ -54,7 +54,6 @@ scenes: source: "test_single.wav" azimuth: 0 elevation: 0 delay: 0 a2: name: "G6S2.wav" Loading @@ -62,7 +61,6 @@ scenes: source: "test_single.wav" azimuth: 60 elevation: 0 delay: 0 a3: name: "G5S3.wav" Loading @@ -70,7 +68,6 @@ scenes: source: "test_single.wav" azimuth: 120 elevation: 0 delay: 0 a4: name: "G4S4.wav" Loading @@ -78,7 +75,6 @@ scenes: source: "test_single.wav" azimuth: 180 elevation: 0 delay: 0 a5: name: "G3S5.wav" Loading @@ -86,7 +82,6 @@ scenes: source: "test_single.wav" azimuth: 240 elevation: 0 delay: 0 a6: name: "G2S6.wav" Loading @@ -94,7 +89,6 @@ scenes: source: "test_single.wav" azimuth: 300 elevation: 0 delay: 0 b1: name: "G2S1.wav" Loading @@ -102,7 +96,6 @@ scenes: source: "test_single.wav" azimuth: 120 elevation: 35 delay: 0 b2: name: "G1S2.wav" Loading @@ -110,7 +103,6 @@ scenes: source: "test_single.wav" azimuth: 180 elevation: 35 delay: 0 b3: name: "G6S3.wav" Loading @@ -118,7 +110,6 @@ scenes: source: "test_single.wav" azimuth: 240 elevation: 35 delay: 0 b4: name: "G5S4.wav" Loading @@ -126,7 +117,6 @@ scenes: source: "test_single.wav" azimuth: 300 elevation: 35 delay: 0 b5: name: "G4S5.wav" Loading @@ -134,7 +124,6 @@ scenes: source: "test_single.wav" azimuth: 0 elevation: 35 delay: 0 b6: name: "G3S6.wav" Loading @@ -142,7 +131,6 @@ scenes: source: "test_single.wav" azimuth: 60 elevation: 35 delay: 0 c1: name: "G3S1.wav" Loading @@ -150,7 +138,6 @@ scenes: source: "test_single.wav" azimuth: "0:1:360" elevation: 0 delay: 0 c2: name: "G2S2.wav" Loading @@ -158,7 +145,6 @@ scenes: source: "test_single.wav" azimuth: "60:1:60+360" elevation: 0 delay: 0 c3: name: "G1S3.wav" Loading @@ -166,7 +152,6 @@ scenes: source: "test_single.wav" azimuth: "120:1:120+360" elevation: 0 delay: 0 c4: name: "G6S4.wav" Loading @@ -174,7 +159,6 @@ scenes: source: "test_single.wav" azimuth: "180:1:180+360" elevation: 0 delay: 0 c5: name: "G5S5.wav" Loading @@ -182,7 +166,6 @@ scenes: source: "test_single.wav" azimuth: "240:1:240+360" elevation: 0 delay: 0 c6: name: "G4S6.wav" Loading @@ -190,7 +173,6 @@ scenes: source: "test_single.wav" azimuth: "300:1:300+360" elevation: 0 delay: 0 d1: name: "G4S1.wav" Loading @@ -198,7 +180,6 @@ scenes: source: "test_single.wav" azimuth: "0:-1:-360" elevation: 35 delay: 0 d2: name: "G3S2.wav" Loading @@ -206,7 +187,6 @@ scenes: source: "test_single.wav" azimuth: "60:-1:60-360" elevation: 35 delay: 0 d3: name: "G3S2.wav" Loading @@ -214,7 +194,6 @@ scenes: source: "test_single.wav" azimuth: "120:-1:120-360" elevation: 35 delay: 0 d4: name: "G1S4.wav" Loading @@ -222,7 +201,6 @@ scenes: source: "test_single.wav" azimuth: "180:-1:180-360" elevation: 35 delay: 0 d5: name: "G6S5.wav" Loading @@ -230,7 +208,6 @@ scenes: source: "test_single.wav" azimuth: "240:-1:240-360" elevation: 35 delay: 0 d6: name: "G5S6.wav" Loading @@ -238,7 +215,6 @@ scenes: source: "test_single.wav" azimuth: "300:-1:300-360" elevation: 35 delay: 0 e1: name: "G5S1.wav" Loading @@ -246,7 +222,6 @@ scenes: source: "test_single.wav" azimuth: 240 elevation: "-90:0.5:90" delay: 0 e2: name: "G4S2.wav" Loading @@ -254,7 +229,6 @@ scenes: source: "test_single.wav" azimuth: 300 elevation: 0 delay: 0 e3: name: "G3S3.wav" Loading @@ -262,7 +236,6 @@ scenes: source: "test_single.wav" azimuth: 0 elevation: "-90:0.5:90" delay: 0 e4: name: "G2S4.wav" Loading @@ -270,7 +243,6 @@ scenes: source: "test_single.wav" azimuth: 60 elevation: "-90:0.5:90" delay: 0 e5: name: "G1S5.wav" Loading @@ -278,7 +250,6 @@ scenes: source: "test_single.wav" azimuth: 120 elevation: "-90:0.5:90" delay: 0 e6: name: "G6S6.wav" Loading @@ -286,7 +257,6 @@ scenes: source: "test_single.wav" azimuth: 180 elevation: "-90:0.5:90" delay: 0 f1: name: "G6S1.wav" Loading @@ -294,7 +264,6 @@ scenes: source: "test_single.wav" azimuth: "60:0.5:60+180" elevation: "35:-0.2:-35" delay: 0 f2: name: "G5S2.wav" Loading @@ -302,7 +271,6 @@ scenes: source: "test_single.wav" azimuth: "120:0.5:120+180" elevation: "35:-0.2:-35" delay: 0 f3: name: "G4S3.wav" Loading @@ -310,7 +278,6 @@ scenes: source: "test_single.wav" azimuth: "180:0.5:180+180" elevation: "35:-0.2:-35" delay: 0 f4: name: "G3S4.wav" Loading @@ -318,7 +285,6 @@ scenes: source: "test_single.wav" azimuth: "240:0.5:240+180" elevation: "35:-0.2:-35" delay: 0 f5: name: "G2S5.wav" Loading @@ -326,7 +292,6 @@ scenes: source: "test_single.wav" azimuth: "300:0.5:300+180" elevation: "35:-0.2:-35" delay: 0 f6: name: "G1S6.wav" Loading @@ -334,5 +299,4 @@ scenes: source: "test_single.wav" azimuth: "0:0.5:0+180" elevation: "35:-0.2:-35" delay: 0 No newline at end of file item_generation_scripts/config/ISM2_CONFIG.yml +73 −72 Original line number Diff line number Diff line Loading @@ -37,6 +37,7 @@ loudness: -26 ### Each scene must start with the sceneN tag ### Specify the mono source filename (the program will search for it in the input_path folder) ### Specify azimuth and elevation for each input source ### Specify the delay in seconds for each input source ### Note 1: use [val1, val2, ...] for multiple sources in a scene ### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames Loading @@ -51,288 +52,288 @@ scenes: a1: name: "G1S1.wav" description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." source: ["f2s5a_Talker1.wav", "m2s16b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [0, 50] elevation: [0, 0] delay: [0, 0] delay: [0, 1] a2: name: "G6S2.wav" description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." source: ["f5s10a_Talker1.wav", "m3s2a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [50, 350] elevation: [0, 0] delay: [0, 0] delay: [0, 1] a3: name: "G5S3.wav" description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." source: ["f2s5a_Talker1.wav", "m2s16b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [40, 290] elevation: [0, 0] delay: [0, 0] delay: [0, 1] a4: name: "G4S4.wav" description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." source: ["m4s11b_Talker1.wav", "f1s4b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [30, 230] elevation: [15, 15] delay: [0, 0] delay: [0, 1] a5: name: "G3S5.wav" description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." source: ["m1s4a_Talker1.wav", "f3s3a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [20, 170] elevation: [15, 15] delay: [0, 0] delay: [0, 1] a6: name: "G2S6.wav" description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." source: ["f5s10a_Talker1.wav", "m3s2a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [10, 110] elevation: [15, 15] delay: [0, 0] delay: [0, 1] b1: name: "G2S1.wav" description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." source: ["f5s10b_Talker1.wav", "m3s2b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [20, 170] elevation: [30, 30] delay: [0, 0] delay: [0, 1] b2: name: "G1S2.wav" description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." source: ["f2s1a_Talker1.wav", "m2s10a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [10, 110] elevation: [30, 30] delay: [0, 0] delay: [0, 1] b3: name: "G6S3.wav" description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." source: ["f5s10b_Talker1.wav", "m3s2b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [0, 50] elevation: [30, 30] delay: [0, 0] delay: [0, 1] b4: name: "G5S4.wav" description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." source: ["f2s1a_Talker1.wav", "m2s10a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [50, 350] elevation: [60, 60] delay: [0, 0] delay: [0, 1] b5: name: "G4S5.wav" description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." source: ["m4s11a_Talker1.wav", "f1s6a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [40, 290] elevation: [60, 60] delay: [0, 0] delay: [0, 1] b6: name: "G3S6.wav" description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." source: ["m1s2b_Talker1.wav", "f3s5a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [30, 230] elevation: [60, 60] delay: [0, 0] delay: [0, 1] c1: name: "G3S1.wav" description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances." source: ["m1s6b_Talker1.wav", "f3s5b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [40, 290] elevation: [0, 60] delay: [0, 0] delay: [0, 1] c2: name: "G2S2.wav" description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances." source: ["f5s14a_Talker1.wav", "m3s8a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [30, 230] elevation: [0, 60] delay: [0, 0] delay: [0, 1] c3: name: "G1S3.wav" description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances." source: ["f2s6a_Talker1.wav", "m2s13a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [20, 170] elevation: [0, 60] delay: [0, 0] delay: [0, 1] c4: name: "G6S4.wav" description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances." source: ["f5s14a_Talker1.wav", "m3s8a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [10, 110] elevation: [0, 60] delay: [0, 0] delay: [0, 1] c5: name: "G5S5.wav" description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances." source: ["f2s6a_Talker1.wav", "m2s13a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [0, 50] elevation: [0, 60] delay: [0, 0] delay: [0, 1] c6: name: "G4S6.wav" description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances." source: ["m4s13a_Talker1.wav", "f1s20a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [50, 350] elevation: [0, 60] delay: [0, 0] delay: [0, 1] d1: name: "G4S1.wav" description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." source: ["m4s12b_Talker1.wav", "f1s12b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [50, "180:1:120 + 360"] elevation: [0, 60] delay: [0, 0] delay: [0, 1] d2: name: "G3S2.wav" description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." source: ["m1s12a_Talker1.wav", "f3s20a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [300, "-70:-1:-10 - 360"] elevation: [0, 60] delay: [0, 0] delay: [0, 1] d3: name: "G3S2.wav" description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." source: ["f5s15b_Talker1.wav", "m3s1a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [250, "-20:-1:-320"] elevation: [0, 60] delay: [0, 0] delay: [0, 1] d4: name: "G1S4.wav" description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." source: ["f2s3b_Talker1.wav", "m2s15a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [200, "30:-1:-270"] elevation: [0, 60] delay: [0, 0] delay: [0, 1] d5: name: "G6S5.wav" description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." source: ["f5s15b_Talker1.wav", "m3s1a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [150, "80:1:20 + 360"] elevation: [0, 60] delay: [0, 0] delay: [0, 1] d6: name: "G5S6.wav" description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." source: ["f2s3b_Talker1.wav", "m2s15a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [100, "130:1:70 + 360"] elevation: [0, 60] delay: [0, 0] delay: [0, 1] e1: name: "G5S1.wav" description: "two talkers walking side-by-side around the table, ~30% overlapping utterances" source: ["f2s4a_Talker1.wav", "m2s17b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: ["80:1:20 + 360", "80:1:20 + 360"] elevation: [10, 60] delay: [0, 0] delay: [0, 1] e2: name: "G4S2.wav" description: "two talkers walking side-by-side around the table, ~30% overlapping utterances" source: ["m4s16a_Talker1.wav", "f1s16b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: ["130:1:70 + 360", "130:1:70 + 360"] elevation: [10, 60] delay: [0, 0] delay: [0, 1] e3: name: "G3S3.wav" description: "two talkers walking side-by-side around the table, ~30% overlapping utterances" source: ["m1s16b_Talker1.wav", "f3s10b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: ["180:1:120 + 360", "180:1:120 + 360"] elevation: [10, 60] delay: [0, 0] delay: [0, 1] e4: name: "G2S4.wav" description: "two talkers walking side-by-side around the table, ~30% overlapping utterances" source: ["f5s19a_Talker1.wav", "m3s1b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: ["-70:-1:-10 - 360", "-70:-1:-10 - 360"] elevation: [10, 60] delay: [0, 0] delay: [0, 1] e5: name: "G1S5.wav" description: "two talkers walking side-by-side around the table, ~30% overlapping utterances" source: ["f2s4a_Talker1.wav", "m2s17b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: ["-20:-1:-320", "-20:-1:-320"] elevation: [10, 60] delay: [0, 0] delay: [0, 1] e6: name: "G6S6.wav" description: "two talkers walking side-by-side around the table, ~30% overlapping utterances" source: ["f5s19a_Talker1.wav", "m3s1b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: ["30:-1:-270", "30:-1:-270"] elevation: [10, 60] delay: [0, 0] delay: [0, 1] f1: name: "G6S1.wav" description: "two talkers walking around the table in opposite directions, non-overlapping utterances." source: ["f5s15a_Talker1.wav", "m3s8b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: ["60:1:0 + 360", "60:-1:120 - 360"] elevation: [20, 50] delay: [0, 0] delay: [0, 1] f2: name: "G5S2.wav" description: "two talkers walking around the table in opposite directions, non-overlapping utterances." source: ["f2s7b_Talker1.wav", "m2s6b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: ["0:1:300", "0:-1:60 - 360"] elevation: [20, 50] delay: [0, 0] delay: [0, 1] f3: name: "G4S3.wav" description: "two talkers walking around the table in opposite directions, non-overlapping utterances." source: ["m4s14a_Talker1.wav", "f1s7a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: ["300:1:240 + 360", "300:-1:0"] elevation: [20, 50] delay: [0, 0] delay: [0, 1] f4: name: "G3S4.wav" description: "two talkers walking around the table in opposite directions, non-overlapping utterances." source: ["m1s7a_Talker1.wav", "f3s7a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: ["240:1:180 + 360", "240:-1:-60"] elevation: [20, 50] delay: [0, 0] delay: [0, 1] f5: name: "G2S5.wav" description: "two talkers walking around the table in opposite directions, non-overlapping utterances." source: ["f5s15a_Talker1.wav", "m3s8b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: ["180:1:120 + 360", "180:-1:-120"] elevation: [20, 50] delay: [0, 0] delay: [0, 1] f6: name: "G1S6.wav" description: "two talkers walking around the table in opposite directions, non-overlapping utterances." source: ["f2s7b_Talker1.wav", "m2s6b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: ["120:1:60 + 360", "120:-1:180 - 360"] elevation: [20, 50] delay: [0, 0] delay: [0, 1] No newline at end of file item_generation_scripts/processing/process_ism_items.py +7 −2 Original line number Diff line number Diff line Loading @@ -72,10 +72,15 @@ def generate_ism_items( y = None y_meta = None for i in range(N_sources): # parse parameters from the scene description source_file = np.atleast_1d(scene["source"])[i] source_azi = np.atleast_1d(scene["azimuth"])[i] source_ele = np.atleast_1d(scene["elevation"])[i] if 'delay' in scene.keys(): source_delay = np.atleast_1d(scene["delay"])[i] else: source_delay = np.array([0]) logger.info( f"Encoding {source_file} at position(s) {source_azi},{source_ele}" Loading Loading
item_generation_scripts/config/ISM1_CONFIG.yml +0 −36 Original line number Diff line number Diff line Loading @@ -54,7 +54,6 @@ scenes: source: "test_single.wav" azimuth: 0 elevation: 0 delay: 0 a2: name: "G6S2.wav" Loading @@ -62,7 +61,6 @@ scenes: source: "test_single.wav" azimuth: 60 elevation: 0 delay: 0 a3: name: "G5S3.wav" Loading @@ -70,7 +68,6 @@ scenes: source: "test_single.wav" azimuth: 120 elevation: 0 delay: 0 a4: name: "G4S4.wav" Loading @@ -78,7 +75,6 @@ scenes: source: "test_single.wav" azimuth: 180 elevation: 0 delay: 0 a5: name: "G3S5.wav" Loading @@ -86,7 +82,6 @@ scenes: source: "test_single.wav" azimuth: 240 elevation: 0 delay: 0 a6: name: "G2S6.wav" Loading @@ -94,7 +89,6 @@ scenes: source: "test_single.wav" azimuth: 300 elevation: 0 delay: 0 b1: name: "G2S1.wav" Loading @@ -102,7 +96,6 @@ scenes: source: "test_single.wav" azimuth: 120 elevation: 35 delay: 0 b2: name: "G1S2.wav" Loading @@ -110,7 +103,6 @@ scenes: source: "test_single.wav" azimuth: 180 elevation: 35 delay: 0 b3: name: "G6S3.wav" Loading @@ -118,7 +110,6 @@ scenes: source: "test_single.wav" azimuth: 240 elevation: 35 delay: 0 b4: name: "G5S4.wav" Loading @@ -126,7 +117,6 @@ scenes: source: "test_single.wav" azimuth: 300 elevation: 35 delay: 0 b5: name: "G4S5.wav" Loading @@ -134,7 +124,6 @@ scenes: source: "test_single.wav" azimuth: 0 elevation: 35 delay: 0 b6: name: "G3S6.wav" Loading @@ -142,7 +131,6 @@ scenes: source: "test_single.wav" azimuth: 60 elevation: 35 delay: 0 c1: name: "G3S1.wav" Loading @@ -150,7 +138,6 @@ scenes: source: "test_single.wav" azimuth: "0:1:360" elevation: 0 delay: 0 c2: name: "G2S2.wav" Loading @@ -158,7 +145,6 @@ scenes: source: "test_single.wav" azimuth: "60:1:60+360" elevation: 0 delay: 0 c3: name: "G1S3.wav" Loading @@ -166,7 +152,6 @@ scenes: source: "test_single.wav" azimuth: "120:1:120+360" elevation: 0 delay: 0 c4: name: "G6S4.wav" Loading @@ -174,7 +159,6 @@ scenes: source: "test_single.wav" azimuth: "180:1:180+360" elevation: 0 delay: 0 c5: name: "G5S5.wav" Loading @@ -182,7 +166,6 @@ scenes: source: "test_single.wav" azimuth: "240:1:240+360" elevation: 0 delay: 0 c6: name: "G4S6.wav" Loading @@ -190,7 +173,6 @@ scenes: source: "test_single.wav" azimuth: "300:1:300+360" elevation: 0 delay: 0 d1: name: "G4S1.wav" Loading @@ -198,7 +180,6 @@ scenes: source: "test_single.wav" azimuth: "0:-1:-360" elevation: 35 delay: 0 d2: name: "G3S2.wav" Loading @@ -206,7 +187,6 @@ scenes: source: "test_single.wav" azimuth: "60:-1:60-360" elevation: 35 delay: 0 d3: name: "G3S2.wav" Loading @@ -214,7 +194,6 @@ scenes: source: "test_single.wav" azimuth: "120:-1:120-360" elevation: 35 delay: 0 d4: name: "G1S4.wav" Loading @@ -222,7 +201,6 @@ scenes: source: "test_single.wav" azimuth: "180:-1:180-360" elevation: 35 delay: 0 d5: name: "G6S5.wav" Loading @@ -230,7 +208,6 @@ scenes: source: "test_single.wav" azimuth: "240:-1:240-360" elevation: 35 delay: 0 d6: name: "G5S6.wav" Loading @@ -238,7 +215,6 @@ scenes: source: "test_single.wav" azimuth: "300:-1:300-360" elevation: 35 delay: 0 e1: name: "G5S1.wav" Loading @@ -246,7 +222,6 @@ scenes: source: "test_single.wav" azimuth: 240 elevation: "-90:0.5:90" delay: 0 e2: name: "G4S2.wav" Loading @@ -254,7 +229,6 @@ scenes: source: "test_single.wav" azimuth: 300 elevation: 0 delay: 0 e3: name: "G3S3.wav" Loading @@ -262,7 +236,6 @@ scenes: source: "test_single.wav" azimuth: 0 elevation: "-90:0.5:90" delay: 0 e4: name: "G2S4.wav" Loading @@ -270,7 +243,6 @@ scenes: source: "test_single.wav" azimuth: 60 elevation: "-90:0.5:90" delay: 0 e5: name: "G1S5.wav" Loading @@ -278,7 +250,6 @@ scenes: source: "test_single.wav" azimuth: 120 elevation: "-90:0.5:90" delay: 0 e6: name: "G6S6.wav" Loading @@ -286,7 +257,6 @@ scenes: source: "test_single.wav" azimuth: 180 elevation: "-90:0.5:90" delay: 0 f1: name: "G6S1.wav" Loading @@ -294,7 +264,6 @@ scenes: source: "test_single.wav" azimuth: "60:0.5:60+180" elevation: "35:-0.2:-35" delay: 0 f2: name: "G5S2.wav" Loading @@ -302,7 +271,6 @@ scenes: source: "test_single.wav" azimuth: "120:0.5:120+180" elevation: "35:-0.2:-35" delay: 0 f3: name: "G4S3.wav" Loading @@ -310,7 +278,6 @@ scenes: source: "test_single.wav" azimuth: "180:0.5:180+180" elevation: "35:-0.2:-35" delay: 0 f4: name: "G3S4.wav" Loading @@ -318,7 +285,6 @@ scenes: source: "test_single.wav" azimuth: "240:0.5:240+180" elevation: "35:-0.2:-35" delay: 0 f5: name: "G2S5.wav" Loading @@ -326,7 +292,6 @@ scenes: source: "test_single.wav" azimuth: "300:0.5:300+180" elevation: "35:-0.2:-35" delay: 0 f6: name: "G1S6.wav" Loading @@ -334,5 +299,4 @@ scenes: source: "test_single.wav" azimuth: "0:0.5:0+180" elevation: "35:-0.2:-35" delay: 0 No newline at end of file
item_generation_scripts/config/ISM2_CONFIG.yml +73 −72 Original line number Diff line number Diff line Loading @@ -37,6 +37,7 @@ loudness: -26 ### Each scene must start with the sceneN tag ### Specify the mono source filename (the program will search for it in the input_path folder) ### Specify azimuth and elevation for each input source ### Specify the delay in seconds for each input source ### Note 1: use [val1, val2, ...] for multiple sources in a scene ### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames Loading @@ -51,288 +52,288 @@ scenes: a1: name: "G1S1.wav" description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." source: ["f2s5a_Talker1.wav", "m2s16b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [0, 50] elevation: [0, 0] delay: [0, 0] delay: [0, 1] a2: name: "G6S2.wav" description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." source: ["f5s10a_Talker1.wav", "m3s2a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [50, 350] elevation: [0, 0] delay: [0, 0] delay: [0, 1] a3: name: "G5S3.wav" description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." source: ["f2s5a_Talker1.wav", "m2s16b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [40, 290] elevation: [0, 0] delay: [0, 0] delay: [0, 1] a4: name: "G4S4.wav" description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." source: ["m4s11b_Talker1.wav", "f1s4b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [30, 230] elevation: [15, 15] delay: [0, 0] delay: [0, 1] a5: name: "G3S5.wav" description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." source: ["m1s4a_Talker1.wav", "f3s3a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [20, 170] elevation: [15, 15] delay: [0, 0] delay: [0, 1] a6: name: "G2S6.wav" description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." source: ["f5s10a_Talker1.wav", "m3s2a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [10, 110] elevation: [15, 15] delay: [0, 0] delay: [0, 1] b1: name: "G2S1.wav" description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." source: ["f5s10b_Talker1.wav", "m3s2b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [20, 170] elevation: [30, 30] delay: [0, 0] delay: [0, 1] b2: name: "G1S2.wav" description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." source: ["f2s1a_Talker1.wav", "m2s10a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [10, 110] elevation: [30, 30] delay: [0, 0] delay: [0, 1] b3: name: "G6S3.wav" description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." source: ["f5s10b_Talker1.wav", "m3s2b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [0, 50] elevation: [30, 30] delay: [0, 0] delay: [0, 1] b4: name: "G5S4.wav" description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." source: ["f2s1a_Talker1.wav", "m2s10a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [50, 350] elevation: [60, 60] delay: [0, 0] delay: [0, 1] b5: name: "G4S5.wav" description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." source: ["m4s11a_Talker1.wav", "f1s6a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [40, 290] elevation: [60, 60] delay: [0, 0] delay: [0, 1] b6: name: "G3S6.wav" description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." source: ["m1s2b_Talker1.wav", "f3s5a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [30, 230] elevation: [60, 60] delay: [0, 0] delay: [0, 1] c1: name: "G3S1.wav" description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances." source: ["m1s6b_Talker1.wav", "f3s5b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [40, 290] elevation: [0, 60] delay: [0, 0] delay: [0, 1] c2: name: "G2S2.wav" description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances." source: ["f5s14a_Talker1.wav", "m3s8a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [30, 230] elevation: [0, 60] delay: [0, 0] delay: [0, 1] c3: name: "G1S3.wav" description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances." source: ["f2s6a_Talker1.wav", "m2s13a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [20, 170] elevation: [0, 60] delay: [0, 0] delay: [0, 1] c4: name: "G6S4.wav" description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances." source: ["f5s14a_Talker1.wav", "m3s8a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [10, 110] elevation: [0, 60] delay: [0, 0] delay: [0, 1] c5: name: "G5S5.wav" description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances." source: ["f2s6a_Talker1.wav", "m2s13a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [0, 50] elevation: [0, 60] delay: [0, 0] delay: [0, 1] c6: name: "G4S6.wav" description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances." source: ["m4s13a_Talker1.wav", "f1s20a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [50, 350] elevation: [0, 60] delay: [0, 0] delay: [0, 1] d1: name: "G4S1.wav" description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." source: ["m4s12b_Talker1.wav", "f1s12b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [50, "180:1:120 + 360"] elevation: [0, 60] delay: [0, 0] delay: [0, 1] d2: name: "G3S2.wav" description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." source: ["m1s12a_Talker1.wav", "f3s20a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [300, "-70:-1:-10 - 360"] elevation: [0, 60] delay: [0, 0] delay: [0, 1] d3: name: "G3S2.wav" description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." source: ["f5s15b_Talker1.wav", "m3s1a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [250, "-20:-1:-320"] elevation: [0, 60] delay: [0, 0] delay: [0, 1] d4: name: "G1S4.wav" description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." source: ["f2s3b_Talker1.wav", "m2s15a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [200, "30:-1:-270"] elevation: [0, 60] delay: [0, 0] delay: [0, 1] d5: name: "G6S5.wav" description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." source: ["f5s15b_Talker1.wav", "m3s1a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [150, "80:1:20 + 360"] elevation: [0, 60] delay: [0, 0] delay: [0, 1] d6: name: "G5S6.wav" description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." source: ["f2s3b_Talker1.wav", "m2s15a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: [100, "130:1:70 + 360"] elevation: [0, 60] delay: [0, 0] delay: [0, 1] e1: name: "G5S1.wav" description: "two talkers walking side-by-side around the table, ~30% overlapping utterances" source: ["f2s4a_Talker1.wav", "m2s17b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: ["80:1:20 + 360", "80:1:20 + 360"] elevation: [10, 60] delay: [0, 0] delay: [0, 1] e2: name: "G4S2.wav" description: "two talkers walking side-by-side around the table, ~30% overlapping utterances" source: ["m4s16a_Talker1.wav", "f1s16b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: ["130:1:70 + 360", "130:1:70 + 360"] elevation: [10, 60] delay: [0, 0] delay: [0, 1] e3: name: "G3S3.wav" description: "two talkers walking side-by-side around the table, ~30% overlapping utterances" source: ["m1s16b_Talker1.wav", "f3s10b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: ["180:1:120 + 360", "180:1:120 + 360"] elevation: [10, 60] delay: [0, 0] delay: [0, 1] e4: name: "G2S4.wav" description: "two talkers walking side-by-side around the table, ~30% overlapping utterances" source: ["f5s19a_Talker1.wav", "m3s1b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: ["-70:-1:-10 - 360", "-70:-1:-10 - 360"] elevation: [10, 60] delay: [0, 0] delay: [0, 1] e5: name: "G1S5.wav" description: "two talkers walking side-by-side around the table, ~30% overlapping utterances" source: ["f2s4a_Talker1.wav", "m2s17b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: ["-20:-1:-320", "-20:-1:-320"] elevation: [10, 60] delay: [0, 0] delay: [0, 1] e6: name: "G6S6.wav" description: "two talkers walking side-by-side around the table, ~30% overlapping utterances" source: ["f5s19a_Talker1.wav", "m3s1b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: ["30:-1:-270", "30:-1:-270"] elevation: [10, 60] delay: [0, 0] delay: [0, 1] f1: name: "G6S1.wav" description: "two talkers walking around the table in opposite directions, non-overlapping utterances." source: ["f5s15a_Talker1.wav", "m3s8b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: ["60:1:0 + 360", "60:-1:120 - 360"] elevation: [20, 50] delay: [0, 0] delay: [0, 1] f2: name: "G5S2.wav" description: "two talkers walking around the table in opposite directions, non-overlapping utterances." source: ["f2s7b_Talker1.wav", "m2s6b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: ["0:1:300", "0:-1:60 - 360"] elevation: [20, 50] delay: [0, 0] delay: [0, 1] f3: name: "G4S3.wav" description: "two talkers walking around the table in opposite directions, non-overlapping utterances." source: ["m4s14a_Talker1.wav", "f1s7a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: ["300:1:240 + 360", "300:-1:0"] elevation: [20, 50] delay: [0, 0] delay: [0, 1] f4: name: "G3S4.wav" description: "two talkers walking around the table in opposite directions, non-overlapping utterances." source: ["m1s7a_Talker1.wav", "f3s7a_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: ["240:1:180 + 360", "240:-1:-60"] elevation: [20, 50] delay: [0, 0] delay: [0, 1] f5: name: "G2S5.wav" description: "two talkers walking around the table in opposite directions, non-overlapping utterances." source: ["f5s15a_Talker1.wav", "m3s8b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: ["180:1:120 + 360", "180:-1:-120"] elevation: [20, 50] delay: [0, 0] delay: [0, 1] f6: name: "G1S6.wav" description: "two talkers walking around the table in opposite directions, non-overlapping utterances." source: ["f2s7b_Talker1.wav", "m2s6b_Talker2.wav"] source: ["test_double.wav", "test_double.wav"] azimuth: ["120:1:60 + 360", "120:-1:180 - 360"] elevation: [20, 50] delay: [0, 0] delay: [0, 1] No newline at end of file
item_generation_scripts/processing/process_ism_items.py +7 −2 Original line number Diff line number Diff line Loading @@ -72,10 +72,15 @@ def generate_ism_items( y = None y_meta = None for i in range(N_sources): # parse parameters from the scene description source_file = np.atleast_1d(scene["source"])[i] source_azi = np.atleast_1d(scene["azimuth"])[i] source_ele = np.atleast_1d(scene["elevation"])[i] if 'delay' in scene.keys(): source_delay = np.atleast_1d(scene["delay"])[i] else: source_delay = np.array([0]) logger.info( f"Encoding {source_file} at position(s) {source_azi},{source_ele}" Loading