support delay of mono items to crate some overlap (086b2309) · Commits · IVAS Codec Public Collaboration / IVAS Processing Scripts

item_generation_scripts/config/ISM1_CONFIG.yml

+0 −36

Original line number	Diff line number	Diff line
		@@ -54,7 +54,6 @@ scenes:
		source: "test_single.wav"
		azimuth: 0
		elevation: 0
		delay: 0

		a2:
		name: "G6S2.wav"
		@@ -62,7 +61,6 @@ scenes:
		source: "test_single.wav"
		azimuth: 60
		elevation: 0
		delay: 0

		a3:
		name: "G5S3.wav"
		@@ -70,7 +68,6 @@ scenes:
		source: "test_single.wav"
		azimuth: 120
		elevation: 0
		delay: 0

		a4:
		name: "G4S4.wav"
		@@ -78,7 +75,6 @@ scenes:
		source: "test_single.wav"
		azimuth: 180
		elevation: 0
		delay: 0

		a5:
		name: "G3S5.wav"
		@@ -86,7 +82,6 @@ scenes:
		source: "test_single.wav"
		azimuth: 240
		elevation: 0
		delay: 0

		a6:
		name: "G2S6.wav"
		@@ -94,7 +89,6 @@ scenes:
		source: "test_single.wav"
		azimuth: 300
		elevation: 0
		delay: 0

		b1:
		name: "G2S1.wav"
		@@ -102,7 +96,6 @@ scenes:
		source: "test_single.wav"
		azimuth: 120
		elevation: 35
		delay: 0

		b2:
		name: "G1S2.wav"
		@@ -110,7 +103,6 @@ scenes:
		source: "test_single.wav"
		azimuth: 180
		elevation: 35
		delay: 0

		b3:
		name: "G6S3.wav"
		@@ -118,7 +110,6 @@ scenes:
		source: "test_single.wav"
		azimuth: 240
		elevation: 35
		delay: 0

		b4:
		name: "G5S4.wav"
		@@ -126,7 +117,6 @@ scenes:
		source: "test_single.wav"
		azimuth: 300
		elevation: 35
		delay: 0

		b5:
		name: "G4S5.wav"
		@@ -134,7 +124,6 @@ scenes:
		source: "test_single.wav"
		azimuth: 0
		elevation: 35
		delay: 0

		b6:
		name: "G3S6.wav"
		@@ -142,7 +131,6 @@ scenes:
		source: "test_single.wav"
		azimuth: 60
		elevation: 35
		delay: 0

		c1:
		name: "G3S1.wav"
		@@ -150,7 +138,6 @@ scenes:
		source: "test_single.wav"
		azimuth: "0:1:360"
		elevation: 0
		delay: 0

		c2:
		name: "G2S2.wav"
		@@ -158,7 +145,6 @@ scenes:
		source: "test_single.wav"
		azimuth: "60:1:60+360"
		elevation: 0
		delay: 0

		c3:
		name: "G1S3.wav"
		@@ -166,7 +152,6 @@ scenes:
		source: "test_single.wav"
		azimuth: "120:1:120+360"
		elevation: 0
		delay: 0

		c4:
		name: "G6S4.wav"
		@@ -174,7 +159,6 @@ scenes:
		source: "test_single.wav"
		azimuth: "180:1:180+360"
		elevation: 0
		delay: 0

		c5:
		name: "G5S5.wav"
		@@ -182,7 +166,6 @@ scenes:
		source: "test_single.wav"
		azimuth: "240:1:240+360"
		elevation: 0
		delay: 0

		c6:
		name: "G4S6.wav"
		@@ -190,7 +173,6 @@ scenes:
		source: "test_single.wav"
		azimuth: "300:1:300+360"
		elevation: 0
		delay: 0

		d1:
		name: "G4S1.wav"
		@@ -198,7 +180,6 @@ scenes:
		source: "test_single.wav"
		azimuth: "0:-1:-360"
		elevation: 35
		delay: 0

		d2:
		name: "G3S2.wav"
		@@ -206,7 +187,6 @@ scenes:
		source: "test_single.wav"
		azimuth: "60:-1:60-360"
		elevation: 35
		delay: 0

		d3:
		name: "G3S2.wav"
		@@ -214,7 +194,6 @@ scenes:
		source: "test_single.wav"
		azimuth: "120:-1:120-360"
		elevation: 35
		delay: 0

		d4:
		name: "G1S4.wav"
		@@ -222,7 +201,6 @@ scenes:
		source: "test_single.wav"
		azimuth: "180:-1:180-360"
		elevation: 35
		delay: 0

		d5:
		name: "G6S5.wav"
		@@ -230,7 +208,6 @@ scenes:
		source: "test_single.wav"
		azimuth: "240:-1:240-360"
		elevation: 35
		delay: 0

		d6:
		name: "G5S6.wav"
		@@ -238,7 +215,6 @@ scenes:
		source: "test_single.wav"
		azimuth: "300:-1:300-360"
		elevation: 35
		delay: 0

		e1:
		name: "G5S1.wav"
		@@ -246,7 +222,6 @@ scenes:
		source: "test_single.wav"
		azimuth: 240
		elevation: "-90:0.5:90"
		delay: 0

		e2:
		name: "G4S2.wav"
		@@ -254,7 +229,6 @@ scenes:
		source: "test_single.wav"
		azimuth: 300
		elevation: 0
		delay: 0

		e3:
		name: "G3S3.wav"
		@@ -262,7 +236,6 @@ scenes:
		source: "test_single.wav"
		azimuth: 0
		elevation: "-90:0.5:90"
		delay: 0

		e4:
		name: "G2S4.wav"
		@@ -270,7 +243,6 @@ scenes:
		source: "test_single.wav"
		azimuth: 60
		elevation: "-90:0.5:90"
		delay: 0

		e5:
		name: "G1S5.wav"
		@@ -278,7 +250,6 @@ scenes:
		source: "test_single.wav"
		azimuth: 120
		elevation: "-90:0.5:90"
		delay: 0

		e6:
		name: "G6S6.wav"
		@@ -286,7 +257,6 @@ scenes:
		source: "test_single.wav"
		azimuth: 180
		elevation: "-90:0.5:90"
		delay: 0

		f1:
		name: "G6S1.wav"
		@@ -294,7 +264,6 @@ scenes:
		source: "test_single.wav"
		azimuth: "60:0.5:60+180"
		elevation: "35:-0.2:-35"
		delay: 0

		f2:
		name: "G5S2.wav"
		@@ -302,7 +271,6 @@ scenes:
		source: "test_single.wav"
		azimuth: "120:0.5:120+180"
		elevation: "35:-0.2:-35"
		delay: 0

		f3:
		name: "G4S3.wav"
		@@ -310,7 +278,6 @@ scenes:
		source: "test_single.wav"
		azimuth: "180:0.5:180+180"
		elevation: "35:-0.2:-35"
		delay: 0

		f4:
		name: "G3S4.wav"
		@@ -318,7 +285,6 @@ scenes:
		source: "test_single.wav"
		azimuth: "240:0.5:240+180"
		elevation: "35:-0.2:-35"
		delay: 0

		f5:
		name: "G2S5.wav"
		@@ -326,7 +292,6 @@ scenes:
		source: "test_single.wav"
		azimuth: "300:0.5:300+180"
		elevation: "35:-0.2:-35"
		delay: 0

		f6:
		name: "G1S6.wav"
		@@ -334,5 +299,4 @@ scenes:
		source: "test_single.wav"
		azimuth: "0:0.5:0+180"
		elevation: "35:-0.2:-35"
		delay: 0

		No newline at end of file

item_generation_scripts/config/ISM2_CONFIG.yml

+73 −72

Original line number	Diff line number	Diff line
		@@ -37,6 +37,7 @@ loudness: -26
		### Each scene must start with the sceneN tag
		### Specify the mono source filename (the program will search for it in the input_path folder)
		### Specify azimuth and elevation for each input source
		### Specify the delay in seconds for each input source
		### Note 1: use [val1, val2, ...] for multiple sources in a scene
		### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames

		@@ -51,288 +52,288 @@ scenes:
		a1:
		name: "G1S1.wav"
		description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances."
		source: ["f2s5a_Talker1.wav", "m2s16b_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: [0, 50]
		elevation: [0, 0]
		delay: [0, 0]
		delay: [0, 1]

		a2:
		name: "G6S2.wav"
		description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances."
		source: ["f5s10a_Talker1.wav", "m3s2a_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: [50, 350]
		elevation: [0, 0]
		delay: [0, 0]
		delay: [0, 1]

		a3:
		name: "G5S3.wav"
		description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances."
		source: ["f2s5a_Talker1.wav", "m2s16b_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: [40, 290]
		elevation: [0, 0]
		delay: [0, 0]
		delay: [0, 1]

		a4:
		name: "G4S4.wav"
		description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances."
		source: ["m4s11b_Talker1.wav", "f1s4b_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: [30, 230]
		elevation: [15, 15]
		delay: [0, 0]
		delay: [0, 1]

		a5:
		name: "G3S5.wav"
		description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances."
		source: ["m1s4a_Talker1.wav", "f3s3a_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: [20, 170]
		elevation: [15, 15]
		delay: [0, 0]
		delay: [0, 1]

		a6:
		name: "G2S6.wav"
		description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances."
		source: ["f5s10a_Talker1.wav", "m3s2a_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: [10, 110]
		elevation: [15, 15]
		delay: [0, 0]
		delay: [0, 1]

		b1:
		name: "G2S1.wav"
		description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
		source: ["f5s10b_Talker1.wav", "m3s2b_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: [20, 170]
		elevation: [30, 30]
		delay: [0, 0]
		delay: [0, 1]

		b2:
		name: "G1S2.wav"
		description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
		source: ["f2s1a_Talker1.wav", "m2s10a_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: [10, 110]
		elevation: [30, 30]
		delay: [0, 0]
		delay: [0, 1]

		b3:
		name: "G6S3.wav"
		description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
		source: ["f5s10b_Talker1.wav", "m3s2b_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: [0, 50]
		elevation: [30, 30]
		delay: [0, 0]
		delay: [0, 1]

		b4:
		name: "G5S4.wav"
		description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
		source: ["f2s1a_Talker1.wav", "m2s10a_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: [50, 350]
		elevation: [60, 60]
		delay: [0, 0]
		delay: [0, 1]

		b5:
		name: "G4S5.wav"
		description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
		source: ["m4s11a_Talker1.wav", "f1s6a_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: [40, 290]
		elevation: [60, 60]
		delay: [0, 0]
		delay: [0, 1]

		b6:
		name: "G3S6.wav"
		description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
		source: ["m1s2b_Talker1.wav", "f3s5a_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: [30, 230]
		elevation: [60, 60]
		delay: [0, 0]
		delay: [0, 1]

		c1:
		name: "G3S1.wav"
		description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances."
		source: ["m1s6b_Talker1.wav", "f3s5b_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: [40, 290]
		elevation: [0, 60]
		delay: [0, 0]
		delay: [0, 1]

		c2:
		name: "G2S2.wav"
		description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances."
		source: ["f5s14a_Talker1.wav", "m3s8a_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: [30, 230]
		elevation: [0, 60]
		delay: [0, 0]
		delay: [0, 1]

		c3:
		name: "G1S3.wav"
		description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances."
		source: ["f2s6a_Talker1.wav", "m2s13a_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: [20, 170]
		elevation: [0, 60]
		delay: [0, 0]
		delay: [0, 1]

		c4:
		name: "G6S4.wav"
		description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances."
		source: ["f5s14a_Talker1.wav", "m3s8a_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: [10, 110]
		elevation: [0, 60]
		delay: [0, 0]
		delay: [0, 1]

		c5:
		name: "G5S5.wav"
		description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances."
		source: ["f2s6a_Talker1.wav", "m2s13a_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: [0, 50]
		elevation: [0, 60]
		delay: [0, 0]
		delay: [0, 1]

		c6:
		name: "G4S6.wav"
		description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances."
		source: ["m4s13a_Talker1.wav", "f1s20a_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: [50, 350]
		elevation: [0, 60]
		delay: [0, 0]
		delay: [0, 1]

		d1:
		name: "G4S1.wav"
		description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
		source: ["m4s12b_Talker1.wav", "f1s12b_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: [50, "180:1:120 + 360"]
		elevation: [0, 60]
		delay: [0, 0]
		delay: [0, 1]

		d2:
		name: "G3S2.wav"
		description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
		source: ["m1s12a_Talker1.wav", "f3s20a_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: [300, "-70:-1:-10 - 360"]
		elevation: [0, 60]
		delay: [0, 0]
		delay: [0, 1]

		d3:
		name: "G3S2.wav"
		description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
		source: ["f5s15b_Talker1.wav", "m3s1a_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: [250, "-20:-1:-320"]
		elevation: [0, 60]
		delay: [0, 0]
		delay: [0, 1]

		d4:
		name: "G1S4.wav"
		description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
		source: ["f2s3b_Talker1.wav", "m2s15a_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: [200, "30:-1:-270"]
		elevation: [0, 60]
		delay: [0, 0]
		delay: [0, 1]

		d5:
		name: "G6S5.wav"
		description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
		source: ["f5s15b_Talker1.wav", "m3s1a_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: [150, "80:1:20 + 360"]
		elevation: [0, 60]
		delay: [0, 0]
		delay: [0, 1]

		d6:
		name: "G5S6.wav"
		description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
		source: ["f2s3b_Talker1.wav", "m2s15a_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: [100, "130:1:70 + 360"]
		elevation: [0, 60]
		delay: [0, 0]
		delay: [0, 1]

		e1:
		name: "G5S1.wav"
		description: "two talkers walking side-by-side around the table, ~30% overlapping utterances"
		source: ["f2s4a_Talker1.wav", "m2s17b_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: ["80:1:20 + 360", "80:1:20 + 360"]
		elevation: [10, 60]
		delay: [0, 0]
		delay: [0, 1]

		e2:
		name: "G4S2.wav"
		description: "two talkers walking side-by-side around the table, ~30% overlapping utterances"
		source: ["m4s16a_Talker1.wav", "f1s16b_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: ["130:1:70 + 360", "130:1:70 + 360"]
		elevation: [10, 60]
		delay: [0, 0]
		delay: [0, 1]

		e3:
		name: "G3S3.wav"
		description: "two talkers walking side-by-side around the table, ~30% overlapping utterances"
		source: ["m1s16b_Talker1.wav", "f3s10b_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: ["180:1:120 + 360", "180:1:120 + 360"]
		elevation: [10, 60]
		delay: [0, 0]
		delay: [0, 1]

		e4:
		name: "G2S4.wav"
		description: "two talkers walking side-by-side around the table, ~30% overlapping utterances"
		source: ["f5s19a_Talker1.wav", "m3s1b_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: ["-70:-1:-10 - 360", "-70:-1:-10 - 360"]
		elevation: [10, 60]
		delay: [0, 0]
		delay: [0, 1]

		e5:
		name: "G1S5.wav"
		description: "two talkers walking side-by-side around the table, ~30% overlapping utterances"
		source: ["f2s4a_Talker1.wav", "m2s17b_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: ["-20:-1:-320", "-20:-1:-320"]
		elevation: [10, 60]
		delay: [0, 0]
		delay: [0, 1]

		e6:
		name: "G6S6.wav"
		description: "two talkers walking side-by-side around the table, ~30% overlapping utterances"
		source: ["f5s19a_Talker1.wav", "m3s1b_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: ["30:-1:-270", "30:-1:-270"]
		elevation: [10, 60]
		delay: [0, 0]
		delay: [0, 1]

		f1:
		name: "G6S1.wav"
		description: "two talkers walking around the table in opposite directions, non-overlapping utterances."
		source: ["f5s15a_Talker1.wav", "m3s8b_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: ["60:1:0 + 360", "60:-1:120 - 360"]
		elevation: [20, 50]
		delay: [0, 0]
		delay: [0, 1]

		f2:
		name: "G5S2.wav"
		description: "two talkers walking around the table in opposite directions, non-overlapping utterances."
		source: ["f2s7b_Talker1.wav", "m2s6b_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: ["0:1:300", "0:-1:60 - 360"]
		elevation: [20, 50]
		delay: [0, 0]
		delay: [0, 1]

		f3:
		name: "G4S3.wav"
		description: "two talkers walking around the table in opposite directions, non-overlapping utterances."
		source: ["m4s14a_Talker1.wav", "f1s7a_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: ["300:1:240 + 360", "300:-1:0"]
		elevation: [20, 50]
		delay: [0, 0]
		delay: [0, 1]

		f4:
		name: "G3S4.wav"
		description: "two talkers walking around the table in opposite directions, non-overlapping utterances."
		source: ["m1s7a_Talker1.wav", "f3s7a_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: ["240:1:180 + 360", "240:-1:-60"]
		elevation: [20, 50]
		delay: [0, 0]
		delay: [0, 1]

		f5:
		name: "G2S5.wav"
		description: "two talkers walking around the table in opposite directions, non-overlapping utterances."
		source: ["f5s15a_Talker1.wav", "m3s8b_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: ["180:1:120 + 360", "180:-1:-120"]
		elevation: [20, 50]
		delay: [0, 0]
		delay: [0, 1]

		f6:
		name: "G1S6.wav"
		description: "two talkers walking around the table in opposite directions, non-overlapping utterances."
		source: ["f2s7b_Talker1.wav", "m2s6b_Talker2.wav"]
		source: ["test_double.wav", "test_double.wav"]
		azimuth: ["120:1:60 + 360", "120:-1:180 - 360"]
		elevation: [20, 50]
		delay: [0, 0]
		delay: [0, 1]

		No newline at end of file

item_generation_scripts/processing/process_ism_items.py

+7 −2

Original line number	Diff line number	Diff line
		@@ -72,10 +72,15 @@ def generate_ism_items(
		y = None
		y_meta = None
		for i in range(N_sources):

		# parse parameters from the scene description
		source_file = np.atleast_1d(scene["source"])[i]
		source_azi = np.atleast_1d(scene["azimuth"])[i]
		source_ele = np.atleast_1d(scene["elevation"])[i]
		if 'delay' in scene.keys():
		source_delay = np.atleast_1d(scene["delay"])[i]
		else:
		source_delay = np.array([0])

		logger.info(
		f"Encoding {source_file} at position(s) {source_azi},{source_ele}"