examples of scene description files (.yml) for item generation scripts (ade4f928) · Commits · IVAS Codec Public Collaboration / IVAS Processing Scripts

examples/ITEM_GENERATION_3ISM.yml

0 → 100644

+171 −0

Original line number	Diff line number	Diff line
		---
		################################################
		# Item generation - General configuration
		################################################

		### Any relative paths will be interpreted relative to the working directory the script is called from!
		### Usage of absolute paths is recommended.
		### Do not use file names with dots "." in them! This is not supported, use "_" instead
		### For Windows users: please use double back slash '\\' in paths and add '.exe' to executable definitions

		### Output format
		format: "ISM3"
		# masa_tc: 2
		# masa_dirs: 2
		# sba_order: 2

		### Output sampling rate in Hz
		fs: 48000

		### Generate BINAURAL output (_BINAURAL will be appended to the output filename)
		binaural_output: true

		### Normalize target loudness to X LKFS
		# loudness: -26

		### Apply pre-amble and post-amble in X seconds
		preamble: 0.0
		postamble: 0.0

		### Add low-level random background noise (amplitude +-4) instead of silence; default = false (silence)
		add_low_level_random_noise: true

		### Process with parallel streams
		multiprocessing: False

		################################################
		### Item generation - Filename conventions
		################################################

		### Naming convention for the input mono files
		### The input filenames are represented by:
		### lLLeeettszz.wav
		### where:
		### l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com)
		### LL stands for the language: JP, FR, GE, MA, DA, EN
		### eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09
		### tt stands for the talker ID: f1, f2, f3, m1, m2, m3
		### s stands for 'sample' and zz is the sample number; 01, ..., 14

		### Naming convention for the generated output files
		### The output filenames are represented by:
		### leeeayszz.wav
		### The filenames of the accompanying output metadata files (applicable to metadata-assisted spatial audio, object-based audio) are represented by:
		### leeeayszz.met for metadata-assisted spatial audio
		### leeeayszz.wav.o.csv for object-based audio
		### where:
		### l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com)
		### eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09
		### a stands 'audio'
		### y is the per-experiment category according to IVAS-8a: 01, 02, 03, 04, 05, 06
		### s stands for sample and zz is the sample number; 01, 02, 03, 04, 05, 06, 07 (07 is the preliminary sample)
		### o stands for the object number; 0, 1, 2, 3

		### File designators, default is "l" for listening lab, "EN" for language, "p07" for experiment and "g" for company
		listening_lab: "l"
		language: "EN"
		exp: "p01"
		provider: "va"

		### Insert prefix for all input filenames (default: "")
		### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment'
		### the number of consecutive letters define the length of each field
		# use_input_prefix: "lLLeee"

		### Insert prefix for all output filenames (default: "")
		### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment'
		### the number of consecutive letters define the length of each field
		# use_output_prefix: "leee"

		################################################
		### Item generation - Scene description
		################################################

		### Each scene shall de described using the following parameters/properties:
		### output: output filename
		### description: textual description of the scene
		### input: input filename(s)
		### azimuth: azimuth in the range [-180,180]; positive values point to the left
		### elevation: elevation in the range [-90,90]; positive values indicate up
		### shift: time adjustment of the input signal (negative value delays the signal)
		###
		### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder)
		### Note 1: use brackets [val1, val2, ...] when specifying multiple values
		### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames
		### Note 3: we're using right-handed coordinate system with azimuth = 0 pointing from the nose to the screen


		scenes:

		"01":
		output: "out/VA_3obj_2tlks_music1.wav"
		description: "Two talkers sitting at a table, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
		input: ["items_mono/untrimmed/f2s1a_Talker1.wav", "items_mono/untrimmed/m2s10a_Talker2.wav", "items_mono/music/Sc01.wav"]
		azimuth: [20, -40, 45]
		elevation: [0, 0, 70]
		level: [-26, -26, -41]
		shift: [0.0, 0.0, 0.0]

		"02":
		output: "out/VA_3obj_2tlks_music2.wav"
		description: "One talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
		input: ["items_mono/untrimmed/f5s10b_Talker1.wav", "items_mono/untrimmed/m3s2b_Talker2.wav", "items_mono/music/Guitar1.wav"]
		azimuth: [50, "180:1:120 + 360", -120]
		elevation: [0, 45, 70]
		level: [-26, -26, -41]
		shift: [0.0, 0.0, 0.0]

		"03":
		output: "out/VA_3obj_2tlks_music3.wav"
		description: "Two talkers walking side-by-side around the table, ~30% overlapping utterances."
		input: ["items_mono/untrimmed/m1s2b_Talker1.wav", "items_mono/untrimmed/f3s5a_Talker2.wav", "items_mono/music/Track066.wav"]
		azimuth: ["80:1:20 + 360", "80:1:20 + 360", -30]
		elevation: [10, 60, 70]
		level: [-26, -26, -41]
		shift: [0.0, 0.0, 0.0]

		"04":
		output: "out/VA_3obj_2tlks_music4.wav"
		description: "Two talkers walking around the table in opposite directions, ~30% overlapping utterances."
		input: ["items_mono/untrimmed/m4s12b_Talker1.wav", "items_mono/untrimmed/f1s12b_Talker2.wav", "items_mono/music/Sample02.wav"]
		azimuth: ["60:1:0 + 360", "60:-1:120 - 360", 100]
		elevation: [20, 50, 70]
		level: [-26, -26, -41]
		shift: [0.0, 0.0, 0.0]

		"05":
		output: "out/VA_3obj_3tlks_1.wav"
		description: "Three static talkers, partially overlapping utterances."
		input: ["items_mono/untrimmed/m4s12b_Talker1.wav", "items_mono/untrimmed/f1s12b_Talker2.wav", "items_mono/untrimmed/m3s1a_Talker2.wav"]
		azimuth: [30, -45, 100]
		elevation: [20, 20, 30]
		level: [-26, -26, -26]
		shift: [0.0, 0.0, -2.5]

		"06":
		output: "out/VA_3obj_3tlks_2.wav"
		description: "One walking talker, two static talkers, non-overlapping utterances."
		input: ["items_mono/untrimmed/f2s5a_Talker1.wav", "items_mono/untrimmed/m2s16b_Talker2.wav", "items_mono/untrimmed/m3s8b_Talker2.wav"]
		azimuth: ["-20:0.5:360", 60, -45]
		elevation: [10, 10, 10]
		level: [-26, -26, -26]
		shift: [0.0, 0.0, -3.0]

		"07":
		output: "out/VA_3obj_3tlks_3.wav"
		description: "Two moving talkers, one static talker, partially overlapping utterances."
		input: ["items_mono/untrimmed/f1s16b_Talker2.wav", "items_mono/untrimmed/m4s16a_Talker1.wav", "items_mono/untrimmed/f3s10b_Talker2.wav"]
		azimuth: [-90, "0:1:360", "0:-1:-360"]
		elevation: [0, 30, 30]
		level: [-26, -26, -26]
		shift: [0.0, 0.0, -3.0]

		"08":
		output: "out/VA_3obj_3tlks_4.wav"
		description: "Three walking talkers, partially overlapping utterances."
		input: ["items_mono/untrimmed/f5s15b_Talker1.wav", "items_mono/untrimmed/m3s1a_Talker2.wav", "items_mono/untrimmed/m2s17b_Talker2.wav"]
		azimuth: ["-90:-1:-360", "-10:1.5:360", "70:1:360"]
		elevation: [0, 20, 0]
		level: [-26, -26, -26]
		shift: [0.0, 0.0, -3.5]

examples/ITEM_GENERATION_FOA.yml

0 → 100644

+154 −0

Original line number	Diff line number	Diff line
		---
		################################################
		# Item generation - General configuration
		################################################

		### Any relative paths will be interpreted relative to the working directory the script is called from!
		### Usage of absolute paths is recommended.
		### Do not use file names with dots "." in them! This is not supported, use "_" instead
		### For Windows users: please use double back slash '\\' in paths and add '.exe' to executable definitions

		### Output format
		format: "FOA"
		# masa_tc: 2
		# masa_dirs: 2
		# sba_order: 2

		### Output sampling rate in Hz
		fs: 48000

		### Generate BINAURAL output (_BINAURAL will be appended to the output filename)
		binaural_output: true

		### Normalize target loudness to X LKFS
		loudness: -26

		### Apply pre-amble and post-amble in X seconds
		preamble: 0.5
		postamble: 1.0

		### Add low-level random background noise (amplitude +-4) instead of silence; default = false (silence)
		add_low_level_random_noise: False

		### Process with parallel streams
		multiprocessing: False

		################################################
		### Item generation - Filename conventions
		################################################

		### Naming convention for the input mono files
		### The input filenames are represented by:
		### lLLeeettszz.wav
		### where:
		### l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com)
		### LL stands for the language: JP, FR, GE, MA, DA, EN
		### eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09
		### tt stands for the talker ID: f1, f2, f3, m1, m2, m3
		### s stands for 'sample' and zz is the sample number; 01, ..., 14

		### Naming convention for the generated output files
		### The output filenames are represented by:
		### leeeayszz.wav
		### The filenames of the accompanying output metadata files (applicable to metadata-assisted spatial audio, object-based audio) are represented by:
		### leeeayszz.met for metadata-assisted spatial audio
		### leeeayszz.wav.o.csv for object-based audio
		### where:
		### l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com)
		### eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09
		### a stands 'audio'
		### y is the per-experiment category according to IVAS-8a: 01, 02, 03, 04, 05, 06
		### s stands for sample and zz is the sample number; 01, 02, 03, 04, 05, 06, 07 (07 is the preliminary sample)
		### o stands for the object number; 0, 1, 2, 3

		### File designators, default is "l" for listening lab, "EN" for language, "p07" for experiment and "g" for company
		listening_lab: "b"
		language: "GE"
		exp: "p02"
		provider: "g"

		### Insert prefix for all input filenames (default: "")
		### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment'
		### the number of consecutive letters define the length of each field
		# use_input_prefix: "lLLeee"

		### Insert prefix for all output filenames (default: "")
		### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment'
		### the number of consecutive letters define the length of each field
		use_output_prefix: "leee"

		################################################
		### Item generation - Scene description
		################################################

		### Each scene shall de described using the following parameters/properties:
		### output: output filename
		### description: textual description of the scene
		### input: input filename(s)
		### IR: filenames(s) of the input IRs
		### azimuth: azimuth in the range [-180,180]; positive values point to the left
		### elevation: elevation in the range [-90,90]; positive values indicate up
		### shift: time adjustment of the input signal (negative value delays the signal)
		###
		### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder)
		### Note 1: use brackets [val1, val2, ...] when specifying multiple values
		### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames
		### Note 3: we're using right-handed coordinate system with azimuth = 0 pointing from the nose to the screen


		scenes:
		"01":
		output: "out/s01.wav"
		description: "Car with AB microphone pickup, no overlap between the talkers, car noise."
		input: ["items_mono/untrimmed/f1s4b_Talker2.wav", "items_mono/untrimmed/f2s1a_Talker1.wav"]
		IR: ["IRs/IR_do_p04_e_01_01_FOA.wav", "IRs/IR_do_p04_e_02_01_FOA.wav"]
		shift: [0.0, -1.0]

		"02":
		output: "out/s02.wav"
		description: "Car with AB microphone pickup, overlap between the talkers, car noise."
		input: ["items_mono/untrimmed/f1s6a_Talker2.wav", "items_mono/untrimmed/f2s3b_Talker1.wav"]
		IR: ["IRs/IR_do_p04_e_03_01_FOA.wav", "IRs/IR_do_p04_e_04_01_FOA.wav"]
		shift: [0.0, +1.0]

		"03":
		output: "out/s03.wav"
		description: "Car with AB microphone pickup, no overlap between the talkers, car noise."
		input: ["items_mono/untrimmed/f3s3a_Talker2.wav", "items_mono/untrimmed/f3s10b_Talker2.wav"]
		IR: ["IRs/IR_do_p04_e_05_01_FOA.wav", "IRs/IR_do_p04_e_06_01_FOA.wav"]
		shift: [0.0, -1.0]

		"04":
		output: "out/s04.wav"
		description: "Car with AB microphone pickup, no overlap between the talkers, car noise."
		input: ["items_mono/untrimmed/f2s7b_Talker1.wav", "items_mono/untrimmed/f5s15a_Talker1.wav"]
		IR: ["IRs/IR_do_p04_e_07_01_FOA.wav", "IRs/IR_do_p04_e_08_01_FOA.wav"]
		shift: [0.0, -1.0]

		"05":
		output: "out/s05.wav"
		description: "Car with AB microphone pickup, no overlap between the talkers, car noise."
		input: ["items_mono/untrimmed/m2s15a_Talker2.wav", "items_mono/untrimmed/m1s4a_Talker1.wav"]
		IR: ["IRs/IR_do_p04_e_07_01_FOA.wav", "IRs/IR_do_p04_e_01_01_FOA.wav"]
		shift: [0.0, -1.0]

		"06":
		output: "out/s06.wav"
		description: "Car with AB microphone pickup, no overlap between the talkers."
		input: ["items_mono/untrimmed/m3s8a_Talker2.wav", "items_mono/untrimmed/m4s13a_Talker1.wav"]
		IR: ["IRs/IR_do_p04_e_03_01_FOA.wav", "IRs/IR_do_p04_e_01_01_FOA.wav"]
		shift: [0.0, -1.0]

		"07":
		output: "out/s07.wav"
		description: "Preliminary: Car with AB microphone pickup, no overlap between the talkers."
		input: ["items_mono/untrimmed/f1s20a_Talker2.wav", "items_mono/untrimmed/f5s15b_Talker1.wav"]
		IR: ["IRs/IR_do_p04_e_02_01_FOA.wav", "IRs/IR_do_p04_e_07_01_FOA.wav"]
		shift: [0.0, -1.0]

		"08":
		output: "out/s08.wav"
		description: "Car with AB microphone pickup, overlap between the talkers."
		input: ["items_mono/untrimmed/m2s6b_Talker2.wav", "items_mono/untrimmed/f5s14a_Talker1.wav"]
		IR: ["IRs/IR_do_p04_e_08_01_FOA.wav", "IRs/IR_do_p04_e_04_01_FOA.wav"]
		shift: [0.0, +1.0]

examples/ITEM_GENERATION_OMASA.yml

0 → 100644

+170 −0

Original line number	Diff line number	Diff line
		---
		################################################
		# Item generation - General configuration
		################################################

		### Any relative paths will be interpreted relative to the working directory the script is called from!
		### Usage of absolute paths is recommended.
		### Do not use file names with dots "." in them! This is not supported, use "_" instead
		### For Windows users: please use double back slash '\\' in paths and add '.exe' to executable definitions

		### Output format
		format: "OMASA"
		masa_tc: 2
		masa_dirs: 2
		# sba_order: 2

		### Output sampling rate in Hz
		fs: 48000

		### Generate BINAURAL output (_BINAURAL will be appended to the output filename)
		binaural_output: true

		### Normalize target loudness to X LKFS
		# loudness: -26

		### Apply pre-amble and post-amble in X seconds
		preamble: 0.0
		postamble: 0.0

		### Add low-level random background noise (amplitude +-4) instead of silence; default = false (silence)
		add_low_level_random_noise: true

		### Process with parallel streams
		multiprocessing: False

		################################################
		### Item generation - Filename conventions
		################################################

		### Naming convention for the input mono files
		### The input filenames are represented by:
		### lLLeeettszz.wav
		### where:
		### l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com)
		### LL stands for the language: JP, FR, GE, MA, DA, EN
		### eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09
		### tt stands for the talker ID: f1, f2, f3, m1, m2, m3
		### s stands for 'sample' and zz is the sample number; 01, ..., 14

		### Naming convention for the generated output files
		### The output filenames are represented by:
		### leeeayszz.wav
		### The filenames of the accompanying output metadata files (applicable to metadata-assisted spatial audio, object-based audio) are represented by:
		### leeeayszz.met for metadata-assisted spatial audio
		### leeeayszz.wav.o.csv for object-based audio
		### where:
		### l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com)
		### eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09
		### a stands 'audio'
		### y is the per-experiment category according to IVAS-8a: 01, 02, 03, 04, 05, 06
		### s stands for sample and zz is the sample number; 01, 02, 03, 04, 05, 06, 07 (07 is the preliminary sample)
		### o stands for the object number; 0, 1, 2, 3

		### File designators, default is "l" for listening lab, "EN" for language, "p07" for experiment and "g" for company
		listening_lab: "l"
		language: "EN"
		exp: "p01"
		provider: "va"

		### Insert prefix for all input filenames (default: "")
		### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment'
		### the number of consecutive letters define the length of each field
		# use_input_prefix: "lLLeee"

		### Insert prefix for all output filenames (default: "")
		### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment'
		### the number of consecutive letters define the length of each field
		# use_output_prefix: "leee"

		################################################
		### Item generation - Scene description
		################################################

		### Each scene shall de described using the following parameters/properties:
		### output: output filename
		### description: textual description of the scene
		### input: input filename(s)
		### azimuth: azimuth in the range [-180,180]; positive values point to the left
		### elevation: elevation in the range [-90,90]; positive values indicate up
		### shift: time adjustment of the input signal (negative value delays the signal)
		###
		### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder)
		### Note 1: use brackets [val1, val2, ...] when specifying multiple values
		### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames
		### Note 3: we're using right-handed coordinate system with azimuth = 0 pointing from the nose to the screen

		scenes:
		"01":
		output: "out/VA_3tlks_music.wav"
		description: "Three talkers over music background"
		input: ["items_hoa2/bm7aa1s01.wav", "items_mono/untrimmed/m4s12b_Talker1.wav", "items_mono/untrimmed/f1s6a_Talker2.wav", "items_mono/untrimmed/m3s1a_Talker2.wav"]
		azimuth: [0, 30, -45, 100]
		elevation: [0, 20, 20, 30]
		level: [-36, -26, -26, -26]
		shift: [0.0, 0.0, 0.0, -2.0]

		"02":
		output: "out/VA_3tlks_music.wav"
		description: "Three talkers over music background"
		input: ["items_hoa2/bm7aa1s03.wav", "items_mono/untrimmed/f2s5a_Talker1.wav", "items_mono/untrimmed/f5s10a_Talker1.wav", "items_mono/untrimmed/m3s8b_Talker2.wav"]
		azimuth: [0, "-20:0.5:360", "60:-0.5:-360", 60]
		elevation: [0, 10, 10, 10]
		level: [-46, -26, -26, -26]
		shift: [0.0, 0.0, -2.0, -2.5]

		"03":
		output: "out/VA_3tlks_music.wav"
		description: "Three talkers over music background"
		input: ["items_hoa2/bm7aa1s05.wav", "items_mono/untrimmed/f1s16b_Talker2.wav", "items_mono/untrimmed/m4s16a_Talker1.wav", "items_mono/untrimmed/f3s10b_Talker2.wav"]
		azimuth: [0, -90, "0:1:360", "0:-1:-360"]
		elevation: [0, 0, 30, 30]
		level: [-36, -26, -26, -26]
		shift: [0.0, 0.0, 0.0, -2.6]

		"04":
		output: "out/VA_3tlks_music.wav"
		description: "Three talkers over music background"
		input: ["items_hoa2/bm7aa1s07.wav", "items_mono/untrimmed/f5s15b_Talker1.wav", "items_mono/untrimmed/m1s7a_Talker1.wav", "items_mono/untrimmed/m1s6b_Talker1.wav"]
		azimuth: [0, "-90:-1:-360", "-10:1.5:360", "70:1:360"]
		elevation: [0, 0, 20, 0]
		level: [-46, -26, -36, -26]
		shift: [0.0, -2.0, 0.0, -3.5]

		"05":
		output: "out/VA_2tlks_1obj_music.wav"
		description: "Two talkers, one musical object over music background"
		input: ["items_hoa2/bm7aa1s09.wav", "items_mono/untrimmed/f2s1a_Talker1.wav", "items_mono/untrimmed/f2s5a_Talker1.wav", "music/item_lxa3s3.48k.wav"]
		azimuth: [0, 20, -40, 45]
		elevation: [0, 0, 0, 70]
		level: [-36, -36, -26, -41]
		shift: [0.0, 0.0, -2.0, 0.0]

		"06":
		output: "out/VA_2tlks_1obj_music.wav"
		description: "Two talkers, one musical object over music background"
		input: ["items_hoa2/bm7aa1s11.wav", "items_mono/untrimmed/f5s10b_Talker1.wav", "items_mono/untrimmed/m1s4a_Talker1.wav", "music/item_lxa3s5.48k.wav"]
		azimuth: [0, 50, "180:1:360", -120]
		elevation: [0, 0, 45, 70]
		level: [-46, -26, -26, -41]
		shift: [0.0, 0.0, -2.5, 0.0]

		"07":
		output: "out/VA_2tlks_1obj_music.wav"
		description: "Two talkers, one musical object over music background"
		input: ["items_hoa2/bm7aa1s13.wav", "items_mono/untrimmed/m1s2b_Talker1.wav", "items_mono/untrimmed/f3s5a_Talker2.wav", "music/641692__theflyfishingfilmmaker__classical-violin-minor-10s-mono.wav"]
		azimuth: [0, "80:1:20 + 360", "80:1:20 + 360", -30]
		elevation: [0, 10, 60, 70]
		level: [-36, -26, -26, -36]
		shift: [0.0, 0.0, 0.0, 0.0]

		"08":
		output: "out/VA_2tlks_1obj_music.wav"
		description: "Two talkers, one musical object over music background"
		input: ["items_hoa2/bm7aa1s15.wav", "items_mono/untrimmed/m4s16a_Talker1.wav", "items_mono/untrimmed/f2s4a_Talker1.wav", "music/item_lxa4s2.48k.wav"]
		azimuth: [0, "60:1:0 + 360", "60:-1:120 - 360", 100]
		elevation: [0, 20, 50, 70]
		level: [-46, -26, -26, -41]
		shift: [0.0, 0.0, -1.0, -0.5]

examples/ITEM_GENERATION_OSBA.yml

0 → 100644

+170 −0

File added.

Preview size limit exceeded, changes collapsed.

examples/ITEM_GENERATION_STEREO.yml

0 → 100644

+154 −0

File added.

Preview size limit exceeded, changes collapsed.