Commit ade4f928 authored by Vladimir Malenovsky's avatar Vladimir Malenovsky
Browse files

examples of scene description files (.yml) for item generation scripts

parent 0774f358
Loading
Loading
Loading
Loading
+171 −0
Original line number Diff line number Diff line
---
################################################
# Item generation - General configuration
################################################

### Any relative paths will be interpreted relative to the working directory the script is called from!
### Usage of absolute paths is recommended.
### Do not use file names with dots "." in them! This is not supported, use "_" instead
### For Windows users: please use double back slash '\\' in paths and add '.exe' to executable definitions

### Output format
format: "ISM3"
# masa_tc: 2
# masa_dirs: 2
# sba_order: 2

### Output sampling rate in Hz
fs: 48000

### Generate BINAURAL output (_BINAURAL will be appended to the output filename)
binaural_output: true

### Normalize target loudness to X LKFS 
# loudness: -26

### Apply pre-amble and post-amble in X seconds 
preamble: 0.0
postamble: 0.0

### Add low-level random background noise (amplitude +-4) instead of silence; default = false (silence)
add_low_level_random_noise: true

### Process with parallel streams
multiprocessing: False

################################################
### Item generation - Filename conventions
################################################

### Naming convention for the input mono files
### The input filenames are represented by:
###   lLLeeettszz.wav
### where: 
###   l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com) 
###   LL stands for the language: JP, FR, GE, MA, DA, EN
###   eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09
###   tt stands for the talker ID: f1, f2, f3, m1, m2, m3
###   s stands for 'sample' and zz is the sample number; 01, ..., 14

### Naming convention for the generated output files
### The output filenames are represented by:
###   leeeayszz.wav
### The filenames of the accompanying output metadata files (applicable to metadata-assisted spatial audio, object-based audio) are represented by:
###   leeeayszz.met for metadata-assisted spatial audio
###   leeeayszz.wav.o.csv for object-based audio
### where: 
###   l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com) 
###   eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09
###   a stands 'audio'
###   y is the per-experiment category according to IVAS-8a: 01, 02, 03, 04, 05, 06
###   s stands for sample and zz is the sample number; 01, 02, 03, 04, 05, 06, 07 (07 is the preliminary sample)
###   o stands for the object number; 0, 1, 2, 3

### File designators, default is "l" for listening lab, "EN" for language, "p07" for experiment and "g" for company
listening_lab: "l"
language: "EN"
exp: "p01"
provider: "va"

### Insert prefix for all input filenames (default: "")
### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment' 
### the number of consecutive letters define the length of each field
# use_input_prefix: "lLLeee"

### Insert prefix for all output filenames (default: "")
### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment' 
### the number of consecutive letters define the length of each field
# use_output_prefix: "leee"

################################################
### Item generation - Scene description
################################################

### Each scene shall de described using the following parameters/properties:
###   output:      output filename
###   description: textual description of the scene
###   input:       input filename(s)
###   azimuth:     azimuth in the range [-180,180]; positive values point to the left
###   elevation:   elevation in the range [-90,90]; positive values indicate up
###   shift:       time adjustment of the input signal (negative value delays the signal)
###
### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder)
### Note 1: use brackets [val1, val2, ...] when specifying multiple values 
### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames
### Note 3: we're using right-handed coordinate system with azimuth = 0 pointing from the nose to the screen


scenes:

    "01": 
        output: "out/VA_3obj_2tlks_music1.wav"
        description: "Two talkers sitting at a table, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
        input: ["items_mono/untrimmed/f2s1a_Talker1.wav", "items_mono/untrimmed/m2s10a_Talker2.wav", "items_mono/music/Sc01.wav"]
        azimuth: [20, -40, 45]
        elevation: [0, 0, 70]
        level: [-26, -26, -41]
        shift: [0.0, 0.0, 0.0]
        
    "02":
        output: "out/VA_3obj_2tlks_music2.wav"
        description: "One talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
        input: ["items_mono/untrimmed/f5s10b_Talker1.wav", "items_mono/untrimmed/m3s2b_Talker2.wav", "items_mono/music/Guitar1.wav"]
        azimuth: [50, "180:1:120 + 360", -120]
        elevation: [0, 45, 70]
        level: [-26, -26, -41]
        shift: [0.0, 0.0, 0.0] 
        
    "03":
        output: "out/VA_3obj_2tlks_music3.wav"
        description: "Two talkers walking side-by-side around the table, ~30% overlapping utterances."
        input: ["items_mono/untrimmed/m1s2b_Talker1.wav", "items_mono/untrimmed/f3s5a_Talker2.wav", "items_mono/music/Track066.wav"]
        azimuth: ["80:1:20 + 360", "80:1:20 + 360", -30]
        elevation: [10, 60, 70]
        level: [-26, -26, -41]
        shift: [0.0, 0.0, 0.0] 

    "04":
        output: "out/VA_3obj_2tlks_music4.wav"
        description: "Two talkers walking around the table in opposite directions, ~30% overlapping utterances."
        input: ["items_mono/untrimmed/m4s12b_Talker1.wav", "items_mono/untrimmed/f1s12b_Talker2.wav", "items_mono/music/Sample02.wav"]
        azimuth: ["60:1:0 + 360", "60:-1:120 - 360", 100]
        elevation: [20, 50, 70]
        level: [-26, -26, -41]
        shift: [0.0, 0.0, 0.0] 
        
    "05":
        output: "out/VA_3obj_3tlks_1.wav"
        description: "Three static talkers, partially overlapping utterances."
        input: ["items_mono/untrimmed/m4s12b_Talker1.wav", "items_mono/untrimmed/f1s12b_Talker2.wav", "items_mono/untrimmed/m3s1a_Talker2.wav"]
        azimuth: [30, -45, 100]
        elevation: [20, 20, 30]
        level: [-26, -26, -26]
        shift: [0.0, 0.0, -2.5] 
        
    "06":
        output: "out/VA_3obj_3tlks_2.wav"
        description: "One walking talker, two static talkers, non-overlapping utterances."
        input: ["items_mono/untrimmed/f2s5a_Talker1.wav", "items_mono/untrimmed/m2s16b_Talker2.wav", "items_mono/untrimmed/m3s8b_Talker2.wav"]
        azimuth: ["-20:0.5:360", 60, -45]
        elevation: [10, 10, 10]
        level: [-26, -26, -26]
        shift: [0.0, 0.0, -3.0] 
        
    "07":
        output: "out/VA_3obj_3tlks_3.wav"
        description: "Two moving talkers, one static talker, partially overlapping utterances."
        input: ["items_mono/untrimmed/f1s16b_Talker2.wav", "items_mono/untrimmed/m4s16a_Talker1.wav", "items_mono/untrimmed/f3s10b_Talker2.wav"]
        azimuth: [-90, "0:1:360", "0:-1:-360"]
        elevation: [0, 30, 30]
        level: [-26, -26, -26]
        shift: [0.0, 0.0, -3.0] 

    "08":
        output: "out/VA_3obj_3tlks_4.wav"
        description: "Three walking talkers, partially overlapping utterances."
        input: ["items_mono/untrimmed/f5s15b_Talker1.wav", "items_mono/untrimmed/m3s1a_Talker2.wav", "items_mono/untrimmed/m2s17b_Talker2.wav"]
        azimuth: ["-90:-1:-360", "-10:1.5:360", "70:1:360"]
        elevation: [0, 20, 0]
        level: [-26, -26, -26]
        shift: [0.0, 0.0, -3.5] 
+154 −0
Original line number Diff line number Diff line
---
################################################
# Item generation - General configuration
################################################

### Any relative paths will be interpreted relative to the working directory the script is called from!
### Usage of absolute paths is recommended.
### Do not use file names with dots "." in them! This is not supported, use "_" instead
### For Windows users: please use double back slash '\\' in paths and add '.exe' to executable definitions

### Output format
format: "FOA"
# masa_tc: 2
# masa_dirs: 2
# sba_order: 2

### Output sampling rate in Hz
fs: 48000

### Generate BINAURAL output (_BINAURAL will be appended to the output filename)
binaural_output: true

### Normalize target loudness to X LKFS 
loudness: -26

### Apply pre-amble and post-amble in X seconds 
preamble: 0.5
postamble: 1.0

### Add low-level random background noise (amplitude +-4) instead of silence; default = false (silence)
add_low_level_random_noise: False

### Process with parallel streams
multiprocessing: False

################################################
### Item generation - Filename conventions
################################################

### Naming convention for the input mono files
### The input filenames are represented by:
###   lLLeeettszz.wav
### where: 
###   l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com) 
###   LL stands for the language: JP, FR, GE, MA, DA, EN
###   eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09
###   tt stands for the talker ID: f1, f2, f3, m1, m2, m3
###   s stands for 'sample' and zz is the sample number; 01, ..., 14

### Naming convention for the generated output files
### The output filenames are represented by:
###   leeeayszz.wav
### The filenames of the accompanying output metadata files (applicable to metadata-assisted spatial audio, object-based audio) are represented by:
###   leeeayszz.met for metadata-assisted spatial audio
###   leeeayszz.wav.o.csv for object-based audio
### where: 
###   l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com) 
###   eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09
###   a stands 'audio'
###   y is the per-experiment category according to IVAS-8a: 01, 02, 03, 04, 05, 06
###   s stands for sample and zz is the sample number; 01, 02, 03, 04, 05, 06, 07 (07 is the preliminary sample)
###   o stands for the object number; 0, 1, 2, 3

### File designators, default is "l" for listening lab, "EN" for language, "p07" for experiment and "g" for company
listening_lab: "b"
language: "GE"
exp: "p02"
provider: "g"

### Insert prefix for all input filenames (default: "")
### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment' 
### the number of consecutive letters define the length of each field
# use_input_prefix: "lLLeee"

### Insert prefix for all output filenames (default: "")
### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment' 
### the number of consecutive letters define the length of each field
use_output_prefix: "leee"

################################################
### Item generation - Scene description
################################################

### Each scene shall de described using the following parameters/properties:
###   output:      output filename
###   description: textual description of the scene
###   input:       input filename(s)
###   IR:          filenames(s) of the input IRs 
###   azimuth:     azimuth in the range [-180,180]; positive values point to the left
###   elevation:   elevation in the range [-90,90]; positive values indicate up
###   shift:       time adjustment of the input signal (negative value delays the signal)
###
### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder)
### Note 1: use brackets [val1, val2, ...] when specifying multiple values 
### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames
### Note 3: we're using right-handed coordinate system with azimuth = 0 pointing from the nose to the screen


scenes:
    "01": 
        output: "out/s01.wav"
        description: "Car with AB microphone pickup, no overlap between the talkers, car noise."
        input: ["items_mono/untrimmed/f1s4b_Talker2.wav", "items_mono/untrimmed/f2s1a_Talker1.wav"]
        IR: ["IRs/IR_do_p04_e_01_01_FOA.wav", "IRs/IR_do_p04_e_02_01_FOA.wav"]
        shift: [0.0, -1.0]
        
    "02": 
        output: "out/s02.wav"
        description: "Car with AB microphone pickup, overlap between the talkers, car noise."
        input: ["items_mono/untrimmed/f1s6a_Talker2.wav", "items_mono/untrimmed/f2s3b_Talker1.wav"]
        IR: ["IRs/IR_do_p04_e_03_01_FOA.wav", "IRs/IR_do_p04_e_04_01_FOA.wav"]
        shift: [0.0, +1.0]
        
    "03": 
        output: "out/s03.wav"
        description: "Car with AB microphone pickup, no overlap between the talkers, car noise."
        input: ["items_mono/untrimmed/f3s3a_Talker2.wav", "items_mono/untrimmed/f3s10b_Talker2.wav"]
        IR: ["IRs/IR_do_p04_e_05_01_FOA.wav", "IRs/IR_do_p04_e_06_01_FOA.wav"]
        shift: [0.0, -1.0]
        
    "04": 
        output: "out/s04.wav"
        description: "Car with AB microphone pickup, no overlap between the talkers, car noise."
        input: ["items_mono/untrimmed/f2s7b_Talker1.wav", "items_mono/untrimmed/f5s15a_Talker1.wav"]
        IR: ["IRs/IR_do_p04_e_07_01_FOA.wav", "IRs/IR_do_p04_e_08_01_FOA.wav"]
        shift: [0.0, -1.0]
        
    "05": 
        output: "out/s05.wav"
        description: "Car with AB microphone pickup, no overlap between the talkers, car noise."
        input: ["items_mono/untrimmed/m2s15a_Talker2.wav", "items_mono/untrimmed/m1s4a_Talker1.wav"]
        IR: ["IRs/IR_do_p04_e_07_01_FOA.wav", "IRs/IR_do_p04_e_01_01_FOA.wav"]
        shift: [0.0, -1.0]
        
    "06": 
        output: "out/s06.wav"
        description: "Car with AB microphone pickup, no overlap between the talkers."
        input: ["items_mono/untrimmed/m3s8a_Talker2.wav", "items_mono/untrimmed/m4s13a_Talker1.wav"]
        IR: ["IRs/IR_do_p04_e_03_01_FOA.wav", "IRs/IR_do_p04_e_01_01_FOA.wav"]
        shift: [0.0, -1.0]
         
    "07": 
        output: "out/s07.wav"
        description: "Preliminary: Car with AB microphone pickup, no overlap between the talkers."
        input: ["items_mono/untrimmed/f1s20a_Talker2.wav", "items_mono/untrimmed/f5s15b_Talker1.wav"]
        IR: ["IRs/IR_do_p04_e_02_01_FOA.wav", "IRs/IR_do_p04_e_07_01_FOA.wav"]
        shift: [0.0, -1.0]
         
    "08": 
        output: "out/s08.wav"
        description: "Car with AB microphone pickup, overlap between the talkers."
        input: ["items_mono/untrimmed/m2s6b_Talker2.wav", "items_mono/untrimmed/f5s14a_Talker1.wav"]
        IR: ["IRs/IR_do_p04_e_08_01_FOA.wav", "IRs/IR_do_p04_e_04_01_FOA.wav"]
        shift: [0.0, +1.0]
+170 −0
Original line number Diff line number Diff line
---
################################################
# Item generation - General configuration
################################################

### Any relative paths will be interpreted relative to the working directory the script is called from!
### Usage of absolute paths is recommended.
### Do not use file names with dots "." in them! This is not supported, use "_" instead
### For Windows users: please use double back slash '\\' in paths and add '.exe' to executable definitions

### Output format
format: "OMASA"
masa_tc: 2
masa_dirs: 2
# sba_order: 2

### Output sampling rate in Hz
fs: 48000

### Generate BINAURAL output (_BINAURAL will be appended to the output filename)
binaural_output: true

### Normalize target loudness to X LKFS 
# loudness: -26

### Apply pre-amble and post-amble in X seconds 
preamble: 0.0
postamble: 0.0

### Add low-level random background noise (amplitude +-4) instead of silence; default = false (silence)
add_low_level_random_noise: true

### Process with parallel streams
multiprocessing: False

################################################
### Item generation - Filename conventions
################################################

### Naming convention for the input mono files
### The input filenames are represented by:
###   lLLeeettszz.wav
### where: 
###   l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com) 
###   LL stands for the language: JP, FR, GE, MA, DA, EN
###   eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09
###   tt stands for the talker ID: f1, f2, f3, m1, m2, m3
###   s stands for 'sample' and zz is the sample number; 01, ..., 14

### Naming convention for the generated output files
### The output filenames are represented by:
###   leeeayszz.wav
### The filenames of the accompanying output metadata files (applicable to metadata-assisted spatial audio, object-based audio) are represented by:
###   leeeayszz.met for metadata-assisted spatial audio
###   leeeayszz.wav.o.csv for object-based audio
### where: 
###   l stands for the listening lab designator: a (Force Technology), b (HEAD acoustics), c (MQ University), d (Mesaqin.com) 
###   eee stands for the experiment designator: p01, p02, p04, p05, p06, p07, p08, p09
###   a stands 'audio'
###   y is the per-experiment category according to IVAS-8a: 01, 02, 03, 04, 05, 06
###   s stands for sample and zz is the sample number; 01, 02, 03, 04, 05, 06, 07 (07 is the preliminary sample)
###   o stands for the object number; 0, 1, 2, 3

### File designators, default is "l" for listening lab, "EN" for language, "p07" for experiment and "g" for company
listening_lab: "l"
language: "EN"
exp: "p01"
provider: "va"

### Insert prefix for all input filenames (default: "")
### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment' 
### the number of consecutive letters define the length of each field
# use_input_prefix: "lLLeee"

### Insert prefix for all output filenames (default: "")
### l stands for the 'listening_lab' designator, L stands for the 'language', e stands for the 'experiment' 
### the number of consecutive letters define the length of each field
# use_output_prefix: "leee"

################################################
### Item generation - Scene description
################################################

### Each scene shall de described using the following parameters/properties:
###   output:      output filename
###   description: textual description of the scene
###   input:       input filename(s)
###   azimuth:     azimuth in the range [-180,180]; positive values point to the left
###   elevation:   elevation in the range [-90,90]; positive values indicate up
###   shift:       time adjustment of the input signal (negative value delays the signal)
###
### Note 0: you can use relative paths in filenames (the program assumes that the root directory is the parent directory of the ivas_processing_scripts subfolder)
### Note 1: use brackets [val1, val2, ...] when specifying multiple values 
### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames
### Note 3: we're using right-handed coordinate system with azimuth = 0 pointing from the nose to the screen

scenes:
    "01": 
        output: "out/VA_3tlks_music.wav"
        description: "Three talkers over music background"
        input: ["items_hoa2/bm7aa1s01.wav", "items_mono/untrimmed/m4s12b_Talker1.wav", "items_mono/untrimmed/f1s6a_Talker2.wav", "items_mono/untrimmed/m3s1a_Talker2.wav"]
        azimuth: [0, 30, -45, 100]
        elevation: [0, 20, 20, 30]
        level: [-36, -26, -26, -26]
        shift: [0.0, 0.0, 0.0, -2.0] 
        
    "02":
        output: "out/VA_3tlks_music.wav"
        description: "Three talkers over music background"
        input: ["items_hoa2/bm7aa1s03.wav", "items_mono/untrimmed/f2s5a_Talker1.wav", "items_mono/untrimmed/f5s10a_Talker1.wav", "items_mono/untrimmed/m3s8b_Talker2.wav"]
        azimuth: [0, "-20:0.5:360", "60:-0.5:-360", 60]
        elevation: [0, 10, 10, 10]
        level: [-46, -26, -26, -26]
        shift: [0.0, 0.0, -2.0, -2.5] 
        
    "03":
        output: "out/VA_3tlks_music.wav"
        description: "Three talkers over music background"
        input: ["items_hoa2/bm7aa1s05.wav", "items_mono/untrimmed/f1s16b_Talker2.wav", "items_mono/untrimmed/m4s16a_Talker1.wav", "items_mono/untrimmed/f3s10b_Talker2.wav"]
        azimuth: [0, -90, "0:1:360", "0:-1:-360"]
        elevation: [0, 0, 30, 30]
        level: [-36, -26, -26, -26]
        shift: [0.0, 0.0, 0.0, -2.6] 

    "04":
        output: "out/VA_3tlks_music.wav"
        description: "Three talkers over music background"
        input: ["items_hoa2/bm7aa1s07.wav", "items_mono/untrimmed/f5s15b_Talker1.wav", "items_mono/untrimmed/m1s7a_Talker1.wav", "items_mono/untrimmed/m1s6b_Talker1.wav"]
        azimuth: [0, "-90:-1:-360", "-10:1.5:360", "70:1:360"]
        elevation: [0, 0, 20, 0]
        level: [-46, -26, -36, -26]
        shift: [0.0, -2.0, 0.0, -3.5] 
        
    "05":
        output: "out/VA_2tlks_1obj_music.wav"
        description: "Two talkers, one musical object over music background"
        input: ["items_hoa2/bm7aa1s09.wav", "items_mono/untrimmed/f2s1a_Talker1.wav", "items_mono/untrimmed/f2s5a_Talker1.wav", "music/item_lxa3s3.48k.wav"]
        azimuth: [0, 20, -40, 45]
        elevation: [0, 0, 0, 70]
        level: [-36, -36, -26, -41]
        shift: [0.0, 0.0, -2.0, 0.0]
        
    "06":
        output: "out/VA_2tlks_1obj_music.wav"
        description: "Two talkers, one musical object over music background"
        input: ["items_hoa2/bm7aa1s11.wav", "items_mono/untrimmed/f5s10b_Talker1.wav", "items_mono/untrimmed/m1s4a_Talker1.wav", "music/item_lxa3s5.48k.wav"]
        azimuth: [0, 50, "180:1:360", -120]
        elevation: [0, 0, 45, 70]
        level: [-46, -26, -26, -41]
        shift: [0.0, 0.0, -2.5, 0.0] 
        
    "07":
        output: "out/VA_2tlks_1obj_music.wav"
        description: "Two talkers, one musical object over music background"
        input: ["items_hoa2/bm7aa1s13.wav", "items_mono/untrimmed/m1s2b_Talker1.wav", "items_mono/untrimmed/f3s5a_Talker2.wav", "music/641692__theflyfishingfilmmaker__classical-violin-minor-10s-mono.wav"]
        azimuth: [0, "80:1:20 + 360", "80:1:20 + 360", -30]
        elevation: [0, 10, 60, 70]
        level: [-36, -26, -26, -36]
        shift: [0.0, 0.0, 0.0, 0.0] 

    "08":
        output: "out/VA_2tlks_1obj_music.wav"
        description: "Two talkers, one musical object over music background"
        input: ["items_hoa2/bm7aa1s15.wav", "items_mono/untrimmed/m4s16a_Talker1.wav", "items_mono/untrimmed/f2s4a_Talker1.wav", "music/item_lxa4s2.48k.wav"]
        azimuth: [0, "60:1:0 + 360", "60:-1:120 - 360", 100]
        elevation: [0, 20, 50, 70]
        level: [-46, -26, -26, -41]
        shift: [0.0, 0.0, -1.0, -0.5] 
        
+170 −0

File added.

Preview size limit exceeded, changes collapsed.

+154 −0

File added.

Preview size limit exceeded, changes collapsed.