Commit 086b2309 authored by Vladimir Malenovsky's avatar Vladimir Malenovsky
Browse files

support delay of mono items to crate some overlap

parent 81628b69
Loading
Loading
Loading
Loading
Loading
+0 −36
Original line number Diff line number Diff line
@@ -54,7 +54,6 @@ scenes:
        source: "test_single.wav"
        azimuth: 0 
        elevation: 0 
        delay: 0 
        
    a2: 
        name: "G6S2.wav"
@@ -62,7 +61,6 @@ scenes:
        source: "test_single.wav"
        azimuth: 60 
        elevation: 0 
        delay: 0 
        
    a3: 
        name: "G5S3.wav"
@@ -70,7 +68,6 @@ scenes:
        source: "test_single.wav"
        azimuth: 120 
        elevation: 0 
        delay: 0 

    a4: 
        name: "G4S4.wav"
@@ -78,7 +75,6 @@ scenes:
        source: "test_single.wav"
        azimuth: 180 
        elevation: 0 
        delay: 0 

    a5: 
        name: "G3S5.wav"
@@ -86,7 +82,6 @@ scenes:
        source: "test_single.wav"
        azimuth: 240 
        elevation: 0 
        delay: 0 

    a6: 
        name: "G2S6.wav"
@@ -94,7 +89,6 @@ scenes:
        source: "test_single.wav"
        azimuth: 300 
        elevation: 0 
        delay: 0 

    b1: 
        name: "G2S1.wav"
@@ -102,7 +96,6 @@ scenes:
        source: "test_single.wav"
        azimuth: 120 
        elevation: 35 
        delay: 0 
 
    b2: 
        name: "G1S2.wav"
@@ -110,7 +103,6 @@ scenes:
        source: "test_single.wav"
        azimuth: 180 
        elevation: 35 
        delay: 0 
 
    b3: 
        name: "G6S3.wav"
@@ -118,7 +110,6 @@ scenes:
        source: "test_single.wav"
        azimuth: 240 
        elevation: 35 
        delay: 0 
 
    b4: 
        name: "G5S4.wav"
@@ -126,7 +117,6 @@ scenes:
        source: "test_single.wav"
        azimuth: 300 
        elevation: 35 
        delay: 0 

    b5: 
        name: "G4S5.wav"
@@ -134,7 +124,6 @@ scenes:
        source: "test_single.wav"
        azimuth: 0 
        elevation: 35 
        delay: 0 

    b6: 
        name: "G3S6.wav"
@@ -142,7 +131,6 @@ scenes:
        source: "test_single.wav"
        azimuth: 60 
        elevation: 35 
        delay: 0 

    c1: 
        name: "G3S1.wav"
@@ -150,7 +138,6 @@ scenes:
        source: "test_single.wav"
        azimuth: "0:1:360"
        elevation: 0 
        delay: 0 

    c2: 
        name: "G2S2.wav"
@@ -158,7 +145,6 @@ scenes:
        source: "test_single.wav"
        azimuth: "60:1:60+360" 
        elevation: 0 
        delay: 0 
  
    c3: 
        name: "G1S3.wav"
@@ -166,7 +152,6 @@ scenes:
        source: "test_single.wav"
        azimuth: "120:1:120+360" 
        elevation: 0 
        delay: 0 
  
    c4: 
        name: "G6S4.wav"
@@ -174,7 +159,6 @@ scenes:
        source: "test_single.wav"
        azimuth: "180:1:180+360" 
        elevation: 0 
        delay: 0 
  
    c5: 
        name: "G5S5.wav"
@@ -182,7 +166,6 @@ scenes:
        source: "test_single.wav"
        azimuth: "240:1:240+360"
        elevation: 0 
        delay: 0 
  
    c6: 
        name: "G4S6.wav"
@@ -190,7 +173,6 @@ scenes:
        source: "test_single.wav"
        azimuth: "300:1:300+360" 
        elevation: 0 
        delay: 0 
 
    d1: 
        name: "G4S1.wav"
@@ -198,7 +180,6 @@ scenes:
        source: "test_single.wav"
        azimuth: "0:-1:-360"
        elevation: 35 
        delay: 0 
        
    d2: 
        name: "G3S2.wav"
@@ -206,7 +187,6 @@ scenes:
        source: "test_single.wav"
        azimuth: "60:-1:60-360" 
        elevation: 35 
        delay: 0 
        
    d3: 
        name: "G3S2.wav"
@@ -214,7 +194,6 @@ scenes:
        source: "test_single.wav"
        azimuth: "120:-1:120-360" 
        elevation: 35 
        delay: 0 
 
    d4: 
        name: "G1S4.wav"
@@ -222,7 +201,6 @@ scenes:
        source: "test_single.wav"
        azimuth: "180:-1:180-360" 
        elevation: 35 
        delay: 0 
 
    d5: 
        name: "G6S5.wav"
@@ -230,7 +208,6 @@ scenes:
        source: "test_single.wav"
        azimuth: "240:-1:240-360"
        elevation: 35 
        delay: 0 
 
    d6: 
        name: "G5S6.wav"
@@ -238,7 +215,6 @@ scenes:
        source: "test_single.wav"
        azimuth: "300:-1:300-360" 
        elevation: 35
        delay: 0 
 
    e1: 
        name: "G5S1.wav"
@@ -246,7 +222,6 @@ scenes:
        source: "test_single.wav"
        azimuth: 240 
        elevation: "-90:0.5:90" 
        delay: 0 
 
    e2: 
        name: "G4S2.wav"
@@ -254,7 +229,6 @@ scenes:
        source: "test_single.wav"
        azimuth: 300 
        elevation: 0 
        delay: 0 
        
    e3: 
        name: "G3S3.wav"
@@ -262,7 +236,6 @@ scenes:
        source: "test_single.wav"
        azimuth: 0 
        elevation: "-90:0.5:90"  
        delay: 0 
  
    e4: 
        name: "G2S4.wav"
@@ -270,7 +243,6 @@ scenes:
        source: "test_single.wav"
        azimuth: 60 
        elevation: "-90:0.5:90"  
        delay: 0 
  
    e5: 
        name: "G1S5.wav"
@@ -278,7 +250,6 @@ scenes:
        source: "test_single.wav"
        azimuth: 120 
        elevation: "-90:0.5:90"  
        delay: 0 
  
    e6: 
        name: "G6S6.wav"
@@ -286,7 +257,6 @@ scenes:
        source: "test_single.wav"
        azimuth: 180 
        elevation: "-90:0.5:90"  
        delay: 0 
 
    f1: 
        name: "G6S1.wav"
@@ -294,7 +264,6 @@ scenes:
        source: "test_single.wav"
        azimuth: "60:0.5:60+180" 
        elevation: "35:-0.2:-35"
        delay: 0 
 
    f2: 
        name: "G5S2.wav"
@@ -302,7 +271,6 @@ scenes:
        source: "test_single.wav"
        azimuth: "120:0.5:120+180" 
        elevation: "35:-0.2:-35" 
        delay: 0 
  
    f3: 
        name: "G4S3.wav"
@@ -310,7 +278,6 @@ scenes:
        source: "test_single.wav"
        azimuth: "180:0.5:180+180" 
        elevation: "35:-0.2:-35" 
        delay: 0 
  
    f4: 
        name: "G3S4.wav"
@@ -318,7 +285,6 @@ scenes:
        source: "test_single.wav"
        azimuth: "240:0.5:240+180" 
        elevation: "35:-0.2:-35"
        delay: 0 
  
    f5: 
        name: "G2S5.wav"
@@ -326,7 +292,6 @@ scenes:
        source: "test_single.wav"
        azimuth: "300:0.5:300+180" 
        elevation: "35:-0.2:-35" 
        delay: 0 
  
    f6: 
        name: "G1S6.wav"
@@ -334,5 +299,4 @@ scenes:
        source: "test_single.wav"
        azimuth: "0:0.5:0+180" 
        elevation: "35:-0.2:-35" 
        delay: 0 
  
 No newline at end of file
+73 −72
Original line number Diff line number Diff line
@@ -37,6 +37,7 @@ loudness: -26
### Each scene must start with the sceneN tag
### Specify the mono source filename (the program will search for it in the input_path folder)
### Specify azimuth and elevation for each input source
### Specify the delay in seconds for each input source
### Note 1: use [val1, val2, ...] for multiple sources in a scene
### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames

@@ -51,288 +52,288 @@ scenes:
    a1: 
        name: "G1S1.wav"
        description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances."
        source: ["f2s5a_Talker1.wav", "m2s16b_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: [0, 50]
        elevation: [0, 0]
        delay: [0, 0]
        delay: [0, 1]
        
    a2: 
        name: "G6S2.wav"
        description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances."
        source: ["f5s10a_Talker1.wav", "m3s2a_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: [50, 350]
        elevation: [0, 0]
        delay: [0, 0]
        delay: [0, 1]
        
    a3: 
        name: "G5S3.wav"
        description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances."
        source: ["f2s5a_Talker1.wav", "m2s16b_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: [40, 290]
        elevation: [0, 0]
        delay: [0, 0]
        delay: [0, 1]

    a4: 
        name: "G4S4.wav"
        description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances."
        source: ["m4s11b_Talker1.wav", "f1s4b_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: [30, 230]
        elevation: [15, 15]
        delay: [0, 0]
        delay: [0, 1]

    a5: 
        name: "G3S5.wav"
        description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances."
        source: ["m1s4a_Talker1.wav", "f3s3a_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: [20, 170]
        elevation: [15, 15]
        delay: [0, 0]
        delay: [0, 1]

    a6: 
        name: "G2S6.wav"
        description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances."
        source: ["f5s10a_Talker1.wav", "m3s2a_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: [10, 110]
        elevation: [15, 15]
        delay: [0, 0]
        delay: [0, 1]

    b1: 
        name: "G2S1.wav"
        description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
        source: ["f5s10b_Talker1.wav", "m3s2b_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: [20, 170]
        elevation: [30, 30]
        delay: [0, 0]
        delay: [0, 1]
 
    b2: 
        name: "G1S2.wav"
        description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
        source: ["f2s1a_Talker1.wav", "m2s10a_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: [10, 110]
        elevation: [30, 30]
        delay: [0, 0]
        delay: [0, 1]
 
    b3: 
        name: "G6S3.wav"
        description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
        source: ["f5s10b_Talker1.wav", "m3s2b_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: [0, 50]
        elevation: [30, 30]
        delay: [0, 0]
        delay: [0, 1]
 
    b4: 
        name: "G5S4.wav"
        description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
        source: ["f2s1a_Talker1.wav", "m2s10a_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: [50, 350]
        elevation: [60, 60]
        delay: [0, 0] 
        delay: [0, 1] 

    b5: 
        name: "G4S5.wav"
        description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
        source: ["m4s11a_Talker1.wav", "f1s6a_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: [40, 290]
        elevation: [60, 60]
        delay: [0, 0] 
        delay: [0, 1] 

    b6: 
        name: "G3S6.wav"
        description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
        source: ["m1s2b_Talker1.wav", "f3s5a_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: [30, 230]
        elevation: [60, 60]
        delay: [0, 0] 
        delay: [0, 1] 

    c1: 
        name: "G3S1.wav"
        description: "one talker sitting at a table, second talker standing beside the table,  non-overlapping utterances."
        source: ["m1s6b_Talker1.wav", "f3s5b_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: [40, 290]
        elevation: [0, 60]
        delay: [0, 0] 
        delay: [0, 1] 

    c2: 
        name: "G2S2.wav"
        description: "one talker sitting at a table, second talker standing beside the table,  non-overlapping utterances."
        source: ["f5s14a_Talker1.wav", "m3s8a_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: [30, 230]
        elevation: [0, 60]
        delay: [0, 0] 
        delay: [0, 1] 
  
    c3: 
        name: "G1S3.wav"
        description: "one talker sitting at a table, second talker standing beside the table,  non-overlapping utterances."
        source: ["f2s6a_Talker1.wav", "m2s13a_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: [20, 170]
        elevation: [0, 60]
        delay: [0, 0]   
        delay: [0, 1]   
  
    c4: 
        name: "G6S4.wav"
        description: "one talker sitting at a table, second talker standing beside the table,  non-overlapping utterances."
        source: ["f5s14a_Talker1.wav", "m3s8a_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: [10, 110]
        elevation: [0, 60]
        delay: [0, 0]     
        delay: [0, 1]     
  
    c5: 
        name: "G5S5.wav"
        description: "one talker sitting at a table, second talker standing beside the table,  non-overlapping utterances."
        source: ["f2s6a_Talker1.wav", "m2s13a_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: [0, 50]
        elevation: [0, 60]
        delay: [0, 0]     
        delay: [0, 1]     
  
    c6: 
        name: "G4S6.wav"
        description: "one talker sitting at a table, second talker standing beside the table,  non-overlapping utterances."
        source: ["m4s13a_Talker1.wav", "f1s20a_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: [50, 350]
        elevation: [0, 60]
        delay: [0, 0]      
        delay: [0, 1]      
 
    d1: 
        name: "G4S1.wav"
        description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
        source: ["m4s12b_Talker1.wav", "f1s12b_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: [50, "180:1:120 + 360"]
        elevation: [0, 60]
        delay: [0, 0]   
        delay: [0, 1]   
        
    d2: 
        name: "G3S2.wav"
        description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
        source: ["m1s12a_Talker1.wav", "f3s20a_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: [300, "-70:-1:-10 - 360"]
        elevation: [0, 60]
        delay: [0, 0]   
        delay: [0, 1]   
        
    d3: 
        name: "G3S2.wav"
        description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
        source: ["f5s15b_Talker1.wav", "m3s1a_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: [250, "-20:-1:-320"]
        elevation: [0, 60]
        delay: [0, 0]          
        delay: [0, 1]          
 
    d4: 
        name: "G1S4.wav"
        description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
        source: ["f2s3b_Talker1.wav", "m2s15a_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: [200, "30:-1:-270"]
        elevation: [0, 60]
        delay: [0, 0]  
        delay: [0, 1]  
 
    d5: 
        name: "G6S5.wav"
        description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
        source: ["f5s15b_Talker1.wav", "m3s1a_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: [150, "80:1:20 + 360"]
        elevation: [0, 60]
        delay: [0, 0]   
        delay: [0, 1]   
 
    d6: 
        name: "G5S6.wav"
        description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
        source: ["f2s3b_Talker1.wav", "m2s15a_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: [100, "130:1:70 + 360"]
        elevation: [0, 60]
        delay: [0, 0]   
        delay: [0, 1]   
 
    e1: 
        name: "G5S1.wav"
        description: "two talkers walking side-by-side around the table, ~30% overlapping utterances"
        source: ["f2s4a_Talker1.wav", "m2s17b_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: ["80:1:20 + 360", "80:1:20 + 360"]
        elevation: [10, 60]
        delay: [0, 0]
        delay: [0, 1]
 
    e2: 
        name: "G4S2.wav"
        description: "two talkers walking side-by-side around the table, ~30% overlapping utterances"
        source: ["m4s16a_Talker1.wav", "f1s16b_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: ["130:1:70 + 360", "130:1:70 + 360"]
        elevation: [10, 60]
        delay: [0, 0]    
        delay: [0, 1]    
        
    e3: 
        name: "G3S3.wav"
        description: "two talkers walking side-by-side around the table, ~30% overlapping utterances"
        source: ["m1s16b_Talker1.wav", "f3s10b_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: ["180:1:120 + 360", "180:1:120 + 360"]
        elevation: [10, 60]
        delay: [0, 0]            
        delay: [0, 1]            
  
    e4: 
        name: "G2S4.wav"
        description: "two talkers walking side-by-side around the table, ~30% overlapping utterances"
        source: ["f5s19a_Talker1.wav", "m3s1b_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: ["-70:-1:-10 - 360", "-70:-1:-10 - 360"]
        elevation: [10, 60]
        delay: [0, 0]    
        delay: [0, 1]    
  
    e5: 
        name: "G1S5.wav"
        description: "two talkers walking side-by-side around the table, ~30% overlapping utterances"
        source: ["f2s4a_Talker1.wav", "m2s17b_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: ["-20:-1:-320", "-20:-1:-320"]
        elevation: [10, 60]
        delay: [0, 0]   
        delay: [0, 1]   
  
    e6: 
        name: "G6S6.wav"
        description: "two talkers walking side-by-side around the table, ~30% overlapping utterances"
        source: ["f5s19a_Talker1.wav", "m3s1b_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: ["30:-1:-270", "30:-1:-270"]
        elevation: [10, 60]
        delay: [0, 0]     
        delay: [0, 1]     
 
    f1: 
        name: "G6S1.wav"
        description: "two talkers walking around the table in opposite directions, non-overlapping utterances."
        source: ["f5s15a_Talker1.wav", "m3s8b_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: ["60:1:0 + 360", "60:-1:120 - 360"]
        elevation: [20, 50]
        delay: [0, 0]    
        delay: [0, 1]    
 
    f2: 
        name: "G5S2.wav"
        description: "two talkers walking around the table in opposite directions, non-overlapping utterances."
        source: ["f2s7b_Talker1.wav", "m2s6b_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: ["0:1:300", "0:-1:60 - 360"]
        elevation: [20, 50]
        delay: [0, 0]   
        delay: [0, 1]   
  
    f3: 
        name: "G4S3.wav"
        description: "two talkers walking around the table in opposite directions, non-overlapping utterances."
        source: ["m4s14a_Talker1.wav", "f1s7a_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: ["300:1:240 + 360", "300:-1:0"]
        elevation: [20, 50]
        delay: [0, 0]     
        delay: [0, 1]     
  
    f4: 
        name: "G3S4.wav"
        description: "two talkers walking around the table in opposite directions, non-overlapping utterances."
        source: ["m1s7a_Talker1.wav", "f3s7a_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: ["240:1:180 + 360", "240:-1:-60"]
        elevation: [20, 50]
        delay: [0, 0]  
        delay: [0, 1]  
  
    f5: 
        name: "G2S5.wav"
        description: "two talkers walking around the table in opposite directions, non-overlapping utterances."
        source: ["f5s15a_Talker1.wav", "m3s8b_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: ["180:1:120 + 360", "180:-1:-120"]
        elevation: [20, 50]
        delay: [0, 0]    
        delay: [0, 1]    
  
    f6: 
        name: "G1S6.wav"
        description: "two talkers walking around the table in opposite directions, non-overlapping utterances."
        source: ["f2s7b_Talker1.wav", "m2s6b_Talker2.wav"]
        source: ["test_double.wav", "test_double.wav"]
        azimuth: ["120:1:60 + 360", "120:-1:180 - 360"]
        elevation: [20, 50]
        delay: [0, 0]      
        delay: [0, 1]      
  
 No newline at end of file
+7 −2
Original line number Diff line number Diff line
@@ -72,10 +72,15 @@ def generate_ism_items(
        y = None
        y_meta = None
        for i in range(N_sources):
        
            # parse parameters from the scene description
            source_file = np.atleast_1d(scene["source"])[i]
            source_azi = np.atleast_1d(scene["azimuth"])[i]
            source_ele = np.atleast_1d(scene["elevation"])[i]
            if 'delay' in scene.keys():
                source_delay = np.atleast_1d(scene["delay"])[i]
            else:
                source_delay = np.array([0])
            
            logger.info(
                f"Encoding {source_file} at position(s) {source_azi},{source_ele}"