Commit 5d8959b5 authored by malenovsky's avatar malenovsky
Browse files

Merge branch 'item_generation_scripts' into 'main'

Item generation scripts

See merge request !32
parents 7075ceac bfa648d4
Loading
Loading
Loading
Loading
Loading
+0 −1
Original line number Diff line number Diff line
@@ -6,7 +6,6 @@ venv/
.vscode/
.idea/
.DS_Store
*.wav
!tests/data/**/*.wav
*.pcm
*.bs
+303 −0
Original line number Diff line number Diff line
---
################################################
# General configuration
################################################

### Output format
format: "ISM1"

### Output sampling rate in Hz needed for headerless audio files; default = 48000
fs: 48000

### Any relative paths will be interpreted relative to the working directory the script is called from!
### Usage of absolute paths is recommended.
### Do not use file names with dots "." in them! This is not supported, use "_" instead
### For Windows users: please use double back slash '\\' in paths and add '.exe' to executable definitions

### Input path to mono files
input_path: "./items_mono"

### Output path for generated test items and metadata files
output_path: "./items_ISM1"

### Target loudness in LKFS; default = null (no loudness normalization applied)
loudness: -26

### Pre-amble and Post-amble length in seconds (default = 0.0)
preamble: 0.5
postamble: 0.5

### Flag for adding low-level random background noise (amplitude +-4) instead of silence; default = false (silence)
add_low_level_random_noise: true


################################################
### Scene description
################################################

### Each scene must start with the sceneN tag
### Specify the mono source filename (the program will search for it in the input_path folder)
### Specify azimuth and elevation for each input source
### Note 1: use [val1, val2, ...] for multiple sources in a scene
### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames

### Note 3: we're using right-handed coordinate system with azi = 0 pointing from the nose to the screen
### azimuth: float, [-180,180]; positive indicates left
### elevation: float, [-90,90]; positive indicates up
### distance: float, tbd: default: 1
### spread: float, [0,360]; spread in angles from 0 ... 360˚
### gain: float, [0,1]

scenes:
    a1: 
        name: "G1S1.wav"
        description: "Talker sitting at a table"
        source: "test_single.wav"
        azimuth: 0 
        elevation: 0 
        
    a2: 
        name: "G6S2.wav"
        description: "Talker sitting at a table"
        source: "test_single.wav"
        azimuth: 60 
        elevation: 0 
        
    a3: 
        name: "G5S3.wav"
        description: "Talker sitting at a table"
        source: "test_single.wav"
        azimuth: 120 
        elevation: 0 

    a4: 
        name: "G4S4.wav"
        description: "Talker sitting at a table"
        source: "test_single.wav"
        azimuth: 180 
        elevation: 0 

    a5: 
        name: "G3S5.wav"
        description: "Talker sitting at a table"
        source: "test_single.wav"
        azimuth: 240 
        elevation: 0 

    a6: 
        name: "G2S6.wav"
        description: "Talker sitting at a table"
        source: "test_single.wav"
        azimuth: 300 
        elevation: 0 

    b1: 
        name: "G2S1.wav"
        description: "standing talker."
        source: "test_single.wav"
        azimuth: 120 
        elevation: 35 
 
    b2: 
        name: "G1S2.wav"
        description: "standing talker."
        source: "test_single.wav"
        azimuth: 180 
        elevation: 35 
 
    b3: 
        name: "G6S3.wav"
        description: "standing talker."
        source: "test_single.wav"
        azimuth: 240 
        elevation: 35 
 
    b4: 
        name: "G5S4.wav"
        description: "standing talker."
        source: "test_single.wav"
        azimuth: 300 
        elevation: 35 

    b5: 
        name: "G4S5.wav"
        description: "standing talker."
        source: "test_single.wav"
        azimuth: 0 
        elevation: 35 

    b6: 
        name: "G3S6.wav"
        description: "standing talker."
        source: "test_single.wav"
        azimuth: 60 
        elevation: 35 

    c1: 
        name: "G3S1.wav"
        description: "Smaller talker (child) walking around a table."
        source: "test_single.wav"
        azimuth: "0:1:360"
        elevation: 0 

    c2: 
        name: "G2S2.wav"
        description: "Smaller talker (child) walking around a table."
        source: "test_single.wav"
        azimuth: "60:1:60+360" 
        elevation: 0 
  
    c3: 
        name: "G1S3.wav"
        description: "Smaller talker (child) walking around a table."
        source: "test_single.wav"
        azimuth: "120:1:120+360" 
        elevation: 0 
  
    c4: 
        name: "G6S4.wav"
        description: "Smaller talker (child) walking around a table."
        source: "test_single.wav"
        azimuth: "180:1:180+360" 
        elevation: 0 
  
    c5: 
        name: "G5S5.wav"
        description: "Smaller talker (child) walking around a table."
        source: "test_single.wav"
        azimuth: "240:1:240+360"
        elevation: 0 
  
    c6: 
        name: "G4S6.wav"
        description: "Smaller talker (child) walking around a table."
        source: "test_single.wav"
        azimuth: "300:1:300+360" 
        elevation: 0 
 
    d1: 
        name: "G4S1.wav"
        description: "Talker walking around the table."
        source: "test_single.wav"
        azimuth: "0:-1:-360"
        elevation: 35 
        
    d2: 
        name: "G3S2.wav"
        description: "Talker walking around the table."
        source: "test_single.wav"
        azimuth: "60:-1:60-360" 
        elevation: 35 
        
    d3: 
        name: "G2S3.wav"
        description: "Talker walking around the table."
        source: "test_single.wav"
        azimuth: "120:-1:120-360" 
        elevation: 35 
 
    d4: 
        name: "G1S4.wav"
        description: "Talker walking around the table."
        source: "test_single.wav"
        azimuth: "180:-1:180-360" 
        elevation: 35 
 
    d5: 
        name: "G6S5.wav"
        description: "Talker walking around the table."
        source: "test_single.wav"
        azimuth: "240:-1:240-360"
        elevation: 35 
 
    d6: 
        name: "G5S6.wav"
        description: "Talker walking around the table."
        source: "test_single.wav"
        azimuth: "300:-1:300-360" 
        elevation: 35
 
    e1: 
        name: "G5S1.wav"
        description: "Elevation displacement."
        source: "test_single.wav"
        azimuth: 240 
        elevation: "-90:0.5:90" 
 
    e2: 
        name: "G4S2.wav"
        description: "Elevation displacement."
        source: "test_single.wav"
        azimuth: 300 
        elevation: 0 
        
    e3: 
        name: "G3S3.wav"
        description: "Elevation displacement."
        source: "test_single.wav"
        azimuth: 0 
        elevation: "-90:0.5:90"  
  
    e4: 
        name: "G2S4.wav"
        description: "Elevation displacement."
        source: "test_single.wav"
        azimuth: 60 
        elevation: "-90:0.5:90"  
  
    e5: 
        name: "G1S5.wav"
        description: "Elevation displacement."
        source: "test_single.wav"
        azimuth: 120 
        elevation: "-90:0.5:90"  
  
    e6: 
        name: "G6S6.wav"
        description: "Elevation displacement."
        source: "test_single.wav"
        azimuth: 180 
        elevation: "-90:0.5:90"  
 
    f1: 
        name: "G6S1.wav"
        description: "Azimuth and elevation displacement."
        source: "test_single.wav"
        azimuth: "60:0.5:60+180" 
        elevation: "35:-0.2:-35"
 
    f2: 
        name: "G5S2.wav"
        description: "Azimuth and elevation displacement."
        source: "test_single.wav"
        azimuth: "120:0.5:120+180" 
        elevation: "35:-0.2:-35" 
  
    f3: 
        name: "G4S3.wav"
        description: "Azimuth and elevation displacement."
        source: "test_single.wav"
        azimuth: "180:0.5:180+180" 
        elevation: "35:-0.2:-35" 
  
    f4: 
        name: "G3S4.wav"
        description: "Azimuth and elevation displacement."
        source: "test_single.wav"
        azimuth: "240:0.5:240+180" 
        elevation: "35:-0.2:-35"
  
    f5: 
        name: "G2S5.wav"
        description: "Azimuth and elevation displacement."
        source: "test_single.wav"
        azimuth: "300:0.5:300+180" 
        elevation: "35:-0.2:-35" 
  
    f6: 
        name: "G1S6.wav"
        description: "Azimuth and elevation displacement."
        source: "test_single.wav"
        azimuth: "0:0.5:0+180" 
        elevation: "35:-0.2:-35" 
  
 No newline at end of file
+339 −0
Original line number Diff line number Diff line
---
################################################
# General configuration
################################################

### Output format
format: "ISM2"

### Output sampling rate in Hz needed for headerless audio files; default = 48000
fs: 48000

### Any relative paths will be interpreted relative to the working directory the script is called from!
### Usage of absolute paths is recommended.
### Do not use file names with dots "." in them! This is not supported, use "_" instead
### For Windows users: please use double back slash '\\' in paths and add '.exe' to executable definitions

### Input path to mono files
input_path: "./items_mono"

### Output path for generated test items and metadata files
output_path: "./items_ISM2"

### Target loudness in LKFS; default = null (no loudness normalization applied)
loudness: -26

### Pre-amble and Post-amble length in seconds (default = 0.0)
preamble: 1.0
postamble: 1.0

### Flag for adding low-level random background noise (amplitude +-4) instead of silence; default = false (silence)
add_low_level_random_noise: true

################################################
### Scene description
################################################

### Each scene must start with the sceneN tag
### Specify the mono source filename (the program will search for it in the input_path folder)
### Specify azimuth and elevation for each input source
### Specify the overlap length in seconds for each input source (negative value creates a gap)
### Note 1: use [val1, val2, ...] for multiple sources in a scene
### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames

### Note 3: we're using right-handed coordinate system with azi = 0 pointing from the nose to the screen
### azimuth: float, [-180,180]; positive indicates left
### elevation: float, [-90,90]; positive indicates up
### distance: float, tbd: default: 1
### spread: float, [0,360]; spread in angles from 0 ... 360˚
### gain: float, [0,1]

scenes:
    a1: 
        name: "G1S1.wav"
        description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances."
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [0, 50]
        elevation: [0, 0]
        overlap: -1.0
        
    a2: 
        name: "G6S2.wav"
        description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances."
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [50, 350]
        elevation: [0, 0]
        overlap: -1.0
        
    a3: 
        name: "G5S3.wav"
        description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances."
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [40, 290]
        elevation: [0, 0]
        overlap: -1.0

    a4: 
        name: "G4S4.wav"
        description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances."
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [30, 230]
        elevation: [15, 15]
        overlap: -1.0

    a5: 
        name: "G3S5.wav"
        description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances."
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [20, 170]
        elevation: [15, 15]
        overlap: -1.0

    a6: 
        name: "G2S6.wav"
        description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances."
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [10, 110]
        elevation: [15, 15]
        overlap: -1.0

    b1: 
        name: "G2S1.wav"
        description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [20, 170]
        elevation: [30, 30]
        overlap: 1.0
 
    b2: 
        name: "G1S2.wav"
        description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [10, 110]
        elevation: [30, 30]
        overlap: 1.0
 
    b3: 
        name: "G6S3.wav"
        description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [0, 50]
        elevation: [30, 30]
        overlap: 1.0
 
    b4: 
        name: "G5S4.wav"
        description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [50, 350]
        elevation: [60, 60]
        overlap: 1.0 

    b5: 
        name: "G4S5.wav"
        description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [40, 290]
        elevation: [60, 60]
        overlap: 1.0 

    b6: 
        name: "G3S6.wav"
        description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [30, 230]
        elevation: [60, 60]
        overlap: 1.0 

    c1: 
        name: "G3S1.wav"
        description: "one talker sitting at a table, second talker standing beside the table,  non-overlapping utterances."
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [40, 290]
        elevation: [0, 60]
        overlap: -1.0 

    c2: 
        name: "G2S2.wav"
        description: "one talker sitting at a table, second talker standing beside the table,  non-overlapping utterances."
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [30, 230]
        elevation: [0, 60]
        overlap: -1.0 
  
    c3: 
        name: "G1S3.wav"
        description: "one talker sitting at a table, second talker standing beside the table,  non-overlapping utterances."
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [20, 170]
        elevation: [0, 60]
        overlap: -1.0   
  
    c4: 
        name: "G6S4.wav"
        description: "one talker sitting at a table, second talker standing beside the table,  non-overlapping utterances."
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [10, 110]
        elevation: [0, 60]
        overlap: -1.0     
  
    c5: 
        name: "G5S5.wav"
        description: "one talker sitting at a table, second talker standing beside the table,  non-overlapping utterances."
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [0, 50]
        elevation: [0, 60]
        overlap: -1.0     
  
    c6: 
        name: "G4S6.wav"
        description: "one talker sitting at a table, second talker standing beside the table,  non-overlapping utterances."
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [50, 350]
        elevation: [0, 60]
        overlap: -1.0      
 
    d1: 
        name: "G4S1.wav"
        description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [50, "180:1:120 + 360"]
        elevation: [0, 60]
        overlap: 1.0   
        
    d2: 
        name: "G3S2.wav"
        description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [300, "-70:-1:-10 - 360"]
        elevation: [0, 60]
        overlap: 1.0   
        
    d3: 
        name: "G2S3.wav"
        description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [250, "-20:-1:-320"]
        elevation: [0, 60]
        overlap: 1.0          
 
    d4: 
        name: "G1S4.wav"
        description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [200, "30:-1:-270"]
        elevation: [0, 60]
        overlap: 1.0  
 
    d5: 
        name: "G6S5.wav"
        description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [150, "80:1:20 + 360"]
        elevation: [0, 60]
        overlap: 1.0   
 
    d6: 
        name: "G5S6.wav"
        description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [100, "130:1:70 + 360"]
        elevation: [0, 60]
        overlap: 1.0   
 
    e1: 
        name: "G5S1.wav"
        description: "two talkers walking side-by-side around the table, ~30% overlapping utterances"
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: ["80:1:20 + 360", "80:1:20 + 360"]
        elevation: [10, 60]
        overlap: 1.0
 
    e2: 
        name: "G4S2.wav"
        description: "two talkers walking side-by-side around the table, ~30% overlapping utterances"
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: ["130:1:70 + 360", "130:1:70 + 360"]
        elevation: [10, 60]
        overlap: 1.0    
        
    e3: 
        name: "G3S3.wav"
        description: "two talkers walking side-by-side around the table, ~30% overlapping utterances"
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: ["180:1:120 + 360", "180:1:120 + 360"]
        elevation: [10, 60]
        overlap: 1.0            
  
    e4: 
        name: "G2S4.wav"
        description: "two talkers walking side-by-side around the table, ~30% overlapping utterances"
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: ["-70:-1:-10 - 360", "-70:-1:-10 - 360"]
        elevation: [10, 60]
        overlap: 1.0    
  
    e5: 
        name: "G1S5.wav"
        description: "two talkers walking side-by-side around the table, ~30% overlapping utterances"
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: ["-20:-1:-320", "-20:-1:-320"]
        elevation: [10, 60]
        overlap: 1.0   
  
    e6: 
        name: "G6S6.wav"
        description: "two talkers walking side-by-side around the table, ~30% overlapping utterances"
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: ["30:-1:-270", "30:-1:-270"]
        elevation: [10, 60]
        overlap: 1.0     
 
    f1: 
        name: "G6S1.wav"
        description: "two talkers walking around the table in opposite directions, non-overlapping utterances."
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: ["60:1:0 + 360", "60:-1:120 - 360"]
        elevation: [20, 50]
        overlap: -1.0    
 
    f2: 
        name: "G5S2.wav"
        description: "two talkers walking around the table in opposite directions, non-overlapping utterances."
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: ["0:1:300", "0:-1:60 - 360"]
        elevation: [20, 50]
        overlap: -1.0   
  
    f3: 
        name: "G4S3.wav"
        description: "two talkers walking around the table in opposite directions, non-overlapping utterances."
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: ["300:1:240 + 360", "300:-1:0"]
        elevation: [20, 50]
        overlap: -1.0     
  
    f4: 
        name: "G3S4.wav"
        description: "two talkers walking around the table in opposite directions, non-overlapping utterances."
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: ["240:1:180 + 360", "240:-1:-60"]
        elevation: [20, 50]
        overlap: -1.0  
  
    f5: 
        name: "G2S5.wav"
        description: "two talkers walking around the table in opposite directions, non-overlapping utterances."
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: ["180:1:120 + 360", "180:-1:-120"]
        elevation: [20, 50]
        overlap: -1.0    
  
    f6: 
        name: "G1S6.wav"
        description: "two talkers walking around the table in opposite directions, non-overlapping utterances."
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: ["120:1:60 + 360", "120:-1:180 - 360"]
        elevation: [20, 50]
        overlap: -1.0      
  
 No newline at end of file
+303 −0

File added.

Preview size limit exceeded, changes collapsed.

+5 −2
Original line number Diff line number Diff line
@@ -112,6 +112,7 @@ def write(
    filename: Union[str, Path],
    x: np.ndarray,
    fs: Optional[int] = 48000,
    dtype: Optional[str] = "int16",
) -> None:
    """
    Write audio file (.pcm, .wav or .raw)
@@ -124,6 +125,8 @@ def write(
        Numpy 2D array of dimension: number of channels x number of samples
    fs: Optional[int]
        Sampling rate, required for .pcm or .raw input file, default = 48000 (Hz)
    dtype: Optional[str]
        Data type format required for .pcm or .raw input file, default = 'int16'

    Returns
    -------
@@ -140,10 +143,10 @@ def write(
        x = np.clip(x, np.iinfo(np.int16).min, np.iinfo(np.int16).max)

    if file_extension == ".wav":
        x = x.astype(np.int16)
        x = x.astype(dtype)
        wav.write(filename, fs, x)
    elif file_extension == ".pcm" or file_extension == ".raw":
        x = x.astype("int16").reshape(-1, 1)
        x = x.astype(dtype).reshape(-1, 1)
        x.tofile(filename)
    else:
        raise ValueError("Wrong input format. Use wav, pcm or raw")
Loading