Commit f41efcb8 authored by Vladimir Malenovsky's avatar Vladimir Malenovsky
Browse files

support for +- overlap in STEREO items, expect trimmed sentences, support for...

support for +- overlap in STEREO items, expect trimmed sentences, support for low-level random noise addition
parent 8a6542d4
Loading
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -110,6 +110,7 @@ def main(args):
            IR_fs=cfg.IR_fs,
            preamble=cfg.preamble,
            postamble=cfg.postamble,
            add_low_level_random_noise=cfg.add_low_level_random_noise,
        )
        
    # copy configuration to output directory
+68 −61
Original line number Diff line number Diff line
@@ -35,6 +35,13 @@ output_path: "./items_STEREO"
### Target loudness in LKFS; default = null (no loudness normalization applied)
loudness: -26

### Pre-amble and Post-amble length in seconds (default = 0.0)
preamble: 0.5
postamble: 0.5

### Flag for adding low-level random background noise (amplitude +-4) instead of silence; default = false (silence)
add_low_level_random_noise: true


################################################
### Scene description
@@ -43,7 +50,7 @@ loudness: -26
### Each scene must start with the sceneN tag
### Specify the mono source filename (the program will search for it in the input_path folder)
### Specify the stereo IR source filename (the program will search for it in the input_path_IR folder)
### Specify the delay in seconds for each input source
### Specify the overlap length in seconds for each input source (negative value creates a gap)
### Note 1: use [val1, val2, ...] for multiple sources in a scene
### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames

@@ -51,252 +58,252 @@ scenes:
    a1: 
        name: "G1S1.wav"
        description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room."
        source: ["test_single.wav", "test_single.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        IR: ["LEABP04.wav", "LEABP11.wav"]
        delay: [0, 3]
        overlap: 0.5
        
    a2: 
        name: "G6S2.wav"
        description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room."
        source: ["test_single.wav", "test_single.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        IR: ["LEABP05.wav", "LEABP11.wav"]
        delay: [0, 3]
        overlap: 0.5
        
    a3: 
        name: "G5S3.wav"
        description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room."
        source: ["test_single.wav", "test_single.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        IR: ["LEABP06.wav", "LEABP11.wav"]
        delay: [0, 3]
        overlap: 0.5

    a4: 
        name: "G4S4.wav"
        description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room."
        source: ["test_single.wav", "test_single.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        IR: ["LEABP05.wav", "LEABP10.wav"]
        delay: [0, 1.5]
        overlap: -0.5

    a5: 
        name: "G3S5.wav"
        description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room."
        source: ["test_single.wav", "test_single.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        IR: ["LEABP05.wav", "LEABP11.wav"]
        delay: [0, 1.5]
        overlap: -0.5

    a6: 
        name: "G2S6.wav"
        description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room."
        source: ["test_single.wav", "test_single.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        IR: ["LEABP05.wav", "LEABP12.wav"]
        delay: [0, 1.5]
        overlap: -0.5

    b1: 
        name: "G2S1.wav"
        description: "Two speakers sitting at oval table side by side in a large anechoic conference room."
        source: ["test_single.wav", "test_single.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        IR: ["LAABP05.wav", "LAABP06.wav"]
        delay: [0, 35]
        overlap: -0.5
 
    b2: 
        name: "G1S2.wav"
        description: "Two speakers sitting at oval table side by side in a large anechoic conference room."
        source: ["test_single.wav", "test_single.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        IR: ["LAABP07.wav", "LAABP08.wav"]
        delay: [0, 3]
        overlap: 0.5
 
    b3: 
        name: "G6S3.wav"
        description: "Two speakers sitting at oval table side by side in a large anechoic conference room."
        source: ["test_single.wav", "test_single.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        IR: ["LAABP09.wav", "LAABP10.wav"]
        delay: [0, 3]
        overlap: 0.5
 
    b4: 
        name: "G5S4.wav"
        description: "Two speakers sitting at oval table side by side in a large anechoic conference room."
        source: ["test_single.wav", "test_single.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        IR: ["LAABP11.wav", "LAABP12.wav"]
        delay: [0, 1.5] 
        overlap: -0.5 

    b5: 
        name: "G4S5.wav"
        description: "Two speakers sitting at oval table side by side in a large anechoic conference room."
        source: ["test_single.wav", "test_single.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        IR: ["LAABP01.wav", "LAABP02.wav"]
        delay: [0, 1.5] 
        overlap: -0.5 

    b6: 
        name: "G3S6.wav"
        description: "Two speakers sitting at oval table side by side in a large anechoic conference room."
        source: ["test_single.wav", "test_single.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        IR: ["LAABP03.wav", "LAABP04.wav"]
        delay: [0, 1.5] 
        overlap: -0.5 

    c1: 
        name: "G3S1.wav"
        description: "One talker sitting at table in a small anechoic conference room."
        source: ["test_single.wav"]
        IR: ["SAMSP01.wav"]
        delay: [0] 
        overlap: -0.5

    c2: 
        name: "G2S2.wav"
        description: "One talker sitting at table in a small anechoic conference room."
        source: ["test_single.wav"]
        IR: ["SAMSP04.wav"]
        delay: [0] 
        overlap: -0.5
  
    c3: 
        name: "G1S3.wav"
        description: "One talker sitting at table in a small anechoic conference room."
        source: ["test_single.wav"]
        IR: ["SAMSP07.wav"]
        delay: [0] 
        overlap: -0.5
  
    c4: 
        name: "G6S4.wav"
        description: "One talker sitting at table in a small echoic conference room."
        source: ["test_single.wav"]
        IR: ["SEABP01.wav"]
        delay: [0] 
        overlap: -0.5
  
    c5: 
        name: "G5S5.wav"
        description: "One talker sitting at table in a small echoic conference room."
        source: ["test_single.wav"]
        IR: ["SEABP03.wav"]
        delay: [0] 
        overlap: -0.5
  
    c6: 
        name: "G4S6.wav"
        description: "One talker sitting at table in a small echoic conference room."
        source: ["test_single.wav"]
        IR: ["SEABP06.wav"]
        delay: [0] 
        overlap: -0.5
 
    d1: 
        name: "G4S1.wav"
        description: "One talker sitting at table in a small anechoic conference room."
        source: ["test_single.wav"]
        IR: ["SEBIP01.wav"]
        delay: [0]   
        overlap: -0.5  
        
    d2: 
        name: "G3S2.wav"
        description: "One talker sitting at table in a small anechoic conference room."
        source: ["test_single.wav"]
        IR: ["SEBIP04.wav"]
        delay: [0]   
        overlap: -0.5  
        
    d3: 
        name: "G3S2.wav"
        description: "One talker sitting at table in a small anechoic conference room."
        source: ["test_single.wav"]
        IR: ["SEBIP07.wav"]
        delay: [0]   
        overlap: -0.5  
 
    d4: 
        name: "G1S4.wav"
        description: "One talker sitting at table in a small echoic conference room."
        source: ["test_single.wav"]
        IR: ["SEBIP07.wav"]
        delay: [0]   
        overlap: -0.5  
 
    d5: 
        name: "G6S5.wav"
        description: "One talker sitting at table in a small echoic conference room."
        source: ["test_single.wav"]
        IR: ["SEBIP07.wav"]
        delay: [0]   
        overlap: -0.5  
 
    d6: 
        name: "G5S6.wav"
        description: "One talker sitting at table in a small echoic conference room."
        source: ["test_single.wav"]
        IR: ["SEBIP07.wav"]
        delay: [0]   
        overlap: -0.5  
 
    e1: 
        name: "G5S1.wav"
        description: "Two talkers sitting in a room."
        source: ["test_single.wav", "test_single.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        IR: ["SEMSP01.wav", "SEMSP03.wav"]
        delay: [0, 3]
        overlap: 0.5
 
    e2: 
        name: "G4S2.wav"
        description: "Two talkers sitting in a room."
        source: ["test_single.wav", "test_single.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        IR: ["SEMSP01.wav", "SEMSP05.wav"]
        delay: [0, 3]
        overlap: 0.5
        
    e3: 
        name: "G3S3.wav"
        description: "Two talkers sitting in a room."
        source: ["test_single.wav", "test_single.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        IR: ["SEMSP01.wav", "SEMSP07.wav"]
        delay: [0, 3]
        overlap: 0.5
  
    e4: 
        name: "G2S4.wav"
        description: "Two talkers sitting in a room."
        source: ["test_single.wav", "test_single.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        IR: ["SEMSP03.wav", "SEMSP04.wav"]
        delay: [0, 1.5]
        overlap: -0.5
  
    e5: 
        name: "G1S5.wav"
        description: "Two talkers sitting in a room."
        source: ["test_single.wav", "test_single.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        IR: ["SEMSP05.wav", "SEMSP07.wav"]
        delay: [0, 1.5]
        overlap: -0.5
  
    e6: 
        name: "G6S6.wav"
        description: "Two talkers sitting in a room."
        source: ["test_single.wav", "test_single.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        IR: ["SEMSP06.wav", "SEMSP02.wav"]
        delay: [0, 1.5]
        overlap: -0.5
 
    f1: 
        name: "G6S1.wav"
        description: "Two talkers sitting in a room."
        source: ["test_single.wav", "test_single.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        IR: ["SEBIP05.wav", "SEBIP01.wav"]
        delay: [0, 3]
        overlap: 0.5
 
    f2: 
        name: "G5S2.wav"
        description: "Two talkers sitting in a room."
        source: ["test_single.wav", "test_single.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        IR: ["SEBIP07.wav", "SEBIP01.wav"]
        delay: [0, 3]
        overlap: 0.5
  
    f3: 
        name: "G4S3.wav"
        description: "Two talkers sitting in a room."
        source: ["test_single.wav", "test_single.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        IR: ["SEBIP04.wav", "SEBIP01.wav"]
        delay: [0, 3]
        overlap: 0.5
  
    f4: 
        name: "G3S4.wav"
        description: "Two talkers sitting in a room."
        source: ["test_single.wav", "test_single.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        IR: ["SEBIP02.wav", "SEBIP06.wav"]
        delay: [0, 1.5]
        overlap: -0.5
  
    f5: 
        name: "G2S5.wav"
        description: "Two talkers sitting in a room."
        source: ["test_single.wav", "test_single.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        IR: ["SEBIP02.wav", "SEBIP06.wav"]
        delay: [0, 1.5]
        overlap: -0.5
  
    f6: 
        name: "G1S6.wav"
        description: "Two talkers sitting in a room."
        source: ["test_single.wav", "test_single.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        IR: ["SEBIP03.wav", "SEBIP04.wav"]
        delay: [0, 1.5]
        overlap: -0.5
  
 No newline at end of file
+1 −4
Original line number Diff line number Diff line
@@ -100,9 +100,6 @@ def generate_ism_items(
            # read source file
            x = audio.fromfile("MONO", os.path.join(input_path, source_file), fs=fs)
 
            ############### DEBUG ############33
            # x.audio = x.audio[:-10]

            # get the number of frames (multiple of 20ms)
            N_frames = int(len(x.audio) / x.fs * 50)
            
+65 −20
Original line number Diff line number Diff line
@@ -40,11 +40,12 @@ from copy import copy
import numpy as np
from math import floor


from item_generation_scripts.audiotools import audio, audiofile
from item_generation_scripts.audiotools.wrappers.bs1770 import get_loudness
from item_generation_scripts.audiotools.wrappers.reverb import reverb_stereo

SEED_RANDOM_NOISE = 0


# function for converting nd numpy array to strings with 2 decimal digits
def csv_formatdata(data):
@@ -62,6 +63,9 @@ def generate_stereo_items(
    logger: logging.Logger,
    fs: Optional[int] = 48000,
    IR_fs: Optional[int] = 48000,
    preamble: Optional[float] = 0.0,
    postamble: Optional[float] = 0.0,
    add_low_level_random_noise: Optional[bool] = False,
):
    """Generate STEREO items from mono items based on scene description"""

@@ -77,16 +81,18 @@ def generate_stereo_items(
        # read the IR (check if stereo or two mono files were provided)
        source_IR = np.atleast_1d(scene["IR"])
        
        # read the overlap length
        if 'overlap' in scene.keys():
            source_overlap = float(scene["overlap"])
        else:
            source_overlap = 0.0

        y = audio.ChannelBasedAudio("STEREO")
        for i in range(N_sources):
        
            # parse parameters from the scene description
            source_file = np.atleast_1d(scene["source"])[i]
            IR_file = np.atleast_1d(scene["IR"])[i]
            if 'delay' in scene.keys():
                source_delay = np.atleast_1d(scene["delay"])[i]
            else:
                source_delay = np.array([0])
            
            logger.info(
                f"Convolving {source_file} with {source_IR}"
@@ -98,35 +104,46 @@ def generate_stereo_items(
            # get the number of frames (multiple of 20ms)
            N_frames = int(len(x.audio) / x.fs * 50)
            
            # trim the source signal to align to 20ms boundary
            N_trim = int(N_frames * x.fs / 50)
            x.audio = x.audio[:N_trim]

            # read the IR file
            IR = audio.fromfile("STEREO", os.path.join(IR_path, IR_file), fs=IR_fs)
                 
            # delay the source file
            if source_delay > 0:
            # convolve with stereo IR
            x_rev = reverb_stereo(x, IR)
            
            # adjust the level of the stereo signal
            _, scale_factor = get_loudness(x_rev, target_level, "STEREO")
            x_rev.audio *= scale_factor
            
            # shift the second (and all other) source files (positive shift creates overlap, negative shift creates a gap)
            if i > 0 and source_overlap != 0.0:
                # get the length of the first source file
                N_delay = len(y.audio[:,0])
                
                # add the shift
                N_delay += int(source_overlap * x.fs)
            
                # ensure delay is a multiple of 20ms
                N_delay = int(floor(source_delay * 50) / 50 * x.fs)
                # N_delay = int(floor(source_shift * 50) / 50 * x.fs)
            
                # insert all-zero preamble
                pre = np.zeros((N_delay, x.audio.shape[1]))
                x.audio = np.concatenate([pre, x.audio])
                
            # convolve with stereo IR
            x_rev = reverb_stereo(x, IR)
            # pad with zeros to ensure that the signal length is a multiple of 20ms  
            N_frame = x.fs / 50
            if len(x.audio) % N_frame != 0:
                N_pad = int(N_frame - len(x.audio) % N_frame)
                
            # adjust the level of the stereo signal
            _, scale_factor = get_loudness(x_rev, target_level, "STEREO")
            x_rev.audio *= scale_factor
                # insert all-zero preamble
                pre = np.zeros((N_pad, x.audio.shape[1]))
                x.audio = np.concatenate([pre, x.audio])
               
            # add source signal to the array of source signals
            y.fs = x.fs
            if y.audio is None:
                y.audio = x_rev.audio
            else:
                # append zeros to have equal length of all source signals
                # pad with zeros to have equal length of all source signals
                if x_rev.audio.shape[0] > y.audio.shape[0]:
                    y.audio = np.vstack((y.audio, np.zeros((x_rev.audio.shape[0] - y.audio.shape[0], y.audio.shape[1]))))
                elif y.audio.shape[0] > x_rev.audio.shape[0]:
@@ -135,6 +152,34 @@ def generate_stereo_items(
                # superimpose 
                y.audio += x_rev.audio

        # append pre-amble and post-amble to all sources
        if preamble != 0.0:
            # ensure that pre-mable is a multiple of 20ms
            N_pre = int(floor(preamble * 50) / 50 * y.fs)
            
            # insert all-zero preamble to all sources
            pre = np.zeros((N_pre, y.audio.shape[1]))
            y.audio = np.concatenate([pre, y.audio])
        
        if postamble != 0.0:
            # ensure that post-mable is a multiple of 20ms
            N_post = int(floor(postamble * 50) / 50 * y.fs)
            
            # append all-zero postamble to all sources
            post = np.zeros((N_post, y.audio.shape[1]))
            y.audio = np.concatenate([y.audio, post])
            
        # add random noise
        if add_low_level_random_noise:
            # create uniformly distributed noise between -4 and 4
            np.random.seed(SEED_RANDOM_NOISE)
            noise = np.random.randint(
                low=-4, high=5, size=y.audio.shape
            ).astype("float")
            
            # superimpose
            y.audio += noise
            
        # write the reverberated audio into output file
        output_filename = scene["name"]
        audiofile.write(