Commit 8a6542d4 authored by Vladimir Malenovsky's avatar Vladimir Malenovsky
Browse files

support for +- overlap in ISM items, expect trimmed sentences, support for...

support for +- overlap in ISM items, expect trimmed sentences, support for low-level random noise addition
parent bab5d25f
Loading
Loading
Loading
Loading
Loading
+6 −1
Original line number Diff line number Diff line
@@ -91,7 +91,10 @@ def main(args):
            cfg.output_path,
            cfg.scenes,
            logger,
            fs=cfg.fs
            fs=cfg.fs,
            preamble=cfg.preamble,
            postamble=cfg.postamble,
            add_low_level_random_noise=cfg.add_low_level_random_noise,
        )
    elif cfg.format == "STEREO":
        # generate STEREO items according to scene description
@@ -105,6 +108,8 @@ def main(args):
            logger,
            fs=cfg.fs,
            IR_fs=cfg.IR_fs,
            preamble=cfg.preamble,
            postamble=cfg.postamble,
        )
        
    # copy configuration to output directory
+4 −0
Original line number Diff line number Diff line
@@ -29,6 +29,10 @@ output_path: "./items_ISM1"
### Target loudness in LKFS; default = null (no loudness normalization applied)
loudness: -26

### Pre-amble and Post-amble length in seconds (default = None)
preamble: 0.5
postamble: 0.5


################################################
### Scene description
+79 −73
Original line number Diff line number Diff line
@@ -29,6 +29,12 @@ output_path: "./items_ISM2"
### Target loudness in LKFS; default = null (no loudness normalization applied)
loudness: -26

### Pre-amble and Post-amble length in seconds (default = 0.0)
preamble: 0.5
postamble: 0.5

### Flag for adding low-level random background noise (amplitude +-4) instead of silence; default = false (silence)
add_low_level_random_noise: true

################################################
### Scene description
@@ -37,7 +43,7 @@ loudness: -26
### Each scene must start with the sceneN tag
### Specify the mono source filename (the program will search for it in the input_path folder)
### Specify azimuth and elevation for each input source
### Specify the delay in seconds for each input source
### Specify the overlap length in seconds for each input source (negative value creates a gap)
### Note 1: use [val1, val2, ...] for multiple sources in a scene
### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames

@@ -52,288 +58,288 @@ scenes:
    a1: 
        name: "G1S1.wav"
        description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances."
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [0, 50]
        elevation: [0, 0]
        delay: [0, 0]
        overlap: -0.5
        
    a2: 
        name: "G6S2.wav"
        description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances."
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [50, 350]
        elevation: [0, 0]
        delay: [0, 0]
        overlap: -0.5
        
    a3: 
        name: "G5S3.wav"
        description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances."
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [40, 290]
        elevation: [0, 0]
        delay: [0, 0]
        overlap: -0.5

    a4: 
        name: "G4S4.wav"
        description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances."
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [30, 230]
        elevation: [15, 15]
        delay: [0, 0]
        overlap: -0.5

    a5: 
        name: "G3S5.wav"
        description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances."
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [20, 170]
        elevation: [15, 15]
        delay: [0, 0]
        overlap: -0.5

    a6: 
        name: "G2S6.wav"
        description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances."
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [10, 110]
        elevation: [15, 15]
        delay: [0, 0]
        overlap: -0.5

    b1: 
        name: "G2S1.wav"
        description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [20, 170]
        elevation: [30, 30]
        delay: [0, 1.5]
        overlap: 0.5
 
    b2: 
        name: "G1S2.wav"
        description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [10, 110]
        elevation: [30, 30]
        delay: [0, 1.5]
        overlap: 0.5
 
    b3: 
        name: "G6S3.wav"
        description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [0, 50]
        elevation: [30, 30]
        delay: [0, 1.5]
        overlap: 0.5
 
    b4: 
        name: "G5S4.wav"
        description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [50, 350]
        elevation: [60, 60]
        delay: [0, 1.5] 
        overlap: 0.5 

    b5: 
        name: "G4S5.wav"
        description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [40, 290]
        elevation: [60, 60]
        delay: [0, 1.5] 
        overlap: 0.5 

    b6: 
        name: "G3S6.wav"
        description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [30, 230]
        elevation: [60, 60]
        delay: [0, 1.5] 
        overlap: 0.5 

    c1: 
        name: "G3S1.wav"
        description: "one talker sitting at a table, second talker standing beside the table,  non-overlapping utterances."
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [40, 290]
        elevation: [0, 60]
        delay: [0, 0] 
        overlap: -0.5 

    c2: 
        name: "G2S2.wav"
        description: "one talker sitting at a table, second talker standing beside the table,  non-overlapping utterances."
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [30, 230]
        elevation: [0, 60]
        delay: [0, 0] 
        overlap: -0.5 
  
    c3: 
        name: "G1S3.wav"
        description: "one talker sitting at a table, second talker standing beside the table,  non-overlapping utterances."
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [20, 170]
        elevation: [0, 60]
        delay: [0, 0]   
        overlap: -0.5   
  
    c4: 
        name: "G6S4.wav"
        description: "one talker sitting at a table, second talker standing beside the table,  non-overlapping utterances."
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [10, 110]
        elevation: [0, 60]
        delay: [0, 1]     
        shift: [0, 1]     
  
    c5: 
        name: "G5S5.wav"
        description: "one talker sitting at a table, second talker standing beside the table,  non-overlapping utterances."
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [0, 50]
        elevation: [0, 60]
        delay: [0, 0]     
        overlap: -0.5     
  
    c6: 
        name: "G4S6.wav"
        description: "one talker sitting at a table, second talker standing beside the table,  non-overlapping utterances."
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [50, 350]
        elevation: [0, 60]
        delay: [0, 0]      
        overlap: -0.5      
 
    d1: 
        name: "G4S1.wav"
        description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [50, "180:1:120 + 360"]
        elevation: [0, 60]
        delay: [0, 1.5]   
        overlap: 0.5   
        
    d2: 
        name: "G3S2.wav"
        description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [300, "-70:-1:-10 - 360"]
        elevation: [0, 60]
        delay: [0, 1.5]   
        overlap: 0.5   
        
    d3: 
        name: "G2S3.wav"
        description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [250, "-20:-1:-320"]
        elevation: [0, 60]
        delay: [0, 1.5]          
        overlap: 0.5          
 
    d4: 
        name: "G1S4.wav"
        description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [200, "30:-1:-270"]
        elevation: [0, 60]
        delay: [0, 1.5]  
        overlap: 0.5  
 
    d5: 
        name: "G6S5.wav"
        description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [150, "80:1:20 + 360"]
        elevation: [0, 60]
        delay: [0, 1.5]   
        overlap: 0.5   
 
    d6: 
        name: "G5S6.wav"
        description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: [100, "130:1:70 + 360"]
        elevation: [0, 60]
        delay: [0, 1.5]   
        overlap: 0.5   
 
    e1: 
        name: "G5S1.wav"
        description: "two talkers walking side-by-side around the table, ~30% overlapping utterances"
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: ["80:1:20 + 360", "80:1:20 + 360"]
        elevation: [10, 60]
        delay: [0, 1.5]
        overlap: 0.5
 
    e2: 
        name: "G4S2.wav"
        description: "two talkers walking side-by-side around the table, ~30% overlapping utterances"
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: ["130:1:70 + 360", "130:1:70 + 360"]
        elevation: [10, 60]
        delay: [0, 1.5]    
        overlap: 0.5    
        
    e3: 
        name: "G3S3.wav"
        description: "two talkers walking side-by-side around the table, ~30% overlapping utterances"
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: ["180:1:120 + 360", "180:1:120 + 360"]
        elevation: [10, 60]
        delay: [0, 1.5]            
        overlap: 0.5            
  
    e4: 
        name: "G2S4.wav"
        description: "two talkers walking side-by-side around the table, ~30% overlapping utterances"
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: ["-70:-1:-10 - 360", "-70:-1:-10 - 360"]
        elevation: [10, 60]
        delay: [0, 1.5]    
        overlap: 0.5    
  
    e5: 
        name: "G1S5.wav"
        description: "two talkers walking side-by-side around the table, ~30% overlapping utterances"
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: ["-20:-1:-320", "-20:-1:-320"]
        elevation: [10, 60]
        delay: [0, 1.5]   
        overlap: 0.5   
  
    e6: 
        name: "G6S6.wav"
        description: "two talkers walking side-by-side around the table, ~30% overlapping utterances"
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: ["30:-1:-270", "30:-1:-270"]
        elevation: [10, 60]
        delay: [0, 1.5]     
        overlap: 0.5     
 
    f1: 
        name: "G6S1.wav"
        description: "two talkers walking around the table in opposite directions, non-overlapping utterances."
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: ["60:1:0 + 360", "60:-1:120 - 360"]
        elevation: [20, 50]
        delay: [0, 0]    
        overlap: -0.5    
 
    f2: 
        name: "G5S2.wav"
        description: "two talkers walking around the table in opposite directions, non-overlapping utterances."
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: ["0:1:300", "0:-1:60 - 360"]
        elevation: [20, 50]
        delay: [0, 0]   
        overlap: -0.5   
  
    f3: 
        name: "G4S3.wav"
        description: "two talkers walking around the table in opposite directions, non-overlapping utterances."
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: ["300:1:240 + 360", "300:-1:0"]
        elevation: [20, 50]
        delay: [0, 0]     
        overlap: -0.5     
  
    f4: 
        name: "G3S4.wav"
        description: "two talkers walking around the table in opposite directions, non-overlapping utterances."
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: ["240:1:180 + 360", "240:-1:-60"]
        elevation: [20, 50]
        delay: [0, 0]  
        overlap: -0.5  
  
    f5: 
        name: "G2S5.wav"
        description: "two talkers walking around the table in opposite directions, non-overlapping utterances."
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: ["180:1:120 + 360", "180:-1:-120"]
        elevation: [20, 50]
        delay: [0, 0]    
        overlap: -0.5    
  
    f6: 
        name: "G1S6.wav"
        description: "two talkers walking around the table in opposite directions, non-overlapping utterances."
        source: ["test_double.wav", "test_double.wav"]
        source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
        azimuth: ["120:1:60 + 360", "120:-1:180 - 360"]
        elevation: [20, 50]
        delay: [0, 0]      
        overlap: -0.5      
  
 No newline at end of file
+81 −11
Original line number Diff line number Diff line
@@ -41,6 +41,7 @@ from math import floor
from item_generation_scripts.audiotools import audio, audiofile
from item_generation_scripts.audiotools.wrappers.bs1770 import get_loudness

SEED_RANDOM_NOISE = 0

# function for converting nd numpy array to strings with 2 decimal digits
def csv_formatdata(data):
@@ -56,6 +57,9 @@ def generate_ism_items(
    scenes: dict,
    logger: logging.Logger,
    fs: Optional[int] = 48000,
    preamble: Optional[float] = 0.0,
    postamble: Optional[float] = 0.0,
    add_low_level_random_noise: Optional[bool] = False,
):
    """Generate ISM items with metadata from mono items based on scene description"""

@@ -75,6 +79,12 @@ def generate_ism_items(
            y = audio.ChannelBasedAudio("MONO")
        y_meta = None
        
        # read the overlap length
        if 'overlap' in scene.keys():
            source_overlap = float(scene["overlap"])
        else:
            source_overlap = 0.0
        
        # repeat for all source files
        for i in range(N_sources):
        
@@ -82,10 +92,6 @@ def generate_ism_items(
            source_file = np.atleast_1d(scene["source"])[i]
            source_azi = np.atleast_1d(scene["azimuth"])[i]
            source_ele = np.atleast_1d(scene["elevation"])[i]
            if 'delay' in scene.keys():
                source_delay = np.atleast_1d(scene["delay"])[i]
            else:
                source_delay = np.array([0])
            
            logger.info(
                f"Encoding {source_file} at position(s) {source_azi},{source_ele}"
@@ -94,12 +100,15 @@ def generate_ism_items(
            # read source file
            x = audio.fromfile("MONO", os.path.join(input_path, source_file), fs=fs)
            
            ############### DEBUG ############33
            # x.audio = x.audio[:-10]

            # get the number of frames (multiple of 20ms)
            N_frames = int(len(x.audio) / x.fs * 50)
            
            # trim the source signal to align to 20ms boundary
            N_trim = int(N_frames * x.fs / 50)
            x.audio = x.audio[:N_trim]
            # N_trim = int(N_frames * x.fs / 50)
            # x.audio = x.audio[:N_trim]

            # adjust the level of the source file
            _, scale_factor = get_loudness(x, target_level, "MONO")
@@ -172,10 +181,16 @@ def generate_ism_items(
            # arrange all metadata fields column-wise into a matrix
            x_meta = np.column_stack((azi, ele, dist, spread, gain))
            
            # delay the source file
            if source_delay > 0:
            # shift the second (and all other) source files (positive shift creates overlap, negative shift creates a gap)
            if i > 0 and source_overlap != 0.0:
                # get the length of the first source file
                N_delay = len(y.audio[:,0])
                
                # add the shift
                N_delay += int(source_overlap * x.fs)
            
                # ensure delay is a multiple of 20ms
                N_delay = int(floor(source_delay * 50) / 50 * x.fs)
                # N_delay = int(floor(source_shift * 50) / 50 * x.fs)
            
                # insert all-zero preamble
                pre = np.zeros((N_delay, x.audio.shape[1]))
@@ -187,12 +202,27 @@ def generate_ism_items(
                )   # !!!! TBD - check if we should insert  netrual position or the first position of the metadata
                x_meta = np.concatenate([pre, x_meta])
                
            # pad with zeros to ensure that the signal length is a multiple of 20ms  
            N_frame = x.fs / 50
            if len(x.audio) % N_frame != 0:
                N_pad = int(N_frame - len(x.audio) % N_frame)
                
                # insert all-zero preamble
                pre = np.zeros((N_pad, x.audio.shape[1]))
                x.audio = np.concatenate([pre, x.audio])

                # insert neutral position as a pre-amble
                pre = np.tile(
                    [0.00, 0.00, 1.00, 0.00, 1.00], (N_pad, 1)
                )   # !!!! TBD - check if we should insert  netrual position or the first position of the metadata
                x_meta = np.concatenate([pre, x_meta])

            # add source signal to the array of all source signals
            y.fs = x.fs
            if y.audio is None:
                y.audio = x.audio
            else:
                # append zeros to have equal length of all source signals
                # pad with zeros to have the same length of all source signals
                if x.audio.shape[0] > y.audio.shape[0]:
                    y.audio = np.vstack((y.audio, np.zeros((x.audio.shape[0] - y.audio.shape[0], y.audio.shape[1]))))
                elif y.audio.shape[0] > x.audio.shape[0]:
@@ -234,6 +264,46 @@ def generate_ism_items(

                y_meta = np.concatenate([y_meta, x_meta])

        # append pre-amble and post-amble to all sources
        if preamble != 0.0:
            # ensure that pre-mable is a multiple of 20ms
            N_pre = int(floor(preamble * 50) / 50 * y.fs)
            
            # insert all-zero preamble to all sources
            pre = np.zeros((N_pre, y.audio.shape[1]))
            y.audio = np.concatenate([pre, y.audio])

            # insert neutral position as a pre-amble to all sources
            pre = np.tile(
                [0.00, 0.00, 1.00, 0.00, 1.00], (y_meta.shape[0], N_pre, 1)
            )   # !!!! TBD - check if we should insert netrual position or the first position of the metadata
            y_meta = np.concatenate([pre, y_meta], axis=1)
        
        if postamble != 0.0:
            # ensure that post-mable is a multiple of 20ms
            N_post = int(floor(postamble * 50) / 50 * y.fs)
            
            # append all-zero postamble to all sources
            post = np.zeros((N_post, y.audio.shape[1]))
            y.audio = np.concatenate([y.audio, post])

            # append neutral position as a post-amble to all sources
            post = np.tile(
                [0.00, 0.00, 1.00, 0.00, 1.00], (y_meta.shape[0], N_post, 1)
            )   # !!!! TBD - check if we should insert netrual position or the last position of the metadata
            y_meta = np.concatenate([y_meta, post], axis=1)
            
        # add random noise
        if add_low_level_random_noise:
            # create uniformly distributed noise between -4 and 4
            np.random.seed(SEED_RANDOM_NOISE)
            noise = np.random.randint(
                low=-4, high=5, size=y.audio.shape
            ).astype("float")
            
            # superimpose
            y.audio += noise

        # write individual ISM audio streams to the output file in an interleaved format
        output_filename = scene["name"]
        audiofile.write(