Commit c88f0b6d authored by Vladimir Malenovsky's avatar Vladimir Malenovsky
Browse files

refactor to use source shits

parent 661bc864
Loading
Loading
Loading
Loading
+83 −36
Original line number Diff line number Diff line
@@ -33,7 +33,6 @@
import logging
import os
from itertools import groupby, repeat
from math import floor
from pathlib import Path

import numpy as np
@@ -202,22 +201,37 @@ def generate_stereo_scene(
    # extract the number of audio sources
    N_inputs = len(np.atleast_1d(scene["input"]))

    # initialize output dirs
    # get the output filename
    output_filename = Path(scene["output"]).parent / (
        cfg.use_output_prefix + Path(scene["output"]).name
    )

    # initialize output dirs
    dir_path = output_filename.parent
    if dir_path and not dir_path.exists():
        dir_path.mkdir(parents=True, exist_ok=True)

    # initialize output audio object
    # initialize output STEREO object
    y = audio.ChannelBasedAudio(cfg.format)
    y.fs = cfg.fs

    # set the frame length
    frame_len = int(cfg.fs / 50)

    # repeat for all source files
    offset = 0
    for i in range(N_inputs):
        # parse parameters from the scene description
        source_file = np.atleast_1d(scene["input"])[i]
        IR_file = np.atleast_1d(scene["IR"])[i]
        source_file = (
            scene["input"][i] if isinstance(scene["input"], list) else scene["input"]
        )
        IR_file = (
            scene["IR"][i] if isinstance(scene["IR"], list) else scene["IR"]
        )

        # get input filename and IR filename
        input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name)
        IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name)

        # read the overlap length
        if "shift" in scene.keys():
@@ -229,6 +243,13 @@ def generate_stereo_scene(
        else:
            source_shift = 0.0

        # convert overlap to samples and ensure it is a multiple of 20ms
        source_shift = source_shift * cfg.fs
        if source_shift >= 0:
            source_shift = int(np.floor(source_shift / frame_len) * frame_len)
        else:
            source_shift = int(np.ceil(source_shift / frame_len) * frame_len)

        # read the level
        if "level" in scene.keys():
            level = (
@@ -241,62 +262,88 @@ def generate_stereo_scene(

        logger.info(f"Convolving {source_file} with {IR_file}")

        # get input filename and IR filename
        input_filename = Path(source_file).parent / (
            cfg.use_input_prefix + Path(source_file).name
        )
        IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name)

        # read source file
        x = audio.fromfile("MONO", input_filename, fs=cfg.fs)
        x = audio.fromfile("MONO", input_filename)

        # resample to the target fs if necessary
        if x.fs != cfg.fs:
            logger.warning(
                f"Warning: Sample rate of the audio source is {x.fs} Hz and needs to be resampled to {cfg.fs}!"
            )
            resampled_audio = audioarray.resample(x.audio, x.fs, cfg.fs)
            x.audio = resampled_audio
            x.fs = cfg.fs

        # read the IR file (!must be in STEREO format!)
        IR = audio.fromfile("STEREO", IR_filename, fs=cfg.IR_fs)
        IR = audio.fromfile("STEREO", IR_filename)

        # convolve mono source signal with stereo IR
        # convolve MONO source audio with STEREO IR -> results in STEREO audio object
        x = reverb_stereo(x, IR)

        # adjust the level of the stereo signal
        x.audio, _ = loudness_norm(x, level, loudness_format="STEREO")

        # shift the source signal (positive shift creates overlap, negative shift creates a gap)
        if int(floor(-source_shift)) != 0:
            x.audio = audioarray.trim(x.audio, x.fs, limits=[-source_shift, 0])

        # get the number of frames (multiple of 20ms)
        frame_len = int(x.fs / 50)

        # ensure the length of the audio source signal is a multiple of 20ms
        if len(x.audio) % frame_len != 0:
            # pad with zeros to ensure that the signal length is a multiple of 20ms
            if len(x.audio) % frame_len != 0:
                N_pad = int(frame_len - len(x.audio) % frame_len)
                x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True)

        # add the convolved STEREO audio source signal to the output signal
        if y.audio is None:
            # add source signal to the array of all source signals
            y.audio = x.audio.copy()
            y.fs = x.fs

            # if source_shift < 0:
            #     # insert zeros to the new audio source signal to shift it right
            #     metadata.trim_meta(y, limits=[source_shift, 0], samples=True)
            offset = source_shift
        else:
            # pad the signal with zeros to have the same length as the previous signal(s)
            N_pad = y.audio.shape[0] - x.audio.shape[0]
            if N_pad != 0:
                x.audio = audioarray.trim(
                    x.audio, x.fs, limits=[0, -N_pad], samples=True
                )
            # shift the beginning of the audio source signal
            delta_offset = source_shift - offset
            if delta_offset > 0:
                # insert zeros to the existing output signal to shift it right
                y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_offset], samples=True)
                offset = source_shift
            else:
                # insert zeros to the new audio source signal to shift it right
                x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_offset], samples=True)

            # adjust the length of the audio source signal
            delta_length = len(x.audio) - len(y.audio)
            if delta_length > 0:
                # pad zeros to the existing output signal
                y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_length], samples=True)
            else:
                # pad zeros to the new audio source signal
                x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_length], samples=True)

            # superimpose
            y.audio += x.audio

    # append pre-amble and post-amble to all sources
    y.audio = audioarray.trim(y.audio, y.fs, limits=[-cfg.preamble, -cfg.postamble])
    # append pre-amble and post-amble
    preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len)  # convert to samples and ensure multiple of 20ms
    postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len)  # convert to samples and ensure multiple of 20ms
    y.audio = audioarray.trim(y.audio, y.fs, limits=[-preamble, -postamble], samples=True)

    # add random noise
    if cfg.add_low_level_random_noise:
        # create uniformly distributed noise between -4 and 4
        np.random.seed(SEED_RANDOM_NOISE)
        noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float")

        # superimpose
        y.audio += noise

    # write the output STEREO audio signal into output file
    # adjust the length of the output signal
    if "duration" in cfg.__dict__:
        # trim the output signal such that the total duration is X seconds
        duration = int(cfg.duration * cfg.fs)  # convert to samples
    else:
        # do not change the length of the audio signal
        duration = len(y.audio)
    duration = int(np.floor(duration / frame_len) * frame_len)  # ensure multiple of 20ms
    if len(y.audio) != duration:
        y.audio = audioarray.trim(y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True)

    # write the STEREO audio signal into output file
    audiofile.write(output_filename, y.audio, y.fs)