Commit b323333d authored by Vladimir Malenovsky's avatar Vladimir Malenovsky
Browse files

adjust pre-amble and post-amble to 20ms boundary

parent c88f0b6d
Loading
Loading
Loading
Loading
Loading
+83 −37
Original line number Diff line number Diff line
@@ -32,7 +32,6 @@

import logging
from itertools import groupby, repeat
from math import floor
from pathlib import Path

import numpy as np
@@ -196,22 +195,37 @@ def generate_ambi_scene(
    # extract the number of audio sources
    N_inputs = len(np.atleast_1d(scene["input"]))

    # initialize output dirs
    # get the output filename
    output_filename = Path(scene["output"]).parent / (
        cfg.use_output_prefix + Path(scene["output"]).name
    )

    # initialize output dirs
    dir_path = output_filename.parent
    if dir_path and not dir_path.exists():
        dir_path.mkdir(parents=True, exist_ok=True)

    # initialize output audio object
    # initialize output SBA object
    y = audio.SceneBasedAudio(cfg.format)
    y.fs = cfg.fs

    # set the frame length
    frame_len = int(cfg.fs / 50)

    # repeat for all source files
    offset = 0
    for i in range(N_inputs):
        # parse parameters from the scene description
        source_file = np.atleast_1d(scene["input"])[i]
        IR_file = np.atleast_1d(scene["IR"])[i]
        source_file = (
            scene["input"][i] if isinstance(scene["input"], list) else scene["input"]
        )
        IR_file = (
            scene["IR"][i] if isinstance(scene["IR"], list) else scene["IR"]
        )

        # get input filename and IR filename
        input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name)
        IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name)

        # read the overlap length
        if "shift" in scene.keys():
@@ -223,6 +237,13 @@ def generate_ambi_scene(
        else:
            source_shift = 0.0

        # convert overlap to samples and ensure it is a multiple of 20ms
        source_shift = source_shift * cfg.fs
        if source_shift >= 0:
            source_shift = int(np.floor(source_shift / frame_len) * frame_len)
        else:
            source_shift = int(np.ceil(source_shift / frame_len) * frame_len)

        # read the level
        if "level" in scene.keys():
            level = (
@@ -235,19 +256,22 @@ def generate_ambi_scene(

        logger.info(f"Convolving {source_file} with {IR_file}")

        # get input filename and IR filename
        input_filename = Path(source_file).parent / (
            cfg.use_input_prefix + Path(source_file).name
        )
        IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name)

        # read source file
        x = audio.fromfile("MONO", input_filename, fs=cfg.fs)
        x = audio.fromfile("MONO", input_filename)

        # resample to the target fs if necessary
        if x.fs != cfg.fs:
            logger.warning(
                f"Warning: Sample rate of the audio source is {x.fs} Hz and needs to be resampled to {cfg.fs}!"
            )
            resampled_audio = audioarray.resample(x.audio, x.fs, cfg.fs)
            x.audio = resampled_audio
            x.fs = cfg.fs

        # read the IR file (!must be in target format!)
        IR = audio.fromfile(cfg.format, IR_filename, fs=cfg.IR_fs)
        IR = audio.fromfile(cfg.format, IR_filename)

        # convolve with the FOA/HOA2/HOA3 IR
        # convolve MONO source audio with FOA/HOA2/HOA3 IR -> results in FOA/HOA2/HOA3 audio object
        if cfg.format == "FOA":
            x = reverb_foa(x, IR)
        elif cfg.format == "HOA2":
@@ -258,46 +282,69 @@ def generate_ambi_scene(
        # adjust the level of the target signal
        x.audio, _ = loudness_norm(x, level, loudness_format="STEREO")

        # shift the source signal (positive shift creates overlap, negative shift creates a gap)
        if int(floor(-source_shift)) != 0:
            x.audio = audioarray.trim(x.audio, x.fs, limits=[-source_shift, 0])

        # get the number of frames (multiple of 20ms)
        frame_len = int(x.fs / 50)

        # ensure the length of the audio source signal is a multiple of 20ms
        if len(x.audio) % frame_len != 0:
            # pad with zeros to ensure that the signal length is a multiple of 20ms
            if len(x.audio) % frame_len != 0:
                N_pad = int(frame_len - len(x.audio) % frame_len)
                x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True)

        # add the convolved FOA/HOA2/HOA3 audio source signal to the output signal
        if y.audio is None:
            # add source signal to the array of all source signals
            y.audio = x.audio.copy()
            y.fs = x.fs

            # if source_shift < 0:
            #     # insert zeros to the new audio source signal to shift it right
            #     metadata.trim_meta(y, limits=[source_shift, 0], samples=True)
            offset = source_shift
        else:
            # adjust the signal length (trim from the end or pad with zeros) to align its length with the previous signal(s)
            N_pad = y.audio.shape[0] - x.audio.shape[0]
            if N_pad != 0:
                x.audio = audioarray.trim(
                    x.audio, x.fs, limits=[0, -N_pad], samples=True
                )
            # shift the beginning of the audio source signal
            delta_offset = source_shift - offset
            if delta_offset > 0:
                # insert zeros to the existing output signal to shift it right
                y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_offset], samples=True)
                offset = source_shift
            else:
                # insert zeros to the new audio source signal to shift it right
                x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_offset], samples=True)

            # adjust the length of the audio source signal
            delta_length = len(x.audio) - len(y.audio)
            if delta_length > 0:
                # pad zeros to the existing output signal
                y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_length], samples=True)
            else:
                # pad zeros to the new audio source signal
                x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_length], samples=True)

            # superimpose
            y.audio += x.audio

    # append pre-amble and post-amble to all sources
    y.audio = audioarray.trim(y.audio, y.fs, limits=[-cfg.preamble, -cfg.postamble])
    # append pre-amble and post-amble
    preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len)  # convert to samples and ensure multiple of 20ms
    postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len)  # convert to samples and ensure multiple of 20ms
    y.audio = audioarray.trim(y.audio, y.fs, limits=[-preamble, -postamble], samples=True)

    # add random noise
    if cfg.add_low_level_random_noise:
        # create uniformly distributed noise between -4 and 4
        np.random.seed(SEED_RANDOM_NOISE)
        noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float")

        # superimpose
        y.audio += noise

    # write the FOA/HOA2/HOA3 audio into output file
    # adjust the length of the output signal
    if "duration" in cfg.__dict__:
        # trim the output signal such that the total duration is X seconds
        duration = int(cfg.duration * cfg.fs)  # convert to samples
    else:
        # do not change the length of the audio signal
        duration = len(y.audio)
    duration = int(np.floor(duration / frame_len) * frame_len)  # ensure multiple of 20ms
    if len(y.audio) != duration:
        y.audio = audioarray.trim(y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True)

    # write the FOA/HOA2/HOA3 audio signal into output file
    audiofile.write(output_filename, y.audio, y.fs)

    # convert to BINAURAL, if option was chosen
@@ -314,4 +361,3 @@ def generate_ambi_scene(
            binaudio.fs,
        )
        logger.info(f"Written BINAURAL output to: {binaural_output_filename}")
    return
+3 −1
Original line number Diff line number Diff line
@@ -388,7 +388,9 @@ def generate_ismN_scene(
        y.metadata_files.insert(i, str(output_filename.with_suffix(f".{i}.csv")))

    # append pre-amble and post-amble
    metadata.trim_meta(y, limits=[-cfg.preamble * 1000, -cfg.postamble * 1000])
    preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len)  # convert to samples and ensure multiple of 20ms
    postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len)  # convert to samples and ensure multiple of 20ms
    metadata.trim_meta(y, limits=[-preamble, -postamble], samples=True)

    # add random noise
    if cfg.add_low_level_random_noise:
+5 −3
Original line number Diff line number Diff line
@@ -277,7 +277,7 @@ def generate_OMASA_scene(
            sys.exit(-1)

        # read source file
        x = audio.fromfile(fmt, input_filename, fs=cfg.fs)
        x = audio.fromfile(fmt, input_filename)

        # resample to the target fs if necessary
        if x.fs != cfg.fs:
@@ -290,7 +290,7 @@ def generate_OMASA_scene(

        # adjust the level of the source file
        if fmt in ["FOA", "HOA2", "HOA3"]:
            x.audio, _ = loudness_norm(x, level, loudness_format="STEREO", rms=True)
            x.audio, _ = loudness_norm(x, level, loudness_format="STEREO")
        else:
            x.audio, _ = loudness_norm(x, level, loudness_format="MONO")

@@ -436,7 +436,9 @@ def generate_OMASA_scene(
            y.metadata_files.insert(i - 1, str(output_filename.with_suffix(f".{i - 1}.csv")))

    # append pre-amble and post-amble
    metadata.trim_meta(y, limits=[-cfg.preamble * 1000, -cfg.postamble * 1000])
    preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len)  # convert to samples and ensure multiple of 20ms
    postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len)  # convert to samples and ensure multiple of 20ms
    metadata.trim_meta(y, limits=[-preamble, -postamble], samples=True)

    # add random noise
    if cfg.add_low_level_random_noise:
+8 −14
Original line number Diff line number Diff line
@@ -33,7 +33,6 @@
import logging
import sys
from itertools import groupby, repeat
from math import floor
from pathlib import Path

import numpy as np
@@ -182,7 +181,7 @@ def generate_OSBA_scene(
    N_inputs = len(np.atleast_1d(scene["input"]))
    N_ISMs = N_inputs - 1

    # get input and output filenames
    # get OSBA format and output filename
    osba_format = f"ISM{N_ISMs}SBA{cfg.sba_order}"
    output_filename = Path(scene["output"]).parent / (
        cfg.use_output_prefix + Path(scene["output"]).name
@@ -209,9 +208,7 @@ def generate_OSBA_scene(
        )

        # get input filename
        input_filename = Path(source_file).parent / (
            cfg.use_input_prefix + Path(source_file).name
        )
        input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name)

        # read azimuth and elevation information
        source_azi = (
@@ -254,11 +251,6 @@ def generate_OSBA_scene(

        logger.info(f"Encoding {source_file} at position(s) {source_azi},{source_ele}")

        # get input filename
        input_filename = Path(source_file).parent / (
            cfg.use_input_prefix + Path(source_file).name
        )

        # get the number of channels from the .wav file header
        wav_header = audiofile.parse_wave_header(input_filename)
        N_channels = wav_header["channels"]
@@ -280,7 +272,7 @@ def generate_OSBA_scene(
            sys.exit(-1)

        # read source file
        x = audio.fromfile(fmt, input_filename, fs=cfg.fs)
        x = audio.fromfile(fmt, input_filename)

        # resample to the target fs if necessary
        if x.fs != cfg.fs:
@@ -293,7 +285,7 @@ def generate_OSBA_scene(

        # adjust the level of the source file
        if fmt in ["FOA", "HOA2", "HOA3"]:
            x.audio, _ = loudness_norm(x, level, loudness_format="STEREO", rms=True)
            x.audio, _ = loudness_norm(x, level, loudness_format="STEREO")
        else:
            x.audio, _ = loudness_norm(x, level, loudness_format="MONO")

@@ -307,7 +299,7 @@ def generate_OSBA_scene(
        # get the number of frames (multiple of 20ms)
        N_frames = int(len(x.audio) / frame_len)

        # convert the input audio source signal to ISM
        # convert the input MONO audio source signal to ISM1 object
        if fmt == "MONO":
            # convert MONO to ISM1
            x_ism = audio.ObjectBasedAudio("ISM1")  # ISM with 1 channel
@@ -426,7 +418,9 @@ def generate_OSBA_scene(
            y.metadata_files.insert(i - 1, str(output_filename.with_suffix(f".{i - 1}.csv")))

    # append pre-amble and post-amble
    metadata.trim_meta(y, limits=[-cfg.preamble * 1000, -cfg.postamble * 1000])
    preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len)  # convert to samples and ensure multiple of 20ms
    postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len)  # convert to samples and ensure multiple of 20ms
    metadata.trim_meta(y, limits=[-preamble, -postamble], samples=True)

    # add random noise
    if cfg.add_low_level_random_noise: