Commit 5a00b481 authored by PLAINSI's avatar PLAINSI
Browse files

HOA2 processing

parent 9f73b183
Loading
Loading
Loading
Loading
Loading
+58 −0
Original line number Diff line number Diff line
---
################################################
# General configuration
################################################

### Output format
format: "HOA2"

### Output sampling rate in Hz needed for headerless audio files; default = 48000
fs: 48000

### IR sampling rate in Hz needed for headerless audio files; default = 48000
IR_fs: 48000

### Any relative paths will be interpreted relative to the working directory the script is called from!
### Usage of absolute paths is recommended.
### Do not use file names with dots "." in them! This is not supported, use "_" instead
### For Windows users: please use double back slash '\\' in paths and add '.exe' to executable definitions

### Input path to mono files
input_path: "./items_mono"

### Input path to stereo impulse response files, default = './ivas_processing_scripts/generation/IR'
IR_path: "./IRs"

### Output path for generated test items and metadata files
output_path: "./items_HOA2"

### Target loudness in LKFS; default = null (no loudness normalization applied)
loudness: -26

### Pre-amble and Post-amble length in seconds (default = 0.0)
preamble: 0.5
postamble: 1.0

### Flag for adding low-level random background noise (amplitude +-4) instead of silence; default = false (silence)
add_low_level_random_noise: true


################################################
### Scene description
################################################

### Each scene must start with the sceneN tag
### Specify the mono source filename (the program will search for it in the input_path folder)
### Specify the stereo IR source filename (the program will search for it in the input_path_IR folder)
### Specify the overlap length in seconds for each input source (negative value creates a gap)
### Note 1: use [val1, val2, ...] for multiple sources in a scene
### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames

scenes:
    a1: 
        name: "out.wav"
        description: ""
        source: ["fa1.wav", "ma1.wav"]
        IR: ["IR_HOA2_env1/FreefieldFloor_TalkPos1_EigenHoA2_SinSweep_9chn.wav", "IR_HOA2_env1/FreefieldFloor_TalkPos2_EigenHoA2_SinSweep_9chn.wav"]
        overlap: -0.2
        
+65 −1
Original line number Diff line number Diff line
@@ -238,7 +238,7 @@ def reverb_foa(
        H = fft(foa_IR.audio, axis=0)
        align = 1.0 / np.max(np.abs(H))

    # convolve mono input with left and right IR
    # convolve mono input with FOA IR
    y_w = reverb(input, IR_w, align=align)
    y_x = reverb(input, IR_x, align=align)
    y_y = reverb(input, IR_y, align=align)
@@ -251,3 +251,67 @@ def reverb_foa(
    y.audio = np.column_stack([y_w.audio, y_x.audio, y_y.audio, y_z.audio])

    return y


def reverb_hoa2(
    input: Audio,
    hoa2_IR: Audio,
    align: Optional[float] = None,
) -> Audio:
    """
    Wrapper for the ITU-T reverb binary to convolve mono audio signal with an HOA2 impulse response

    Parameters
    ----------
    input: Audio
        Input audio signal
    IR: Audio
        Impulse response
    align: float
         multiplicative factor to apply to the reverberated sound in order to align its energy level with the second file

    Returns
    -------
    output: Audio
        Convolved audio signal with HOA2 IR
    """

    # convert to float32
    hoa2_IR.audio = np.float32(hoa2_IR.audio)

    numchannels = 9  # HOA2 by definition

    # calculate the scaling (multiplicative) factor such that the maximum gain of the IR filter across all frequencies is 0dB
    if align is None:
        H = fft(hoa2_IR.audio, axis=0)
        align = 1.0 / np.max(np.abs(H))

    IR = copy(hoa2_IR)
    IR.name = "MONO"
    IR.num_channels = 1
    ych = []
    for i in range(numchannels):
        # separate IR into each channel
        IR.audio = np.reshape(hoa2_IR.audio[:, i], (-1, 1))
        # convolve mono input with channel IR
        ych.append(reverb(input, IR, align=align))

    # combine into hoa2 output
    y = copy(input)
    y.name = "HOA2"
    y.num_channels = numchannels
    y.audio = np.column_stack(
        [
            ych[0].audio,
            ych[1].audio,
            ych[2].audio,
            ych[3].audio,
            ych[4].audio,
            ych[5].audio,
            ych[6].audio,
            ych[7].audio,
            ych[8].audio,
        ]
    )

    return y
+218 −0
Original line number Diff line number Diff line
#!/usr/bin/env python3

#
#  (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB,
#  Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
#  Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
#  Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
#  contributors to this repository. All Rights Reserved.
#
#  This software is protected by copyright law and by international treaties.
#  The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB,
#  Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
#  Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
#  Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
#  contributors to this repository retain full ownership rights in their respective contributions in
#  the software. This notice grants no license of any kind, including but not limited to patent
#  license, nor is any license granted by implication, estoppel or otherwise.
#
#  Contributors are required to enter into the IVAS codec Public Collaboration agreement before making
#  contributions.
#
#  This software is provided "AS IS", without any express or implied warranties. The software is in the
#  development stage. It is intended exclusively for experts who have experience with such software and
#  solely for the purpose of inspection. All implied warranties of non-infringement, merchantability
#  and fitness for a particular purpose are hereby disclaimed and excluded.
#
#  Any dispute, controversy or claim arising under or in relation to providing this software shall be
#  submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in
#  accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and
#  the United Nations Convention on Contracts on the International Sales of Goods.
#

import logging
import os
from math import floor

import numpy as np

from ivas_processing_scripts.audiotools import audio, audiofile
from ivas_processing_scripts.audiotools.wrappers.bs1770 import get_loudness
from ivas_processing_scripts.audiotools.wrappers.reverb import reverb_hoa2
from ivas_processing_scripts.generation import config

SEED_RANDOM_NOISE = 0


# function for converting nd numpy array to strings with 2 decimal digits
def csv_formatdata(data):
    for row in data:
        yield ["%0.2f" % v for v in row]


def generate_hoa2_items(
    cfg: config.TestConfig,
    logger: logging.Logger,
):
    """Generate HOA2 items from mono items based on scene description"""

    # get the number of scenes
    N_scenes = len(cfg.scenes)

    # set the target level
    if "loudness" not in cfg.__dict__:
        cfg.loudness = -26

    # set the fs
    if "fs" not in cfg.__dict__:
        cfg.fs = 48000

    # set the IR fs
    if "IR_fs" not in cfg.__dict__:
        cfg.IR_fs = 48000

    # set the pre-amble and post-amble
    if "preamble" not in cfg.__dict__:
        cfg.preamble = 0.0

    if "postamble" not in cfg.__dict__:
        cfg.postamble = 0.0

    # set the IR path
    if "IR_path" not in cfg.__dict__:
        cfg.IR_path = os.path.join(os.path.dirname(__file__), "IR")

    # set the pre-amble and post-amble
    if "add_low_level_random_noise" not in cfg.__dict__:
        cfg.add_low_level_random_noise = False

    # repeat for all source files
    for scene_name, scene in cfg.scenes.items():
        logger.info(
            f"Processing scene: {scene_name} out of {N_scenes} scenes, name: {scene['name']}"
        )

        # extract the number of audio sources
        N_sources = len(np.atleast_1d(scene["source"]))

        # read the overlap length
        if "overlap" in scene.keys():
            source_overlap = float(scene["overlap"])
        else:
            source_overlap = 0.0

        y = audio.SceneBasedAudio("HOA2")
        for i in range(N_sources):
            # parse parameters from the scene description
            source_file = np.atleast_1d(scene["source"])[i]
            IR_file = np.atleast_1d(scene["IR"])[i]

            logger.info(f"Convolving {source_file} with {IR_file}")

            # read source file
            x = audio.fromfile(
                "MONO", os.path.join(cfg.input_path, source_file), fs=cfg.fs
            )

            # read the IR file
            IR = audio.fromfile(
                "HOA2", os.path.join(cfg.IR_path, IR_file), fs=cfg.IR_fs
            )

            # convolve with HOA2 IR
            x = reverb_hoa2(x, IR)

            # adjust the level of the HOA2 signal
            _, scale_factor, _ = get_loudness(x, cfg.loudness, "BINAURAL")
            x.audio *= scale_factor

            # shift the second (and all other) source files (positive shift creates overlap, negative shift creates a gap)
            if i > 0 and source_overlap != 0.0:
                # get the length of the first source file
                N_delay = len(y.audio[:, 0])

                # add the shift
                N_delay += int(-source_overlap * x.fs)

                # insert all-zero preamble
                pre = np.zeros((N_delay, x.audio.shape[1]))
                x.audio = np.concatenate([pre, x.audio])

            # pad with zeros to ensure that the signal length is a multiple of 20ms
            N_frame = x.fs / 50
            if len(x.audio) % N_frame != 0:
                N_pad = int(N_frame - len(x.audio) % N_frame)

                # insert all-zero preamble
                pre = np.zeros((N_pad, x.audio.shape[1]))
                x.audio = np.concatenate([pre, x.audio])

            # add source signal to the array of source signals
            y.fs = x.fs
            if y.audio is None:
                y.audio = x.audio
            else:
                # pad with zeros to have equal length of all source signals
                if x.audio.shape[0] > y.audio.shape[0]:
                    y.audio = np.vstack(
                        (
                            y.audio,
                            np.zeros(
                                (
                                    x.audio.shape[0] - y.audio.shape[0],
                                    y.audio.shape[1],
                                )
                            ),
                        )
                    )
                elif y.audio.shape[0] > x.audio.shape[0]:
                    x.audio = np.vstack(
                        (
                            x.audio,
                            np.zeros(
                                (
                                    y.audio.shape[0] - x.audio.shape[0],
                                    x.audio.shape[1],
                                )
                            ),
                        )
                    )

                # superimpose
                y.audio += x.audio

        # append pre-amble and post-amble to all sources
        if cfg.preamble != 0.0:
            # ensure that pre-amble is a multiple of 20ms
            N_pre = int(floor(cfg.preamble * 50) / 50 * y.fs)

            # insert all-zero preamble to all sources
            pre = np.zeros((N_pre, y.audio.shape[1]))
            y.audio = np.concatenate([pre, y.audio])

        if cfg.postamble != 0.0:
            # ensure that post-mable is a multiple of 20ms
            N_post = int(floor(cfg.postamble * 50) / 50 * y.fs)

            # append all-zero postamble to all sources
            post = np.zeros((N_post, y.audio.shape[1]))
            y.audio = np.concatenate([y.audio, post])

        # add random noise
        if cfg.add_low_level_random_noise:
            # create uniformly distributed noise between -4 and 4
            np.random.seed(SEED_RANDOM_NOISE)
            noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype(
                "float"
            )

            # superimpose
            y.audio += noise

        # write the reverberated audio into output file
        output_filename = scene["name"]
        audiofile.write(
            os.path.join(cfg.output_path, output_filename), y.audio, y.fs
        )  # !!!! TBD: replace all os.path.xxx operations with the Path object

    return