Commit 93d4dbfe authored by Vladimir Malenovsky's avatar Vladimir Malenovsky
Browse files

support for X(i_ref) notation to allow specifying overlap between items

parent 73a29eb9
Loading
Loading
Loading
Loading
Loading
+34 −12
Original line number Diff line number Diff line
@@ -30,6 +30,7 @@
#  the United Nations Convention on Contracts on the International Sales of Goods.
#
import logging
import re
import sys
from itertools import groupby, repeat
from pathlib import Path
@@ -196,6 +197,7 @@ def generate_ismN_scene(

    # repeat for all source files
    offset = 0
    end_position = []
    for i in range(N_inputs):
        # read input filename
        source_file = (
@@ -232,16 +234,33 @@ def generate_ismN_scene(
                if isinstance(scene["shift"], list)
                else scene["shift"]
            )

            # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index
            # of the reference signal (0-based index)
            if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]:
                # extract X and i_ref
                match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i])

                if match:
                    overlap = float(match.group(1))
                    overlap_ref = int(match.group(2))
                else:
                    scene_shift_str = scene["shift"][i]
                    logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!")
                    sys.exit(-1)

                # calculate absolute shift of the source signal in seconds
                source_shift = end_position[overlap_ref] + overlap
        else:
            source_shift = 0.0

        # convert shift from seconds to samples and ensure it is a multiple of 20ms
        source_shift_in_seconds = source_shift
        source_shift = source_shift * cfg.fs
        if source_shift >= 0:
            source_shift = int(np.floor(source_shift / frame_len) * frame_len)
        else:
            source_shift = int(np.ceil(source_shift / frame_len) * frame_len)
        source_shift_in_seconds = source_shift / cfg.fs

        # read the level
        if "level" in scene.keys():
@@ -278,6 +297,9 @@ def generate_ismN_scene(
        x = audio.fromtype("ISM1")
        x.audio, x.fs = audiofile.read(input_filename)

        # record the total duration of the source signal, taking into account the shift of the starting position
        end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds)

        # resample to the target fs if necessary
        if x.fs != cfg.fs:
            logger.warning(
@@ -288,12 +310,12 @@ def generate_ismN_scene(
            x.fs = cfg.fs

        # adjust the level of the audio source file (need to convert to MONO first)
        if level is None:
            # do not change the level of the audio source signal
            logger.info("-- Level of the audio source signal is not changed")
        elif np.isinf(level):
        if np.isinf(level):
            # set all channels to zero
            x.audio = np.zeros_like(x.audio)
        elif level is None:
            # do not change the level of the audio source signal
            logger.info("-- Level of the audio source signal is not changed")
        else:
            x_temp = audio.ChannelBasedAudio(
                "MONO"
@@ -391,21 +413,21 @@ def generate_ismN_scene(
            y.object_pos = x.object_pos.copy()
            y.fs = x.fs

            if source_shift < 0:
            if source_shift > 0:
                # insert zeros to the new audio source signal to shift it right
                metadata.trim_meta(y, limits=[source_shift, 0], samples=True)
                metadata.trim_meta(y, limits=[-source_shift, 0], samples=True)
            else:
                offset = source_shift
        else:
            # shift the beginning of the audio source signal
            delta_offset = source_shift - offset
            if delta_offset > 0:
            if delta_offset < 0:
                # insert zeros to the previous ISM signal(s) to shift them right
                metadata.trim_meta(y, limits=[-delta_offset, 0], samples=True)
                metadata.trim_meta(y, limits=[delta_offset, 0], samples=True)
                offset = source_shift
            else:
                # insert zeros to the new audio source signal to shift it right
                metadata.trim_meta(x, limits=[delta_offset, 0], samples=True)
                metadata.trim_meta(x, limits=[-delta_offset, 0], samples=True)

            # adjust the length of the audio source signal
            delta_length = len(x.audio) - len(y.audio)
@@ -443,14 +465,14 @@ def generate_ismN_scene(
        noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float")
        y.audio += noise

    # trim the output signal such if the total duration exceeds X seconds
    # trim the output signal if the total duration exceeds X seconds
    if "duration" in cfg.__dict__:
        # convert from seconds to samples (ensure multiple of 20ms)
        duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len)

        # check if the current length of the output signal exceeds the duration
        if len(y.audio) > duration:
            metadata.trim_meta(y, limits=[0, duration], samples=True)
            metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True)

    # adjust the loudness of the output signal
    if "loudness" in cfg.__dict__:
+47 −10
Original line number Diff line number Diff line
@@ -31,6 +31,7 @@
#

import logging
import re
import sys
from itertools import groupby, repeat
from pathlib import Path
@@ -209,6 +210,7 @@ def generate_MASA_scene(
    # repeat for all source files
    offset = 0
    y_int = None
    end_position = []
    for i in range(N_inputs):
        # parse parameters from the scene description
        source_file = (
@@ -232,13 +234,44 @@ def generate_MASA_scene(
        else:
            source_shift = 0.0

        # read the source shift length (in seconds)
        if "shift" in scene.keys():
            source_shift = (
                scene["shift"][i]
                if isinstance(scene["shift"], list)
                else scene["shift"]
            )

            # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index
            # of the reference signal (0-based index)
            if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]:
                # extract X and i_ref
                match = re.match(
                    r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]
                )

                if match:
                    overlap = float(match.group(1))
                    overlap_ref = int(match.group(2))
                else:
                    scene_shift_str = scene["shift"][i]
                    logger.error(
                        f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!"
                    )
                    sys.exit(-1)

                # calculate absolute shift of the source signal in seconds
                source_shift = end_position[overlap_ref] - overlap
        else:
            source_shift = 0.0

        # convert shift from seconds to samples and ensure it is a multiple of 20ms
        source_shift_in_seconds = source_shift
        source_shift = source_shift * cfg.fs
        if source_shift >= 0:
            source_shift = int(np.floor(source_shift / frame_len) * frame_len)
        else:
            source_shift = int(np.ceil(source_shift / frame_len) * frame_len)
        source_shift_in_seconds = source_shift / cfg.fs

        # read the level
        if "level" in scene.keys():
@@ -295,6 +328,9 @@ def generate_MASA_scene(
        # read source file
        x = audio.fromfile("MONO", input_filename)

        # record the total duration of the source signal, taking into account the shift of the starting position
        end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds)

        # resample to the target fs if necessary
        if x.fs != cfg.fs:
            logger.warning(
@@ -339,26 +375,26 @@ def generate_MASA_scene(
            # this is the first SBA source signal
            y_int.audio = x.audio.copy()

            if source_shift < 0:
            if source_shift > 0:
                # insert zeros to the first SBA source signal to shift it right
                y_int.audio = audioarray.trim(
                    y_int.audio, y_int.fs, limits=[source_shift, 0], samples=True
                    y_int.audio, y_int.fs, limits=[-source_shift, 0], samples=True
                )
            else:
                offset = source_shift
        else:
            # shift the beginning of the audio source signal
            delta_offset = source_shift - offset
            if delta_offset > 0:
            if delta_offset < 0:
                # insert zeros to the output SBA signal to shift it right
                y_int.audio = audioarray.trim(
                    y_int.audio, y_int.fs, limits=[-delta_offset, 0], samples=True
                    y_int.audio, y_int.fs, limits=[delta_offset, 0], samples=True
                )
                offset = source_shift
            else:
                # insert zeros to the new SBA source signal to shift it right
                x.audio = audioarray.trim(
                    x.audio, x.fs, limits=[delta_offset, 0], samples=True
                    x.audio, x.fs, limits=[-delta_offset, 0], samples=True
                )

            # adjust the length of the audio source signal
@@ -396,14 +432,15 @@ def generate_MASA_scene(
    # trim the output signal if the total duration exceeds X seconds
    if "duration" in cfg.__dict__:
        # convert from seconds to samples (ensure multiple of 20ms)
        duration = int(
            np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len
        )
        duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len)

        # check if the current length of the output signal exceeds the duration
        if len(y_int.audio) > duration:
            y_int.audio = audioarray.trim(
                y_int.audio, y_int.fs, limits=[0, len(y_int.audio) - duration], samples=True
                y_int.audio,
                y_int.fs,
                limits=[0, len(y_int.audio) - duration],
                samples=True,
            )

    # adjust the loudness of the output signal
+29 −7
Original line number Diff line number Diff line
@@ -31,6 +31,7 @@
#

import logging
import re
import sys
from itertools import groupby, repeat
from pathlib import Path
@@ -209,6 +210,7 @@ def generate_MC_scene(
    # repeat for all source files
    offset = 0
    y_int = None
    end_position = []
    for i in range(N_inputs):
        # parse parameters from the scene description
        source_file = (
@@ -222,23 +224,40 @@ def generate_MC_scene(
        )
        IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name)

        # read the shift time in seconds
        # read the source shift length (in seconds)
        if "shift" in scene.keys():
            source_shift = (
                scene["shift"][i]
                if isinstance(scene["shift"], list)
                else scene["shift"]
            )

            # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index
            # of the reference signal (0-based index)
            if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]:
                # extract X and i_ref
                match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i])

                if match:
                    overlap = float(match.group(1))
                    overlap_ref = int(match.group(2))
                else:
                    scene_shift_str = scene["shift"][i]
                    logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!")
                    sys.exit(-1)

                # calculate absolute shift of the source signal in seconds
                source_shift = end_position[overlap_ref] + overlap
        else:
            source_shift = 0.0

        # convert shift from seconds to samples and ensure it is a multiple of 20ms
        source_shift_in_seconds = source_shift
        source_shift = source_shift * cfg.fs
        if source_shift >= 0:
            source_shift = int(np.floor(source_shift / frame_len) * frame_len)
        else:
            source_shift = int(np.ceil(source_shift / frame_len) * frame_len)
        source_shift_in_seconds = source_shift / cfg.fs

        # read the level
        if "level" in scene.keys():
@@ -295,6 +314,9 @@ def generate_MC_scene(
        # read source file
        x = audio.fromfile("MONO", input_filename)

        # record the total duration of the source signal, taking into account the shift of the starting position
        end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds)

        # resample to the target fs if necessary
        if x.fs != cfg.fs:
            logger.warning(
@@ -339,26 +361,26 @@ def generate_MC_scene(
            # this is the first SBA source signal
            y_int.audio = x.audio.copy()

            if source_shift < 0:
            if source_shift > 0:
                # insert zeros to the first SBA source signal to shift it right
                y_int.audio = audioarray.trim(
                    y_int.audio, y_int.fs, limits=[source_shift, 0], samples=True
                    y_int.audio, y_int.fs, limits=[-source_shift, 0], samples=True
                )
            else:
                offset = source_shift
        else:
            # shift the beginning of the audio source signal
            delta_offset = source_shift - offset
            if delta_offset > 0:
            if delta_offset < 0:
                # insert zeros to the output SBA signal to shift it right
                y_int.audio = audioarray.trim(
                    y_int.audio, y_int.fs, limits=[-delta_offset, 0], samples=True
                    y_int.audio, y_int.fs, limits=[delta_offset, 0], samples=True
                )
                offset = source_shift
            else:
                # insert zeros to the new SBA source signal to shift it right
                x.audio = audioarray.trim(
                    x.audio, x.fs, limits=[delta_offset, 0], samples=True
                    x.audio, x.fs, limits=[-delta_offset, 0], samples=True
                )

            # adjust the length of the audio source signal
+31 −22
Original line number Diff line number Diff line
@@ -31,6 +31,7 @@
#

import logging
import re
import sys
from itertools import groupby, repeat
from pathlib import Path
@@ -183,6 +184,7 @@ def generate_OMASA_scene(

    # repeat for all source files
    offset = 0
    end_position = []
    for i in range(N_inputs):
        # parse parameters from the scene description
        source_file = (
@@ -220,16 +222,33 @@ def generate_OMASA_scene(
                if isinstance(scene["shift"], list)
                else scene["shift"]
            )

            # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index
            # of the reference signal (0-based index)
            if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]:
                # extract X and i_ref
                match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i])

                if match:
                    overlap = float(match.group(1))
                    overlap_ref = int(match.group(2))
                else:
                    scene_shift_str = scene["shift"][i]
                    logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!")
                    sys.exit(-1)

                # calculate absolute shift of the source signal in seconds
                source_shift = end_position[overlap_ref] + overlap
        else:
            source_shift = 0.0

        # convert shift from seconds to samples and ensure it is a multiple of 20ms
        source_shift_in_seconds = source_shift
        source_shift = source_shift * cfg.fs
        if source_shift >= 0:
            source_shift = int(np.floor(source_shift / frame_len) * frame_len)
        else:
            source_shift = int(np.ceil(source_shift / frame_len) * frame_len)
        source_shift_in_seconds = source_shift / cfg.fs

        # read the level
        if "level" in scene.keys():
@@ -300,6 +319,9 @@ def generate_OMASA_scene(
        # read source file
        x = audio.fromfile(fmt, input_filename)

        # record the total duration of the source signal, taking into account the shift of the starting position
        end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds)

        # resample to the target fs if necessary
        if x.fs != cfg.fs:
            logger.warning(
@@ -417,21 +439,21 @@ def generate_OMASA_scene(
            # add the first audio source signal (should be FOA/HOA2/HOA3) to the array of all source signals
            y_int.audio = x.audio.copy()

            if source_shift < 0:
            if source_shift > 0:
                # insert zeros to the new audio source signal to shift it right
                metadata.trim_meta(y_int, limits=[source_shift, 0], samples=True)
                metadata.trim_meta(y_int, limits=[-source_shift, 0], samples=True)
            else:
                offset = source_shift
        else:
            # shift the beginning of the audio source signal
            delta_offset = source_shift - offset
            if delta_offset > 0:
            if delta_offset < 0:
                # insert zeros to the existing intermediate OSBA object to shift it right
                metadata.trim_meta(y_int, limits=[-delta_offset, 0], samples=True)
                metadata.trim_meta(y_int, limits=[delta_offset, 0], samples=True)
                offset = source_shift
            else:
                # insert zeros to the new audio source signal to shift it right
                metadata.trim_meta(x, limits=[delta_offset, 0], samples=True)
                metadata.trim_meta(x, limits=[-delta_offset, 0], samples=True)

            # adjust the length of the audio source signal
            delta_length = len(x.audio) - len(y_int.audio)
@@ -472,29 +494,16 @@ def generate_OMASA_scene(
        noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float")
        y_int.audio += noise

    # trim the output signal such if the total duration exceeds X seconds
    # trim the output signal if the total duration exceeds X seconds
    if "duration" in cfg.__dict__:
<<<<<<< Updated upstream
        # trim the output signal such that the total duration is X seconds
        duration = int(cfg.duration * cfg.fs)  # convert to samples
    else:
        # do not change the length of the audio signal
        duration = len(y_int.audio)
    duration = int(
        np.floor(duration / frame_len) * frame_len
    )  # ensure multiple of 20ms
    if len(y_int.audio) != duration:
        metadata.trim_meta(y_int, limits=[0, len(y_int.audio) - duration], samples=True)
=======
        # convert from seconds to samples (ensure multiple of 20ms)
        duration = int(
            np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len
        )

        # check if the current length of the output signal exceeds the duration
        if len(y.audio) > duration:
            metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True)
>>>>>>> Stashed changes
        if len(y_int.audio) > duration:
            metadata.trim_meta(y_int, limits=[0, len(y_int.audio) - duration], samples=True)

    # adjust the loudness of the output signal
    if "loudness" in cfg.__dict__:
+29 −7
Original line number Diff line number Diff line
@@ -31,6 +31,7 @@
#

import logging
import re
import sys
from itertools import groupby, repeat
from pathlib import Path
@@ -187,6 +188,7 @@ def generate_OSBA_scene(

    # repeat for all source files
    offset = 0
    end_position = []
    for i in range(N_inputs):
        # parse parameters from the scene description
        source_file = (
@@ -210,23 +212,40 @@ def generate_OSBA_scene(
            else scene["elevation"]
        )

        # read the shift time in seconds
        # read the source shift length (in seconds)
        if "shift" in scene.keys():
            source_shift = (
                scene["shift"][i]
                if isinstance(scene["shift"], list)
                else scene["shift"]
            )

            # check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index
            # of the reference signal (0-based index)
            if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]:
                # extract X and i_ref
                match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i])

                if match:
                    overlap = float(match.group(1))
                    overlap_ref = int(match.group(2))
                else:
                    scene_shift_str = scene["shift"][i]
                    logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!")
                    sys.exit(-1)

                # calculate absolute shift of the source signal in seconds
                source_shift = end_position[overlap_ref] + overlap
        else:
            source_shift = 0.0

        # convert shift from seconds to samples and ensure it is a multiple of 20ms
        source_shift_in_seconds = source_shift
        source_shift = source_shift * cfg.fs
        if source_shift >= 0:
            source_shift = int(np.floor(source_shift / frame_len) * frame_len)
        else:
            source_shift = int(np.ceil(source_shift / frame_len) * frame_len)
        source_shift_in_seconds = source_shift / cfg.fs

        # read the level
        if "level" in scene.keys():
@@ -282,6 +301,9 @@ def generate_OSBA_scene(
        # read source file
        x = audio.fromfile(fmt, input_filename)

        # record the total duration of the source signal, taking into account the shift of the starting position
        end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds)

        # resample to the target fs if necessary
        if x.fs != cfg.fs:
            logger.warning(
@@ -403,21 +425,21 @@ def generate_OSBA_scene(
                # if ISM, append object position to the OSBA object
                y.object_pos = x.object_pos.copy()

            if source_shift < 0:
            if source_shift > 0:
                # insert zeros to the new audio source signal to shift it right
                metadata.trim_meta(y, limits=[source_shift, 0], samples=True)
                metadata.trim_meta(y, limits=[-source_shift, 0], samples=True)
            else:
                offset = source_shift
        else:
            # shift the beginning of the audio source signal
            delta_offset = source_shift - offset
            if delta_offset > 0:
            if delta_offset < 0:
                # insert zeros to the previous ISM signal(s) to shift them right
                metadata.trim_meta(y, limits=[-delta_offset, 0], samples=True)
                metadata.trim_meta(y, limits=[delta_offset, 0], samples=True)
                offset = source_shift
            else:
                # insert zeros to the new audio source signal to shift it right
                metadata.trim_meta(x, limits=[delta_offset, 0], samples=True)
                metadata.trim_meta(x, limits=[-delta_offset, 0], samples=True)

            # adjust the length of the audio source signal
            delta_length = len(x.audio) - len(y.audio)
Loading