Commit e0fbcf7a authored by Vladimir Malenovsky's avatar Vladimir Malenovsky
Browse files

fix 20ms frame alignment

parent 9db60d12
Loading
Loading
Loading
Loading
+51 −32
Original line number Diff line number Diff line
@@ -36,8 +36,8 @@ import logging
import os
from pathlib import Path
from typing import Optional

import numpy as np
from math import floor

from item_generation_scripts.audiotools import audio, audiofile
from item_generation_scripts.audiotools.wrappers.bs1770 import get_loudness
@@ -69,8 +69,14 @@ def generate_ism_items(
        # extract the number of audio sources
        N_sources = len(np.atleast_1d(scene["source"]))

        y = None
        # initialize output variables
        if format == "ISM2":
            y = audio.ChannelBasedAudio("STEREO")
        else:
            y = audio.ChannelBasedAudio("MONO")
        y_meta = None
        
        # repeat for all source files
        for i in range(N_sources):
        
            # parse parameters from the scene description
@@ -87,16 +93,18 @@ def generate_ism_items(
            )

            # read source file
            audio_object = audio.fromfile("MONO", os.path.join(input_path, source_file), fs=fs)
            x = audio_object.audio
            fs = audio_object.fs
            x = audio.fromfile("MONO", os.path.join(input_path, source_file), fs=fs)

            # find the number of frames
            N_frames = int(len(x) / fs * 50 + 1)
            # get the number of frames (multiple of 20ms)
            N_frames = int(len(x.audio) / x.fs * 50)
            
            # trim the source signal to align to 20ms boundary
            len = int(N_frames * x.fs / 50)
            x.audio = x.audio[:len]

            # adjust the level of the source file
            _, scale_factor = get_loudness(audio_object, target_level, "MONO")
            x *= scale_factor
            _, scale_factor = get_loudness(x, target_level, "MONO")
            x.audio *= scale_factor

            # read azimuth information and create array
            if isinstance(source_azi, str):
@@ -167,59 +175,70 @@ def generate_ism_items(

            # delay the source file
            if source_delay > 0:
                pre = np.zeros((int(source_delay * fs), x.shape[1]))
                x = np.concatenate([pre, x])
                # ensure delay is a multiple of 20ms
                N_delay = int(floor(source_delay * 50) / 50 * x.fs)
            
                # insert all-zero preamble
                pre = np.zeros((N_delay, x.audio.shape[1]))
                x.audio = np.concatenate([pre, x.audio])

                # apply delay to metadata as well
                # insert neutral position as a pre-amble
                pre = np.tile(
                    [0.00, 0.00, 1.00, 0.00, 1.00], (int(source_delay * 50), 1)
                )
                # pre = np.zeros((int(source_delay * 50), x_meta.shape[1]))
                    [0.00, 0.00, 1.00, 0.00, 1.00], (N_delay, 1)
                )   # !!!! TBD - check if we should insert  netrual position or the first position of the metadata
                x_meta = np.concatenate([pre, x_meta])

            # add source signal to the array of source signals
            if y is None:
                y = x
            # add source signal to the array of all source signals
            y.fs = x.fs
            if y.audio is None:
                y.audio = x.audio
            else:
                # append zeros to have equal length of all source signals
                if x.shape[0] > y.shape[0]:
                    y = np.vstack((y, np.zeros((x.shape[0] - y.shape[0], y.shape[1]))))
                elif y.shape[0] > x.shape[0]:
                    x = np.vstack((x, np.zeros((y.shape[0] - x.shape[0], x.shape[1]))))
                y = np.hstack((y, x))
                if x.audio.shape[0] > y.audio.shape[0]:
                    y.audio = np.vstack((y.audio, np.zeros((x.audio.shape[0] - y.audio.shape[0], y.audio.shape[1]))))
                elif y.audio.shape[0] > x.audio.shape[0]:
                    x.audio = np.vstack((x.audio, np.zeros((y.audio.shape[0] - x.audio.shape[0], x.audio.shape[1]))))
                y.audio = np.hstack((y.audio, x.audio))

            # add metadata to the array of all metadata
            x_meta = x_meta[np.newaxis, :]  # make sure x_meta is a 3d array
            # make sure x_meta is a 3d array
            x_meta = x_meta[np.newaxis, :]  
            if y_meta is None:
                y_meta = x_meta
            else:
                N_srcs = y_meta.shape[0]
                N_meta_features = y_meta.shape[2]

                # append postamble (create by repeating the last row of metadata) to have equal length of all metadata
                # append the last position of the metadata to have equal length of all metadata
                if x_meta.shape[1] > y_meta.shape[1]:
                    N_delta = x_meta.shape[1] - y_meta.shape[1]
                    y_meta = y_meta.reshape(y_meta.shape[1], -1)  # reshape to 2d array
                    # reshape to 2d array
                    y_meta = y_meta.reshape(y_meta.shape[1], -1)  
                    # repeat last row N_delta times and append to the array
                    y_meta = np.vstack(
                        (y_meta, np.tile(y_meta[-1, :], (N_delta, 1)))
                    )  # repeat last row N_delta times and append to the array
                    )  
                    # reshape back to 3d array
                    y_meta = y_meta.reshape(
                        N_srcs, -1, N_meta_features
                    )  # reshape back to 3d array
                    )  
                elif y_meta.shape[1] > x_meta.shape[1]:
                    N_delta = y_meta.shape[1] - x_meta.shape[1]
                    x_meta = x_meta.reshape(x_meta.shape[1], -1)  # reshape to 2d array
                    # reshape to 2d array
                    x_meta = x_meta.reshape(x_meta.shape[1], -1)  
                    # repeat last row N_delta times and append to the array
                    x_meta = np.vstack(
                        (x_meta, np.tile(x_meta[-1, :], (N_delta, 1)))
                    )  # repeat last row N_delta times and append to the array
                    x_meta = np.expand_dims(x_meta, axis=0)  # reshape back to 3d array
                    )  
                    # reshape back to 3d array
                    x_meta = np.expand_dims(x_meta, axis=0)  

                y_meta = np.concatenate([y_meta, x_meta])

        # write individual ISM audio streams to the output file in an interleaved format
        output_filename = scene["name"]
        audiofile.write(
            os.path.join(output_path, output_filename), y, fs
            os.path.join(output_path, output_filename), y.audio, y.fs
        )  # !!!! TBD: replace all os.path.xxx operations with the Path object

        # write individual ISM metadata to output files in .csv format