fix 20ms frame alignment (e0fbcf7a) · Commits · IVAS Codec Public Collaboration / IVAS Processing Scripts

item_generation_scripts/processing/process_ism_items.py

+51 −32

Original line number	Diff line number	Diff line
		@@ -36,8 +36,8 @@ import logging
		import os
		from pathlib import Path
		from typing import Optional

		import numpy as np
		from math import floor

		from item_generation_scripts.audiotools import audio, audiofile
		from item_generation_scripts.audiotools.wrappers.bs1770 import get_loudness
		@@ -69,8 +69,14 @@ def generate_ism_items(
		# extract the number of audio sources
		N_sources = len(np.atleast_1d(scene["source"]))

		y = None
		# initialize output variables
		if format == "ISM2":
		y = audio.ChannelBasedAudio("STEREO")
		else:
		y = audio.ChannelBasedAudio("MONO")
		y_meta = None

		# repeat for all source files
		for i in range(N_sources):

		# parse parameters from the scene description
		@@ -87,16 +93,18 @@ def generate_ism_items(
		)

		# read source file
		audio_object = audio.fromfile("MONO", os.path.join(input_path, source_file), fs=fs)
		x = audio_object.audio
		fs = audio_object.fs
		x = audio.fromfile("MONO", os.path.join(input_path, source_file), fs=fs)

		# find the number of frames
		N_frames = int(len(x) / fs * 50 + 1)
		# get the number of frames (multiple of 20ms)
		N_frames = int(len(x.audio) / x.fs * 50)

		# trim the source signal to align to 20ms boundary
		len = int(N_frames * x.fs / 50)
		x.audio = x.audio[:len]

		# adjust the level of the source file
		_, scale_factor = get_loudness(audio_object, target_level, "MONO")
		x *= scale_factor
		_, scale_factor = get_loudness(x, target_level, "MONO")
		x.audio *= scale_factor

		# read azimuth information and create array
		if isinstance(source_azi, str):
		@@ -167,59 +175,70 @@ def generate_ism_items(

		# delay the source file
		if source_delay > 0:
		pre = np.zeros((int(source_delay * fs), x.shape[1]))
		x = np.concatenate([pre, x])
		# ensure delay is a multiple of 20ms
		N_delay = int(floor(source_delay * 50) / 50 * x.fs)

		# insert all-zero preamble
		pre = np.zeros((N_delay, x.audio.shape[1]))
		x.audio = np.concatenate([pre, x.audio])

		# apply delay to metadata as well
		# insert neutral position as a pre-amble
		pre = np.tile(
		[0.00, 0.00, 1.00, 0.00, 1.00], (int(source_delay * 50), 1)
		)
		# pre = np.zeros((int(source_delay * 50), x_meta.shape[1]))
		[0.00, 0.00, 1.00, 0.00, 1.00], (N_delay, 1)
		) # !!!! TBD - check if we should insert netrual position or the first position of the metadata
		x_meta = np.concatenate([pre, x_meta])

		# add source signal to the array of source signals
		if y is None:
		y = x
		# add source signal to the array of all source signals
		y.fs = x.fs
		if y.audio is None:
		y.audio = x.audio
		else:
		# append zeros to have equal length of all source signals
		if x.shape[0] > y.shape[0]:
		y = np.vstack((y, np.zeros((x.shape[0] - y.shape[0], y.shape[1]))))
		elif y.shape[0] > x.shape[0]:
		x = np.vstack((x, np.zeros((y.shape[0] - x.shape[0], x.shape[1]))))
		y = np.hstack((y, x))
		if x.audio.shape[0] > y.audio.shape[0]:
		y.audio = np.vstack((y.audio, np.zeros((x.audio.shape[0] - y.audio.shape[0], y.audio.shape[1]))))
		elif y.audio.shape[0] > x.audio.shape[0]:
		x.audio = np.vstack((x.audio, np.zeros((y.audio.shape[0] - x.audio.shape[0], x.audio.shape[1]))))
		y.audio = np.hstack((y.audio, x.audio))

		# add metadata to the array of all metadata
		x_meta = x_meta[np.newaxis, :] # make sure x_meta is a 3d array
		# make sure x_meta is a 3d array
		x_meta = x_meta[np.newaxis, :]
		if y_meta is None:
		y_meta = x_meta
		else:
		N_srcs = y_meta.shape[0]
		N_meta_features = y_meta.shape[2]

		# append postamble (create by repeating the last row of metadata) to have equal length of all metadata
		# append the last position of the metadata to have equal length of all metadata
		if x_meta.shape[1] > y_meta.shape[1]:
		N_delta = x_meta.shape[1] - y_meta.shape[1]
		y_meta = y_meta.reshape(y_meta.shape[1], -1) # reshape to 2d array
		# reshape to 2d array
		y_meta = y_meta.reshape(y_meta.shape[1], -1)
		# repeat last row N_delta times and append to the array
		y_meta = np.vstack(
		(y_meta, np.tile(y_meta[-1, :], (N_delta, 1)))
		) # repeat last row N_delta times and append to the array
		)
		# reshape back to 3d array
		y_meta = y_meta.reshape(
		N_srcs, -1, N_meta_features
		) # reshape back to 3d array
		)
		elif y_meta.shape[1] > x_meta.shape[1]:
		N_delta = y_meta.shape[1] - x_meta.shape[1]
		x_meta = x_meta.reshape(x_meta.shape[1], -1) # reshape to 2d array
		# reshape to 2d array
		x_meta = x_meta.reshape(x_meta.shape[1], -1)
		# repeat last row N_delta times and append to the array
		x_meta = np.vstack(
		(x_meta, np.tile(x_meta[-1, :], (N_delta, 1)))
		) # repeat last row N_delta times and append to the array
		x_meta = np.expand_dims(x_meta, axis=0) # reshape back to 3d array
		)
		# reshape back to 3d array
		x_meta = np.expand_dims(x_meta, axis=0)

		y_meta = np.concatenate([y_meta, x_meta])

		# write individual ISM audio streams to the output file in an interleaved format
		output_filename = scene["name"]
		audiofile.write(
		os.path.join(output_path, output_filename), y, fs
		os.path.join(output_path, output_filename), y.audio, y.fs
		) # !!!! TBD: replace all os.path.xxx operations with the Path object

		# write individual ISM metadata to output files in .csv format