From 323e4b86207e0a22f6edc8665e9766b03596a208 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Tue, 2 May 2023 15:21:36 +0200 Subject: [PATCH 01/27] Skeleton framework for item generation scripts --- item_generation_scripts/__init__.py | 101 ++ item_generation_scripts/__main__.py | 50 + item_generation_scripts/audiotools/EFAP.py | 922 ++++++++++++++++++ .../audiotools/__init__.py | 286 ++++++ .../audiotools/__main__.py | 36 + item_generation_scripts/audiotools/audio.py | 428 ++++++++ .../audiotools/audioarray.py | 690 +++++++++++++ .../audiotools/audiofile.py | 433 ++++++++ .../BRIR_IISofficialMPEG222UC_FULL.mat | 3 + .../BRIR_IISofficialMPEG222UC_LS.mat | 3 + .../HRIR_ORANGE53_Dolby_SBA1.mat | 3 + .../HRIR_ORANGE53_Dolby_SBA2.mat | 3 + .../HRIR_ORANGE53_Dolby_SBA3.mat | 3 + .../binaural_datasets/HRIR_ORANGE53_FULL.mat | 3 + .../binaural_datasets/HRIR_ORANGE53_LS.mat | 3 + .../audiotools/binaural_datasets/README.txt | 34 + .../audiotools/binaural_datasets/__init__.py | 31 + .../binaural_datasets/binaural_dataset.py | 288 ++++++ .../audiotools/binauralobjectrenderer.py | 652 +++++++++++++ .../audiotools/constants.py | 704 +++++++++++++ .../audiotools/convert/__init__.py | 323 ++++++ .../audiotools/convert/binaural.py | 108 ++ .../audiotools/convert/channelbased.py | 390 ++++++++ .../audiotools/convert/masa.py | 165 ++++ .../audiotools/convert/objectbased.py | 352 +++++++ .../audiotools/convert/scenebased.py | 429 ++++++++ .../audiotools/metadata.py | 571 +++++++++++ .../audiotools/rotation.py | 379 +++++++ item_generation_scripts/audiotools/utils.py | 71 ++ .../audiotools/wrappers/__init__.py | 31 + .../audiotools/wrappers/bs1770.py | 291 ++++++ .../audiotools/wrappers/eid_xor.py | 193 ++++ .../audiotools/wrappers/esdru.py | 130 +++ .../audiotools/wrappers/filter.py | 366 +++++++ .../audiotools/wrappers/gen_patt.py | 171 ++++ .../audiotools/wrappers/masaRenderer.py | 117 +++ .../audiotools/wrappers/networkSimulator.py | 224 +++++ .../audiotools/wrappers/p50fbmnru.py | 110 +++ .../audiotools/wrappers/random_seed.py | 92 ++ item_generation_scripts/binary_paths.yml | 30 + .../config/ISM1_CONFIG.yml | 338 +++++++ .../config/ISM2_CONFIG.yml | 338 +++++++ item_generation_scripts/constants.py | 80 ++ .../processing/__init__.py | 31 + item_generation_scripts/processing/config.py | 130 +++ .../processing/preprocessing_2.py | 155 +++ .../processing/process_ism_items.py | 221 +++++ .../processing/processing.py | 455 +++++++++ item_generation_scripts/utils.py | 297 ++++++ 49 files changed, 11264 insertions(+) create mode 100644 item_generation_scripts/__init__.py create mode 100644 item_generation_scripts/__main__.py create mode 100644 item_generation_scripts/audiotools/EFAP.py create mode 100644 item_generation_scripts/audiotools/__init__.py create mode 100644 item_generation_scripts/audiotools/__main__.py create mode 100644 item_generation_scripts/audiotools/audio.py create mode 100644 item_generation_scripts/audiotools/audioarray.py create mode 100644 item_generation_scripts/audiotools/audiofile.py create mode 100644 item_generation_scripts/audiotools/binaural_datasets/BRIR_IISofficialMPEG222UC_FULL.mat create mode 100644 item_generation_scripts/audiotools/binaural_datasets/BRIR_IISofficialMPEG222UC_LS.mat create mode 100644 item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA1.mat create mode 100644 item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA2.mat create mode 100644 item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA3.mat create mode 100644 item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_FULL.mat create mode 100644 item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_LS.mat create mode 100644 item_generation_scripts/audiotools/binaural_datasets/README.txt create mode 100644 item_generation_scripts/audiotools/binaural_datasets/__init__.py create mode 100644 item_generation_scripts/audiotools/binaural_datasets/binaural_dataset.py create mode 100644 item_generation_scripts/audiotools/binauralobjectrenderer.py create mode 100644 item_generation_scripts/audiotools/constants.py create mode 100644 item_generation_scripts/audiotools/convert/__init__.py create mode 100644 item_generation_scripts/audiotools/convert/binaural.py create mode 100644 item_generation_scripts/audiotools/convert/channelbased.py create mode 100644 item_generation_scripts/audiotools/convert/masa.py create mode 100644 item_generation_scripts/audiotools/convert/objectbased.py create mode 100644 item_generation_scripts/audiotools/convert/scenebased.py create mode 100644 item_generation_scripts/audiotools/metadata.py create mode 100644 item_generation_scripts/audiotools/rotation.py create mode 100644 item_generation_scripts/audiotools/utils.py create mode 100644 item_generation_scripts/audiotools/wrappers/__init__.py create mode 100644 item_generation_scripts/audiotools/wrappers/bs1770.py create mode 100644 item_generation_scripts/audiotools/wrappers/eid_xor.py create mode 100644 item_generation_scripts/audiotools/wrappers/esdru.py create mode 100644 item_generation_scripts/audiotools/wrappers/filter.py create mode 100644 item_generation_scripts/audiotools/wrappers/gen_patt.py create mode 100644 item_generation_scripts/audiotools/wrappers/masaRenderer.py create mode 100644 item_generation_scripts/audiotools/wrappers/networkSimulator.py create mode 100644 item_generation_scripts/audiotools/wrappers/p50fbmnru.py create mode 100644 item_generation_scripts/audiotools/wrappers/random_seed.py create mode 100644 item_generation_scripts/binary_paths.yml create mode 100644 item_generation_scripts/config/ISM1_CONFIG.yml create mode 100644 item_generation_scripts/config/ISM2_CONFIG.yml create mode 100644 item_generation_scripts/constants.py create mode 100644 item_generation_scripts/processing/__init__.py create mode 100644 item_generation_scripts/processing/config.py create mode 100644 item_generation_scripts/processing/preprocessing_2.py create mode 100644 item_generation_scripts/processing/process_ism_items.py create mode 100644 item_generation_scripts/processing/processing.py create mode 100644 item_generation_scripts/utils.py diff --git a/item_generation_scripts/__init__.py b/item_generation_scripts/__init__.py new file mode 100644 index 00000000..989d61a6 --- /dev/null +++ b/item_generation_scripts/__init__.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +import os +import logging +from itertools import repeat +import yaml +import pdb + +from item_generation_scripts.constants import ( + LOGGER_DATEFMT, + LOGGER_FORMAT, + LOGGER_SUFFIX, +) +from item_generation_scripts.processing import config, process_ism_items +from item_generation_scripts.processing import config +from item_generation_scripts.utils import create_dir + + +def logging_init(args, cfg): + """set up logging for a test file""" + logger = logging.getLogger("__main__") + logger.setLevel(logging.DEBUG) + + # console handler + console_handler = logging.StreamHandler() + console_handler.setFormatter(logging.Formatter("%(message)s")) + console_handler.setLevel(logging.DEBUG if args.debug else logging.INFO) + logger.addHandler(console_handler) + + # main log file + file_handler = logging.FileHandler( + cfg.output_path.joinpath(f"{cfg.format}{LOGGER_SUFFIX}"), mode="w" + ) + file_handler.setFormatter(logging.Formatter(LOGGER_FORMAT, datefmt=LOGGER_DATEFMT)) + file_handler.setLevel(logging.DEBUG if args.debug else logging.INFO) + logger.addHandler(file_handler) + + logger.info(f"Processing test configuration file {args.config}") + logger.info(f"Input path: {cfg.input_path.absolute()}") + logger.info(f"Output path: {cfg.output_path.absolute()}") + + return logger + + +def main(args): + + # parse configuration + cfg = config.TestConfig(args.config) + + # create output directory, if not existing + if not os.path.exists(cfg.output_path): + create_dir(cfg.output_path) + + # set up logging + logger = logging_init(args, cfg) + + # generate input items + if cfg.format.startswith("ISM"): + # generate ISM items according to scene description + process_ism_items.generate_ism_items( + cfg.format, + cfg.loudness, + cfg.input_path, + cfg.output_path, + cfg.scenes, + logger + ) + + # copy configuration to output directory + with open(cfg.output_path.joinpath(f"{cfg.format}.yml"), "w") as f: + yaml.safe_dump(cfg._yaml_dump, f) diff --git a/item_generation_scripts/__main__.py b/item_generation_scripts/__main__.py new file mode 100644 index 00000000..b49109d3 --- /dev/null +++ b/item_generation_scripts/__main__.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +import argparse + +from item_generation_scripts import main + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="IVAS item generation scripts for listening tests. Please refer to README.md for usage." + ) + parser.add_argument( + "config", + help="YAML configuration file", + ) + parser.add_argument( + "--debug", help="Set logging level to debug", action="store_true", default=False + ) + args = parser.parse_args() + + main(args) diff --git a/item_generation_scripts/audiotools/EFAP.py b/item_generation_scripts/audiotools/EFAP.py new file mode 100644 index 00000000..b83d57e6 --- /dev/null +++ b/item_generation_scripts/audiotools/EFAP.py @@ -0,0 +1,922 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +import argparse +from enum import Enum +from itertools import combinations +from pathlib import Path +from typing import Optional, Tuple, Union + +import numpy as np + + +def wrap_angles( + azi: float, + ele: float, + clip_ele: Optional[bool] = False, +) -> Tuple[float, float]: + """ + Wrap angles to (-180, 180] azimuth and [-90, 90] elevation + Takes into account hemisphere flips from large elevation changes unless clip_ele is specified + """ + if clip_ele: + ele = min(max(ele, -90), 90) + + if ele % 90 == 0 and ele % 180 != 0: + # if elevation is a multiple of 90, azimuth is irrelevant since we are at a pole + azi = 0 + while np.abs(ele) > 90: + ele -= 360 + else: + # wrap elevation value + while np.abs(ele) > 90: + # flip azimuth to other hemisphere + azi += 180 + + # compensate elevation accordingly + if ele > 90: + ele = 180 - ele + elif ele < -90: + ele = -180 - ele + + # wrap azimuth value + while azi > 180: + azi -= 360 + while azi <= -180: + azi += 360 + + return azi, ele + + +class EfapDmxType(Enum): + NONE = 0 + AMPLITUDE = 1 + INTENSITY = 2 + + +class EfapVertex: + """ + Vertex data structure for EFAP + + Initialises a vertex from the given spherical coordinate pair, + with a flag specifying if it is a ghost loudspeaker + + Parameters + ---------- + azi : float + Azimuth of vertex + ele : float + Elevation of vertex + is_ghost : bool + Whether the vertex is a ghost, default is False + dmx_type : EfapDmxType + Downmix type for ghost vertices + """ + + def __init__( + self, + azi: float, + ele: float, + is_ghost: Optional[bool] = False, + dmx_type: Optional[EfapDmxType] = EfapDmxType.INTENSITY, + ): + self.azi, self.ele = wrap_angles(azi, ele) + self.pos = np.array( + [ + np.cos(np.deg2rad(azi)) * np.cos(np.deg2rad(ele)), + np.sin(np.deg2rad(azi)) * np.cos(np.deg2rad(ele)), + np.sin(np.deg2rad(ele)), + ] + ) + + idx_azi = np.round(np.abs(90 - np.abs(self.azi))) + idx_ele = 90 - np.round(np.abs(self.ele)) + self.index = ( + idx_azi + 181 * idx_ele + ) # vertices on the median plane have lowest index + + self.is_ghost = is_ghost + self.dmx_type = dmx_type + + def __str__(self): + str_ = f"a{self.azi}e{self.ele}" + if self.is_ghost: + str_ += "*" + return str_ + + def __lt__(self, other): + return self.index < other.index + + +class EFAP: + """ + EFAP data structure + + Initialise EFAP data for computing panning gains + + Parameters + ---------- + azimuths : np.ndarray + Azimuth positions of the loudspeaker array + elevations : np.ndarray + Elevation postions of the loudspeaker array + intensity_panning : bool + Whether intensity panning is enabled or not + + Examples + -------- + >>> from EFAP import EFAP + >>> panner = EFAP([30, -30, 0, 110, -110], [0, 0, 0, 0, 0], False) + >>> panner.pan(15, 45) + array([0.66742381, 0.19069252, 0.66742381, 0.19069252, 0.19069252]) + """ + + _EFAP_HULL_TOL = 1e-4 # tolerance for a point to be added to the convex hull + _EFAP_MAX_AZI_GAP = 160 # maximum allowed angular gap in the middle layer + _EFAP_POLAR_ELE = 90 # elevation of north / south poles (zenith / nadir) + _EFAP_THRESH_COPLANAR = 1e-3 # tolerance for points to be considered coplanar + _EFAP_THRESH_MID_LAYER = 45 # elevation threshold for loudspeakers to be considered as in the middle layer + _EFAP_THRESH_POLES = 1e-6 # tolerance for a vertex to be considered polar + _EFAP_THRESH_TRI = 1e-10 # tolerance for a point to be inside a triangle + + def __init__( + self, + azimuths: Union[list, np.ndarray], + elevations: Union[list, np.ndarray], + intensity_panning: Optional[bool] = False, + ): + # validation + azimuths = np.array(azimuths) + elevations = np.array(elevations) + if np.squeeze(azimuths).ndim > 1: + raise ValueError("Too many dimensions for loudspeaker azimuth array") + if np.squeeze(elevations).ndim > 1: + raise ValueError("Too many dimensions for loudspeaker elevations array") + if azimuths.shape != elevations.shape: + raise ValueError("Mismatch between loudspeaker azimuths and elevations") + + # set EFIP flag + self.intensity_panning = intensity_panning + + # initialise vertices and add ghost loudspeakers if needed + self.verts = np.array( + [EfapVertex(azi, ele) for azi, ele in zip(azimuths, elevations)] + ) + self._add_ghost_speakers() + + # formulate initial tetrahedron for the convex hull + self._init_simplex() + + # add the remaining vertices to the convex hull in order of their index + for i in np.argsort(self.verts): + if self.verts[i] not in self.verts[self.tris]: + self._add_vertex_to_hull(i) + + # compute downmix matrix with remapped ghost speakers + self._remap_ghost_speakers() + + # set vertices near poles to have NaN azimuth + for v in self.verts: + if ( + v.ele > self._EFAP_POLAR_ELE - self._EFAP_THRESH_POLES + or v.ele < self._EFAP_THRESH_POLES - self._EFAP_POLAR_ELE + ): + v.azi = np.nan + + # combine triangles into polygons + self._tri2poly() + + def _add_ghost_speakers(self) -> None: + """ + Add ghost loudspeakers at the poles, or to fill large horizontal gaps + """ + ele = [v.ele for v in self.verts] + + dmx_type = EfapDmxType.INTENSITY + + # add ghost loudspeakers at the poles if necessary + if max(ele) < self._EFAP_POLAR_ELE: + if self.intensity_panning: + if max(ele) > self._EFAP_THRESH_MID_LAYER: + dmx_type = EfapDmxType.NONE + else: + dmx_type = EfapDmxType.AMPLITUDE + + self.verts = np.append(self.verts, EfapVertex(0, 90, True, dmx_type)) + + if min(ele) > -self._EFAP_POLAR_ELE: + if self.intensity_panning: + if min(ele) < -self._EFAP_THRESH_MID_LAYER: + dmx_type = EfapDmxType.NONE + else: + dmx_type = EfapDmxType.AMPLITUDE + + self.verts = np.append(self.verts, EfapVertex(0, -90, True, dmx_type)) + + # check for large gaps in the middle horizontal layer + mid_spkrs = [ + v.azi for v in self.verts if np.abs(v.ele) < self._EFAP_THRESH_MID_LAYER + ] + + # no speakers in middle layer; add a triangle of ghost speakers + if not mid_spkrs: + self.verts = np.append( + self.verts, + [ + EfapVertex(0, 0, True), + EfapVertex(180, 0, True), + EfapVertex(240, 0, True), + ], + ) + # only one speaker in the threshold; add two ghost speakers to form a triangle + elif len(mid_spkrs) == 1: + self.verts = np.append( + self.verts, + [ + EfapVertex(mid_spkrs[0] + 120, 0, True), + EfapVertex(mid_spkrs[0] + 240, 0, True), + ], + ) + # search for and fill gaps greater than MAX_AZI_GAP + else: + mid_spkrs = np.sort(mid_spkrs) + angle_diff = np.diff(np.concatenate([mid_spkrs, [mid_spkrs[0] + 360]])) + sectors = np.ceil(angle_diff / self._EFAP_MAX_AZI_GAP) + + for i, s in enumerate(sectors): + if s > 1: + new_diff = angle_diff[i] / s + num_new = s - 1 + for k in range(int(num_new)): + new_azi = mid_spkrs[i] + (k + 1) * new_diff + self.verts = np.append(self.verts, EfapVertex(new_azi, 0, True)) + + def _init_simplex(self) -> None: + """ + Create an initial tetrahedron / simplex for the convex hull from 4 vertices + """ + # take the first vertex as seed + t = [0] + + # attempt to form an edge with non-zero length + for i, v in enumerate(self.verts): + if ( + v.azi != self.verts[t[0]].azi or v.ele != self.verts[t[0]].ele + ) and i not in t: + t.append(i) + break + else: + raise ValueError("Vertices are conincident!") + + # attempt to form a triangle with non-zero area + for i, v in enumerate(self.verts): + if ( + np.linalg.norm( + np.cross( + self.verts[t[1]].pos - self.verts[t[0]].pos, + v.pos - self.verts[t[0]].pos, + ), + 2, + ) + > self._EFAP_HULL_TOL + and i not in t + ): + t.append(i) + break + else: + raise ValueError("Vertices are colinear!") + + # attempt to form a tetrahedron with non-zero volume + for i, v in enumerate(self.verts): + if ( + np.abs( + np.dot( + np.cross( + self.verts[t[1]].pos - self.verts[t[0]].pos, + self.verts[t[2]].pos - self.verts[t[0]].pos, + ), + v.pos - self.verts[t[0]].pos, + ) + ) + ) > self._EFAP_HULL_TOL and i not in t: + t.append(i) + break + else: + raise ValueError("Vertices are coplanar!") + + # create a list of the triangles of the initial simplex / tetrahedron + t = np.array(t) + self.tris = np.array([t[[0, 1, 2]], t[[0, 1, 3]], t[[0, 2, 3]], t[[1, 2, 3]]]) + + # orient the triangle surface planes outwards from the centroid + self.centroid = np.mean([self.verts[i].pos for i in t], axis=0) + for i, tri in enumerate(self.tris): + self.tris[i, :] = self._flip_plane(tri) + + def _add_vertex_to_hull(self, idx_new_vert: int) -> None: + """ + Add a vertex to the convex hull and update the list of triangles in the hull + """ + # compute the centroid of the current convex hull + self.centroid = np.mean( + [self.verts[i].pos for i in np.unique(self.tris)], axis=0 + ) + + tris_new = [] + visible = [] + + # find which hull surfaces are visible from the new vertex + for i, tri in enumerate(self.tris): + if self._vertex_dist(tri, idx_new_vert) > -1e-6: + visible.append(i) + else: + tris_new.append(tri) + + tris_new = np.array(tris_new) + visible = np.array(visible, dtype=int) + + # find edges of the visible hull surfaces + max_vert = np.amax(self.tris[visible]) + 1 + counter = np.zeros([max_vert, max_vert]) + for i, tri in enumerate(self.tris[visible]): + surface = np.append(tri, tri[0]) + for n in range(3): + a = surface[n] + b = surface[n + 1] + counter[a, b] = counter[a, b] + 1 + + counter += counter.T + + edges = [] + for a in range(max_vert - 1): + for b in range(a + 1, max_vert): + if counter[a, b] == 1: + edges.append([a, b]) + edges = np.vstack(edges) + + # break the edges visible from the new vertex and add the new triangle + for e in edges: + tris_new = np.vstack( + [tris_new, self._flip_plane(np.append(e, idx_new_vert))] + ) + + # update the list of triangles in the convex hull + self.tris = tris_new + + def _remap_ghost_speakers(self) -> None: + """ + Remove unused ghost speakers and compute a downmix matrix for the rest + """ + # find ghosts that are not part of the convex hull + ghosts = [i for i, v in enumerate(self.verts) if v.is_ghost] + unused_ghosts = np.compress( + np.isin(ghosts, np.unique(self.tris), invert=True), ghosts + ) + + if unused_ghosts.size > 0: + # remove the unused ghosts from the triangle array and also adjust indices + self.tris[self.tris > unused_ghosts.min()] -= unused_ghosts.size + # delete them from the vertex array + self.verts = np.delete(self.verts, unused_ghosts) + + # generate initial sound energy distribution matrix + n_vtx = len(self.verts) + n_ghost = len(ghosts) - len(unused_ghosts) + + M = np.eye(n_vtx) + for i, v in enumerate(self.verts): + if v.is_ghost: + neighbours = self._get_neighbours(i) + M[:, i] = np.zeros(n_vtx) + M[neighbours, i] = np.ones(len(neighbours)) / len(neighbours) + + # re-distribute sound energy from ghosts + M2 = M.copy() + for i, v in enumerate(self.verts): + if v.is_ghost: + vec = M[:, i] + while np.sum(vec[-n_ghost:]) > 1e-4: + vec = M @ vec + M2[:, i] = vec + + self.dmx_mat = M2[:-n_ghost, :] + + # amplitude downmix for real loudspeakers + self.dmx_mat[:, :-n_ghost] = np.sqrt(self.dmx_mat[:, :-n_ghost]) + + # distribute ghosts according to downmix type + for i, v in enumerate(self.verts): + if v.is_ghost: + if v.dmx_type == EfapDmxType.NONE: + self.dmx_mat[:, i] = 0 + elif v.dmx_type == EfapDmxType.AMPLITUDE: + pass + else: + self.dmx_mat[:, i] = np.sqrt(self.dmx_mat[:, i]) + + def _tri2poly(self) -> None: + """ + Merge hull triangles into polygons if they are coplanar + """ + polys = [] + + for tri in self.tris: + # find all vertices coplanar with this triangle (including those already in the triangle) + new_poly = np.array( + [ + i + for i, _ in enumerate(self.verts) + if np.abs(self._vertex_dist(tri, i)) < self._EFAP_THRESH_COPLANAR + ] + ) + + # check if we already found this polygon as a complete subset + is_subset = [ + i for i, poly in enumerate(polys) if np.all(np.isin(new_poly, poly)) + ] + is_superset = [ + i for i, poly in enumerate(polys) if np.all(np.isin(poly, new_poly)) + ] + + if is_subset: + continue + elif is_superset: + # remove the other polygon since it will be replaced by the superset polygon + polys_new = [p for i, p in enumerate(polys) if i not in is_superset] + polys = polys_new + + # orient the polygon plane in the same direction as the triangle + P1 = self.verts[tri[0]].pos + P2 = self.verts[tri[1]].pos + P3 = self.verts[tri[2]].pos + + # first base vector + U = P2 - P1 + U = U / np.linalg.norm(U) + + # second base vector + V = P3 - P2 + V = V - np.dot(U, V) * U + V = V / np.linalg.norm(V) + + # center of the first triangle + M = np.mean([P1, P2, P3], axis=0) + + # sort vertices + azi = np.zeros_like(new_poly, dtype=float) + for i, idx_v in enumerate(new_poly): + P = self.verts[idx_v].pos - M + X = np.dot(P, U) + Y = np.dot(P, V) + azi[i] = np.arctan2(Y, X) + + idx = np.argsort(azi) + new_poly = new_poly[idx] + + # add the polygon to the main list + polys.append(new_poly) + + self.polys = polys + + def _pan_EFAP_poly( + self, azimuth: float, elevation: float, poly: np.ndarray, mod: int + ) -> np.ndarray: + """ + Compute panning gains for each vertex in the given polygon + + Parameters + ---------- + azimuth : float + Azimuth of requested panning position + elevation : float + Elevation of requested panning position + poly : np.ndarray + Array of vertices defining the polygon + + Returns + ------- + poly_gain: np.ndarray + Gains for each vertex in the polygon + """ + poly_gain = np.zeros_like(poly, dtype=float) + + P = np.array([azimuth, elevation]) + # search for the triangle of the polygon in which P belongs + for i in range(1, poly.size + 1): + A = np.array([self.verts[poly[i - 1]].azi, self.verts[poly[i - 1]].ele]) + for j in range(i, poly.size - 2 + i): + idx1 = 1 + (j % poly.size) + idx2 = 1 + (idx1 % poly.size) + B = np.array( + [self.verts[poly[idx1 - 1]].azi, self.verts[poly[idx1 - 1]].ele] + ) + C = np.array( + [self.verts[poly[idx2 - 1]].azi, self.verts[poly[idx2 - 1]].ele] + ) + + if mod: + if not np.isnan(A[0]): + A[0] %= mod + if not np.isnan(B[0]): + B[0] %= mod + if not np.isnan(C[0]): + C[0] %= mod + + if self._in_triangle(P, A, B, C): + N = np.transpose([B[1] - C[1], C[0] - B[0]]) + N = N / np.dot(N, B - A) + poly_gain[i - 1] = 1 - np.dot(P - A, N) + + """ DEBUGGING / TODO """ + # set gains <= -60dB to 0 + poly_gain[np.abs(poly_gain) < 1e-6] = 0 + + return poly_gain + + """ geometric / math helper functions """ + + def _get_neighbours(self, idx_vert: int) -> np.ndarray: + """ + Find triangles containing the given vertex index (neighbouring vertices) + """ + n = self.tris[np.any(np.isin(self.tris, idx_vert), axis=1)] + return np.unique(n[n != idx_vert]) + + def _get_azi_ele(self, idx_vert: int) -> Tuple[float, float]: + """ + Return a tuple of (azi, ele) for a vertex at the given index + """ + return self.verts[idx_vert].azi, self.verts[idx_vert].ele + + def _in_polygon( + self, azimuth: float, elevation: float, poly: np.ndarray + ) -> Tuple[bool, int]: + """ + Determine whether the panning position lies within the given polygon + by iteratively checking its triangles + + Parameters + ---------- + azimuth : float + Azimuth of requested panning position + elevation : float + Elevation of requested panning position + poly : np.ndarray + Array of vertices defining the polygon + + Returns + ------- + in_polygon, mod: Tuple[bool, int] + Flag indicating whether the point is inside the given polygon + Value of wrapping required if used + """ + azi = [self.verts[v].azi for v in poly] + + P = np.array([azimuth, elevation]) + + for tri in combinations(poly, 3): + A = np.array(self._get_azi_ele(tri[0])) + B = np.array(self._get_azi_ele(tri[1])) + C = np.array(self._get_azi_ele(tri[2])) + if self._in_triangle(P, A, B, C): + return True, None + + # if the azimuth difference is large, perform the 2D check again with azimuths wrapped to (-360, 0] and [0, 360) + if np.nanmax(azi) - np.nanmin(azi) > 180: + for tri in combinations(poly, 3): + A = np.array(self._get_azi_ele(tri[0])) + B = np.array(self._get_azi_ele(tri[1])) + C = np.array(self._get_azi_ele(tri[2])) + if not np.isnan(A[0]): + A[0] %= 360 + if not np.isnan(B[0]): + B[0] %= 360 + if not np.isnan(C[0]): + C[0] %= 360 + if self._in_triangle(P, A, B, C): + return True, 360 + + for tri in combinations(poly, 3): + A = np.array(self._get_azi_ele(tri[0])) + B = np.array(self._get_azi_ele(tri[1])) + C = np.array(self._get_azi_ele(tri[2])) + if not np.isnan(A[0]): + A[0] %= -360 + if not np.isnan(B[0]): + B[0] %= -360 + if not np.isnan(C[0]): + C[0] %= -360 + if self._in_triangle(P, A, B, C): + return True, -360 + + return False, None + + def _in_triangle( + self, P: np.ndarray, A: np.ndarray, B: np.ndarray, C: np.ndarray + ) -> bool: + """ + Determine whether the panning position lies within the given triangle + + Parameters + ---------- + P : float + Point under test + A : float + First vertex of the triangle + B : float + Second vertex of the triangle + C : float + Third vertex of the triangle + + Returns + ------- + bool + Flag indicating whether the point is inside the given triangle + """ + if np.isnan(A[0]): + A[0] = P[0] + + if np.isnan(B[0]): + B[0] = P[0] + + if np.isnan(C[0]): + C[0] = P[0] + + tmpMat = np.transpose([B - A, C - A]) + if (1 / np.linalg.cond(tmpMat)) < self._EFAP_THRESH_TRI: + return False + + Minv = np.linalg.inv(tmpMat) + S = Minv @ (P - A) + + if ( + S[0] < -self._EFAP_THRESH_TRI + or S[1] < -self._EFAP_THRESH_TRI + or S[0] + S[1] > 1 + self._EFAP_THRESH_TRI + ): + return False + + return True + + def _vertex_dist(self, surface: np.ndarray, idx_vert: int) -> float: + """ + Compute the distance of a vertex from a given plane + + Parameters + ---------- + surface : np.ndarray + Array of 3 ordered vertices defining the plane and its orientation + idx_vert: int + Index of the vertex to compute the distance for + + Returns + ------- + float + Distance of the vertex from the given plane + """ + return self._point_plane_dist( + self.verts[surface[0]].pos, + self.verts[surface[1]].pos, + self.verts[surface[2]].pos, + self.verts[idx_vert].pos, + ) + + def _point_plane_dist( + self, P1: np.ndarray, P2: np.ndarray, P3: np.ndarray, X: np.ndarray + ) -> float: + """ + Compute the distance of a vertex from a plane defined by three points + + Parameters + ---------- + P1 : np.ndarray + Cartesian coordinates of the first point + P2 : np.ndarray + Cartesian coordinates of the second point + P3 : np.ndarray + Cartesian coordinates of the third point + X: np.ndarray + Cartesian coordinates of the vertex + + Returns + ------- + float + Distance of the vertex from the given plane + """ + + if np.all(X == P1) or np.all(X == P2) or np.all(X == P3): + return 0 + else: + N = np.cross(P1 - P2, P1 - P3) + eps = np.finfo(float).eps + return np.dot(X - P1, N / (np.linalg.norm(N) + eps)) + + def _flip_plane(self, surface: np.ndarray) -> np.ndarray: + """ + Flip the orientation of a plane (invert normal vector) + + Parameters + ---------- + surface : np.ndarray + Array of 3 ordered vertices defining the plane and its orientation + + Returns + ------- + surface : np.ndarray + Reordered vertices with plane normal pointing outwards from the hull centroid + """ + if ( + self._point_plane_dist( + self.verts[surface[0]].pos, + self.verts[surface[1]].pos, + self.verts[surface[2]].pos, + self.centroid, + ) + > 0 + ): + surface = np.flip(surface.copy()) + + return surface + + def _compute_gains_point(self, azimuth: float, elevation: float) -> np.ndarray: + """ + Compute gains for the requested panning position + + Parameters + ---------- + azimuth : float + Azimuth of requested panning position + elevation : float + Elevation of requested panning position + + Returns + ------- + gains: np.ndarray + Panning gains for the loudspeaker layout + """ + if np.isnan(azimuth) or np.isnan(elevation): + raise ValueError(f"Angles cannot be NaNs : ({azimuth}, {elevation})") + + azimuth, elevation = wrap_angles(azimuth, elevation) + point_pos = [ + np.cos(np.deg2rad(azimuth)) * np.cos(np.deg2rad(elevation)), + np.sin(np.deg2rad(azimuth)) * np.cos(np.deg2rad(elevation)), + np.sin(np.deg2rad(elevation)), + ] + + # filter the polygon list with a quick 2d check + found_polys = [] + for poly in self.polys: + in_poly, mod = self._in_polygon(azimuth, elevation, poly) + if in_poly: + found_polys.append((poly, mod)) + + if not found_polys: + raise AssertionError("Unexpected error during panning") + + # find a visible polygon with the smallest distance + dist = [] + + for poly, mod in found_polys: + surface = self.verts[poly] + d = self._point_plane_dist( + surface[0].pos, + surface[1].pos, + surface[2].pos, + point_pos, + ) + if d >= 0: + dist.append(d) + else: + dist.append(np.inf) + + found_poly, mod = found_polys[np.argmin(dist)] + + # compute gains for the polygon vertices + poly_gain = self._pan_EFAP_poly(azimuth, elevation, found_poly, mod) + + # downmix ghost loudspeakers + gains = np.zeros(self.verts.size) + gains[found_poly] = poly_gain / np.linalg.norm(poly_gain) + gains = gains @ self.dmx_mat.T + gains = gains / np.linalg.norm(gains) + + if self.intensity_panning: + gains = np.sqrt(gains / np.sum(gains)) + + return gains + + """ public functions """ + + def pan( + self, + azimuths: float, + elevations: float, + intensity_panning: Optional[bool] = False, + ) -> np.ndarray: + """ + Compute gains for the requested panning position + + Parameters + ---------- + azimuths : float + Azimuth of requested panning position + elevations : float + Elevation of requested panning position + intensity_panning : bool + Flag whether to use intensity panning (Default is False == amplitude panning) + + Returns + ------- + gains: np.ndarray + Panning gains for the loudspeaker layout + """ + azimuths = np.array(azimuths) + elevations = np.array(elevations) + if azimuths.size == 1 and elevations.size == 1: + return self._compute_gains_point(azimuths, elevations) + elif np.squeeze(azimuths).ndim == 1 and np.squeeze(elevations).ndim == 1: + gains = [] + for a, e in zip(azimuths, elevations): + gains.append(self._compute_gains_point(a, e)) + return np.vstack(gains) + else: + raise ValueError( + "Azimuth and Elevation arrays cannot have more than one dimension and must be of equal size" + ) + + +def main(args): + """ + Parses a speaker layout text file and prints the panning gains + for the requested position + + Parameters + ---------- + args : Namespace + Command line arguments + """ + + speaker_positions = np.loadtxt(Path(args.input), delimiter=",", max_rows=2) + panner = EFAP(speaker_positions[0, :], speaker_positions[1, :], args.efip) + print(panner.pan(args.azimuth, args.elevation)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Edge-Fading Amplitude Panning") + parser.add_argument( + "-i", + "--input", + metavar="layout_file", + required=True, + type=str, + help="IVAS compatible loudspeaker layout file (Loudspeaker azimuths in first line, elevations in second, subsequent lines are ignored)", + ) + parser.add_argument( + "-efip", + "-intensity_panning", + default=False, + action="store_true", + help="Intensity panning mode (EFIP)", + ) + parser.add_argument( + "azimuth", + type=float, + help="Azimuth of direction to compute panning gains for (positive-left)", + ) + parser.add_argument( + "elevation", + type=float, + help="Elevation of direction to compute panning gains for (positive-up)", + ) + args = parser.parse_args() + main(args) diff --git a/item_generation_scripts/audiotools/__init__.py b/item_generation_scripts/audiotools/__init__.py new file mode 100644 index 00000000..effc5a25 --- /dev/null +++ b/item_generation_scripts/audiotools/__init__.py @@ -0,0 +1,286 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +import argparse +from itertools import repeat +from pathlib import Path + +from item_generation_scripts.audiotools.constants import AUDIO_FORMATS +from item_generation_scripts.audiotools.convert import convert_file +from item_generation_scripts.utils import apply_func_parallel + + +def add_processing_args(group, input=True): + # set up prefixes to avoid argument collision + if input: + p = "in" + ps = "i" + else: + p = "out" + ps = "o" + + group.add_argument( + f"-{ps}", + f"--{p}", + dest=f"{p}put", + required=True, + type=Path, + help="Path to *.{wav, pcm, raw} file or directory", + ) + group.add_argument( + f"-{ps}f", + f"--{p}_fmt", + required=input, + type=str, + help="Audio format (use -l, --list for a list / -L, --long for a detailed list)", + default=None, + ) + group.add_argument( + f"-{ps}s", + f"--{p}_fs", + type=int, + help="Sampling rate (Hz) (deduced for .wav input, same as input if output not specified, default = %(default)s)", + default=48000, + ) + group.add_argument( + f"-{ps}fc", + f"--{p}_cutoff", + type=int, + help="Cut-off frequency for low-pass filtering (default = %(default)s)", + default=None, + ) + group.add_argument( + f"-{ps}hp", + f"--{p}_hp50", + help="Apply 50 Hz high-pass filtering (default = %(default)s)", + action="store_true", + ) + group.add_argument( + f"-{ps}w", + f"--{p}_window", + type=float, + help="Window the start/end of the signal by this amount in milliseconds (default = %(default)s)", + default=None, + ) + group.add_argument( + f"-{ps}t", + f"--{p}_trim", + type=float, + nargs=2, + metavar=("PRE_TRIM", "POST_TRIM"), + help="Pre-/post-trim the signal by this amount in milliseconds (negative values pad silence), (default = %(default)s)", + ) + group.add_argument( + f"-{ps}pn", + f"--{p}_pad_noise", + help="Flag for padding with noise instead of zeros", + action="store_true", + ) + group.add_argument( + f"-{ps}d", + f"--{p}_delay", + type=float, + help="Delay the signal by this amount in milliseconds (negative values advance, default = %(default)s)", + default=None, + ) + group.add_argument( + f"-{ps}l", + f"--{p}_loudness", + type=float, + help="Normalize to given loudness with BS 1770-4 (default = %(default)s)", + default=None, + ) + group.add_argument( + f"-{ps}nf", + f"--{p}_loudness_fmt", + type=str, + help=f"Format used for loudness computation (only valid with with -{ps}l/--{p}_loudness, default = {p.upper()}_FMT)", + default=None, + ) + + +def get_args(): + parser = argparse.ArgumentParser( + description="Audiotools: Convert/Manipulate spatial audio files." + ) + + """ Input file arguments """ + input_parser = parser.add_argument_group("Input (pre-) processing options") + + # add common arguments + add_processing_args(input_parser) + + # input only arguments + input_parser.add_argument( + "-im", + "--in_meta", + type=str, + nargs="+", + help="list of input metadata files (only relevant for ISM and MASA input)", + default=None, + ) + + """ Output file arguments """ + output_parser = parser.add_argument_group("Output (post-) processing options") + + # add common arguments + add_processing_args(output_parser, False) + + # output only arguments + output_parser.add_argument( + "-lm", + "--limit", + help="Apply limiting to output (default = %(default)s)", + action="store_true", + ) + output_parser.add_argument( + "-t", + "--trajectory", + type=str, + help="Head-tracking trajectory file for binaural output (default = %(default)s)", + default=None, + ) + output_parser.add_argument( + "-bd", + "--bin_dataset", + type=str, + help="Use a custom binaural dataset (see README.md and audiotools/binaural_datasets/README.txt for further information)", + default=None, + ) + output_parser.add_argument( + "-bl", + "--bin_lfe_gain", + type=float, + help="Render LFE to binaural output with the specified gain (only valid for channel-based input, default = %(default)s)", + default=None, + ) + output_parser.add_argument( + "-mnru", + "--mnru_q", + type=float, + help="Flag for MNRU processing", + default=None, + ) + output_parser.add_argument( + "-esdru", + "--esdru_alpha", + type=float, + help="Flag for ESDRU processing", + default=None, + ) + + misc_parser = parser.add_argument_group("General options") + + """ Miscellaneous or meta arguments """ + misc_parser.add_argument( + "-l", + "--list", + help="list all supported audio formats and exit", + action="store_true", + ) + misc_parser.add_argument( + "-L", + "--long", + help="list all supported audio formats with long description and exit", + action="store_true", + ) + misc_parser.add_argument( + "-mp", + "--multiprocessing", + help="Enable multiprocessing (default = %(default)s)", + action="store_true", + ) + + return parser.parse_args() + + +def main(): + args = get_args() + + if args.list is True or args.long is True: + for fmt in AUDIO_FORMATS: + if args.long: + for f, d in fmt.items(): + print(f) + [print(f"\t{k}: {v}", end=None) for k, v in d.items()] + else: + print(", ".join(fmt.keys())) + exit() + + elif args.input is not None: + if not args.out_fs: + args.out_fs = args.in_fs + + if not args.out_fmt: + args.out_fmt = args.in_fmt + + if not args.out_loudness_fmt: + args.out_loudness_fmt = args.out_fmt + + # List input files + args.input = Path(args.input) + in_files = [] + if args.input.exists(): + if args.input.is_dir(): + in_files.extend(args.input.glob("*.wav")) + in_files.extend(args.input.glob("*.pcm")) + in_files.extend(args.input.glob("*.raw")) + else: + in_files = [args.input] + else: + raise ValueError(f"Input path {args.input} does not exist!") + + if len(in_files) == 0: + raise ValueError(f"Input directory {args.input} empty!") + + # Create output directory + args.output = Path(args.output) + + if len(in_files) == 1 and args.input.is_file(): + out_files = [args.output] + else: + args.output.mkdir(exist_ok=True) + out_files = [args.output.joinpath(i.name) for i in in_files] + + # Multiprocessing + enable_multiprocessing = args.multiprocessing + + # Remove unneeded keys to avoid passing to convert_file() + for k in ["list", "long", "multiprocessing", "input", "output"]: + args.__dict__.pop(k) + + apply_func_parallel( + convert_file, + zip(in_files, out_files), + repeat(args.__dict__), + "mp" if enable_multiprocessing else None, + ) diff --git a/item_generation_scripts/audiotools/__main__.py b/item_generation_scripts/audiotools/__main__.py new file mode 100644 index 00000000..9bdf64cd --- /dev/null +++ b/item_generation_scripts/audiotools/__main__.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +from item_generation_scripts.audiotools import main + +if __name__ == "__main__": + main() diff --git a/item_generation_scripts/audiotools/audio.py b/item_generation_scripts/audiotools/audio.py new file mode 100644 index 00000000..1804f5dd --- /dev/null +++ b/item_generation_scripts/audiotools/audio.py @@ -0,0 +1,428 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +import warnings +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Optional, Union + +import numpy as np + +from item_generation_scripts.audiotools.audiofile import read +from item_generation_scripts.audiotools.constants import ( + BINAURAL_AUDIO_FORMATS, + CHANNEL_BASED_AUDIO_ALTNAMES, + CHANNEL_BASED_AUDIO_FORMATS, + IVAS_FRAME_LEN_MS, + METADATA_ASSISTED_SPATIAL_AUDIO_FORMATS, + OBJECT_BASED_AUDIO_FORMATS, + SCENE_BASED_AUDIO_FORMATS, +) + +from .EFAP import wrap_angles + + +class Audio(ABC): + """Base class for audio data""" + + def __init__(self, name: str): + self.name = name.upper() + self.audio = None + self.fs = None + self.num_channels = None + # self.logger = None # TODO needed? + + def __repr__(self): + return f"{self.__class__} : {self.__dict__}" + + @classmethod + @abstractmethod + def _from_file(cls, name: str, filename: Path, fs: Optional[int] = None) -> "Audio": + """Create an Audio object from a file""" + out_audio = cls(name) + + filename = Path(filename) + if filename.suffix in [".pcm", ".raw"]: + if fs is None: + raise ValueError( + "Sampling rate must be specified for headerless files!" + ) + out_audio.audio, out_audio.fs = read(filename, out_audio.num_channels, fs) + elif filename.suffix == ".wav": + out_audio.audio, out_audio.fs = read(filename) + else: + raise NotImplementedError(f"Filetype {filename.suffix} is unsupported!") + + return out_audio + + @classmethod + @abstractmethod + def _from_filelist( + cls, name, files: list[Path], fs: Optional[int] = None + ) -> "Audio": + """Create an Audio object from a list of files with channels""" + out_audio = cls(name) + + for f in files: + f = Path(f) + + if f.suffix in [".pcm", ".raw"]: + if fs is None: + raise ValueError( + "Sampling rate must be specified for headerless files!" + ) + channel, fs = read(f, out_audio.num_channels, fs) + elif f.suffix == ".wav": + channel, fs = read(f) + else: + raise NotImplementedError(f"Filetype {f.suffix} is unsupported!") + + if out_audio.audio is None: + out_audio.audio = channel + out_audio.fs = fs + else: + if fs != out_audio.fs: + raise ValueError( + f"Sampling rate mismatch between input audio files, expected {out_audio.fs}, encountered {fs} for {f}!" + ) + + if channel.shape[0] > out_audio.audio.shape[0]: + channel = channel[: out_audio.audio.shape[0], :] + elif channel.shape[0] < out_audio.audio.shape[0]: + out_audio.audio = out_audio.audio[: channel.shape[0], :] + out_audio.audio = np.column_stack([out_audio.audio, channel]) + + return out_audio + + def apply(self, func, **kwargs) -> None: + """Apply a function to the audio array""" + self.audio = func(self.audio, self.fs, **kwargs) + + +class BinauralAudio(Audio): + """Sub-class for binaural audio""" + + def __init__(self, name: str): + super().__init__(name) + try: + self.__dict__.update(BINAURAL_AUDIO_FORMATS[name.upper()]) + except KeyError: + raise ValueError(f"Unsupported binaural audio format {name}") + + @classmethod + def _from_file( + cls, name: str, filename: Path, fs: Optional[int] = None + ) -> "BinauralAudio": + return super()._from_file(name, filename, fs) + + @classmethod + def _from_filelist( + cls, name: str, filename: Path, fs: Optional[int] = None + ) -> "BinauralAudio": + return super()._from_filelist(name, filename, fs) + + +class ChannelBasedAudio(Audio): + """Sub-class for channel-based audio""" + + def __init__(self, name: str): + if Path(name).exists() and Path(name).suffix == ".txt": + self.parse_custom_layout(name) + else: + # remap configuration name to internal naming + if name.upper() in CHANNEL_BASED_AUDIO_ALTNAMES.keys(): + name = CHANNEL_BASED_AUDIO_ALTNAMES[name.upper()] + + super().__init__(name) + try: + self.__dict__.update(CHANNEL_BASED_AUDIO_FORMATS[name.upper()]) + except KeyError: + raise ValueError(f"Unsupported channel-based audio format {name}") + + self.is_planar = np.all([e == 0 for e in self.ls_ele]) + + def parse_custom_layout(self, layout_file: Union[Path, str]): + layout_file = Path(layout_file) + with open(layout_file) as f_ls: + self.ls_azi = [float(x.strip()) for x in f_ls.readline().strip().split(",")] + self.ls_ele = [float(x.strip()) for x in f_ls.readline().strip().split(",")] + try: + self.lfe_index = [ + int(x.strip()) for x in f_ls.readline().strip().split(",") + ] + except Exception: + self.lfe_index = [] + + if self.lfe_index: + [self.ls_azi.insert(i, 0.0) for i in self.lfe_index] + [self.ls_ele.insert(i, 0.0) for i in self.lfe_index] + + self.name = layout_file.stem + self.num_channels = len(self.ls_azi) + self.layout_file = layout_file + + @classmethod + def _from_file( + cls, name: str, filename: Path, fs: Optional[int] = None + ) -> "ChannelBasedAudio": + return super()._from_file(name, filename, fs) + + @classmethod + def _from_filelist( + cls, name: str, filename: Path, fs: Optional[int] = None + ) -> "ChannelBasedAudio": + return super()._from_filelist(name, filename, fs) + + +class MetadataAssistedSpatialAudio(Audio): + """Sub-class for metadata-assisted spatial audio""" + + def __init__(self, name: str): + super().__init__(name) + try: + self.__dict__.update(METADATA_ASSISTED_SPATIAL_AUDIO_FORMATS[name.upper()]) + except KeyError: + raise ValueError( + f"Unsupported metadata assisted spatial audio format {name}" + ) + self.metadata_files = [] + + @classmethod + def _from_file( + cls, + name: str, + filename: Path, + metadata_files: list[str], + fs: Optional[int] = None, + ) -> "MetadataAssistedSpatialAudio": + obj = super()._from_file(name, filename, fs) + obj.metadata_file = Path(metadata_files[0]) + return obj + + @classmethod + def _from_filelist( + cls, + name: str, + filename: Path, + metadata_files: list[str], + fs: Optional[int] = None, + ) -> "MetadataAssistedSpatialAudio": + obj = super()._from_file(name, filename, fs) + obj.metadata_file = Path(metadata_files[0]) + return obj + + +class ObjectBasedAudio(Audio): + """Sub-class for object-based audio""" + + def __init__(self, name: str): + super().__init__(name) + try: + self.__dict__.update(OBJECT_BASED_AUDIO_FORMATS[name.upper()]) + except KeyError: + raise ValueError(f"Unsupported object-based audio format {name}") + self.object_pos = [] + self.metadata_files = [] + + @classmethod + def _from_file( + cls, + name: str, + filename: Union[str, Path], + metadata_files: list[Union[str, Path]], + fs: Optional[int] = None, + ) -> "ObjectBasedAudio": + obj = super()._from_file(name, filename, fs) + if metadata_files is not None: + obj.metadata_files = [Path(f) for f in metadata_files] + else: + # search for metadata with naming scheme: name.(wav, pcm).(0-3).csv + for obj_idx in range(obj.num_channels): + file_name_meta = filename.with_suffix( + f"{filename.suffix}.{obj_idx}.csv" + ) + if file_name_meta.is_file(): + obj.metadata_files.append(file_name_meta) + else: + raise ValueError(f"Metadata file {file_name_meta} not found.") + warnings.warn( + f"No metadata files specified: The following files were found and used: \n {*obj.metadata_files,}" + ) + + obj.init_metadata() + return obj + + @classmethod + def _from_filelist( + cls, + name: str, + filename: Path, + metadata_files: list[Union[str, Path]], + fs: Optional[int] = None, + ) -> "ObjectBasedAudio": + obj = super()._from_filelist(name, filename, fs) + obj.metadata_files = [Path(f) for f in metadata_files] + obj.init_metadata() + return obj + + def init_metadata(self): + if self.audio.shape[1] != len(self.metadata_files): + raise ValueError( + f"Mismatch between number of channels in file [{self.audio.shape[1]}], and metadata [{len(self.metadata_files)}]" + ) + + self.object_pos = [] + for i, f in enumerate(self.metadata_files): + pos = np.genfromtxt(f, delimiter=",") + + # check if metadata has right number of columns + if pos.shape[1] < 5: + raise ValueError("Metadata incomplete. Columns are missing.") + elif pos.shape[1] > 5: + if pos.shape[1] == 7: + pos = pos[:, :5] + else: + raise ValueError( + "Too many columns in metadata (possibly old version with frame index used)" + ) + + # check if metadata is longer than file -> cut off + num_frames = int( + np.ceil(self.audio.shape[0] / (self.fs * IVAS_FRAME_LEN_MS / 1000)) + ) + if num_frames < pos.shape[0]: + pos = pos[:num_frames] + # check if metadata is shorter than file -> loop + elif num_frames > pos.shape[0]: + pos_loop = np.zeros((num_frames, pos.shape[1])) + pos_loop[: pos.shape[0]] = pos + for idx in range(pos.shape[0], num_frames): + pos_loop[idx, :2] = pos[idx % pos.shape[0], :2] + pos = pos_loop + + # wrap metadata to target value range + for j in range(num_frames): + pos[j, 0], pos[j, 1] = wrap_angles(pos[j, 0], pos[j, 1], clip_ele=True) + + self.object_pos.append(pos) + + +class SceneBasedAudio(Audio): + """Sub-class for scene-based audio""" + + def __init__(self, name: str): + if name == "SBA1": + name = "FOA" + elif name == "SBA2": + name = "HOA2" + elif name == "SBA3": + name = "HOA3" + + super().__init__(name) + try: + self.__dict__.update(SCENE_BASED_AUDIO_FORMATS[name.upper()]) + except KeyError: + raise ValueError(f"Unsupported scene-based audio format {name}") + + # self.ambi_order = ambi_order_from_nchan(self.num_channels) + self.ambi_order = int(np.sqrt(self.num_channels) - 1) + + @classmethod + def _from_file( + cls, name: str, filename: Path, fs: Optional[int] = None + ) -> "SceneBasedAudio": + return super()._from_file(name, filename, fs) + + @classmethod + def _from_filelist( + cls, name: str, filename: Path, fs: Optional[int] = None + ) -> "SceneBasedAudio": + return super()._from_filelist(name, filename, fs) + + +def _get_audio_class(fmt) -> Audio: + """Return a child audio class corresponding to the specifed format""" + if fmt in BINAURAL_AUDIO_FORMATS.keys(): + return BinauralAudio + elif fmt in METADATA_ASSISTED_SPATIAL_AUDIO_FORMATS.keys(): + return MetadataAssistedSpatialAudio + elif fmt in OBJECT_BASED_AUDIO_FORMATS.keys(): + return ObjectBasedAudio + elif fmt in SCENE_BASED_AUDIO_FORMATS.keys(): + return SceneBasedAudio + elif ( + fmt in CHANNEL_BASED_AUDIO_FORMATS.keys() or CHANNEL_BASED_AUDIO_ALTNAMES.keys() + ): + return ChannelBasedAudio + elif Path(fmt).suffix == ".txt": + return ChannelBasedAudio + else: + raise ValueError(f"Unknown audio format {fmt}!") + + +def fromtype(fmt: str) -> Audio: + return _get_audio_class(fmt)(fmt) + + +def fromarray(fmt: str, x: np.ndarray, fs: int) -> Audio: + """Wrap the given array into an audio format""" + if x is None or not fs: + return ValueError("Both array and sampling rate must be specified!") + + output = _get_audio_class(fmt)(fmt) + + output.audio = x + output.fs = fs + + return output + + +def fromfile( + fmt: str, + filename: Union[str, Path], + fs: Optional[int] = None, + in_meta: Optional[list[Union[str, Path]]] = None, +) -> Audio: + """Create an Audio object of the specified format from the given file""" + filename = Path(filename) + fmt_cls = _get_audio_class(fmt) + if fmt_cls is ObjectBasedAudio or fmt_cls is MetadataAssistedSpatialAudio: + return fmt_cls._from_file(fmt, filename, in_meta, fs) + else: + return fmt_cls._from_file(fmt, filename, fs) + + +def fromfilelist( + fmt: str, files: list[Union[str, Path]], fs: Optional[int] = None +) -> Audio: + """Create an Audio object of the specified format from the given list of files""" + return _get_audio_class(fmt)._from_filelist(fmt, files, fs) diff --git a/item_generation_scripts/audiotools/audioarray.py b/item_generation_scripts/audiotools/audioarray.py new file mode 100644 index 00000000..c0909c4c --- /dev/null +++ b/item_generation_scripts/audiotools/audioarray.py @@ -0,0 +1,690 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +import logging +import warnings +from typing import Iterator, Optional, Tuple, Union + +import numpy as np +import scipy.signal as sig + +from .constants import DELAY_COMPENSATION_FOR_FILTERING, SEED_PADDING + +logger = logging.getLogger("__main__") +logger.setLevel(logging.DEBUG) + + +"""Functions used in this module""" + + +def trim( + x: np.ndarray, + fs: Optional[int] = 48000, + limits: Optional[Tuple[int, int]] = None, + pad_noise: Optional[bool] = False, + samples: Optional[bool] = False, +) -> np.ndarray: + """ + Trim an audio array + + Parameters + ---------- + x: np.ndarray + Input array + fs: Optional[int] + Input sampling rate in Hz, default = 48000 + limits: Optional[Tuple[int, int]] + Pre- and post-trim duration in milliseconds (negative values pad) + pad_noise: Optional[bool] + If true noise will be padded otherwise zeros will be padded + samples: Optional[bool] + If true limits are interpreted as samples, otherwise as ms + + Returns + ------- + y : np.ndarray + Output trimmed array + """ + + if not limits: + return x + + if not samples: + pre_trim = int(limits[0] * fs // 1000) + post_trim = int(limits[1] * fs // 1000) + else: + pre_trim = limits[0] + post_trim = limits[1] + + if pre_trim < 0: + if pad_noise: + # pad with uniformly distributed noise between -4 and 4 + np.random.seed(SEED_PADDING) + noise = np.random.randint( + low=-4, high=5, size=(np.abs(pre_trim), np.shape(x)[1]) + ).astype("float") + x = np.concatenate((noise, x), axis=0) + else: + x = np.pad(x, [[np.abs(pre_trim), 0], [0, 0]]) + elif pre_trim > 0: + x = x[pre_trim:, :] + + if post_trim < 0: + if pad_noise: + # pad with uniformly distributed noise between -4 and 4 + np.random.seed(SEED_PADDING) + noise = np.random.randint( + low=-4, high=5, size=(np.abs(post_trim), np.shape(x)[1]) + ).astype("float") + x = np.concatenate((x, noise), axis=0) + else: + x = np.pad(x, [[0, np.abs(post_trim)], [0, 0]]) + elif post_trim > 0: + x = x[:-post_trim, :] + + return x + + +def window( + x: np.ndarray, + fs: Optional[int] = 48000, + len_ms: Optional[float] = 100, +) -> np.ndarray: + """ + Apply windowing to the start and end + of an audio array + + + Parameters + ---------- + x: np.ndarray + Input audio array + fs: Optional[int] + Input sampling rate in Hz, default = 48000 + len_ms: Optional[float] + Window length used at start and end of array in milliseconds, default = 100 ms + + Returns + ------- + y: np.ndarray + Output windowed array + """ + + wlen_smp = int(len_ms * fs // 1000) + + # if requested window length is larger than the signal, simply window the signal + if wlen_smp > x.shape[0]: + wlen_smp = x.shape[0] // 2 + + window = sig.windows.hann(2 * wlen_smp) + + # we only need half of the window + window = window[:wlen_smp, np.newaxis] + + x[:wlen_smp, :] *= window + x[-wlen_smp:, :] *= window[::-1, :] + + return x + + +def delay_compensation( + x: np.ndarray, + flt_type: str, + fs: Optional[int] = 48000, + up: Optional[bool] = False, + down: Optional[bool] = False, +) -> np.ndarray: + """ + Compensation for a delayed signal + + Parameters + ---------- + x: np.ndarray + Input array + flt_type: str + Name of filter type used for filtering + fs: Optional[int] + Input sampling rate + up: Optional[bool] + Flag for up-sampling + down: Optional[bool] + Flag for down-sampling + + Returns + ------- + x: np.ndarray + Delay compensated test array + """ + + # Get the delay in number of samples + if flt_type == "SHQ2" and up: + d_samples = DELAY_COMPENSATION_FOR_FILTERING["SHQ2"]["up"] + elif flt_type == "SHQ2" and down: + d_samples = DELAY_COMPENSATION_FOR_FILTERING["SHQ2"]["down"] + elif flt_type == "SHQ3" and up: + d_samples = DELAY_COMPENSATION_FOR_FILTERING["SHQ3"]["up"] + elif flt_type == "SHQ3" and down: + d_samples = DELAY_COMPENSATION_FOR_FILTERING["SHQ3"]["down"] + else: + d_samples = DELAY_COMPENSATION_FOR_FILTERING[flt_type] + # Delay compensation + x = delay(x, fs, -d_samples, samples=True) + + return x + + +def delay( + x: np.ndarray, + fs: Optional[int] = 48000, + delay: Optional[float] = 0, + samples: Optional[bool] = False, +) -> np.ndarray: + """ + Delay a signal by a specified duration (ms) or number of samples + + Parameters + ---------- + x: np.ndarray + Input array + fs: Optional[int] + Sampling rate + delay: Optional[float] + Delay in milliseconds or samples (negative values advance file) + samples: Optional[bool] + If true delay is interpreted as samples, if false as milliseconds + + Returns + ------- + x: np.ndarray + Delayed audio signal + """ + + if not samples: + delay = int(delay * fs / 1000) + + delay_abs = np.abs(delay) + + x = np.roll(x, delay, axis=0) + + if delay < 0: + x[-delay_abs:, :] = 0 + elif delay > 0: + x[:delay_abs, :] = 0 + + return x + + +def limiter( + x: np.ndarray, + fs: int, +) -> np.ndarray: + """ + Apply limiting to an audio signal + + Parameters + ---------- + x: np.ndarray + Input reference array + fs: int + Input sampling frequency + + Returns + ------- + x: np.ndarray + Limited audio signal + """ + + limiter_threshold = 32729 # -0.01dB FS + limiter_attack_seconds = 0.005 + attack_constant = 0.01 ** (1.0 / (limiter_attack_seconds * fs)) + release_heuristics_mem = 0.0 + gain = 1.0 + strong_saturation_cnt = 0 + limited = False + + if x.ndim == 1: + n_samples_x = x.shape + n_chan_x = 1 + else: + n_samples_x, n_chan_x = x.shape + # framing + framesize = fs // 50 + nframes = n_samples_x // framesize + for fr in range(nframes): + apply_limiting = True + fr_sig = x[fr * framesize : ((fr + 1) * framesize), :] + sig_max = np.amax(np.absolute(fr_sig)) + release_heuristic = release_heuristics_mem + if sig_max > limiter_threshold: + frame_gain = limiter_threshold / sig_max + release_heuristic = min(1.0, release_heuristic + (4.0 * framesize / fs)) + else: + release_heuristic = max(0.0, release_heuristic - (framesize / fs)) + if gain >= 1.0 - 1e-10: + apply_limiting = False + + frame_gain = 1.0 + + if sig_max > 3 * limiter_threshold and strong_saturation_cnt > 0: + apply_strong_limiting = True + elif sig_max > 10 * limiter_threshold: + strong_saturation_cnt += 20 + apply_strong_limiting = True + else: + strong_saturation_cnt -= 1 + if strong_saturation_cnt < 0: + strong_saturation_cnt = 0 + apply_strong_limiting = False + + if apply_strong_limiting is True: + if frame_gain < 0.3: + frame_gain /= 3.0 + else: + apply_strong_limiting = False + + if frame_gain < 0.1 and apply_strong_limiting is False: + frame_gain = 0.1 + + if apply_limiting is True: + if frame_gain < gain: + fac = attack_constant ** (np.arange(1, framesize + 1, dtype=np.float32)) + else: + release_constant = 0.01 ** ( + 1.0 / (0.005 * (200.0**release_heuristic) * fs) + ) + fac = release_constant ** ( + np.arange(1, framesize + 1, dtype=np.float32) + ) + + fr_gain = np.tile(gain * fac + frame_gain * (1.0 - fac), (n_chan_x, 1)).T + fr_sig *= fr_gain + gain = fr_gain[-1, 0] + limited = True + else: + gain = 1.0 + + release_heuristics_mem = release_heuristic + # hard limiting for everything that still sticks out + if (fr_sig > 32767).any() or (fr_sig < -32768).any(): + limited = True + idx_max = np.where(fr_sig > 32767) + fr_sig[idx_max] = 32767 + idx_min = np.where(fr_sig < -32768) + fr_sig[idx_min] = -32768 + + if limited: + warnings.warn("Limiting had to be applied") + return x + + +def get_framewise( + x: np.ndarray, + chunk_size: int, + zero_pad: Optional[bool] = False, +) -> Iterator: + """ + Generator to yield a signal frame by frame + If array size is not a multiple of chunk_size, last frame contains the remainder + + Parameters + ---------- + x: np.ndarray + Input reference array + chunk_size: int + Size of frames to yield + zero_pad: Optional[bool] + Whether to zero pad the last chunk if there are not enough samples + + Yields + ------- + frame : np.ndarray + One frame of the input audio signal + """ + + n_frames = x.shape[0] // chunk_size + for i in range(n_frames): + yield x[i * chunk_size : (i + 1) * chunk_size, :] + if x.shape[0] % chunk_size: + last_chunk = x[n_frames * chunk_size :, :] + if zero_pad: + yield np.pad( + last_chunk, [[0, chunk_size - (x.shape[0] % chunk_size)], [0, 0]] + ) + else: + yield last_chunk + + +def framewise_io( + i: np.ndarray, o: np.ndarray, chunk_size: int, zero_pad: Optional[bool] = False +) -> Iterator: + """ + Return an iterator over frame_index, input_frame and output_frame + + Parameters + ---------- + i: np.ndarray + Input array + o: np.ndarray + Output array + chunk_size: int + Size of frames to yield + zero_pad: Optional[bool] + Whether to zero pad the last chunk if there are not enough samples + + Yields + ------- + frame : Iterator + Frame index, one frame of the input and output audio signal + """ + + return enumerate( + zip( + get_framewise(i, chunk_size, zero_pad), + get_framewise(o, chunk_size, zero_pad), + ) + ) + + +"""Deprecated functions (partly replaced by ITU binaries)""" + + +def resample( + x: np.ndarray, + in_freq: int, + out_freq: int, +) -> np.ndarray: + """ + Resample a multi-channel audio array + + Parameters + ---------- + x: np.ndarray + Input array + in_freq: int + Input sampling rate + out_freq: int + Output sampling rate + + Returns + ------- + y: np.ndarray + Output resampled array + """ + + if in_freq == out_freq or out_freq is None: + y = x + else: + datatype = x.dtype + if datatype.name.startswith("int"): + # cast necessary due to bug in resample_poly() with input of type int + x = x.astype("float") + + y = sig.resample_poly(x, out_freq, in_freq) + + if datatype.name.startswith("int"): + y = x.astype(datatype) + + return y + + +def lpfilter( + x: np.ndarray, + fc: int, + fs: int, +) -> np.ndarray: + """ + Low-pass filter a multi-channel audio array + + Parameters + ---------- + x: np.ndarray + Input array + fc: int + Cut-off frequency in Hz + fs: int + Sampling rate in Hz + + Returns + ------- + y: np.ndarray + Output low-pass filtered array + """ + + if (fc + 500) < (fs / 2.0): + # Design a Chebychev Type II filter, band_pass-band_stop = 500 Hz + N, Wn = sig.cheb2ord(fc / (fs / 2), (fc + 500) / (fs / 2), 3, 60) + b, a = sig.cheby2(N, 60, Wn, "low") + + # Apply the Butterworth filter for each channels, across time axis + # y = sig.lfilter(b, a, axis=0) # non zero-phase filter + y = sig.filtfilt(b, a, x, axis=0) # zero-phase filer, batch processing + else: + y = x + + return y + + +def cut( + x: np.ndarray, + limits: Optional[Tuple[int, int]], +) -> np.ndarray: + """ + Cut an audio array + + Parameters + ---------- + x: np.ndarray + Input array + limits: Tuple[int, int] + first and last samples to extract + + Returns + ------- + y: np.ndarray + Output cut array + """ + + in_samples, in_channels = x.shape + first_sample = limits[0] + last_sample = limits[1] + + if first_sample == 0 and (last_sample == -1 or last_sample == in_samples): + y = x + else: + if last_sample == -1: + last_sample = in_samples + + signal_start = first_sample + signal_end = last_sample + insert_start = 0 + insert_end = last_sample - first_sample + total_samples = last_sample - first_sample + if first_sample < 0: + samples_to_pad_begin = -first_sample + insert_start = samples_to_pad_begin + insert_end += samples_to_pad_begin + if last_sample > in_samples: + signal_end = in_samples + insert_end = insert_end - last_sample + in_samples + y = np.zeros([total_samples, in_channels], dtype=x.dtype) + y[insert_start:insert_end, :] = x[signal_start:signal_end, :] + + return y + + +def compare( + ref: np.ndarray, + test: np.ndarray, + fs: int, + per_frame: bool = False, +) -> dict: + """ + Compare two audio arrays + + Parameters + ---------- + ref: np.ndarray + Input reference array + test: np.ndarray + Input test array + fs: int + Input sampling rate in Hz + + Returns + ------- + result: dict + Comparison results + """ + + framesize = fs // 50 + diff = abs(test - ref) + max_diff = int(diff.max()) + result = { + "bitexact": True, + "max_abs_diff": 0, + "max_abs_diff_pos_sample": 0, + "max_abs_diff_pos_channel": 0, + "nsamples_diff": 0, + "nsamples_diff_percentage": 0.0, + "first_diff_pos_sample": -1, + "first_diff_pos_channel": -1, + "first_diff_pos_frame": -1, + } + if per_frame: + result["max_abs_diff_pos_frame"] = 0 + result["nframes_diff"] = 0 + result["nframes_diff_percentage"] = 0.0 + + if max_diff != 0: + if diff.ndim == 1: + nsamples_total = diff.shape + nchannels = 1 + else: + nsamples_total, nchannels = diff.shape + max_diff_pos = np.nonzero(diff == max_diff) + max_diff_pos = [ + max_diff_pos[0][0], + max_diff_pos[0][0] // framesize, + max_diff_pos[1][0], + ] + + first_diff_pos = np.nonzero(diff) + first_diff_pos = [ + first_diff_pos[0][0], + first_diff_pos[0][0] // framesize, + first_diff_pos[1][0], + ] + + nsamples_diff = np.nonzero(diff)[0].size + nsamples_diff_percentage = nsamples_diff / (nsamples_total * nchannels) * 100.0 + nframes = nsamples_total // framesize + nframes_diff = 0 + + result = { + "bitexact": False, + "max_abs_diff": max_diff, + "max_abs_diff_pos_sample": max_diff_pos[0], + "max_abs_diff_pos_channel": max_diff_pos[2], + "nsamples_diff": nsamples_diff, + "nsamples_diff_percentage": nsamples_diff_percentage, + "first_diff_pos_sample": first_diff_pos[0], + "first_diff_pos_channel": first_diff_pos[2], + "first_diff_pos_frame": first_diff_pos[1], + } + + if per_frame: + for fr in range(nframes): + diff_fr = diff[fr * framesize : ((fr + 1) * framesize), :] + nframes_diff += 1 if diff_fr.nonzero()[0].size > 0 else 0 + nframes_diff_percentage = nframes_diff / nframes * 100.0 + result["max_abs_diff_pos_frame"] = max_diff_pos[1] + result["nframes_diff"] = nframes_diff + result["nframes_diff_percentage"] = nframes_diff_percentage + + return result + + +def getdelay( + x: np.ndarray, + y: np.ndarray, +) -> int: + """ + Get the delay between two audio signals + + Parameters + ---------- + x: np.ndarray + Input reference array + y: np.ndarray + Input test array + + Returns + ------- + result: int + Delay of y in samples with respect to x (median of individual channel delays) + """ + + if x.ndim == 1: + n_samples_x = x.shape + n_chan_x = 1 + else: + n_samples_x, n_chan_x = x.shape + if y.ndim == 1: + n_samples_y = y.shape + n_chan_y = 1 + else: + n_samples_y, n_chan_y = y.shape + if n_chan_x != n_chan_y: + raise ValueError + lags = np.arange(-n_samples_x + 1, n_samples_y) + lag = np.zeros([n_chan_x, 1], dtype=int) + for chan in range(n_chan_x): + correlation = sig.correlate(y[:, chan], x[:, chan], mode="full") + lag[chan] = lags[np.argmax(correlation)] + return int(np.median(lag)) + + +def mono_downmix(x: np.ndarray) -> np.ndarray: + """ + Creates a passive mono downmix for a multi-channel audio signal + """ + return np.sum(x, axis=1) + + +def mute_channels( + x: np.ndarray, mute: Optional[Union[list, np.ndarray]] = None +) -> np.ndarray: + """ + Mute audio channels in signal + """ + x[:, mute] = 0 + return x diff --git a/item_generation_scripts/audiotools/audiofile.py b/item_generation_scripts/audiotools/audiofile.py new file mode 100644 index 00000000..954c91f8 --- /dev/null +++ b/item_generation_scripts/audiotools/audiofile.py @@ -0,0 +1,433 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +import logging +import struct +from pathlib import Path +from typing import Optional, Tuple, Union + +import numpy as np +import scipy.io.wavfile as wav + +from .audioarray import trim, window + +logger = logging.getLogger("__main__") +logger.setLevel(logging.DEBUG) + + +def read( + filename: Union[str, Path], + nchannels: Optional[int] = 1, + fs: Optional[int] = 48000, + outdtype: Optional[str] = "float", +) -> Tuple[np.ndarray, int]: + """ + Read audio file (.pcm, .wav or .raw) + + Parameters + ---------- + filename: str + Input file path + nchannels: Optional[int] + Number of input channels, required for .pcm otherwise default = 1 + fs: Optional[int] + Input sampling rate, required for .pcm input file, otherwise default = 48000 (Hz) + outdtype: Optional[str] + Data type of output array, python builtin or np.dtype + + Returns + ------- + x: np.ndarray + audio signal array + fs: int + signal sampling frequency + """ + + file_extension = Path(filename).suffix + + if file_extension == ".wav": + fs, data = wav.read(filename) + if data.dtype == np.int32: + data = np.interp( + data, + (np.iinfo(np.int32).min, np.iinfo(np.int32).max), + (np.iinfo(np.int16).min, np.iinfo(np.int16).max), + ) + elif data.dtype == np.float32: + data = np.interp( + data, + (-1, 1), + (np.iinfo(np.int16).min, np.iinfo(np.int16).max), + ) + x = np.array(data, dtype=outdtype) + file_len = x.shape[0] + if x.ndim == 1: + # force to be a mtx + x = np.reshape(x, (file_len, 1)) + elif file_extension in [".pcm", ".raw"]: + x = np.fromfile(filename, dtype=np.int16).astype(outdtype) + signal_len = len(x) // nchannels + try: + x = x.reshape(signal_len, nchannels) + except ValueError: + raise ValueError("Wrong number of channels") + else: + raise ValueError("Wrong input format. Use wav, pcm or raw") + + return x, fs + + +def write( + filename: Union[str, Path], + x: np.ndarray, + fs: Optional[int] = 48000, +) -> None: + """ + Write audio file (.pcm, .wav or .raw) + + Parameters + ---------- + filename: str + Output file path (.pcm, .wav or .raw) + x: np.ndarray + Numpy 2D array of dimension: number of channels x number of samples + fs: Optional[int] + Sampling rate, required for .pcm or .raw input file, default = 48000 (Hz) + + Returns + ------- + None + """ + + file_extension = Path(filename).suffix + + clipped_samples = np.sum( + np.logical_or(x < np.iinfo(np.int16).min, x > np.iinfo(np.int16).max) + ) + if clipped_samples > 0: + logger.warning(f" Warning: {clipped_samples} samples clipped") + x = np.clip(x, np.iinfo(np.int16).min, np.iinfo(np.int16).max) + + if file_extension == ".wav": + x = x.astype(np.int16) + wav.write(filename, fs, x) + elif file_extension == ".pcm" or file_extension == ".raw": + x = x.astype("int16").reshape(-1, 1) + x.tofile(filename) + else: + raise ValueError("Wrong input format. Use wav, pcm or raw") + + +def concat( + in_filenames: list, + out_file: str, + silence_pre: Optional[int] = 0, + silence_post: Optional[int] = 0, + in_fs: Optional[int] = 48000, + num_channels: Optional[int] = None, + pad_noise: Optional[bool] = False, + preamble: Optional[int] = None, + pad_noise_preamble: Optional[bool] = False, +) -> list: + """ + Horizontally concatenates audio files into one long file + + Parameters + __________ + in_filenames: list + Input list of filenmames (.pcm, .raw or .wav) + out_file: str + Output multi-channel audio file name (.pcm, .raw or .wav) + silence_pre: int + Padded zeros before signal in samples + silence_post: int + Padded zeros after signal in samples + in_fs: Optional[int] + Input sampling rate, default 48000 Hz + pad_noise: Optional[bool] + If true noise will be padded otherwise zeros will be padded + + Returns + ------- + splits + List of sample indices to split the resulting file at + """ + + y = None + fs_compare = 0 + + # create a list of splits + splits = [0] + + # Read input files + for in_file in in_filenames: + x, fs = read(in_file, fs=in_fs, nchannels=num_channels) + if fs_compare and fs_compare != fs: + raise ValueError("Sampling rates of files to concatenate don't match") + else: + fs_compare = fs + + # pad with very low amplitude noise + x = trim( + x, in_fs, (-silence_pre, -silence_post), samples=True, pad_noise=pad_noise + ) + + # add the length to our splits list + splits.append(splits[-1] + x.shape[0]) + + # concatenate + y = np.concatenate([y, x]) if y is not None else x + + # add preamble + if preamble: + y = trim(y, in_fs, (-preamble, 0), pad_noise_preamble) + + write(out_file, y, fs=in_fs) + + return splits[1:] + + +def split( + in_filename: Union[str, Path], + out_folder: Union[str, Path], + split_filenames: list[Union[str, Path]], + splits: list[int], + in_fs: Optional[int] = 48000, + preamble: Optional[int] = 0, + loudness: Optional[float] = None, +) -> list[Union[str, Path]]: + """ + Horizontally splits audio files into multiple shorter files and applies windowing and scaling + + Parameters + __________ + in_filename: Union[str, Path] + Input filenmame (.pcm, .raw or .wav) + out_folder: Union[str, Path] + Output folder where to put the splits + split_filenames: list[Union[str, Path]] + List of names for the split files + splits: list[int] + List of sample indices where to cut the signal + in_fs: Optional[int] + Input sampling rate, default 48000 Hz + loudness: Optional[float] + Desired loudness of individual files + """ + + # create a list of output files + out_paths = [] + + # Read input file + x, fs = read(in_filename, fs=in_fs) + + # remove preamble + if preamble: + x = trim(x, fs, (preamble, 0)) + + split_old = 0 + for idx, split in enumerate(splits): + out_file = Path(out_folder) / Path(split_filenames[idx]).with_suffix( + in_filename.suffix + ) + + # add the path to our list + out_paths.append(out_file) + + # split + y = x[split_old:split, :] + + # windowing + y = window(y) + + # write file + write(out_file, y, fs=in_fs) + + split_old = split + + return out_paths + + +def combine( + in_filenames: list, + out_file: str, + in_fs: Optional[int] = 48000, +) -> None: + """ + Combines audio files into one multi-channel file + + Parameters + ---------- + in_filenames: list + Input list of filenmames (.pcm, .raw or .wav) + out_file: str + Output multi-channel audio file name (.pcm, .raw or .wav) + in_fs: Optional[int] + Input sampling rate, required for .pcm and .raw input file, default 48000 Hz + + Returns + ------- + None + """ + + y = None + fs_compare = 0 + + # Read input files + for in_file in in_filenames: + # assign correct channel + x, fs = read(in_file, fs=in_fs) + if fs_compare and fs_compare != in_fs: + raise ValueError("Sampling rates of files to combine don't match") + else: + fs_compare = fs + if y is None: + y = x + else: + if x.shape[0] > y.shape[0]: + x = x[: y.shape[0], :] + elif y.shape[0] > x.shape[0]: + y = y[: x.shape[0], :] + y = np.column_stack([y, x]) + + write(out_file, y, fs=in_fs) + + +def split_channels( + in_file: str, + out_filenames: list, + in_nchans: int, + in_fs: Optional[int] = 48000, +) -> None: + """ + Split multi-channel audio files into individual mono files + + Parameters + ---------- + in_file: str + Input file name (.pcm, .raw or .wav) + out_filenames: list + List of output file names (.pcm, .raw or .wav) + in_nchans: int + Input number of channels + in_fs: Optional[int] = 48000 + Input sampling rate, default 48000 Hz + + Returns + ------- + None + """ + + # validation + if in_nchans is None: + raise ValueError("Number of channels to split must be specified!") + if in_nchans != len(out_filenames): + print( + "Split: Mismatch between number of channels and output filenames length. Truncating output filenames list." + ) + out_filenames = out_filenames[:in_nchans] + + x, in_fs = read(in_file, nchannels=in_nchans, fs=in_fs) + + # Write output files + for idx, out_file in enumerate(out_filenames): + # extract correct channel + y = x[:, idx] + + write(out_file, y, fs=in_fs) + + +def parse_wave_header( + filename: str, +) -> dict: + """ + Get the format information from a WAV file. + Return a dictionary with the format information + + Parameters + ---------- + filename : string or open file handle + Input WAV file. + + Returns + ------- + Dictionary + """ + + with open(filename, "rb") as fid: + riff = fid.read(4) + + if riff == b"RIFF": + binary_format = "<" + elif riff == b"RIFX": + binary_format = ">" + else: + raise IOError("No RIFF chunk found!") + + wav_size = struct.unpack(f"{binary_format}I", fid.read(4))[0] + + wav_identifier = fid.read(4) + if wav_identifier != b"WAVE": + raise IOError("No WAVE chunk found!") + + fmt_chunk_id = fid.read(4) + + if fmt_chunk_id == b"fmt ": + fmt_size = struct.unpack(f"{binary_format}I", fid.read(4))[0] + wav_format = struct.unpack(f"{binary_format}H", fid.read(2))[0] + channels = struct.unpack(f"{binary_format}H", fid.read(2))[0] + fs = struct.unpack(f"{binary_format}I", fid.read(4))[0] + bytes_per_second = struct.unpack(f"{binary_format}I", fid.read(4))[0] + block_align = struct.unpack(f"{binary_format}H", fid.read(2))[0] + bit_depth = struct.unpack(f"{binary_format}H", fid.read(2))[0] + rem_bytes = fmt_size - 16 + ext_param_size = 0 + ext_param = None + if rem_bytes: + ext_param_size = struct.unpack(f"{binary_format}H", fid.read(2))[0] + + if ext_param_size: + ext_param = fid.read(ext_param_size) + else: + raise IOError("Missing or corrupt fmt chunk!") + + return { + "size": wav_size, + "format_tag": wav_format, + "channels": channels, + "fs": fs, + "bytes_per_second": bytes_per_second, + "block_align": block_align, + "bit_depth": bit_depth, + "ext_param_size": ext_param_size, + "ext_param": ext_param, + } diff --git a/item_generation_scripts/audiotools/binaural_datasets/BRIR_IISofficialMPEG222UC_FULL.mat b/item_generation_scripts/audiotools/binaural_datasets/BRIR_IISofficialMPEG222UC_FULL.mat new file mode 100644 index 00000000..42e702db --- /dev/null +++ b/item_generation_scripts/audiotools/binaural_datasets/BRIR_IISofficialMPEG222UC_FULL.mat @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3ddecef64dfcf8887904b5cc370c0d9723bd8fd1637e32232205cdcd739b80d +size 12623190 diff --git a/item_generation_scripts/audiotools/binaural_datasets/BRIR_IISofficialMPEG222UC_LS.mat b/item_generation_scripts/audiotools/binaural_datasets/BRIR_IISofficialMPEG222UC_LS.mat new file mode 100644 index 00000000..1d590edb --- /dev/null +++ b/item_generation_scripts/audiotools/binaural_datasets/BRIR_IISofficialMPEG222UC_LS.mat @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2c964b96d802532c0ecf1076092c7d246a54293a3a0c4c72995953c66bfec71 +size 6348499 diff --git a/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA1.mat b/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA1.mat new file mode 100644 index 00000000..4f59a8a9 --- /dev/null +++ b/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA1.mat @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a9ad5d8d874ac2fb851f5d2b0b303494f1d115612e9f6cab40e5eb33591b05c +size 4630 diff --git a/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA2.mat b/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA2.mat new file mode 100644 index 00000000..1ad2162a --- /dev/null +++ b/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA2.mat @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fc2a15579b80493597a8096bd815e8b847fe1880bdba760d4405122878b0b0a +size 10323 diff --git a/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA3.mat b/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA3.mat new file mode 100644 index 00000000..0e7c3ef4 --- /dev/null +++ b/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA3.mat @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83822cfa090c345a6ece14d1ec1a92023626f467e2f8d982cf099c071dfc1080 +size 18229 diff --git a/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_FULL.mat b/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_FULL.mat new file mode 100644 index 00000000..a2ab24e5 --- /dev/null +++ b/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_FULL.mat @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf86a03f0b13932c5c138af22584f864b75c5733df1b01ac3fdf7750a1bdbe5f +size 14335913 diff --git a/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_LS.mat b/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_LS.mat new file mode 100644 index 00000000..65c2684c --- /dev/null +++ b/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_LS.mat @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e25ef101e9e72c5d70a55bc1451a07d041d29f96a803d7d3f968f20fe403316 +size 20190 diff --git a/item_generation_scripts/audiotools/binaural_datasets/README.txt b/item_generation_scripts/audiotools/binaural_datasets/README.txt new file mode 100644 index 00000000..9fd37c96 --- /dev/null +++ b/item_generation_scripts/audiotools/binaural_datasets/README.txt @@ -0,0 +1,34 @@ +Files in this directory should contain impulse responses for use in rendering in Matlab .mat format +Samplingrate of 48kHz is assumed + +Files should adhere to the following naming scheme: + +{HRIR|BRIR}_{DATASETNAME}_{FULL|LS|SBA(1-3)}.mat + +- HRIR or BRIR + specifies the type of impulse response which will be used + for either BINAURAL or BINAURAL_ROOM output respectively +- DATASETNAME + specifies the name used with the binaural_dataset commandline argument + or YAML key to enable selection of this dataset +- FULL or LS or SBA3 + specifies the subset of impulse responses in the file: + FULL: all available measurements on the sphere + LS: superset of supported loudspeaker layouts + (see audiotools.constants.CHANNEL_BASED_AUDIO_FORMATS["LS""]) + SBA(1-3): impulse responses transformed to ambisonics by external conversion + if available SBA1 is used for FOA, SBA2 for HOA2 and SBA3 for HOA3 + if not available SBA3 is used and truncated for all Ambisonic formats + +Each Matlab file should contain the following variables: +- IR + Impulse responses with dimensions [ir_length x n_ears x n_channels] +- SourcePosition + array of {azimuth, elevation, radius} of dimensions [n_channels x 3] + required for FULL, optional otherwise +- latency_s + latency of the dataset in samples + optional, will be estimated if not provided + +LICENSES: +Please see HRIR.txt and BRIR.txt for license info \ No newline at end of file diff --git a/item_generation_scripts/audiotools/binaural_datasets/__init__.py b/item_generation_scripts/audiotools/binaural_datasets/__init__.py new file mode 100644 index 00000000..aea270d8 --- /dev/null +++ b/item_generation_scripts/audiotools/binaural_datasets/__init__.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# diff --git a/item_generation_scripts/audiotools/binaural_datasets/binaural_dataset.py b/item_generation_scripts/audiotools/binaural_datasets/binaural_dataset.py new file mode 100644 index 00000000..e6c4dbe7 --- /dev/null +++ b/item_generation_scripts/audiotools/binaural_datasets/binaural_dataset.py @@ -0,0 +1,288 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +import warnings +from pathlib import Path +from typing import Optional, Tuple, Union + +import numpy as np +from scipy.io import loadmat + +from item_generation_scripts.audiotools.audio import fromtype +from item_generation_scripts.audiotools.constants import ( + CHANNEL_BASED_AUDIO_FORMATS, + OBJECT_BASED_AUDIO_FORMATS, + SCENE_BASED_AUDIO_FORMATS, +) +from item_generation_scripts.audiotools.EFAP import wrap_angles + + +def load_hrtf( + filename: Union[str, Path], +) -> Tuple[np.ndarray, np.ndarray, int]: + """ + Read HRTFs from Matlab dictionary file mat + + Parameters + ---------- + filename: str + HRTFs file name (.mat) + + Returns + ------- + IR: np.ndarray + Array of impulse responses + SourcePosition: np.ndarray + Array of source positions corresponding to the impulse responses + latency_s: int + Latency in samples + """ + + if not filename.exists(): + raise FileNotFoundError( + f"File {filename.name} was not found in dataset folder!" + ) + + mat_contents = loadmat(filename) + + try: + IR = mat_contents["IR"] + except KeyError: + raise KeyError(f"Key 'IR' not found in .mat file: {filename} !") + + SourcePosition = mat_contents.get("SourcePosition") + latency_s = mat_contents.get("latency_s") + if latency_s is not None: + latency_s = latency_s.astype(np.int32)[0, 0] + + return IR, SourcePosition, latency_s + + +def load_ir( + in_fmt: str, + out_fmt: str, + dataset: Optional[str] = None, +) -> Tuple[np.ndarray, np.ndarray, int]: + """ + Load IRs for a specified rendering format + + Parameters + ---------- + in_fmt: str + Input format + out_fmt: str + Output format + dataset: Optional[str] + Name of desired dataset without prefix and suffix + + Returns + ------- + IR: np.ndarray + Array of impulse responses + SourcePosition: np.ndarray + Array of source positions corresponding to the impulse responses + latency_smp: int + Latency in samples + """ + + dataset_prefix = None + dataset_suffix = None + + if out_fmt.startswith("BINAURAL") and "ROOM" in out_fmt: + dataset_prefix = "BRIR" + if dataset is None: + dataset = "IISofficialMPEG222UC" + + if in_fmt.startswith("MOZART"): + dataset_suffix = "FULL" + elif in_fmt in CHANNEL_BASED_AUDIO_FORMATS.keys(): + dataset_suffix = "LS" + + elif out_fmt.startswith("BINAURAL"): + dataset_prefix = "HRIR" + if dataset is None: + dataset = "ORANGE53" + + if in_fmt in OBJECT_BASED_AUDIO_FORMATS.keys() or in_fmt.startswith( + "CUSTOM_LS" + ): + dataset_suffix = "FULL" + elif in_fmt in CHANNEL_BASED_AUDIO_FORMATS.keys() and in_fmt != "MONO": + dataset_suffix = "LS" + elif in_fmt in SCENE_BASED_AUDIO_FORMATS.keys(): + dataset = "ORANGE53_Dolby" + if in_fmt == "SBA1" or in_fmt == "FOA": + dataset_suffix = "SBA1" + # Use truncated SBA3 dataset if no SBA1 or 2 dataset exists + if not ( + Path(__file__).parent.joinpath( + f"{dataset_prefix}_{dataset}_{dataset_suffix}.mat" + ) + ).is_file(): + dataset_suffix = "SBA3" + warnings.warn("No SBA1 dataset found -> use truncated SBA3 dataset") + elif in_fmt.endswith("2"): + dataset_suffix = "SBA2" + # Use truncated SBA3 dataset if no SBA1 or 2 dataset exists + if not ( + Path(__file__).parent.joinpath( + f"{dataset_prefix}_{dataset}_{dataset_suffix}.mat" + ) + ).is_file(): + dataset_suffix = "SBA3" + warnings.warn("No SBA2 dataset found -> use truncated SBA3 dataset") + else: + dataset_suffix = "SBA3" + + path_dataset = Path(__file__).parent.joinpath( + f"{dataset_prefix}_{dataset}_{dataset_suffix}.mat" + ) + IR, SourcePosition, latency_s = load_hrtf(path_dataset) + + if latency_s is not None: + latency_smp = latency_s + else: + latency_smp = int(np.min(np.argmax(np.sum(np.abs(IR), axis=1), axis=0))) + warnings.warn( + f"No latency of HRTF dataset specified in {path_dataset} file -> computed latency: {latency_smp} sample(s)" + ) + + if in_fmt.startswith("STEREO"): + IR = IR[:, :, :2] # use L and R channels. + elif ( + in_fmt in CHANNEL_BASED_AUDIO_FORMATS.keys() + and not in_fmt.startswith("CUSTOM_LS") + and not in_fmt.startswith("MOZART") + ): + # extract positions from the loudspeaker file + in_fmt = fromtype(in_fmt) + tmp_fmt = fromtype("LS") + + IR_tmp = IR.copy() + IR = np.zeros([IR_tmp.shape[0], IR_tmp.shape[1], in_fmt.num_channels]) + + ir_index = 0 + for i in range(tmp_fmt.num_channels): + for j in range(in_fmt.num_channels): + if ( + tmp_fmt.ls_azi[i] == in_fmt.ls_azi[j] + and tmp_fmt.ls_ele[i] == in_fmt.ls_ele[j] + ): + if j != in_fmt.lfe_index[0]: + IR[:, :, ir_index] = IR_tmp[:, :, i] + ir_index += 1 + + return IR, SourcePosition, latency_smp + + +def find_ir( + SourcePosition: np.ndarray, + azi: float, + ele: float, + num_filter: Optional[int] = None, +) -> Tuple[np.ndarray, np.ndarray]: + """ + Find HRTF measurement closest to the selected direction + + Parameters + ---------- + SourcePosition: np.ndarray + Source IR positions + azi: float + Desired response azimuth + ele: float + Desired response elevation + num_filter: Optional[int] + Number of filters to return, if None return all + + Returns + ------- + i_dir: np.ndarray + Indices of nearest SourcePositions + dist_sort: np.ndarray + Distances corresponding to the indices + """ + + dist = dist_on_sphere(SourcePosition, azi, ele) + + if num_filter is None: + i_dir = np.argsort(dist) + dist_sort = np.sort(dist) + else: + i_dir = np.argsort(dist)[:num_filter] + dist_sort = np.sort(dist)[:num_filter] + + return i_dir, dist_sort + + +def dist_on_sphere( + positions: np.ndarray, + azi: float, + ele: float, +) -> np.ndarray: + """ + Compute great-circle distance + + Parameters + ---------- + positions: np.ndarray + Source IR positions + azi: float + Desired response azimuth + ele: float + Desired response elevation + + Returns + ------- + dist: np.ndarray + Distances from desired point + """ + + azi, ele = wrap_angles(azi, ele) + + delta_azi = np.deg2rad(np.abs(azi - positions[:, 0])) + + # compute great circle distance + a = np.sin(np.deg2rad(positions[:, 1])) * np.sin(np.deg2rad(ele)) + np.cos( + np.deg2rad(positions[:, 1]) + ) * np.cos(np.deg2rad(ele)) * np.cos(delta_azi) + if np.max(a) > 1.001 or np.min(a) < -1.001: + raise ValueError( + f"Absolute distance value larger than one! Min: {np.min(a)}, Max: {np.max(a)}" + ) + + # limiting to prevent errors in arccos due to numerical inaccuracies + a[a > 1] = 1 + a[a < -1] = -1 + dist = np.arccos(a) + + return dist diff --git a/item_generation_scripts/audiotools/binauralobjectrenderer.py b/item_generation_scripts/audiotools/binauralobjectrenderer.py new file mode 100644 index 00000000..548c4921 --- /dev/null +++ b/item_generation_scripts/audiotools/binauralobjectrenderer.py @@ -0,0 +1,652 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +import itertools +from itertools import repeat +from typing import Optional, Tuple + +import numpy as np +from scipy.signal import convolve + +from item_generation_scripts.audiotools.binaural_datasets.binaural_dataset import ( + find_ir, +) +from item_generation_scripts.audiotools.constants import IVAS_FRAME_LEN_MS +from item_generation_scripts.audiotools.EFAP import wrap_angles +from item_generation_scripts.utils import apply_func_parallel + + +def barycentric_weights( + azi_deg: np.ndarray, + ele_deg: np.ndarray, + pos_in: np.ndarray, + interp_1d: Optional[bool] = False, +) -> Tuple[int, int, int]: + """ + Computation of spherical Barycentric weights + Implementation based on paper "Spherical Barycentric Coordinates" + from T. Langer, A. Belyaev und H. Seidel + + Parameters + ---------- + azi_deg: np.ndarray + Azimuthal coordinates of three points that form a triangle in degrees + ele_deg: np.ndarray + Elevation coordinates of three points that form a triangle in degrees + pos_in: np.ndarray + Azimuthal and elevation coordinates in degrees for point to compute weights + interp_1d: bool + 1d interpolation between two points + + Returns + ------- + W_1, W_2, W_3: scalar values + Barycentric weights for corresponding vertices + """ + + # check if point is equal to vertex + for k in range(3): + if azi_deg[k] == pos_in[0] and ele_deg[k] == pos_in[1]: + output = np.zeros(3) + output[k] = 1 + return tuple(output) + + pos = np.copy(pos_in) + + pos[0], pos[1] = wrap_angles(pos[0], pos[1]) + + # convert rad + ele = ( + -np.deg2rad(ele_deg, dtype="float64") + np.pi / 2 + ) # different definition of elevation in metadata + azi = np.deg2rad(azi_deg, dtype="float64") + pos[0] = np.deg2rad(pos[0]) + pos[1] = -np.deg2rad(pos[1]) + np.pi / 2 + + """ spherical barycentric coordinates """ + + # convert to cartesian coordinates + x = np.sin(ele) * np.cos(azi) + y = np.sin(ele) * np.sin(azi) + z = np.cos(ele) + pos_x = np.sin(pos[1]) * np.cos(pos[0]) + pos_y = np.sin(pos[1]) * np.sin(pos[0]) + pos_z = np.cos(pos[1]) + + pos_cart = np.array([pos_x, pos_y, pos_z]) + v_1 = np.array([x[0], y[0], z[0]]) + v_2 = np.array([x[1], y[1], z[1]]) + v_3 = np.array([x[2], y[2], z[2]]) + + # rotate coordinate system + unit = np.array([0, 0, 1]) + a = np.cross(pos_cart, unit) + b = np.dot(pos_cart, unit) + a_matrix = np.array([[0, -a[2], a[1]], [a[2], 0, -a[0]], [-a[1], a[0], 0]]) + if b == -1: + rot_matrix = np.eye(3, 3) # a and b point to opposite directions + else: + rot_matrix = np.eye(3, 3) + a_matrix + np.dot(a_matrix, a_matrix) / (1 + b) + + v_1 = rot_matrix @ v_1 + v_2 = rot_matrix @ v_2 + v_3 = rot_matrix @ v_3 + # test_vec = rot_matrix @ pos_cart # should be [0, 0, 1] + + # scale verticies to tangent plane + v_1_plane = v_1 / v_1[2] + v_2_plane = v_2 / v_2[2] + v_3_plane = v_3 / v_3[2] + eps = 10**-10 + + # compute planar barycentric coordinates + denom = (v_2_plane[1] - v_3_plane[1]) * (v_1_plane[0] - v_3_plane[0]) + ( + v_3_plane[0] - v_2_plane[0] + ) * (v_1_plane[1] - v_3_plane[1]) + # denom is proportional to area of triangle -> when area is zero, use linear 1d interpolation + if abs(denom) <= 10**-15: + interp_1d = True + + if not interp_1d: + W_1_plane = ( + (v_2_plane[1] - v_3_plane[1]) * (0 - v_3_plane[0]) + + (v_3_plane[0] - v_2_plane[0]) * (0 - v_3_plane[1]) + ) / (denom + eps) + W_2_plane = ( + (v_3_plane[1] - v_1_plane[1]) * (0 - v_3_plane[0]) + + (v_1_plane[0] - v_3_plane[0]) * (0 - v_3_plane[1]) + ) / (denom + eps) + W_3_plane = 1 - W_1_plane - W_2_plane + else: + v_diff = np.array( + [v_1_plane[:-1], v_2_plane[:-1], v_3_plane[:-1]] + ) # z entry always one + dist_all = np.linalg.norm(v_diff, axis=1) + v_diff_norm = np.divide(v_diff, dist_all[:, None]) + dot_v_ind = np.array( + [[0, 1], [1, 2], [2, 0]] + ) # the three possible combinations of points + # compute dot product between all vertices to find pairs that lie in opposite directions w.r.t. the point + # in this case the dot product is -1 (due to normalization) + dot = np.empty(3) + k = 0 + for ind_i, ind_j in dot_v_ind: + dot[k] = np.dot(v_diff_norm[ind_i], v_diff_norm[ind_j]) + k += 1 + + margin = 10**-5 + indices_minus_one = np.array(np.abs(dot + 1) < margin) + if indices_minus_one.any(): # test if one entry is -1 + v_ind = dot_v_ind[indices_minus_one] + # use vertex pair with smalles distance from origin (current position) + if np.shape(v_ind)[0] >= 2: + used_vertices = v_ind[ + np.argmin( + np.array([sum(dist_all[v_ind[0]]), sum(dist_all[v_ind[1]])]) + ) + ] + else: + used_vertices = v_ind[0] + dist = dist_all[used_vertices[0]] / sum(dist_all[used_vertices]) + if 0 in used_vertices and 1 in used_vertices: + W_1_plane = 1 - dist + W_2_plane = dist + W_3_plane = 0 + elif 1 in used_vertices and 2 in used_vertices: + W_1_plane = 0 + W_2_plane = 1 - dist + W_3_plane = dist + elif 2 in used_vertices and 0 in used_vertices: + W_1_plane = dist + W_2_plane = 0 + W_3_plane = 1 - dist + else: + raise ValueError("problem in 1d interpolation") + else: + # point does not lie on line spanned by two of the points + W_1_plane = -1 + W_2_plane = -1 + W_3_plane = -1 + + # compute spherical weights from planar weights + W_1 = W_1_plane * np.dot(v_1, v_1_plane) + W_2 = W_2_plane * np.dot(v_2, v_2_plane) + W_3 = W_3_plane * np.dot(v_3, v_3_plane) + + # avoid rejection of triangles due to numerical errors since point lies on edge of tiangle + threshold_error = -1 * 10**-8 + if threshold_error < W_1 < 0: + W_1 = 0 + if threshold_error < W_2 < 0: + W_2 = 0 + if threshold_error < W_3 < 0: + W_3 = 0 + + return W_1, W_2, W_3 + + +def get_tri_weights( + pos: np.ndarray, + SourcePosition: np.ndarray, +) -> Tuple[np.ndarray, np.ndarray]: + """ + Finds suitable triangle of data points on surface in which the defined point lies + + Parameters + ---------- + pos: np.ndarray + Point of interest given as [azimutahal, elevation] + SourcePosition: np.ndarray + Positions of the source in the measurements in IR + + Returns + ------- + combination_vertices: np.ndarray + Indices of the three vertices in SourcePosition + W: np.ndarray + Barycentric weights of point in triangle; + if negative, no suitable triangle was found + """ + + W_1, W_2, W_3 = -1, -1, -1 + index_triangle = 3 + # get indices of source positions sorted by distance on the plane from pos + index_vertices, _ = find_ir(SourcePosition, pos[0], pos[1]) + pos = np.array(wrap_angles(pos[0], pos[1])) + combination_vertices = None + while W_1 < 0 or W_2 < 0 or W_3 < 0: + if ( + SourcePosition[index_vertices[0], 0] == pos[0] + and SourcePosition[index_vertices[0], 1] == pos[1] + ): + # if position is position in data set take first triangle that incudes the point + combination_vertices = index_vertices[:3] + W_1, W_2, W_3 = (1, 0, 0) + break + index_HRIR = index_vertices[:index_triangle] # get nearest positions + y_ele_all = SourcePosition[index_HRIR, 1] + if pos[1] > np.max(y_ele_all) or pos[1] < np.min(y_ele_all): + # no need to compute weights since all possible triangles lie completely above or below point + # attention: this can be problematic if no point is available at [0, +-90] + pass + else: + # test all triangle combinations with new point + for combination_vertices_tmp in itertools.combinations(index_HRIR[:-1], 2): + combination_vertices = np.concatenate( + (index_HRIR[-1, None], combination_vertices_tmp), axis=0 + ) + + x_azi = SourcePosition[combination_vertices, 0] + y_ele = SourcePosition[combination_vertices, 1] + W_1, W_2, W_3 = barycentric_weights(x_azi, y_ele, pos) + if W_1 >= 0 and W_2 >= 0 and W_3 >= 0: + # found suitable triangle + break + index_triangle += 1 + if index_triangle > 30: + # stop after too many iterations + return np.array(combination_vertices), np.array([-1, -1, -1]) + + W = np.array([W_1, W_2, W_3]) + return np.array(combination_vertices), W + + +def interpolate_2d( + azi_in: np.ndarray, + ele_in: np.ndarray, + values: np.ndarray, + pos: np.ndarray, + interp_1d: Optional[bool] = False, + weights: Optional[np.ndarray] = None, + ghost: Optional[list[bool]] = None, + SourcePosition: Optional[np.ndarray] = None, + IR: Optional[np.ndarray] = None, + phase: Optional[bool] = False, +) -> np.ndarray: + """ + Compute HRIR for point on surface spanned by three points via barycentric coordinates + + Parameters + ---------- + azi_in: np.ndarray + Azimuthal coordinates of three points that form a triangle in degrees + ele_in: np.ndarray + Elevation coordinates of three points that form a triangle in degrees + values: np.ndarray + Values to interpolate, here either HRIRs or magnitude or phase of HRTFs + pos: np.ndarray + Position of desired interpolation value + interp_1d: bool + 1d interpolation between two points + weights: tuple + If barycentric weights are already known these values are used + ghost: list of bool + If north and/or south pole is ghost source + SourcePosition: np.ndarray + Only necessary if at least one element in ghost is true + IR: np.ndarray + Only necessary if at least one element in ghost is true + phase: bool + If interpolated values are phases and should be wrapped + + Returns + ------- + HRIR: np.ndarray + Interpolated value at point pos + """ + + if ghost is None: + ghost = [False, False] + + if weights is None: + W_1, W_2, W_3 = barycentric_weights( + azi_in, ele_in, pos, interp_1d + ) # compute barycentric weights + else: + (W_1, W_2, W_3) = weights + + if ( + W_1 + W_2 + W_3 > 1.5 + ): # on sphere sum of weights is not necessarily equal to one! + raise ValueError( + f"Sum of positive barycentric weights larger than expected: {W_1 +W_2 +W_3}" + ) + + threshold_error = -1 * 10**-10 + if W_1 < threshold_error or W_2 < threshold_error or W_3 < threshold_error: + raise ValueError("Point lies outside of triangle! No interpolation possible") + + # do some phase unwrapping + if phase: + values = np.unwrap(values, axis=1) + + # treat potential ghost sources at the north and south pole + if (ghost[0] and 90 in ele_in) or (ghost[1] and -90 in ele_in): + if SourcePosition is None or IR is None: + raise ValueError( + "Source positions and IRs are required in interpolation if ghost source is used" + ) + ele_ghost = [] + additional_term = 0 + weights_copy = np.copy(weights) + if ghost[0] and 90 in ele_in: + ele_ghost.append(90) + if ghost[1] and -90 in ele_in: + ele_ghost.append(-90) + for ele_g in ele_ghost: + ind_dist, dist = find_ir(SourcePosition[: -len(ele_ghost)], 0, ele_g) + ind_dist = ind_dist[dist == dist[0]] + weight_spread = weights_copy[ele_in == ele_g] / len(ind_dist) + weights_copy[ele_in == ele_g] = 0 + additional_term += np.sum(IR[:, ind_dist], axis=1) * weight_spread + + HRIR = ( + values[:, 0] * W_1 + + values[:, 1] * W_2 + + values[:, 2] * W_3 + + additional_term + ) + + else: + HRIR = ( + values[:, 0] * W_1 + values[:, 1] * W_2 + values[:, 2] * W_3 + ) # apply weights + + return HRIR + + +def add_ghost_speaker_bary( + SourcePosition: np.ndarray, + IR: np.ndarray, +) -> Tuple[list[bool], np.ndarray, np.ndarray]: + """ + Adds a ghost speaker at the poles if necessary and indicates result by bool values + + Parameters + ---------- + SourcePosition: np.ndarray + All source positions + IR: np.ndarray + IRs at corresponding source positions + + Returns + ------- + ghost_pos: list of bool + If entry is True a ghost speaker is introduced at the north or south pole, respectively + SourcePosition: np.ndarray + All source positions plus poles if ghost_pos is True + IR: np.ndarray + IRs at corresponding source positions + """ + + ghost_pos = [False, False] + if 90 not in SourcePosition[:, 1]: + # if north pole is not in dataset add it + ghost_pos[0] = True + pole = np.array([0, 90, 1]) + SourcePosition = np.concatenate((SourcePosition, pole[None, :]), axis=0) + IR = np.concatenate((IR, np.zeros((*np.shape(IR)[:2], 1))), axis=2) + if -90 not in SourcePosition[:, 1]: + # if south pole is not in dataset add it + ghost_pos[1] = True + pole = np.array([0, -90, 1]) + SourcePosition = np.concatenate((SourcePosition, pole[None, :]), axis=0) + IR = np.concatenate((IR, np.zeros((*np.shape(IR)[:2], 1))), axis=2) + + return ghost_pos, SourcePosition, IR + + +def binaural_fftconv_framewise( + x: np.ndarray, + IR: np.ndarray, + SourcePosition: np.ndarray, + azi: Optional[np.ndarray] = None, + ele: Optional[np.ndarray] = None, + frame_len: Optional[int] = (IVAS_FRAME_LEN_MS // 4) * 48, +) -> np.ndarray: + """ + Binauralization using fft convolution with frame-wise processing + supports rotation on trajectories with interpolation between measured Source + positions, reimplemented roughly along the lines of ConvBinauralRenderer.m + + Parameters + ---------- + x: np.ndarray + Input multi-channel array + IR: np.ndarray + HRIRs array + SourcePosition: np.ndarray + Positions of the source in the measurements in IR + azi: np.ndarray + Azimuth angles for all frames + ele: np.ndarray + Elevation angles for all frames + frame_len: int + Frame length, optional, default = (IVAS_FRAME_LEN_MS // 4) * 48000 + + Returns + ------- + y: np.ndarray + Output binaural signal array + """ + + sig_len = x.shape[0] + N_frames = int( + sig_len / frame_len + ) # TODO add ceil function for non-integer frame length multiples + num_points_interp = 3 # interpolation in triangle + + N_HRIR_taps = IR.shape[0] + + if azi is None or ele is None: + azi = np.repeat([0.0], N_frames) + ele = np.repeat([0.0], N_frames) + elif len(azi) < N_frames or len(ele) < N_frames: + azi = np.concatenate( + [np.repeat(azi, N_frames // len(azi)), azi[: N_frames % len(azi)]] + ) + ele = np.concatenate( + [np.repeat(ele, N_frames // len(ele)), ele[: N_frames % len(ele)]] + ) + + indices_HRIR = np.empty([N_frames, num_points_interp], dtype=int) + IR_2d = np.empty((N_frames, N_HRIR_taps, 2, num_points_interp)) + Bary_weights = np.empty((N_frames, 3)) + + # find three points to form a triangle for interpolation + # test if point lies within triangle spanned by these points by checking the signas of barycentric coordinates + # if all weights are >= 0 the point lies within the triangle + for index in range(np.shape(SourcePosition)[0]): + SourcePosition[index, 0:2] = np.array( + wrap_angles(SourcePosition[index, 0], SourcePosition[index, 1]) + ) + + # add ghost speaker to poles if necessary + ghost_pos, SourcePosition, IR = add_ghost_speaker_bary(SourcePosition, IR) + for i_frame in range(N_frames): + if ( + i_frame + and azi[i_frame] == azi[i_frame - 1] + and ele[i_frame] == ele[i_frame - 1] + ): + IR_2d[i_frame] = IR_2d[i_frame - 1] + indices_HRIR[i_frame] = indices_HRIR[i_frame - 1] + Bary_weights[i_frame] = Bary_weights[i_frame - 1] + continue + pos = np.array([azi[i_frame], ele[i_frame]]) + combination_vertices, W = get_tri_weights(pos, SourcePosition) + if (W < 0).all(): + raise ValueError("No suitable triangle found in frame " + str(i_frame)) + IR_2d[i_frame] = IR[:, :, np.array(combination_vertices)] + indices_HRIR[i_frame] = combination_vertices + Bary_weights[i_frame] = W + + T_rev = frame_len + N_HRIR_taps - 1 + N_rev = int(np.ceil(T_rev / frame_len)) + + fade_in = np.arange(frame_len) / (frame_len - 1) + fade_in = fade_in[:, np.newaxis] + fade_out = 1.0 - fade_in + + # compute both ears in parallel + i_ear = list(range(2)) + result = apply_func_parallel( + render_ear, + zip( + i_ear, + repeat(frame_len), + repeat(N_frames), + repeat(N_rev), + repeat(T_rev), + repeat(fade_in), + repeat(fade_out), + repeat(x), + repeat(sig_len), + repeat(N_HRIR_taps), + repeat(azi), + repeat(ele), + repeat(SourcePosition), + repeat(IR_2d), + repeat(Bary_weights), + repeat(ghost_pos), + repeat(IR), + repeat(indices_HRIR), + ), + None, + "mp", + False, + ) + + y = np.stack(result, axis=1) + + return y[0:sig_len] + + +def render_ear( + i_ear, + frame_len, + N_frames, + N_rev, + T_rev, + fade_in, + fade_out, + x, + sig_len, + N_HRIR_taps, + azi, + ele, + SourcePosition, + IR_2d, + Bary_weights, + ghost_pos, + IR, + indices_HRIR, +) -> np.ndarray: + # function to process one ear used in multiprocessing + G = np.empty((N_frames, N_HRIR_taps)) + + for frame in range(N_frames): + pos = np.array([azi[frame], ele[frame]]) + # Interpolation of time-domain signals + G[frame] = interpolate_2d( + SourcePosition[indices_HRIR[frame], 0], + SourcePosition[indices_HRIR[frame], 1], + IR_2d[frame, :, i_ear], + pos, + weights=Bary_weights[frame], + ghost=ghost_pos, + SourcePosition=SourcePosition, + IR=IR[:, i_ear], + ) + + # frame wise parallel computation slow (many frames, small computational load per frame) + i_frame = list(range(N_frames)) + result = apply_func_parallel( + convolve_frame, + zip( + i_frame, + repeat(frame_len), + repeat(N_frames), + repeat(N_rev), + repeat(T_rev), + repeat(i_ear), + repeat(fade_in), + repeat(fade_out), + repeat(G), + repeat(x), + repeat(sig_len), + repeat(N_HRIR_taps), + ), + None, + "mt", + False, + ) + + return np.hstack(result) + + +def convolve_frame( + i_frame, + frame_len, + N_frames, + N_rev, + T_rev, + i_ear, + fade_in, + fade_out, + G, + x, + sig_len, + N_HRIR_taps, +) -> np.ndarray: + # function to process one frame used in multiprocessing + i1 = i_frame * frame_len + i2 = (i_frame + 1) * frame_len + + y0 = np.zeros([2, sig_len + N_HRIR_taps - 1, 2]) + + G0 = G[i_frame] + G1 = G[min(i_frame + 1, N_frames - 1)] + + for j_frame in range(max(0, i_frame - N_rev), min(i_frame + 1, N_frames)): + j1 = j_frame * frame_len + j2 = (j_frame + 1) * frame_len + j2p = j1 + T_rev + + y0[0, j1:j2p, i_ear] += convolve(np.squeeze(x[j1:j2]), G0) + y0[1, j1:j2p, i_ear] += convolve(np.squeeze(x[j1:j2]), G1) + + y_frame = ( + np.squeeze(fade_out) * y0[0, i1:i2, i_ear] + + np.squeeze(fade_in) * y0[1, i1:i2, i_ear] + ) + return y_frame diff --git a/item_generation_scripts/audiotools/constants.py b/item_generation_scripts/audiotools/constants.py new file mode 100644 index 00000000..c3af9d29 --- /dev/null +++ b/item_generation_scripts/audiotools/constants.py @@ -0,0 +1,704 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +import numpy as np + +BINAURAL_AUDIO_FORMATS = { + "BINAURAL": { + "num_channels": 2, + }, + "BINAURAL_ROOM": { + "num_channels": 2, + }, +} + +BINAURAL_LFE_GAIN = 10 ** (5.5 / 20) + +LFE_INDEX_DEFAULT = 3 + +LS_AZI_MONO = [0] +LS_ELE_MONO = [0] + +LS_AZI_STEREO = [30, -30] +LS_ELE_STEREO = [0, 0] + +LS_AZI_CICP6 = [30, -30, 0, 0, 110, -110] +LS_ELE_CICP6 = [0, 0, 0, 0, 0, 0] + +LS_AZI_CICP12 = [30, -30, 0, 0, 110, -110, 135, -135] +LS_ELE_CICP12 = [0, 0, 0, 0, 0, 0, 0, 0] + +LS_AZI_CICP14 = [30, -30, 0, 0, 110, -110, 30, -30] +LS_ELE_CICP14 = [0, 0, 0, 0, 0, 0, 35, 35] + +LS_AZI_CICP16 = [30, -30, 0, 0, 110, -110, 30, -30, 110, -110] +LS_ELE_CICP16 = [0, 0, 0, 0, 0, 0, 35, 35, 35, 35] + +LS_AZI_CICP19 = [30, -30, 0, 0, 135, -135, 90, -90, 30, -30, 135, -135] +LS_ELE_CICP19 = [0, 0, 0, 0, 0, 0, 0, 0, 35, 35, 35, 35] + + +CHANNEL_BASED_AUDIO_FORMATS = { + "MONO": { + "num_channels": 1, + "ls_azi": LS_AZI_MONO, + "ls_ele": LS_ELE_MONO, + "lfe_index": [], + }, + "STEREO": { + "num_channels": 2, + "ls_azi": LS_AZI_STEREO, + "ls_ele": LS_ELE_STEREO, + "lfe_index": [], + }, + "5_1": { + "num_channels": 6, + "ls_azi": LS_AZI_CICP6, + "ls_ele": LS_ELE_CICP6, + "lfe_index": [LFE_INDEX_DEFAULT], + }, + "5_1_2": { + "num_channels": 8, + "ls_azi": LS_AZI_CICP14, + "ls_ele": LS_ELE_CICP14, + "lfe_index": [LFE_INDEX_DEFAULT], + }, + "5_1_4": { + "num_channels": 10, + "ls_azi": LS_AZI_CICP16, + "ls_ele": LS_ELE_CICP16, + "lfe_index": [LFE_INDEX_DEFAULT], + }, + "7_1": { + "num_channels": 8, + "ls_azi": LS_AZI_CICP12, + "ls_ele": LS_ELE_CICP12, + "lfe_index": [LFE_INDEX_DEFAULT], + }, + "7_1_4": { + "num_channels": 12, + "ls_azi": LS_AZI_CICP19, + "ls_ele": LS_ELE_CICP19, + "lfe_index": [LFE_INDEX_DEFAULT], + }, + "LS": { + "num_channels": 15, + "ls_azi": [ + 30, + -30, + 0, + 135, + -135, + 110, + -110, + 90, + -90, + 30, + -30, + 110, + -110, + 135, + -135, + ], + "ls_ele": [0, 0, 0, 0, 0, 0, 0, 0, 0, 35, 35, 35, 35, 35, 35], + "lfe_index": [], + }, + "MOZART": { + "num_channels": 30, + "ls_azi": [ + 0, + 0, + 135, + -135, + 30, + -30, + 180, + 0, + 90, + -90, + 45, + -45, + 0, + 0, + 135, + -135, + 90, + -90, + 180, + 0, + 45, + -45, + 60, + -60, + 110, + -110, + 30, + -30, + 110, + -110, + ], + "ls_ele": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 35, + 35, + 35, + 90, + 35, + 35, + 35, + 35, + 35, + -15, + -15, + -15, + 0, + 0, + 0, + 0, + 35, + 35, + 35, + 35, + ], + "lfe_index": [1, 7], + }, + "CUSTOM_LS": { + "num_channels": -1, + "ls_azi": None, + "ls_ele": None, + "lfe_index": None, + }, +} + +# Support a variety of names for multichannel configs +CHANNEL_BASED_AUDIO_ALTNAMES = { + # 5_1 + 51: "5_1", # YAML by default will interpret underscore delimited numbers as integers, similar to python + "5d1": "5_1", + "5.1": "5_1", + "CICP6": "5_1", + # 7_1 + 71: "7_1", + "7d1": "7_1", + "7.1": "7_1", + "CICP12": "7_1", + # 5_1_2 + 512: "5_1_2", + "5d1p2": "5_1_2", + "5.1+2": "5_1_2", + "5.1.2": "5_1_2", + "CICP14": "5_1_2", + # 5_1_4 + 514: "5_1_4", + "5d1p4": "5_1_4", + "5.1+4": "5_1_4", + "5.1.4": "5_1_4", + "CICP16": "5_1_4", + # 7_1_4 + 714: "7_1_4", + "7d1p4": "7_1_4", + "7.1+4": "7_1_4", + "7.1.4": "7_1_4", + "CICP19": "7_1_4", +} + +METADATA_ASSISTED_SPATIAL_AUDIO_FORMATS = { + "MASA1": { + "num_channels": 1, + }, + "MASA2": { + "num_channels": 2, + }, +} +OBJECT_BASED_AUDIO_FORMATS = { + "ISM1": { + "num_channels": 1, + }, + "ISM2": { + "num_channels": 2, + }, + "ISM3": { + "num_channels": 3, + }, + "ISM4": { + "num_channels": 4, + }, +} + + +SCENE_BASED_AUDIO_FORMATS = { + "FOA": { + "num_channels": 4, + "is_planar": False, + }, + "HOA2": { + "num_channels": 9, + "is_planar": False, + }, + "HOA3": { + "num_channels": 16, + "is_planar": False, + }, + "PLANARFOA": { + "num_channels": 4, + "is_planar": True, + }, + "PLANARHOA2": { + "num_channels": 9, + "is_planar": True, + }, + "PLANARHOA3": { + "num_channels": 16, + "is_planar": True, + }, + "SBA1": { + "num_channels": 4, + "is_planar": False, + }, + "SBA2": { + "num_channels": 9, + "is_planar": False, + }, + "SBA3": { + "num_channels": 16, + "is_planar": False, + }, +} + +SCENE_METADATA_FORMATS = {"META"} + +AUDIO_FORMATS = [ + BINAURAL_AUDIO_FORMATS, + CHANNEL_BASED_AUDIO_FORMATS, + METADATA_ASSISTED_SPATIAL_AUDIO_FORMATS, + OBJECT_BASED_AUDIO_FORMATS, + SCENE_BASED_AUDIO_FORMATS, +] + + +IVAS_FRAME_LEN_MS = 20 + +IVAS_CICPX_TO_MONO = np.array( + [ + [ + 1, + 1, + 1, + 1, + 0.79999995, + 0.79999995, + 0.79999995, + 0.79999995, + 0.849999964, + 0.849999964, + 0.849999964, + 0.849999964, + ] + ] +).T + +IVAS_CICPX_TO_STEREO = np.array( + [ + [1, 0], + [0, 1], + [np.sqrt(0.5), np.sqrt(0.5)], + [np.sqrt(0.5), np.sqrt(0.5)], + [0.79999995, 0], + [0, 0.79999995], + [0.79999995, 0], + [0, 0.79999995], + [0.849999964, 0], + [0, 0.849999964], + [0.849999964, 0], + [0, 0.849999964], + ] +) + +# downmix matrices +IVAS_CICP12_TO_6 = np.zeros(8 * 6) +IVAS_CICP12_TO_6[[0, 7, 14, 21, 28, 35, 40, 47]] = 1 +IVAS_CICP12_TO_6 = IVAS_CICP12_TO_6.reshape(8, 6) + +IVAS_CICP14_TO_6 = np.zeros(8 * 6) +IVAS_CICP14_TO_6[[0, 7, 14, 21, 28, 35]] = 1 +IVAS_CICP14_TO_6[[36, 43]] = 0.849999964 +IVAS_CICP14_TO_6 = IVAS_CICP14_TO_6.reshape(8, 6) + +IVAS_CICP16_TO_6 = np.zeros(10 * 6) +IVAS_CICP16_TO_6[[0, 7, 14, 21, 28, 35]] = 1 +IVAS_CICP16_TO_6[[36, 43, 52, 59]] = 0.849999964 +IVAS_CICP16_TO_6 = IVAS_CICP16_TO_6.reshape(10, 6) + +IVAS_CICP16_TO_12 = np.zeros(10 * 8) +IVAS_CICP16_TO_12[[0, 9, 18, 27, 36, 45]] = 1 +IVAS_CICP16_TO_12[[48, 57, 68, 77]] = 0.849999964 +IVAS_CICP16_TO_12 = IVAS_CICP16_TO_12.reshape(10, 8) + +IVAS_CICP16_TO_14 = np.zeros(10 * 8) +IVAS_CICP16_TO_14[[0, 9, 18, 27, 36, 45, 54, 63]] = 1 +IVAS_CICP16_TO_14[[68, 77]] = 0.849999964 +IVAS_CICP16_TO_14 = IVAS_CICP16_TO_14.reshape(10, 8) + +IVAS_CICP19_TO_6 = np.zeros(12 * 6) +IVAS_CICP19_TO_6[[0, 7, 14, 21, 28, 35]] = 1 +IVAS_CICP19_TO_6[[36, 43]] = 0.367322683 +IVAS_CICP19_TO_6[[48, 55, 64, 71]] = 0.849999964 +IVAS_CICP19_TO_6[[40, 47]] = 0.930093586 +IVAS_CICP19_TO_6 = IVAS_CICP19_TO_6.reshape(12, 6) + +IVAS_CICP19_TO_12 = np.zeros(12 * 8) +IVAS_CICP19_TO_12[[0, 9, 18, 27, 38, 47]] = 1 +IVAS_CICP19_TO_12[[48, 57]] = 0.367322683 +IVAS_CICP19_TO_12[[64, 73, 84, 93]] = 0.849999964 +IVAS_CICP19_TO_12[[52, 61]] = 0.930093586 +IVAS_CICP19_TO_12 = IVAS_CICP19_TO_12.reshape(12, 8) + +IVAS_CICP19_TO_14 = np.zeros(12 * 8) +IVAS_CICP19_TO_14[[0, 9, 18, 27, 36, 45, 70, 79]] = 1 +IVAS_CICP19_TO_14[[48, 57]] = 0.367322683 +IVAS_CICP19_TO_14[[84, 93]] = 0.849999964 +IVAS_CICP19_TO_14[[52, 61]] = 0.930093586 +IVAS_CICP19_TO_14 = IVAS_CICP19_TO_14.reshape(12, 8) + +IVAS_CICP19_TO_16 = np.zeros(12 * 10) +IVAS_CICP19_TO_16[[0, 11, 22, 33, 44, 55, 86, 97, 108, 119]] = 1 +IVAS_CICP19_TO_16[[60, 71]] = 0.367322683 +IVAS_CICP19_TO_16[[64, 75]] = 0.930093586 +IVAS_CICP19_TO_16 = IVAS_CICP19_TO_16.reshape(12, 10) + +# upmix matrices +IVAS_MONO_TO_CICPX = np.zeros([1, 12]) +IVAS_MONO_TO_CICPX[0, 2] = 1 + +IVAS_STEREO_TO_CICPX = np.zeros([2, 12]) +IVAS_STEREO_TO_CICPX[0, 0] = 1 +IVAS_STEREO_TO_CICPX[1, 1] = 1 + +IVAS_CICP12_TO_14 = np.zeros(8 * 8) +IVAS_CICP12_TO_14[[0, 9, 18, 27, 36, 45, 52, 61]] = 1 +IVAS_CICP12_TO_14 = IVAS_CICP12_TO_14.reshape(8, 8) + +IVAS_CICP12_TO_16 = np.zeros(8 * 10) +IVAS_CICP12_TO_16[[0, 11, 22, 33, 44, 55, 64, 75]] = 1 +IVAS_CICP12_TO_16 = IVAS_CICP12_TO_16.reshape(8, 10) + +IVAS_CICP12_TO_19 = np.zeros(8 * 12) +IVAS_CICP12_TO_19[[0, 13, 26, 39, 54, 67, 76, 89]] = 1 +IVAS_CICP12_TO_19 = IVAS_CICP12_TO_19.reshape(8, 12) + +IVAS_CICP14_TO_19 = np.zeros(8 * 12) +IVAS_CICP14_TO_19[[0, 13, 26, 39, 52, 65, 80, 93]] = 1 +IVAS_CICP14_TO_19 = IVAS_CICP14_TO_19.reshape(8, 12) + +IVAS_CICP16_TO_19 = np.zeros(10 * 12) +IVAS_CICP16_TO_19[[0, 13, 26, 39, 52, 65, 80, 93, 106, 119]] = 1 +IVAS_CICP16_TO_19 = IVAS_CICP16_TO_19.reshape(10, 12) + +# mapping dict +IVAS_MC_CONVERSION = { + "MONO": { + # upmix + "5_1": IVAS_MONO_TO_CICPX[:, :6], + "7_1": IVAS_MONO_TO_CICPX[:, :8], + "5_1_2": IVAS_MONO_TO_CICPX[:, :8], + "5_1_4": IVAS_MONO_TO_CICPX[:, :10], + "7_1_4": IVAS_MONO_TO_CICPX[:, :12], + }, + "STEREO": { + # upmix + "5_1": IVAS_STEREO_TO_CICPX[:, :6], + "7_1": IVAS_STEREO_TO_CICPX[:, :8], + "5_1_2": IVAS_STEREO_TO_CICPX[:, :8], + "5_1_4": IVAS_STEREO_TO_CICPX[:, :10], + "7_1_4": IVAS_STEREO_TO_CICPX[:, :12], + }, + "5_1": { + # downmix + "MONO": IVAS_CICPX_TO_MONO[:6, :], + "STEREO": IVAS_CICPX_TO_STEREO[:6, :], + # upmix + "7_1": np.pad(np.eye(6), [[0, 0], [0, 2]]), + "5_1_2": np.pad(np.eye(6), [[0, 0], [0, 2]]), + "5_1_4": np.pad(np.eye(6), [[0, 0], [0, 4]]), + "7_1_4": np.pad(np.eye(6), [[0, 0], [0, 6]]), + }, + "7_1": { + # downmix + "MONO": IVAS_CICPX_TO_MONO[:8, :], + "STEREO": IVAS_CICPX_TO_STEREO[:8, :], + "5_1": IVAS_CICP12_TO_6, + # upmix + "5_1_2": IVAS_CICP12_TO_14, + "5_1_4": IVAS_CICP12_TO_16, + "7_1_4": IVAS_CICP12_TO_19, + }, + "5_1_2": { + # downmix + "MONO": np.vstack([IVAS_CICPX_TO_MONO[:6, :], IVAS_CICPX_TO_MONO[-2:, :]]), + "STEREO": np.vstack( + [IVAS_CICPX_TO_STEREO[:6, :], IVAS_CICPX_TO_STEREO[-2:, :]] + ), + "5_1": IVAS_CICP14_TO_6, + "7_1": np.pad(IVAS_CICP14_TO_6, [[0, 0], [0, 2]]), + # upmix + "5_1_4": np.pad(np.eye(8), [[0, 0], [0, 2]]), + "7_1_4": IVAS_CICP14_TO_19, + }, + "5_1_4": { + # downmix + "MONO": np.vstack([IVAS_CICPX_TO_MONO[:6, :], IVAS_CICPX_TO_MONO[-4:, :]]), + "STEREO": np.vstack( + [IVAS_CICPX_TO_STEREO[:6, :], IVAS_CICPX_TO_STEREO[-4:, :]] + ), + "5_1": IVAS_CICP16_TO_6, + "7_1": IVAS_CICP16_TO_12, + "5_1_2": IVAS_CICP16_TO_14, + # upmix + "7_1_4": IVAS_CICP16_TO_19, + }, + "7_1_4": { + # downmix + "MONO": IVAS_CICPX_TO_MONO, + "STEREO": IVAS_CICPX_TO_STEREO, + "5_1": IVAS_CICP19_TO_6, + "7_1": IVAS_CICP19_TO_12, + "5_1_2": IVAS_CICP19_TO_14, + "5_1_4": IVAS_CICP19_TO_16, + }, +} + +# LFE 120 Hz LPF filter coefficients +IVAS_LPF_4_BUTTER_48K_SOS = np.array( + [ + [ + 5.12617881476274e-09, + 1.02523584294987e-08, + 5.12617879059970e-09, + 1, + -1.96875982668433, + 0.969044914826862, + ], + [ + 1, + 1.99999984394358, + 1.00000000471366, + 1, + -1.98677297369091, + 0.987060670205863, + ], + ] +) + +T_DESIGN_11_AZI = np.array( + [ + 132.927291884332, + -83.9349499672527, + 8.47410038634525, + -113.340833834572, + -103.265909909537, + -33.2370360923825, + 21.8564347471830, + -156.539486489880, + -64.2647531387317, + 165.779530068738, + -25.2028339893249, + -97.0037973959711, + 27.8546391256925, + 153.214218975132, + -155.061608694663, + -11.8421354925543, + 80.5387312016125, + -42.0561606270165, + -31.2233262205060, + 38.8379041944063, + 93.7606877469492, + -84.7560200078398, + 7.75536818082863, + -122.276883381108, + 46.8012705252113, + -24.7686335284573, + 99.8904719062334, + -134.783996960185, + -83.0880230164493, + 60.1281736000420, + 152.644656278084, + 29.7576658909417, + 40.7793187974476, + 110.183927562412, + 165.652065916454, + -12.9926632105736, + 79.7359893585681, + -50.5245271190884, + 118.923930267733, + 47.2202861862577, + 171.925276523721, + -62.5145800558502, + -11.1156697680531, + 132.018041099963, + -135.355486412425, + 102.370921576708, + 112.739282398012, + -178.304963670831, + -122.319932198534, + 59.0763464570905, + 151.704200334501, + 21.3763364190503, + -169.005476417779, + 118.980811786769, + -116.089295979010, + 9.64767870353308, + 60.8933243657771, + -156.021526862757, + -63.4602993325163, + 174.929787427393, + -175.288768596346, + -105.951907934032, + -50.1928304519800, + 131.358266702971, + -136.296815007542, + 93.5644603506407, + -97.0840116473627, + -169.158278888619, + -44.1323835471345, + 81.4795403841382, + ] +) + +T_DESIGN_11_ELE = np.array( + [ + 7.69254738757899, + -23.7300652200871, + 23.5127556185301, + 70.4225940747938, + -9.89694439538752, + -70.7513316063095, + -26.4618527647561, + 47.7764936689044, + -7.72047049524459, + 44.5343602375216, + 26.3897904767450, + -44.6578850137166, + 9.76703456924600, + -47.7053318175498, + 7.45302934155972, + -23.5901209534773, + 23.7194484034707, + 70.4382693912270, + -9.83541588740259, + -70.4980825105727, + -26.2949218109204, + 47.6148028805222, + -7.51718499746626, + 44.2862347125773, + 26.6442619674660, + -44.5693707254340, + 9.91271928508000, + -47.9599550372574, + 7.29679922953795, + -23.3445981426306, + 23.6415261666079, + 70.6843143997832, + -9.58140351749889, + -70.3934534122902, + -26.4258159091605, + 47.7510668062369, + -7.30853603036844, + 44.2632768570349, + 26.7140614474957, + -44.3149733480527, + 9.75899721561506, + -48.0361913333593, + 7.43965099805872, + -23.3326075548841, + 23.3868959687598, + 70.8219078016791, + -9.48596399169388, + -70.5801867828491, + -26.6740262349265, + 47.9978414043199, + -7.38276167631068, + 44.4970603752708, + 26.5024990214418, + -44.2461913308458, + 9.51845076548334, + -47.8281351088411, + 7.68427447425834, + -23.5706842106942, + 23.3074499244045, + 70.6586472132300, + -9.68088860263008, + -70.8026785673948, + -26.6963451935976, + 48.0136296461397, + -7.63734823159200, + 44.6651234222196, + 26.3023490002159, + -44.4576351865647, + 9.52341455917443, + -47.6242211091394, + ] +) +PLANAR_HOA_CHANNELS_ACN = np.array([0, 1, 3, 4, 8, 9, 15]) +VERT_HOA_CHANNELS_ACN = np.array([2, 5, 6, 7, 10, 11, 12, 13, 14]) + +SEED_PADDING = 0 + +# delay in number of samples +DELAY_COMPENSATION_FOR_FILTERING = { + "SHQ2": { + "up": 436, + "down": 218, + }, + "SHQ3": { + "up": 436, + "down": 145, + }, + "MSIN": 92, + "LP1p5": 322, + "LP35": 232, + "LP7": 117, + "LP10": 82, + "LP12": 164, + "LP14": 234, + "LP20": 161, + "HP50_32KHZ": 559, + "HP50_48KHZ": 839, +} diff --git a/item_generation_scripts/audiotools/convert/__init__.py b/item_generation_scripts/audiotools/convert/__init__.py new file mode 100644 index 00000000..4ec23739 --- /dev/null +++ b/item_generation_scripts/audiotools/convert/__init__.py @@ -0,0 +1,323 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +import logging +from pathlib import Path, PurePath +from typing import Optional, Union + +from item_generation_scripts.audiotools import audio, audioarray, metadata +from item_generation_scripts.audiotools.audiofile import write +from item_generation_scripts.audiotools.convert.channelbased import convert_channelbased +from item_generation_scripts.audiotools.convert.masa import convert_masa +from item_generation_scripts.audiotools.convert.objectbased import convert_objectbased +from item_generation_scripts.audiotools.convert.scenebased import convert_scenebased +from item_generation_scripts.audiotools.wrappers.bs1770 import loudness_norm +from item_generation_scripts.audiotools.wrappers.esdru import esdru +from item_generation_scripts.audiotools.wrappers.filter import ( + hp50filter_itu, + lpfilter_itu, + resample_itu, +) +from item_generation_scripts.audiotools.wrappers.p50fbmnru import p50fbmnru + +from ..metadata import write_ISM_metadata_in_file + + +def convert_file( + in_file: Union[str, Path], + out_file: Union[str, Path], + in_fs: int, + in_fmt: Union[str, Path], + out_fmt: Optional[Union[str, Path]] = None, + out_fs: Optional[int] = None, + in_meta: Optional[list] = None, + logger: Optional[logging.Logger] = None, + **kwargs, +) -> None: + """Conversion function for one audio file""" + + if not in_fmt: + raise ValueError("Input audio format must be specified!") + + # get audio class object - can be either a regular single audio or scene description .txt + if not isinstance(in_fmt, PurePath) and in_fmt.startswith("META"): + input = metadata.Metadata(in_file) + else: + input = audio.fromfile(in_fmt, in_file, in_fs, in_meta) + + # try to set reasonable defaults if missing + if not in_fs: + in_fs = input.fs + if not out_fs: + out_fs = input.fs + + if not out_fmt: + if isinstance(input, metadata.Metadata): + raise ValueError( + "Output format must be specified for scene description files!" + ) + else: + out_fmt = input.name + + output = audio.fromtype(out_fmt) + if isinstance(output, audio.ObjectBasedAudio): + try: + output.object_pos = input.object_pos + output.metadata_files = input.metadata_files + except Exception: + raise ValueError( + "ISM is not supported as an output for rendering! Only usable as pass-through" + ) + + if isinstance(input, metadata.Metadata): + if logger: + logger.debug(f"Converting metadata to {out_fmt} : {in_file} -> {out_file}") + + # render each audio instance separately + for audio_in in input.audio: + output.fs = out_fs + tmp = audio.fromtype(out_fmt) + tmp.fs = in_fs # resampling not yet applied + convert(audio_in, tmp, in_fs=in_fs, out_fs=out_fs, logger=logger, **kwargs) + if output.audio is not None: + output.audio += tmp.audio + else: + output.audio = tmp.audio + else: + if logger: + logger.debug(f"Converting {in_fmt} to {out_fmt} : {in_file} -> {out_file}") + # run main conversion method + output.fs = in_fs # resampling not yet applied + convert(input, output, in_fs=in_fs, out_fs=out_fs, logger=logger, **kwargs) + + # write output + write(out_file, output.audio, output.fs) + if isinstance(output, audio.ObjectBasedAudio): + write_ISM_metadata_in_file(output.object_pos, [out_file], automatic_naming=True) + + +def convert( + input: audio.Audio, + output: audio.Audio, + in_trim: Optional[list] = None, + in_pad_noise: Optional[bool] = False, + in_delay: Optional[float] = None, + in_fs: Optional[int] = None, + in_cutoff: Optional[int] = None, + in_hp50: Optional[bool] = None, + in_window: Optional[list] = None, + in_loudness: Optional[float] = None, + in_loudness_fmt: Optional[str] = None, + out_trim: Optional[list] = None, + out_pad_noise: Optional[bool] = False, + out_delay: Optional[float] = None, + out_fs: Optional[int] = None, + out_cutoff: Optional[int] = None, + out_hp50: Optional[bool] = None, + out_window: Optional[list] = None, + out_loudness: Optional[float] = None, + out_loudness_fmt: Optional[str] = None, + limit: Optional[bool] = False, + mnru_q: Optional[float] = None, + esdru_alpha: Optional[float] = None, + logger: Optional[logging.Logger] = None, + **kwargs, +) -> None: + """Perform pre-processing, conversion and post-processing""" + + """pre-processing""" + process_audio( + x=input, + trim=in_trim, + pad_noise=in_pad_noise, + delay=in_delay, + fs=in_fs, + fc=in_cutoff, + hp50=in_hp50, + window=in_window, + loudness=in_loudness, + loudness_fmt=in_loudness_fmt, + logger=logger, + ) + + """format conversion""" + format_conversion(input, output, logger=logger, **kwargs) + + """post-processing""" + process_audio( + x=output, + trim=out_trim, + pad_noise=out_pad_noise, + delay=out_delay, + fs=out_fs, + fc=out_cutoff, + hp50=out_hp50, + window=out_window, + loudness=out_loudness, + loudness_fmt=out_loudness_fmt, + limit=limit, + mnru_q=mnru_q, + esdru_alpha=esdru_alpha, + logger=logger, + ) + + +def process_audio( + x: audio.Audio, + trim: Optional[list] = None, + pad_noise: Optional[bool] = False, + delay: Optional[float] = None, + fs: Optional[int] = None, + fc: Optional[int] = None, + hp50: Optional[bool] = False, + window: Optional[float] = None, + loudness: Optional[float] = None, + loudness_fmt: Optional[str] = None, + limit: Optional[bool] = False, + mnru_q: Optional[float] = None, + esdru_alpha: Optional[float] = None, + logger: Optional[logging.Logger] = None, +) -> None: + """Perform (pre-/pos-) processing of audio""" + + if fs is None: + fs = x.fs + + """delay audio""" + if delay is not None: + if logger: + logger.debug(f"Delaying audio by {delay} ms") + x.audio = audioarray.delay(x.audio, x.fs, delay) + + """trim or pad audio""" + if trim is not None: + if isinstance(x, audio.ObjectBasedAudio): + # metadata concatenation necessary for ISM + metadata.trim_meta(x, tuple(trim), pad_noise) + else: + x.audio = audioarray.trim(x.audio, x.fs, tuple(trim), pad_noise) + + """windowing""" + if window is not None: + if logger: + logger.debug(f"Windowing audio with {window} ms Hann window") + x.audio = audioarray.window(x.audio, x.fs, window) + + """high-pass (50 Hz) filtering""" + if hp50: + if logger: + logger.debug("Applying 50 Hz high-pass filter using ITU STL filter") + x.audio = hp50filter_itu(x) + + """resampling""" + if x.fs != fs: + if logger: + logger.debug(f"Resampling from {x.fs} to {fs} using ITU STL filter") + x.audio = resample_itu(x, fs) + x.fs = fs + + """loudness normalization""" + if loudness is not None: + if logger: + logger.debug( + f"Applying loudness adjustment to {loudness} LKFS for format {loudness_fmt} using ITU STL bs1770demo" + ) + x.audio = loudness_norm(x, loudness, loudness_fmt) + + """low-pass filtering""" + if fc is not None: + if logger: + logger.debug( + f"Applying low-pass filter with cutoff {fc} Hz using ITU STL filter" + ) + x.audio = lpfilter_itu(x, fc) + + """MNRU""" + if mnru_q is not None: + if logger: + logger.debug("Applying P.50 Fullband MNRU") + x.audio = p50fbmnru(x, mnru_q) + + """ESDRU""" + if esdru_alpha is not None: + if logger: + logger.debug("Applying ESDRU Recommendation ITU-T P.811") + x.audio = esdru(x, esdru_alpha) + + """limiting""" + if limit: + if logger: + logger.debug("Applying limiter") + audioarray.limiter(x.audio, x.fs) + + +def format_conversion( + input: audio.Audio, + output: audio.Audio, + logger: Optional[logging.Logger] = None, + **kwargs, +) -> None: + """Convert one audio format to another""" + + # validation + if isinstance(output, audio.MetadataAssistedSpatialAudio): + raise NotImplementedError("MASA is not supported as an output for rendering!") + + if isinstance(output, audio.ObjectBasedAudio) and input.name != output.name: + raise NotImplementedError( + "ISM is not supported as an output for rendering! Only usable as pass-through" + ) + + if logger: + logger.debug(f"Format conversion: {input.name} -> {output.name}") + + if input.name == output.name or ( + input.name.startswith("BINAURAL") and output.name.startswith("BINAURAL") + ): + output.audio = input.audio + else: + if isinstance(input, audio.BinauralAudio): + raise NotImplementedError( + f"{input.name} is not supported as an input for rendering!" + ) + elif isinstance(input, audio.ChannelBasedAudio): + convert_channelbased(input, output, **kwargs) + elif isinstance(input, audio.MetadataAssistedSpatialAudio): + convert_masa(input, output, **kwargs) + elif isinstance(input, audio.ObjectBasedAudio): + convert_objectbased(input, output, **kwargs) + elif isinstance(input, audio.SceneBasedAudio): + convert_scenebased(input, output, **kwargs) + else: + raise NotImplementedError( + f"Unknown or unsupported audio format {input.name}" + ) diff --git a/item_generation_scripts/audiotools/convert/binaural.py b/item_generation_scripts/audiotools/convert/binaural.py new file mode 100644 index 00000000..b23e69ee --- /dev/null +++ b/item_generation_scripts/audiotools/convert/binaural.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +from typing import Optional + +import numpy as np +from scipy.signal import fftconvolve + + +def NS2SA( + fs: float, + x: float, +) -> int: + """ + Converts from nanoseconds to number of samples + + Parameters + ---------- + fs: float + Sampling rate + x: float + Duration in nano seconds + + Returns + ------- + Number of samples + """ + + return int(int(fs / 100) * (x / 100) / 100000) + + +def binaural_fftconv( + x: np.ndarray, + IR: np.ndarray, + nchannels: int, + lfe_index: Optional[list[int]] = None, +) -> np.ndarray: + """ + Binauralization using fft convolution + + Parameters + ---------- + x: np.ndarray + Input multi-channel array + IR: np.ndarray + HRIRs array + nchannels: int + Maximum number of channels to process + lfe_index: Optional[list[int]] + List of LFE channel indices + + Returns + ------- + y: np.ndarray + Output convolved signal array + """ + + if lfe_index is None: + lfe_index = [] + + y = np.zeros([x.shape[0], 2]) + for chan_idx in range(min(x.shape[1], nchannels)): + if chan_idx not in lfe_index: + y[:, 0] = np.add( + y[:, 0], + fftconvolve(x[:, chan_idx].astype(float), IR[:, 0, chan_idx]).astype( + float + )[: x.shape[0]], + ) + y[:, 1] = np.add( + y[:, 1], + fftconvolve(x[:, chan_idx].astype(float), IR[:, 1, chan_idx]).astype( + float + )[: x.shape[0]], + ) + else: + ... + + return y diff --git a/item_generation_scripts/audiotools/convert/channelbased.py b/item_generation_scripts/audiotools/convert/channelbased.py new file mode 100644 index 00000000..a8d941e2 --- /dev/null +++ b/item_generation_scripts/audiotools/convert/channelbased.py @@ -0,0 +1,390 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +from pathlib import Path +from typing import Optional, Tuple, Union + +import numpy as np + +from item_generation_scripts.audiotools import audio +from item_generation_scripts.audiotools.audioarray import delay, framewise_io +from item_generation_scripts.audiotools.binaural_datasets.binaural_dataset import ( + load_ir, +) +from item_generation_scripts.audiotools.constants import ( + BINAURAL_LFE_GAIN, + IVAS_FRAME_LEN_MS, + IVAS_MC_CONVERSION, +) +from item_generation_scripts.audiotools.convert import scenebased +from item_generation_scripts.audiotools.convert.binaural import binaural_fftconv +from item_generation_scripts.audiotools.EFAP import EFAP +from item_generation_scripts.audiotools.rotation import Quat2RotMat, rotateAziEle +from item_generation_scripts.audiotools.wrappers.filter import resample_itu + +""" ChannelBasedAudio functions """ + + +def convert_channelbased( + cba: audio.ChannelBasedAudio, + out: audio.Audio, + **kwargs, +) -> audio.Audio: + """Convert channel-based audio to the requested output format""" + # CBA -> Binaural + if isinstance(out, audio.BinauralAudio): + render_cba_to_binaural(cba, out, **kwargs) + + # CBA -> CBA + elif isinstance(out, audio.ChannelBasedAudio): + render_cba_to_cba(cba, out) + + # CBA -> SBA + elif isinstance(out, audio.SceneBasedAudio): + render_cba_to_sba(cba, out) + + else: + raise NotImplementedError( + f"Conversion from {cba.name} to {out.name} is unsupported!" + ) + + return out + + +def render_cba_to_binaural( + cba: audio.ChannelBasedAudio, + bin: audio.BinauralAudio, + trajectory: Optional[Union[str, Path]] = None, + bin_dataset: Optional[str] = None, + bin_lfe_gain: Optional[float] = None, + **kwargs, +) -> None: + """ + Binauralization of channel-based audio + + Parameters + ---------- + cba: audio.ChannelBasedAudio + Channel-based input audio + bin: audio.BinauralAudio + Binaural output audio + trajectory: Optional[Union[str, Path]] + Head rotation trajectory path + bin_dataset: Optional[str] + Name of binaural dataset wihtout prefix or suffix + bin_lfe_gain: Optional[float] + LFE gain for binaural rendering + """ + + if cba.name == "MONO": + # no binauralization possible for mono -> render to stereo and assume binaural signal + cba_stereo = audio.fromtype("STEREO") + cba_stereo.fs = bin.fs + render_cba_to_cba(cba, cba_stereo) + bin.audio = cba_stereo.audio + return + + cba.audio = resample_itu(cba, 48000) + old_fs = cba.fs + cba.fs = 48000 + bin.fs = 48000 + + if trajectory is not None: + cba.audio = rotate_cba(cba, trajectory) + + IR, _, latency_smp = load_ir(cba.name, bin.name, bin_dataset) + + # render LFE + if bin_lfe_gain is not None: + bin_lfe, lfe_delay_ns = render_lfe_to_binaural( + cba.audio, cba.fs, cba.lfe_index, bin_lfe_gain + ) + + # render rest of the signal + bin.audio = binaural_fftconv(cba.audio, IR, cba.num_channels, cba.lfe_index) + # compensate delay from binaural dataset + bin.audio = delay(bin.audio, bin.fs, -latency_smp, samples=True) + + # add LFE and rest + if bin_lfe_gain is not None: + bin.audio += bin_lfe + + bin.audio = resample_itu(bin, old_fs) + + +def render_custom_ls_binaural( + custom_ls: audio.ChannelBasedAudio, + output: audio.BinauralAudio, + IR: np.ndarray, + SourcePosition: np.ndarray, + trajectory: str, +): + # TODO rework impl. (with EFAP) + # logger.info(" Processing channels on custom LS layout") + # azis = ", ".join([f"{a:7.2f}" for a in ls_azi_all]) + # eles = ", ".join([f"{e:7.2f}" for e in ls_ele_all]) + # logger.info(f" azi: {azis}") + # logger.info(f" ele: {eles}") + # logger.info(f" lfe_index: {lfe_index_all}") + + # if output.name == "BINAURAL_ROOM": + # tmp = get_audio_type("MOZART") + # convert_channel_based(custom_ls, tmp) + # logger.info(f" {custom_ls.name} -> {tmp.name} -> {output.name}") + # custom_ls.audio = tmp.audio + # else: + # tmp = custom_ls + # + # ls_azi_all = tmp.ls_azi + # ls_ele_all = tmp.ls_ele + # lfe_index_all = tmp.lfe_index + # + # frame_len = (IVAS_FRAME_LEN_MS // 4) * (fs // 1000) + # sig_len = custom_ls.audio.shape[0] + # N_frames = int(sig_len / frame_len) + # + # i_ls = 0 + # y = np.zeros([sig_len, 2]) + # for i_chan in range(custom_ls.audio.shape[1]): + # + # # skip LFE + # if i_chan in lfe_index_all: + # continue + # + # # skip silent (or very low volume) channels + # if np.allclose(custom_ls.audio[:, i_chan], 0.0, atol=32.0): + # continue + # + # ls_azi = np.repeat(ls_azi_all[i_ls], N_frames) + # ls_ele = np.repeat(ls_ele_all[i_ls], N_frames) + # + # azi, ele = rotateISM(ls_azi, ls_ele, trajectory=trajectory) + # + # y += binaural_fftconv_framewise( + # custom_ls.audio[:, i_chan], + # IR, + # SourcePosition, + # frame_len=frame_len, + # azi=azi, + # ele=ele, + # ) + # i_ls += 1 + # + # return y + return + + +def render_cba_to_cba( + cba_in: audio.ChannelBasedAudio, cba_out: audio.ChannelBasedAudio +) -> None: + """ + Rendering of channel-based input signal to channel-based output + + Parameters + ---------- + cba_in: audio.ObjectBasedAudio + Channel-based input audio + cba_out: audio.ChannelBasedAudio + Channel-based output audio + """ + + # Stereo to Mono + if cba_in.name == "STEREO" and cba_out.name == "MONO": + render_mtx = np.vstack([[0.5], [0.5]]) + else: + try: + render_mtx = IVAS_MC_CONVERSION[cba_in.name][cba_out.name] + except KeyError: + # Use EFAP panning if no matrix was found + panner = EFAP( + np.delete(cba_out.ls_azi, cba_out.lfe_index).astype(float), + np.delete(cba_out.ls_ele, cba_out.lfe_index).astype(float), + ) + + render_mtx = np.vstack( + [ + panner.pan(a, e).T + for i, (a, e) in enumerate(zip(cba_in.ls_azi, cba_in.ls_ele)) + if i not in cba_in.lfe_index + ] + ) + + # pass-through for LFE + for index in np.sort(cba_in.lfe_index): + render_mtx = np.insert(render_mtx, index, 0, axis=0) + render_mtx = np.insert(render_mtx, cba_out.lfe_index, 0, axis=1) + render_mtx[cba_in.lfe_index, cba_out.lfe_index] = 1 + + if cba_out.num_channels <= 2: + render_mtx[cba_in.lfe_index, :] = 0 + + cba_out.audio = cba_in.audio @ render_mtx + + +def render_cba_to_sba(cba: audio.ChannelBasedAudio, sba: audio.SceneBasedAudio) -> None: + """ + Rendering of channel-based input signal to SBA output + + Parameters + ---------- + cba: audio.ObjectBasedAudio + Channel-based input audio + sba: audio.ChannelBasedAudio + SBA output audio + """ + + if cba.name == "MONO": + raise ValueError(f"Rendering from MONO to {sba.name} is not supported.") + + # SH response for loudspeaker positions + render_mtx = np.hstack( + [ + scenebased.getRSH(np.array([a]), np.array([e]), sba.ambi_order) + for a, e in zip(cba.ls_azi, cba.ls_ele) + ] + ).T + render_mtx[cba.lfe_index] = 0 + + sba.audio = cba.audio @ render_mtx + # do not add LFE to output + if sba.is_planar: + scenebased.zero_vert_channels(sba) + + +def rotate_cba( + cba: audio.ChannelBasedAudio, + trajectory: str, +) -> np.ndarray: + """ + Rotate MC signal by applying a rotation matrix calculated from the current quaternion + in each subframe + + Parameters: + ---------- + x: np.ndarray + Input multichannel signal + trajectory: str + Path to trajectory file + + Returns: + ---------- + y: np.ndarray + Rotated multichannel signal + """ + + trj_data = np.genfromtxt(trajectory, delimiter=",") + trj_frames = trj_data.shape[0] + + sig_len = cba.audio.shape[0] + sig_dim = cba.audio.shape[1] + frame_len = (IVAS_FRAME_LEN_MS // 4) * 48 + + out = np.zeros([sig_len, sig_dim]) + + panner = EFAP(cba.ls_azi, cba.ls_ele) + + fade_in = np.arange(frame_len) / (frame_len - 1) + fade_in = fade_in[:, np.newaxis] + fade_out = 1.0 - fade_in + + R_old = np.eye(cba.num_channels) + + for i, (frame_in, frame_out) in framewise_io(cba.audio, out, frame_len): + # update the crossfade if we have a smaller last frame + if frame_out.shape[0] != frame_len: + frame_size = frame_out.shape[0] + fade_in = np.arange(frame_size) / (frame_size - 1) + fade_in = fade_in[:, np.newaxis] + fade_out = 1.0 - fade_in + + q = trj_data[i % trj_frames, :] + rotated_pos = np.array( + [rotateAziEle(a, e, Quat2RotMat(q)) for a, e in zip(cba.ls_azi, cba.ls_ele)] + ) + R = panner.pan(rotated_pos[:, 0], rotated_pos[:, 1]) + R[:, [cba.lfe_index]] = 0 + R[[cba.lfe_index], :] = 0 + R[cba.lfe_index, cba.lfe_index] = 1 + + frame_out[:, :] = (fade_in * frame_in @ R) + (fade_out * frame_in @ R_old) + + R_old = R.copy() + + return out + + +""" Helper functions """ + + +def render_lfe_to_binaural( + x: np.ndarray, + fs: Optional[int] = 48000, + lfe_index: Optional[list] = None, + LFE_gain: Optional[float] = BINAURAL_LFE_GAIN, +) -> Tuple[np.ndarray, int]: + """ + Extract LFE from the given input and render + it binaurally, accounting for delay + """ + + lfe = x[:, lfe_index].copy() + + # if there is more than one LFE sum them into one + if lfe.shape[1] > 1: + lfe = np.sum(lfe, axis=1) + + """ + # 120 Hz low-pass filtering for LFE using IVAS filter coefficients + if fs == 48000: + lfe = sig.sosfilt(IVAS_LPF_4_BUTTER_48K_SOS, lfe, axis=0) + else: + raise NotImplementedError("Only 48 kHz supported at the moment!") + + # 3.5ms LP filter delay from IVAS ROM + lfe_delay_ns = 0.0035 * 1e9 + lfe_delay_smp = round(lfe_delay_ns * fs / 1e9) + + # Delay LFE by the same amount as the HRTF delay + lfe = np.roll(lfe, round(latency_smp), axis=0) + lfe[0 : round(latency_smp), :] = 0 + """ + lfe_delay_ns = 0 + + # apply gain + lfe *= LFE_gain + + # duplicate for each binaural channel + if len(np.shape(lfe)) < 2: + lfe = lfe[:, np.newaxis] + lfe = np.hstack([lfe, lfe]) + + return lfe, lfe_delay_ns diff --git a/item_generation_scripts/audiotools/convert/masa.py b/item_generation_scripts/audiotools/convert/masa.py new file mode 100644 index 00000000..15f1c683 --- /dev/null +++ b/item_generation_scripts/audiotools/convert/masa.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +from pathlib import Path +from typing import Optional, Union +from warnings import warn + +from item_generation_scripts.audiotools import audio +from item_generation_scripts.audiotools.convert import channelbased +from item_generation_scripts.audiotools.wrappers.masaRenderer import masaRenderer + +""" MetadataAssistedSpatialAudio functions """ + + +def convert_masa( + masa: audio.MetadataAssistedSpatialAudio, + out: audio.Audio, + **kwargs, +) -> audio.Audio: + """Convert Metadata Assisted Spatial audio to the requested output format""" + + # MASA -> Binaural + if isinstance(out, audio.BinauralAudio): + render_masa_to_binaural(masa, out, **kwargs) + + # MASA -> CBA + elif isinstance(out, audio.ChannelBasedAudio): + render_masa_to_cba(masa, out) + + # MASA -> SBA + elif isinstance(out, audio.SceneBasedAudio): + render_masa_to_sba(masa, out) + + else: + raise NotImplementedError( + f"Conversion from {masa.name} to {out.name} is unsupported!" + ) + + return out + + +def render_masa_to_binaural( + masa: audio.MetadataAssistedSpatialAudio, + bin: audio.BinauralAudio, + trajectory: Optional[Union[str, Path]] = None, + bin_dataset: Optional[str] = None, + **kwargs, +) -> None: + """ + Binauralization of MASA audio + + Parameters + ---------- + masa: audio.MetadataAssistedSpatialAudio + MASA input audio + bin: audio.BinauralAudio + Output binaural audio + trajectory: Optional[Union[str, Path]] + Head rotation trajectory path + bin_dataset: Optional[str] + Name of binaural dataset without prefix or suffix + """ + + if "ROOM" in bin.name: + cba_tmp = audio.fromtype("7_1_4") + cba_tmp.fs = masa.fs + + render_masa_to_cba(masa, cba_tmp) + + channelbased.render_cba_to_binaural(cba_tmp, bin, trajectory) + else: + if trajectory is not None: + warn( + f"Head-rotation not supported by MasaRenderer! Trajectory {trajectory} will be ignored!" + ) + if bin_dataset is not None: + warn( + "Binaural dataset selection not supported by MasaRenderer - please copy the required hrir.bin manually!" + ) + + bin.audio = masaRenderer(masa, "BINAURAL") + + +def render_masa_to_cba( + masa: audio.MetadataAssistedSpatialAudio, + cba: audio.ChannelBasedAudio, +) -> None: + """ + Rendering of MASA input signal to Channel-based format + + Parameters + ---------- + masa: audio.MetadataAssistedSpatialAudio + MASA input audio + cba: audio.ChannelBasedAudio + Channel-based output audio + """ + + if cba.name not in ["5_1", "7_1_4"]: + warn( + f"MasaRenderer does not support {cba.name} natively. Using 7_1_4 as an intermediate format." + ) + + cba_tmp = audio.fromtype("7_1_4") + cba_tmp.fs = masa.fs + cba_tmp.audio = masaRenderer(masa, cba_tmp.name) + + channelbased.render_cba_to_cba(cba_tmp, cba) + else: + cba.audio = masaRenderer(masa, cba.name) + + +def render_masa_to_sba( + masa: audio.MetadataAssistedSpatialAudio, + sba: audio.SceneBasedAudio, +) -> None: + """ + Rendering of MASA input signal to SBA format + + Parameters + ---------- + masa: audio.MetadataAssistedSpatialAudio + MASA input audio + sba: audio.SceneBasedAudio + SBA output audio + """ + + warn( + f"MasaRenderer does not support {sba.name} natively. Using 7_1_4 as an intermediate format." + ) + + cba_tmp = audio.fromtype("7_1_4") + cba_tmp.fs = masa.fs + cba_tmp.audio = masaRenderer(masa, cba_tmp.name) + + channelbased.render_cba_to_sba(cba_tmp, sba) diff --git a/item_generation_scripts/audiotools/convert/objectbased.py b/item_generation_scripts/audiotools/convert/objectbased.py new file mode 100644 index 00000000..9fb74ed1 --- /dev/null +++ b/item_generation_scripts/audiotools/convert/objectbased.py @@ -0,0 +1,352 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +from itertools import repeat +from pathlib import Path +from typing import Optional, Tuple, Union + +import numpy as np + +from item_generation_scripts.audiotools import audio +from item_generation_scripts.audiotools.audioarray import delay, framewise_io +from item_generation_scripts.audiotools.binaural_datasets.binaural_dataset import ( + load_ir, +) +from item_generation_scripts.audiotools.binauralobjectrenderer import ( + binaural_fftconv_framewise, +) +from item_generation_scripts.audiotools.constants import IVAS_FRAME_LEN_MS +from item_generation_scripts.audiotools.convert.channelbased import ( + render_cba_to_binaural, +) +from item_generation_scripts.audiotools.convert.scenebased import getRSH +from item_generation_scripts.audiotools.EFAP import EFAP, wrap_angles +from item_generation_scripts.audiotools.rotation import Quat2RotMat, rotateAziEle +from item_generation_scripts.audiotools.wrappers.filter import resample_itu +from item_generation_scripts.utils import apply_func_parallel + +""" ObjectBasedAudio functions """ + + +def convert_objectbased( + oba: audio.ObjectBasedAudio, + out: audio.Audio, + **kwargs, +) -> audio.Audio: + """Convert an ISM signal to the requested output format""" + + # OBA -> Binaural + if isinstance(out, audio.BinauralAudio): + render_oba_to_binaural(oba, out, **kwargs) + + # OBA -> CBA + elif isinstance(out, audio.ChannelBasedAudio): + render_oba_to_cba(oba, out) + + # OBA -> SBA + elif isinstance(out, audio.SceneBasedAudio): + render_oba_to_sba(oba, out) + else: + raise NotImplementedError( + f"Conversion from {oba.name} to {out.name} is unsupported!" + ) + + return out + + +def render_oba_to_binaural( + oba: audio.ObjectBasedAudio, + bin: audio.BinauralAudio, + trajectory: Optional[Union[str, Path]] = None, + bin_dataset: Optional[str] = None, + **kwargs, +) -> None: + """ + Binauralization of ISM input signal + + Parameters + ---------- + oba: audio.ObjectBasedAudio + Object based input audio + bin: audio.BinauralAudio + Binaural output audio + trajectory: Optional[Union[str, Path]] + Head rotation trajectory + bin_dataset: Optional[str] + Name of binaural dataset, if None default dataset is used + """ + + # bin.audio = np.zeros([oba.audio.shape[0], bin.num_channels]) + + if "ROOM" in bin.name: + cba_tmp = audio.fromtype("7_1_4") + cba_tmp.fs = oba.fs + + render_oba_to_cba(oba, cba_tmp) + + render_cba_to_binaural(cba_tmp, bin, trajectory) + else: + IR, SourcePosition, latency_smp = load_ir(oba.name, bin.name, bin_dataset) + + oba.audio = resample_itu(oba, 48000) + fs_old = oba.fs + oba.fs = 48000 + + # apply processing for every object in parallel + obj_pos = oba.object_pos + obj_idx = list(range(oba.num_channels)) + result = apply_func_parallel( + render_object, + zip( + obj_idx, + obj_pos, + repeat(oba), + repeat(trajectory), + repeat(IR), + repeat(SourcePosition), + ), + None, + "mt", + False, + ) + + # sum results over all objects + bin.audio = np.sum(np.stack(result, axis=2), axis=2) + + # compensate delay from binaural dataset + bin.audio = delay(bin.audio, bin.fs, -latency_smp, samples=True) + + bin.audio = resample_itu(bin, fs_old) + bin.fs = fs_old + + +def render_oba_to_cba( + oba: audio.ObjectBasedAudio, + cba: audio.ChannelBasedAudio, +) -> None: + """ + Rendering of ISM input signal to channel-based format + + Parameters + ---------- + oba: audio.ObjectBasedAudio + Object based input audio + cba: audio.ChannelBasedAudio + Channel-based output audio + """ + + cba.audio = np.zeros([oba.audio.shape[0], cba.num_channels]) + + for obj_idx, obj_pos in enumerate(oba.object_pos): + obj_audio = oba.audio[:, [obj_idx]] + pos_frames = obj_pos.shape[0] + + frame_len = IVAS_FRAME_LEN_MS * (oba.fs // 1000) + + fade_in = np.arange(frame_len) / (frame_len - 1) + fade_in = fade_in[:, np.newaxis] + fade_out = 1.0 - fade_in + + # use EFAP for rendering + panner = EFAP( + np.delete(cba.ls_azi, cba.lfe_index), np.delete(cba.ls_ele, cba.lfe_index) + ) + gains_old = None + + for i, (frame_in, frame_out) in framewise_io(obj_audio, cba.audio, frame_len): + # update the crossfade if we have a smaller last frame + if frame_out.shape[0] != frame_len: + frame_size = frame_out.shape[0] + fade_in = np.arange(frame_size) / (frame_size - 1) + fade_in = fade_in[:, np.newaxis] + fade_out = 1.0 - fade_in + + azi, ele = wrap_angles(*obj_pos[i % pos_frames, :2], clip_ele=True) + gains = panner.pan(azi, ele) + for lfe in np.sort(cba.lfe_index): + gains = np.insert(gains, lfe, 0) + gains = gains[np.newaxis, :] + + if gains_old is None: + gains_old = gains.copy() + + frame_out[:] += (fade_in * frame_in @ gains) + ( + fade_out * frame_in @ gains_old + ) + + gains_old = gains.copy() + + +def render_oba_to_sba( + oba: audio.ObjectBasedAudio, + sba: audio.SceneBasedAudio, +) -> None: + """ + Rendering of ISM input signal to SBA format + + Parameters + ---------- + oba: audio.ObjectBasedAudio + Object based input audio + sba: audio.SceneBasedAudio + SBA output audio + """ + + sba.audio = np.zeros([oba.audio.shape[0], sba.num_channels]) + + for obj_idx, obj_pos in enumerate(oba.object_pos): + obj_audio = oba.audio[:, [obj_idx]] + pos_frames = obj_pos.shape[0] + + frame_len = IVAS_FRAME_LEN_MS * (oba.fs // 1000) + + fade_in = np.arange(frame_len) / (frame_len - 1) + fade_in = fade_in[:, np.newaxis] + fade_out = 1.0 - fade_in + + gains_old = None + + for i, (frame_in, frame_out) in framewise_io(obj_audio, sba.audio, frame_len): + # update the crossfade if we have a smaller last frame + if frame_out.shape[0] != frame_len: + frame_size = frame_out.shape[0] + fade_in = np.arange(frame_size) / (frame_size - 1) + fade_in = fade_in[:, np.newaxis] + fade_out = 1.0 - fade_in + + pos = obj_pos[i % pos_frames, :] + gains = getRSH(np.array([pos[0]]), np.array([pos[1]]), sba.ambi_order) + + if gains_old is None: + gains_old = gains.copy() + + frame_out[:] += (fade_in * frame_in @ gains.T) + ( + fade_out * frame_in @ gains_old.T + ) + + gains_old = gains.copy() + + +def rotate_oba( + azi: np.ndarray, + ele: np.ndarray, + trajectory: Optional[str] = None, +) -> Tuple[np.ndarray, np.ndarray]: + """ + Application of head tracking trajectory + + Parameters: + ---------- + azi: np.ndarray + Azimuth coordinates of objects + ele: np.ndarray + Elevation coordinates of objects + trajectory: str + Head-tracking trajectory path + + Returns: + ---------- + azi_rot: np.ndarray + Azimuth coordinates after application of trajectory + ele_rot: np.ndarray + Elevation coordinates after application of trajectory + """ + + if trajectory is None: + return azi, ele + + trj_data = np.genfromtxt(trajectory, delimiter=",") + trj_frames = trj_data.shape[0] + + N_frames = azi.shape[0] + if ele.shape[0] != azi.shape[0]: + raise ValueError("Inconsistent input in azi and ele") + + azi_rot = np.zeros([N_frames]) + ele_rot = np.zeros([N_frames]) + + for i_frame in range(N_frames): + q = trj_data[i_frame % trj_frames, :] + azi_rot[i_frame], ele_rot[i_frame] = rotateAziEle( + azi[i_frame], ele[i_frame], Quat2RotMat(q) + ) + + return azi_rot, ele_rot + + +def render_object( + obj_idx: int, + obj_pos: np.ndarray, + oba: audio.ObjectBasedAudio, + trajectory: str, + IR: np.ndarray, + SourcePosition: np.ndarray, +) -> np.ndarray: + """ + Binaural rendering for one ISM object + + Parameters: + ---------- + obj_idx: int + Index of object in list of all objects + obj_pos: np.ndarray + Position of object + oba: audio.ObjectBasedAudio + Input ISM audio object + trajectory: str + Head-tracking trajectory path + IR: np.ndarray + HRIRs for binauralization + SourcePosition: np.ndarray + Positions of HRIR measurements + + Returns: + ---------- + result_audio: np.ndarray + Binaurally rendered object + """ + + # repeat each value four times since head rotation data is on sub-frame basis + azi = np.repeat(obj_pos[:, 0], 4) + ele = np.repeat(obj_pos[:, 1], 4) + # apply head-rotation trajectory + obj_audio = oba.audio[:, [obj_idx]] + azi, ele = rotate_oba(azi, ele, trajectory) + # convolve signal with HRIRs + result_audio = binaural_fftconv_framewise( + obj_audio, + IR, + SourcePosition, + azi, + ele, + ) + return result_audio diff --git a/item_generation_scripts/audiotools/convert/scenebased.py b/item_generation_scripts/audiotools/convert/scenebased.py new file mode 100644 index 00000000..a7e89b4f --- /dev/null +++ b/item_generation_scripts/audiotools/convert/scenebased.py @@ -0,0 +1,429 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +from pathlib import Path +from typing import Optional, Union +from warnings import warn + +import numpy as np +from scipy.special import lpmv + +from item_generation_scripts.audiotools import audio +from item_generation_scripts.audiotools.audioarray import delay, framewise_io +from item_generation_scripts.audiotools.binaural_datasets.binaural_dataset import ( + load_ir, +) +from item_generation_scripts.audiotools.constants import ( + IVAS_FRAME_LEN_MS, + T_DESIGN_11_AZI, + T_DESIGN_11_ELE, + VERT_HOA_CHANNELS_ACN, +) +from item_generation_scripts.audiotools.convert import channelbased +from item_generation_scripts.audiotools.convert.binaural import binaural_fftconv +from item_generation_scripts.audiotools.EFAP import EFAP +from item_generation_scripts.audiotools.rotation import Quat2RotMat, SHrotmatgen +from item_generation_scripts.audiotools.wrappers.filter import resample_itu + +""" SceneBasedAudio functions """ + + +def convert_scenebased( + sba: audio.SceneBasedAudio, + out: audio.Audio, + **kwargs, +) -> audio.Audio: + """Convert scene-based audio to the requested output format""" + + # SBA -> Binaural + if isinstance(out, audio.BinauralAudio): + render_sba_to_binaural(sba, out, **kwargs) + + # SBA -> CBA + elif isinstance(out, audio.ChannelBasedAudio): + render_sba_to_cba(sba, out) + + # SBA -> SBA + elif isinstance(out, audio.SceneBasedAudio): + render_sba_to_sba(sba, out) + else: + raise NotImplementedError( + f"Conversion from {sba.name} to {out.name} is unsupported!" + ) + + return out + + +def render_sba_to_binaural( + sba: audio.SceneBasedAudio, + bin: audio.BinauralAudio, + trajectory: Optional[Union[str, Path]] = None, + bin_dataset: Optional[str] = None, + **kwargs, +) -> None: + """ + Binauralization of scene-based audio + + Parameters + ---------- + sba: audio.SceneBasedAudio + Input SBA audio + bin: audio.BinauralAudio + Output binaural audio + trajectory: Optional[Union[str, Path]] + Head rotation trajectory path + bin_dataset: Optional[str] + Name of binaural dataset without prefix or suffix + """ + + if trajectory is not None: + sba.audio = rotate_sba(sba, trajectory) + + if "ROOM" in bin.name: + cba_tmp = audio.fromtype("7_1_4") + cba_tmp.fs = sba.fs + + render_sba_to_cba(sba, cba_tmp) + + channelbased.render_cba_to_binaural(cba_tmp, bin, trajectory) + else: + IR, _, latency_smp = load_ir(sba.name, bin.name, bin_dataset) + + sba.audio = resample_itu(sba, 48000) + fs_old = sba.fs + sba.fs = 48000 + + bin.audio = binaural_fftconv(sba.audio, IR, sba.num_channels) + + # compensate delay from binaural dataset + bin.audio = delay(bin.audio, bin.fs, -latency_smp, samples=True) + + bin.audio = resample_itu(bin, fs_old) + bin.fs = fs_old + + +def render_sba_to_cba( + sba: audio.SceneBasedAudio, + cba: audio.ChannelBasedAudio, +) -> None: + """ + Rendering of SBA input signal to channel-based format + + Parameters + ---------- + sba: audio.SceneBasedAudio + Scene-based input audio + cba: audio.ChannelBasedAudio + Channel-based output audio + """ + + render_mtx = get_allrad_mtx(sba.ambi_order, cba) + cba.audio = sba.audio @ render_mtx.T + + +def render_sba_to_sba( + sba_in: audio.SceneBasedAudio, + sba_out: audio.SceneBasedAudio, +) -> None: + """ + Rendering of SBA input signal to SBA output format + + Parameters + ---------- + sba_in: audio.SceneBasedAudio + Scene-based input audio + sba_out: audio.SceneBasedAudio + Scene-based output audio + """ + + if sba_out.ambi_order > sba_in.ambi_order: + sba_out.audio = np.pad( + sba_in.audio, [[0, 0], [0, sba_out.num_channels - sba_in.num_channels]] + ) + elif sba_out.ambi_order < sba_in.ambi_order: + sba_out.audio = sba_in.audio[:, : sba_out.num_channels] + + if sba_out.is_planar: + zero_vert_channels(sba_out) + + +def rotate_sba( + sba: audio.SceneBasedAudio, + trajectory: str, +) -> np.ndarray: + """ + Rotate HOA signal by applying a rotation matrix calculated from the current quaternion + in each subframe + + Parameters: + ---------- + x: np.ndarray + Input signal upto HOA3 + trajectory: str + Path to trajectory file + + Returns: + ---------- + y: np.ndarray + Rotated HOA signal + """ + + trj_data = np.genfromtxt(trajectory, delimiter=",") + trj_frames = trj_data.shape[0] + + sig_len = sba.audio.shape[0] + sig_dim = sba.audio.shape[1] + frame_len = (IVAS_FRAME_LEN_MS // 4) * 48 + + if sig_dim not in [4, 9, 16]: + raise ValueError("rotate_sba can only handle FOA, HOA2 or HOA3 signals!") + + out = np.zeros([sig_len, sig_dim]) + + fade_in = np.arange(frame_len) / (frame_len - 1) + fade_in = fade_in[:, np.newaxis] + fade_out = 1.0 - fade_in + + R = np.eye(sig_dim) + R_old = np.eye(sig_dim) + for i, (frame_in, frame_out) in framewise_io(sba.audio, out, frame_len): + # update the crossfade if we have a smaller last frame + if frame_out.shape[0] != frame_len: + frame_size = frame_out.shape[0] + fade_in = np.arange(frame_size) / (frame_size - 1) + fade_in = fade_in[:, np.newaxis] + fade_out = 1.0 - fade_in + + R_r = Quat2RotMat(trj_data[i % trj_frames, :]) + R[:, :] = SHrotmatgen(R_r, order=ambi_order_from_nchan(sig_dim)) + + frame_out[:, :] = (fade_in * frame_in @ R.T) + (fade_out * frame_in @ R_old.T) + + R_old[:, :] = R.copy() + + return out + + +""" Helper functions """ + + +def zero_vert_channels(sba: audio.SceneBasedAudio) -> None: + """Remove all ambisonics parts with vertical components""" + sba.audio[:, VERT_HOA_CHANNELS_ACN[VERT_HOA_CHANNELS_ACN < sba.num_channels]] = 0 + + +def nchan_from_ambi_order(ambi_order: int) -> int: + """Compute number of channels based on ambisonics order""" + return (ambi_order + 1) ** 2 + + +def ambi_order_from_nchan(nchan: int) -> int: + """Compute ambisonics order based on number of channels""" + return int(np.sqrt(nchan) - 1) + + +def rE_weight(order: int) -> np.ndarray: + """Compute max-rE weighting matrix""" + return np.array( + [ + lpmv(0, l, np.cos(np.deg2rad(137.9) / (order + 1.51))) + for l in range(order + 1) + for _ in range(-l, l + 1) + ] + ).T + + +def n2sn(order: int) -> np.ndarray: + """Compute conversion matrix for N3D to SN3D normalization""" + return np.array( + [1.0 / np.sqrt(2 * l + 1) for l in range(order + 1) for _ in range(-l, l + 1)] + ) + + +def sn2n(order: int) -> np.ndarray: + """Compute conversion matrix for SN3D to N3D normalization""" + return np.array( + [np.sqrt(2 * l + 1) for l in range(order + 1) for _ in range(-l, l + 1)] + ) + + +def getRSH( + azi: np.ndarray, + ele: np.ndarray, + ambi_order: int, + norm: Optional[str] = "sn3d", + degrees: Optional[bool] = True, +) -> np.ndarray: + """ + Returns real spherical harmonic response for the given position(s) + + Parameters: + ---------- + azi: np.ndarray + Azimuth angles + ele: np.ndarray + Elevation angles + ambi_order: int + Ambisonics order + norm: Optional[str] + Normalization of ambisonic bases. + Possible values: "sn3d", "n3d", everything else is interpreted as orthogonal + degrees: Optional[bool] + If true azi and ele are interpreted as angles in degrees, otherwise as radians + + Returns: + ---------- + response: np.ndarray + Real spherical harmonic response + """ + + if degrees: + azi = np.deg2rad(azi) + ele = np.deg2rad(ele) + + azi = azi.astype("float64") + ele = ele.astype("float64") + + LM = np.array([(l, m) for l in range(ambi_order + 1) for m in range(-l, l + 1)]) + + response = np.zeros([LM.shape[0], azi.shape[0]]) + + # trig_term * legendre * uncondon + for i, (l, m) in enumerate(LM): + # N3D norm + response[i, :] = np.sqrt( + ((2 * l + 1) * float(np.math.factorial(l - np.abs(m)))) + / (4 * np.pi * float(np.math.factorial(l + np.abs(m)))) + ) + + # trig term + if m < 0: + response[i, :] *= np.sqrt(2) * np.sin(azi * np.abs(m)) + elif m == 0: + pass # response[i,:] *= 1 + else: + response[i, :] *= np.sqrt(2) * np.cos(azi * m) + + # legendre polynomial + a = lpmv(np.abs(m), l, np.sin(ele)) * ((-1) ** np.abs(m)) + if np.inf in a or -np.inf in a: + a[a == np.inf] = np.finfo(np.float64).max + a[a == -np.inf] = np.finfo(np.float64).min + warn( + "Warning: order too large -> leads to overflow. Inf values are discarded!" + ) + response[i, :] *= a + + if norm == "sn3d": + response *= np.sqrt(4 * np.pi) + response[:] = np.diag(n2sn(ambi_order)) @ response + elif norm == "n3d": + response *= np.sqrt(4 * np.pi) + else: + pass # ortho + + return response + + +def get_allrad_mtx( + ambi_order: int, + cba: audio.ChannelBasedAudio, + norm: Optional[str] = "sn3d", + rE_weight_bool: Optional[bool] = False, + intensity_panning: Optional[bool] = True, +) -> np.ndarray: + """ + Returns ALLRAD matrix + + Parameters: + ---------- + ambi_order: int + Ambisonics order + cba: audio.ChannelBasedAudio + Channel-based audio object + norm: Optional[str] + Normalization of ambisonic bases. + Possible values: "sn3d", "ortho", everything else is interpreted as n3d + re_weight_bool: Optional[bool] + Flag for max-rE weighting + intensity_panning: Optional[bool] + Flag for intensity panning + + Returns: + ---------- + hoa_dec: np.ndarray + ALLRAD matrix + """ + + n_harm = nchan_from_ambi_order(ambi_order) + + if cba.name == "MONO": + hoa_dec = np.zeros([1, n_harm]) + hoa_dec[0, 0] = 1 + elif cba.name == "STEREO": + hoa_dec = np.zeros([2, n_harm]) + # Cardioids +/- 90 degrees + hoa_dec[0, 0] = 0.5 + hoa_dec[0, 1] = 0.5 + hoa_dec[1, 0] = 0.5 + hoa_dec[1, 1] = -0.5 + else: + Y_td = getRSH( + T_DESIGN_11_AZI, + T_DESIGN_11_ELE, + ambi_order, + norm="ortho", + ) + Y_td *= np.sqrt(4 * np.pi) + + n_ls_woLFE = cba.num_channels - len(cba.lfe_index) + ls_azi_woLFE = np.delete(cba.ls_azi, cba.lfe_index).astype(float) + ls_ele_woLFE = np.delete(cba.ls_ele, cba.lfe_index).astype(float) + + panner = EFAP(ls_azi_woLFE, ls_ele_woLFE, intensity_panning) + G_td = panner.pan(T_DESIGN_11_AZI, T_DESIGN_11_ELE) + + hoa_dec = (G_td.T @ Y_td.T) / T_DESIGN_11_AZI.size + + if norm == "sn3d": + hoa_dec = hoa_dec @ np.diag(sn2n(ambi_order)) + elif norm == "ortho": + hoa_dec *= np.sqrt(4 * np.pi) + + if rE_weight_bool: + a_n = rE_weight(ambi_order) + nrg_pre = np.sqrt(len(n_ls_woLFE) / np.sum(a_n**2)) + hoa_dec = hoa_dec @ np.diag(a_n) * nrg_pre + + hoa_dec = np.insert(hoa_dec, cba.lfe_index, np.zeros(n_harm), axis=0) + + return hoa_dec diff --git a/item_generation_scripts/audiotools/metadata.py b/item_generation_scripts/audiotools/metadata.py new file mode 100644 index 00000000..0a4631ae --- /dev/null +++ b/item_generation_scripts/audiotools/metadata.py @@ -0,0 +1,571 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +import csv +from pathlib import Path +from typing import Optional, TextIO, Tuple, Union + +import numpy as np + +from item_generation_scripts.audiotools import audio +from item_generation_scripts.audiotools.audio import fromtype +from item_generation_scripts.audiotools.audioarray import trim +from item_generation_scripts.audiotools.audiofile import read +from item_generation_scripts.audiotools.constants import IVAS_FRAME_LEN_MS + + +class Metadata: + def __init__(self, meta_file: Union[str, Path]): + self.meta_file = Path(meta_file) + + if not self.meta_file.exists(): + raise FileNotFoundError( + f"Scene description file {self.meta_file} does not exist!" + ) + + with open(self.meta_file) as f: + audio_file = self.meta_file.parent.joinpath(f.readline().strip()).absolute() + + if audio_file.suffix != ".wav": + raise ValueError( + "Scene description files can only be used with WAVE input!" + ) + + self.audio_array, self.fs = read(audio_file) + self.audio = [] + + num_audio = int(f.readline().strip()) + for _ in range(num_audio): + in_fmt = f.readline().strip().upper() + + if in_fmt == "ISM": + self.parse_ism_input(f) + elif in_fmt == "MASA": + self.parse_masa_input(f) + elif in_fmt == "MC": + self.parse_mc_input(f) + elif in_fmt == "SBA": + self.parse_sba_input(f) + else: + raise KeyError(f"Unknown input type in metadata file {in_fmt}") + + def parse_ism_input(self, f: TextIO): + start = int(f.readline().strip()) - 1 + + ism = fromtype("ISM1") + ism.audio = self.audio_array[:, start : start + 1] + ism.fs = self.fs + + line = f.readline().strip() + tmp_path = self.meta_file.parent.joinpath(line).absolute() + if tmp_path.exists(): + # csv metadata + ism.metadata_files = [tmp_path] + ism.init_metadata() + else: + # manually specified metadata + positions = [f.readline().strip() for _ in range(int(line))] + positions = np.genfromtxt( + positions, delimiter="," + ) # TODO can use ndmin = 2 with numpy > 1.23.0; check support + if positions.ndim == 1: + positions = positions[np.newaxis, :] + + obj_pos = [] + # repeat based on first column + for p in positions: + repeats = int(p[0]) + obj_pos.append(np.tile(p[1:], [repeats, 1])) + obj_pos = np.vstack(obj_pos) + + ism.object_pos = [obj_pos] + + self.audio.append(ism) + + def parse_masa_input(self, f: TextIO): + start = int(f.readline().strip()) - 1 + masa_tc = int(f.readline().strip()) + + masa = fromtype(f"MASA{masa_tc}") + masa.audio = self.audio_array[:, start : start + masa_tc] + masa.fs = self.fs + masa.metadata_files = [ + self.meta_file.parent.joinpath(f.readline().strip()).absolute() + ] + masa.init_metadata() + + self.audio.append(masa) + + def parse_mc_input(self, f: TextIO): + start = int(f.readline().strip()) - 1 + mc_fmt = f.readline().strip() + + mc = fromtype(mc_fmt) + mc.audio = self.audio_array[:, start : start + mc.num_channels] + mc.fs = self.fs + + self.audio.append(mc) + + def parse_sba_input(self, f: TextIO): + start = int(f.readline().strip()) - 1 + sba_order = int(f.readline().strip()) + + sba = fromtype(f"SBA{sba_order}") + sba.audio = self.audio_array[:, start : start + sba.num_channels] + sba.fs = self.fs + + self.audio.append(sba) + + def parse_optional_values(self, f: TextIO): + raise NotImplementedError( + "Additional configuration keys in metadata currently unsupported!" + ) + + # opts = {} + # original_pos = f.tell() + # key_value = f.readline().strip() + + # try to parse a key, otherwise reset read pointer + # for key in OPT_KEYS: + # if key_value.startswith(key): + # opts[key] = key_value.replace(key, "").replace(":", "") + # original_pos = f.tell() + # key_value = f.readline.strip() + # else: + # f.seek(original_pos) + # + + +def write_ISM_metadata_in_file( + metadata: list[np.ndarray], + file_name: list[Union[str, Path]], + automatic_naming: Optional[bool] = False, +) -> list[str, Path]: + """ + Write ISM metadata into csv file(s) + + Parameters + ---------- + metadata: list[np.ndarray] + List of metadata arrays + file_name: list[Union[str, Path]] + List of file names for csv files + automatic_naming: Optional[bool] + If true files are named automatically name.0.csv, name.1.csv, ... with name as the first entry of file_name + + Returns + ---------- + file_names: list[str, Path] + List of acutally used file names + """ + + if not automatic_naming and len(metadata) != len(file_name): + raise ValueError("Number of metadata objects and file names has to match") + number_objects = len(metadata) + + if automatic_naming: + file_names = [] + for m_object in range(number_objects): + file_names.append(f"{file_name[0]}.{m_object}.csv") + else: + file_names = file_name + + for i, csv_file in enumerate(file_names): + number_frames = metadata[i].shape[0] + with open(csv_file, "w", newline="") as file: + writer = csv.writer(file) + for k in range(number_frames): + row_list = [ + "%+07.2f" % np.round(metadata[i][k, 0], 2), + "%+06.2f" % np.round(metadata[i][k, 1], 2), + "01.00", + "000.00", + "1.00", + ] + writer.writerow(row_list) + + return file_names + + +def trim_meta( + x: audio.ObjectBasedAudio, + limits: Optional[Tuple[int, int]] = None, + pad_noise: Optional[bool] = False, + samples: Optional[bool] = False, +) -> None: + """ + Trim or pad ISM including metadata + positive limits trim negative limits pad + + Parameters + ---------- + x: audio.ObjectBasedAudio + ISM audio object + limits: Optional[Tuple[int, int]] + Number of samples to trim or pad at beginning and end + pad_noise: Optional[bool] + Flag for padding noise instead of silence + samples: Optional[bool] + Flag for interpreting limits as samples, otherwise milliseconds + """ + + if not limits: + return + + frame_length = int(IVAS_FRAME_LEN_MS * x.fs // 1000) + + # check if trim values are multiples of the frame length + if not samples: + pre_trim = int(limits[0] * x.fs // 1000) + post_trim = int(limits[1] * x.fs // 1000) + else: + pre_trim = limits[0] + post_trim = limits[1] + + if pre_trim % frame_length != 0 or post_trim % frame_length != 0: + raise ValueError( + f"ISM metadata padding and trimming only possible if pad/trim length is multiple of frame length. " + f"Frame length: {IVAS_FRAME_LEN_MS}ms" + ) + + # check if audio is multiple of frame length + if np.shape(x.audio)[0] % frame_length != 0: + raise ValueError( + f"ISM metadata padding and trimming only possible if audio length is multiple of frame length. " + f"Frame length: {IVAS_FRAME_LEN_MS}ms" + ) + + # check if metadata length fits exactly to audio length + for meta in x.object_pos: + if np.shape(meta)[0] * frame_length != np.shape(x.audio)[0]: + raise ValueError( + f"ISM metadata padding and trimming only possible if audio length is multiple of frame " + f"length and audio and metadata length match. Frame length: {IVAS_FRAME_LEN_MS}ms" + ) + + # trim audio + x.audio = trim(x.audio, x.fs, limits, pad_noise, samples) + + # trim metadata + trim_frames_pre = int(pre_trim / frame_length) + trim_frames_post = int(post_trim / frame_length) + for i in range(len(x.object_pos)): + x.object_pos[i] = trim( + x.object_pos[i], + limits=(trim_frames_pre, trim_frames_post), + pad_noise=False, + samples=True, + ) + + # add radius 1 + if trim_frames_pre < 0: + x.object_pos[i][: abs(trim_frames_pre), 2] = 1 + if trim_frames_post < 0: + x.object_pos[i][abs(trim_frames_post) :, 2] = 1 + + return + + +def concat_meta_from_file( + audio_files: list[str], + meta_files: list[list[str]], + out_file: list[str], + input_fmt: str, + silence_pre: Optional[int] = 0, + silence_post: Optional[int] = 0, + preamble: Optional[int] = None, +) -> None: + """ + Concatenate ISM metadata from files + + Parameters + ---------- + audio_files: list[str] + List of audio file names + meta_files: list[list[str]] + List of corresponding metadata file names + out_file: list[str] + Name of concatenated output file + input_fmt: str + Input audio format + silence_pre: Optional[int] + Silence inserted before each item + silence_post: Optional[int] + Silence inserted after each item + preamble: Optional[int] + Length of preamble in milliseconds + """ + + # create audio objects + audio_objects = [] + fs = None + for i, audio_file in enumerate(audio_files): + # metadata is cut/looped to signal length in init of audio object + audio_object = audio.fromfile(input_fmt, audio_file, in_meta=meta_files[i]) + audio_objects.append(audio_object) + if fs: + if audio_object.fs != fs: + raise ValueError("Sampling rates of files to concatenate don't match") + else: + fs = audio_object.fs + + frame_length = int(IVAS_FRAME_LEN_MS * audio_objects[0].fs // 1000) + + # pad and concatenate + concat_meta_all_obj = [None] * audio_objects[0].num_channels + + for audio_item in audio_objects: + # check if audio is multiple of frame length + if np.shape(audio_item.audio)[0] % frame_length != 0: + raise ValueError( + f"ISM metadata padding and trimming only possible if audio length is multiple of frame length. " + f"Frame length: {IVAS_FRAME_LEN_MS}ms" + ) + + # check if metadata length fits exactly to audio length + for meta in audio_item.object_pos: + if np.shape(meta)[0] * frame_length != np.shape(audio_item.audio)[0]: + raise ValueError( + f"ISM metadata padding and trimming only possible if audio length is multiple of frame " + f"length and audio and metadata length match. Frame length: {IVAS_FRAME_LEN_MS}ms" + ) + + # pad + trim_meta( + audio_item, (-silence_pre, -silence_post) + ) # use negative value since we want to pad, not trim + + # concatenate + for idx, obj_pos in enumerate(audio_item.object_pos): + concat_meta_all_obj[idx] = ( + np.concatenate([concat_meta_all_obj[idx], obj_pos]) + if concat_meta_all_obj[idx] is not None + else obj_pos + ) + + # add preamble + if preamble: + concat_meta_all_obj = add_remove_preamble(concat_meta_all_obj, preamble) + + write_ISM_metadata_in_file(concat_meta_all_obj, out_file) + + return + + +def split_meta_in_file( + in_filename: Union[str, Path], + out_folder: Union[str, Path], + split_filenames: list[Union[str, Path]], + splits: list[int], + input_fmt: str, + meta_files: Optional[list[Union[str, Path]]] = None, + in_fs: Optional[int] = 48000, + preamble: Optional[int] = 0, +): + """ + Splits ISM metadata files into multiple shorter files + + Parameters + __________ + in_filename: Union[str, Path] + Input filenmame (.pcm, .raw or .wav) + out_folder: Union[str, Path] + Output folder where to put the splits + split_filenames: list[Union[str, Path]] + List of names for the split files + splits: list[int] + List of sample indices where to cut the signal + in_fs: Optional[int] + Input sampling rate, default 48000 Hz + """ + + # create a list of output files + out_paths = [] + + # Read input file by creating ISM audio object + audio_object = audio.fromfile(input_fmt, in_filename, in_meta=meta_files, fs=in_fs) + + split_old = 0 + for idx, split in enumerate(splits): + out_paths_obj = [] + for obj in range(audio_object.num_channels): + out_file = ( + Path(out_folder) + / f"{Path(split_filenames[idx]).with_suffix(in_filename.suffix)}.{obj}.csv" + ) + + # add the path to our list + out_paths_obj.append(out_file) + + # remove preamble + if preamble: + preamble_frames = int(preamble / IVAS_FRAME_LEN_MS) + y = trim( + audio_object.object_pos[obj], + audio_object.fs, + (preamble_frames, 0), + samples=True, + ) + else: + y = audio_object.object_pos[obj] + + # split + split_start = int(split_old / IVAS_FRAME_LEN_MS / audio_object.fs * 1000) + split_end = int(split / IVAS_FRAME_LEN_MS / audio_object.fs * 1000) + y = y[split_start:split_end, :] + + # write file + write_ISM_metadata_in_file([y], [out_file]) + + out_paths.append(out_paths_obj) + + split_old = split + + return out_paths + + +def check_ISM_metadata( + in_meta: dict, + num_objects: int, + num_items: int, + item_names: Optional[list] = None, +) -> list: + """Find ISM metadata""" + + list_meta = [] + if in_meta is None: + for item in item_names: + list_item = metadata_search(Path(item).parent, [item], num_objects) + list_meta.append(list_item) + else: + if len(in_meta) == 1 and num_items != 1: + # automatic search for metadata files in folder for all items and objects + try: + path_meta = in_meta["all_items"] + except KeyError: + raise ValueError( + 'Only one metadata path is given but not with key "all_items".' + ) + + list_meta = metadata_search(path_meta, item_names, num_objects) + + elif num_items == len(in_meta): + # search for every item individually + for item_idx in range(num_items): + # try to use item_names as keys + try: + if item_names: + current_item = in_meta[item_names[item_idx].name] + else: + raise KeyError + except KeyError: + current_item = in_meta[f"item{item_idx + 1}"] + + if len(current_item) == 1: + # automatic search in folder + list_item = metadata_search( + current_item[0], [item_names[item_idx]], num_objects + ) + + elif len(current_item) == num_objects: + # just read out + list_item = current_item + else: + raise ValueError("Number of objects and metadata does not match.") + list_meta.append(list_item) + else: + raise ValueError("Number of metadata inputs does not match number of items") + + # return list of lists of metadata files + return list_meta + + +def metadata_search( + in_meta_path: Union[str, Path], + item_names: list[Union[str, Path]], + num_objects: int, +) -> list[list[Union[Path, str]]]: + """Search for ISM metadata with structure item_name.{0-3}.csv in in_meta folder""" + + if not item_names: + raise ValueError("Item names not provided, can't search for metadata") + + list_meta = [] + for item in item_names: + list_item = [] + for obj_idx in range(num_objects): + file_name_meta = in_meta_path / Path(item.stem).with_suffix( + f"{item.suffix}.{obj_idx}.csv" + ) + # check if file exists and add to list + if file_name_meta.is_file(): + list_item.append(file_name_meta) + else: + raise ValueError(f"Metadata file {file_name_meta} not found.") + if len(item_names) == 1: + list_meta = list_item + else: + list_meta.append(list_item) + + return list_meta + + +def add_remove_preamble( + metadata, + preamble, + add: Optional[bool] = True, +): + preamble_frames = preamble / IVAS_FRAME_LEN_MS + if not preamble_frames.is_integer(): + raise ValueError( + f"Application of preamble for ISM metadata is only possible if preamble length is multiple of frame length. " + f"Frame length: {IVAS_FRAME_LEN_MS}ms" + ) + for obj_idx in range(len(metadata)): + if metadata is not None and metadata[obj_idx] is not None: + if add: + metadata[obj_idx] = trim( + metadata[obj_idx], + limits=(-int(preamble_frames), 0), + samples=True, + ) + + # add radius 1 + metadata[obj_idx][: int(preamble_frames), 2] = 1 + else: + metadata[obj_idx] = trim( + metadata[obj_idx], + limits=(int(preamble_frames), 0), + samples=True, + ) + + return metadata diff --git a/item_generation_scripts/audiotools/rotation.py b/item_generation_scripts/audiotools/rotation.py new file mode 100644 index 00000000..742548a8 --- /dev/null +++ b/item_generation_scripts/audiotools/rotation.py @@ -0,0 +1,379 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +from typing import Optional, Tuple + +import numpy as np + +""" +Helper functions used by Ruedenberg, +an implementation of the algorithm in +Ivanic, J. & Ruedenberg, K., J. Phys. Chem. 100, 6342 (1996) +translated from ivas_rotation.c +""" + + +def SHrot_p( + i: int, + l: int, + a: int, + b: int, + SHrotmat: np.ndarray, + R_lm1: np.ndarray, +) -> float: + """Helper function to calculate the ps""" + + ri1 = SHrotmat[i + 1 + 1][1 + 1 + 1] + rim1 = SHrotmat[i + 1 + 1][-1 + 1 + 1] + ri0 = SHrotmat[i + 1 + 1][0 + 1 + 1] + + if b == -l: + R_lm1_1 = R_lm1[a + l - 1][0] + R_lm1_2 = R_lm1[a + l - 1][2 * l - 2] + p = ri1 * R_lm1_1 + rim1 * R_lm1_2 + else: + if b == l: + R_lm1_1 = R_lm1[a + l - 1][2 * l - 2] + R_lm1_2 = R_lm1[a + l - 1][0] + p = ri1 * R_lm1_1 - rim1 * R_lm1_2 + else: + R_lm1_1 = R_lm1[a + l - 1][b + l - 1] + p = ri0 * R_lm1_1 + + return p + + +def SHrot_u( + l: int, + m: int, + n: int, + SHrotmat: np.ndarray, + R_lm1: np.ndarray, +) -> float: + """Helper function to calculate the us""" + + return SHrot_p(0, l, m, n, SHrotmat, R_lm1) + + +def SHrot_v( + l: int, + m: int, + n: int, + SHrotmat: np.ndarray, + R_lm1: np.ndarray, +) -> float: + """Helper function to calculate the vs""" + + if m == 0: + p0 = SHrot_p(1, l, 1, n, SHrotmat, R_lm1) + p1 = SHrot_p(-1, l, -1, n, SHrotmat, R_lm1) + return p0 + p1 + else: + if m > 0: + d = 1.0 if (m == 1) else 0.0 + p0 = SHrot_p(1, l, m - 1, n, SHrotmat, R_lm1) + p1 = SHrot_p(-1, l, -m + 1, n, SHrotmat, R_lm1) + return p0 * np.sqrt(1.0 + d) - p1 * (1.0 - d) + else: + d = 1.0 if (m == -1) else 0.0 + p0 = SHrot_p(1, l, m + 1, n, SHrotmat, R_lm1) + p1 = SHrot_p(-1, l, -m - 1, n, SHrotmat, R_lm1) + return p0 * (1.0 - d) + p1 * np.sqrt(1.0 + d) + + +def SHrot_w( + l: int, + m: int, + n: int, + SHrotmat: np.ndarray, + R_lm1: np.ndarray, +) -> float: + """Helper function to calculate the w""" + + if m == 0: + raise ValueError("ERROR should not be called\n") + else: + if m > 0: + p0 = SHrot_p(1, l, m + 1, n, SHrotmat, R_lm1) + p1 = SHrot_p(-1, l, -m - 1, n, SHrotmat, R_lm1) + return p0 + p1 + else: + p0 = SHrot_p(1, l, m - 1, n, SHrotmat, R_lm1) + p1 = SHrot_p(-1, l, -m + 1, n, SHrotmat, R_lm1) + return p0 - p1 + + +def SHrotmatgen( + R: np.ndarray, + order: Optional[int] = 3, +) -> np.ndarray: + """ + Calculate SHD rotation matrix from that in real space + translated from ivas_rotation.c + + Parameters: + ---------- + R: np.ndarray + real-space rotation matrix + order: Optional[int] + Ambisonics order, default = 3 + + Returns: + ---------- + SHrotmat: np.ndarray + SHD rotation matrix + """ + + dim = (order + 1) * (order + 1) + + SHrotmat = np.zeros([dim, dim]) + R_lm1 = np.zeros([dim, dim]) + R_l = np.zeros([dim, dim]) + + SHrotmat[0][0] = 1.0 + + SHrotmat[1][1] = R[1][1] + SHrotmat[1][2] = R[1][2] + SHrotmat[1][3] = R[1][0] + + SHrotmat[2][1] = R[2][1] + SHrotmat[2][2] = R[2][2] + SHrotmat[2][3] = R[2][0] + + SHrotmat[3][1] = R[0][1] + SHrotmat[3][2] = R[0][2] + SHrotmat[3][3] = R[0][0] + + for i in range(2 * 1 + 1): + for j in range(2 * 1 + 1): + R_lm1[i][j] = SHrotmat[i + 1][j + 1] + + band_idx = 4 + for l in range(2, order + 1): + R_l[:, :] = 0.0 + + for m in range(-l, l + 1): + d = 1 if (m == 0) else 0 + absm = abs(m) + sql2mm2 = np.sqrt((l * l - m * m)) + sqdabsm = np.sqrt(((1 + d) * (l + absm - 1) * (l + absm))) + sqlabsm = np.sqrt(((l - absm - 1) * (l - absm))) + + for n in range(-l, l + 1): + if abs(n) == l: + sqdenom = np.sqrt((2 * l) * (2 * l - 1)) + else: + sqdenom = np.sqrt(l * l - n * n) + + u = sql2mm2 / sqdenom + v = sqdabsm / sqdenom * (1 - 2 * d) * 0.5 + w = sqlabsm / sqdenom * (1 - d) * (-0.5) + + if u != 0: + u = u * SHrot_u(l, m, n, SHrotmat, R_lm1) + if v != 0: + v = v * SHrot_v(l, m, n, SHrotmat, R_lm1) + if w != 0: + w = w * SHrot_w(l, m, n, SHrotmat, R_lm1) + R_l[m + l][n + l] = u + v + w + + for i in range(2 * l + 1): + for j in range(2 * l + 1): + SHrotmat[band_idx + i][band_idx + j] = R_l[i][j] + + for i in range(2 * l + 1): + for j in range(2 * l + 1): + R_lm1[i][j] = R_l[i][j] + + band_idx += 2 * l + 1 + + return SHrotmat + + +def Quat2Euler( + quat: np.ndarray, + degrees: bool = True, +) -> np.ndarray: + """Convert Quaternion to Euler angles""" + + sinr = +2.0 * (quat[..., 0] * quat[..., 1] + quat[..., 2] * quat[..., 3]) + cosr = +1.0 - 2.0 * (quat[..., 1] * quat[..., 1] + quat[..., 2] * quat[..., 2]) + roll = np.arctan2(sinr, cosr) + + sinp = +2.0 * (quat[..., 0] * quat[..., 2] - quat[..., 3] * quat[..., 1]) + pitch = np.where(np.fabs(sinp) >= 1, np.copysign(np.pi / 2, sinp), np.arcsin(sinp)) + + siny = +2.0 * (quat[..., 0] * quat[..., 3] + quat[..., 1] * quat[..., 2]) + cosy = +1.0 - 2.0 * (quat[..., 2] * quat[..., 2] + quat[..., 3] * quat[..., 3]) + yaw = np.arctan2(siny, cosy) + + ypr = np.array([yaw, pitch, roll]).T + + if degrees: + ypr = np.rad2deg(ypr) + + return ypr + + +def Euler2Quat( + ypr: np.ndarray, + degrees: bool = True, +) -> np.ndarray: + """Convert Euler angles to Quaternion""" + + if degrees: + ypr = np.deg2rad(ypr) + + if len(ypr.shape) == 2: + N_quat = ypr.shape[0] + quat = np.zeros([N_quat, 4]) + yaw = ypr[:, 0] + pitch = ypr[:, 1] + roll = ypr[:, 2] + else: + quat = np.zeros([4]) + yaw = ypr[0] + pitch = ypr[1] + roll = ypr[2] + + c1 = np.cos(0.5 * yaw) + c2 = np.cos(0.5 * pitch) + c3 = np.cos(0.5 * roll) + + s1 = np.sin(0.5 * yaw) + s2 = np.sin(0.5 * pitch) + s3 = np.sin(0.5 * roll) + + quat[..., 0] = c3 * c2 * c1 + s3 * s2 * s1 + quat[..., 1] = s3 * c2 * c1 - c3 * s2 * s1 + quat[..., 2] = s3 * c2 * s1 + c3 * s2 * c1 + quat[..., 3] = c3 * c2 * s1 - s3 * s2 * c1 + + return quat + + +def Quat2RotMat( + quat: np.ndarray, +) -> np.ndarray: + """Convert quaternion to rotation matrix""" + + R = np.zeros([3, 3]) + + if quat[0] != -3: + # Quaternions + # formula taken from ivas_rotation.c + + R[0, 0] = ( + quat[0] * quat[0] + + quat[1] * quat[1] + - quat[2] * quat[2] + - quat[3] * quat[3] + ) + R[0, 1] = 2.0 * (quat[1] * quat[2] - quat[0] * quat[3]) + R[0, 2] = 2.0 * (quat[1] * quat[3] + quat[0] * quat[2]) + + R[1, 0] = 2.0 * (quat[1] * quat[2] + quat[0] * quat[3]) + R[1, 1] = ( + quat[0] * quat[0] + - quat[1] * quat[1] + + quat[2] * quat[2] + - quat[3] * quat[3] + ) + R[1, 2] = 2.0 * (quat[2] * quat[3] - quat[0] * quat[1]) + + R[2, 0] = 2.0 * (quat[1] * quat[3] - quat[0] * quat[2]) + R[2, 1] = 2.0 * (quat[2] * quat[3] + quat[0] * quat[1]) + R[2, 2] = ( + quat[0] * quat[0] + - quat[1] * quat[1] + - quat[2] * quat[2] + + quat[3] * quat[3] + ) + + else: + # Euler angles in R_X(roll)*R_Y(pitch)*R_Z(yaw) convention + # + # yaw: rotate scene counter-clockwise in the horizontal plane + # pitch: rotate scene in the median plane, increase elevation with positive values + # roll: rotate scene from the right ear to the top + # + # formula taken from ivas_rotation.c + + c1 = np.cos(quat[3] / 180.0 * np.pi) + c2 = np.cos(quat[2] / 180.0 * np.pi) + c3 = np.cos(quat[1] / 180.0 * np.pi) + + s1 = np.sin(quat[3] / 180.0 * np.pi) + s2 = np.sin(-quat[2] / 180.0 * np.pi) + s3 = np.sin(quat[1] / 180.0 * np.pi) + + R[0, 0] = c2 * c3 + R[0, 1] = -c2 * s3 + R[0, 2] = s2 + + R[1, 0] = c1 * s3 + c3 * s1 * s2 + R[1, 1] = c1 * c3 - s1 * s2 * s3 + R[1, 2] = -c2 * s1 + + R[2, 0] = s1 * s3 - c1 * c3 * s2 + R[2, 1] = c3 * s1 + c1 * s2 * s3 + R[2, 2] = c1 * c2 + + return R + + +def rotateAziEle( + azi: float, + ele: float, + R: np.ndarray, + is_planar: bool = False, +) -> Tuple[float, float]: + """Rotate azimuth and elevation angles with rotation matrix""" + + w = np.cos(np.deg2rad(ele)) + dv = np.array( + [ + w * np.cos(np.deg2rad(azi)), + w * np.sin(np.deg2rad(azi)), + np.sin(np.deg2rad(ele)), + ] + ) + + dv_rot = R @ dv + + azi = np.rad2deg(np.arctan2(dv_rot[1], dv_rot[0])) + if is_planar: + ele = 0 + else: + ele = np.rad2deg(np.arctan2(dv_rot[2], np.sqrt(np.sum(dv_rot[:2] ** 2)))) + + return azi, ele diff --git a/item_generation_scripts/audiotools/utils.py b/item_generation_scripts/audiotools/utils.py new file mode 100644 index 00000000..6aaf5fa9 --- /dev/null +++ b/item_generation_scripts/audiotools/utils.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +from pathlib import Path + +import numpy as np + +from item_generation_scripts.audiotools.rotation import Euler2Quat, Quat2Euler + + +def read_trajectory(trj_file: Path, return_quat=True): + trj = np.genfromtext(trj_file, delimiter=",") + + if np.all(trj[:, 0] == -3): + # Euler + if return_quat: + return Euler2Quat(trj[:, 1:]) + else: + return trj[:, 1:] + else: + # Quat + if return_quat: + return trj + else: + return Quat2Euler(trj) + + +def write_trajectory(trj, out_file, write_quat=True): + if trj.shape[1] == 3: + # Euler + if write_quat: + trj = Euler2Quat(trj) + else: + trj = np.insert(trj, 0, -3.0, axis=1) + elif not write_quat: + trj = Quat2Euler(trj) + trj = np.insert(trj, 0, -3.0, axis=1) + + with open(out_file, "w") as f: + for pos in trj: + f.write(", ".join([f"{q:.6f}" for q in pos])) + f.write("\n") diff --git a/item_generation_scripts/audiotools/wrappers/__init__.py b/item_generation_scripts/audiotools/wrappers/__init__.py new file mode 100644 index 00000000..aea270d8 --- /dev/null +++ b/item_generation_scripts/audiotools/wrappers/__init__.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# diff --git a/item_generation_scripts/audiotools/wrappers/bs1770.py b/item_generation_scripts/audiotools/wrappers/bs1770.py new file mode 100644 index 00000000..d238bec3 --- /dev/null +++ b/item_generation_scripts/audiotools/wrappers/bs1770.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +import copy +import logging +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Optional, Tuple, Union +from warnings import warn + +import numpy as np + +from item_generation_scripts.audiotools import audio, convert +from item_generation_scripts.audiotools.audiofile import write +from item_generation_scripts.audiotools.wrappers.filter import resample_itu +from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES +from item_generation_scripts.utils import find_binary, get_devnull, run + +logger = logging.getLogger("__main__") +logger.setLevel(logging.DEBUG) + + +def bs1770demo( + input: audio.Audio, + target_loudness: Optional[float] = -26, +) -> Tuple[float, float]: + """ + Wrapper for ITU-R BS.1770-4, requires bs1770demo binary + + Parameters + ---------- + input: Audio + Input audio + target_loudness: Optional[float] + Desired loudness in LKFS + + Returns + ------- + measured_loudness : float + Measured loudness of input + scale_factor: float + Scale factor to achieve desired loudness + """ + + null_file = get_devnull() + + if "bs1770demo" in DEFAULT_CONFIG_BINARIES["binary_paths"]: + binary = find_binary( + DEFAULT_CONFIG_BINARIES["binary_paths"]["bs1770demo"].name, + binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"]["bs1770demo"].parent, + ) + else: + binary = find_binary("bs1770demo") + + if not isinstance(input, audio.BinauralAudio) and not isinstance( + input, audio.ChannelBasedAudio + ): + raise NotImplementedError(f"{input.name} is unsupported in ITU-R BS.1770-4.") + + if input.fs != 48000: + warn( + "ITU-R BS.1770-4 only supports 48kHz sampling rate. Temporarily resampling signal for measurement." + ) + tmp_sig = resample_itu(input, 48000) + else: + tmp_sig = input.audio + + with TemporaryDirectory() as tmp_dir: + tmp_dir = Path(tmp_dir) + tmp_file = tmp_dir.joinpath("tmp_loudness.pcm") + + """ + ITU-R BS.1770-4 + """ + + cmd = [ + str(binary), + "-nchan", + str(input.num_channels), # input nchan + "-lev", + str(target_loudness), # level + "-conf", + "", # config string + str(tmp_file), + null_file, + ] + + if isinstance(input, audio.BinauralAudio): + cmd[6] = "00" # -conf + elif isinstance(input, audio.ChannelBasedAudio): + # if loudspeaker position fulfills the criteria, set the config string to 1 for that index + conf_str = [ + str(int(abs(e) < 30 and (60 <= abs(a) <= 120))) + for a, e in zip(input.ls_azi, input.ls_ele) + ] + for lfe in input.lfe_index: + conf_str[lfe] = "L" + + cmd[6] = "".join(conf_str) + + # write temporary file + write(tmp_file, tmp_sig, 48000) + + # run command + result = run(cmd, logger=logger) + + # parse output + measured_loudness = float(result.stdout.splitlines()[3].split(":")[1]) + scale_factor = float(result.stdout.splitlines()[-3].split(":")[1]) + + return measured_loudness, scale_factor + + +def get_loudness( + input: audio.Audio, + target_loudness: Optional[float] = -26, + loudness_format: Optional[str] = None, +) -> Tuple[float, float]: + """ + Loudness measurement using ITU-R BS.1770-4 + + Parameters + ---------- + input : Audio + Input audio + target_loudness: float + Desired loudness in LKFS + loudness_format: str + Loudness format to render to for loudness computation (default input format if possible) + + Returns + ------- + measured_loudness : float + Measured loudness (after conversion to loudness_format if specified) + scale_factor: float + Scale factor to acheive desired loudness + """ + + if target_loudness > 0: + raise ValueError("Desired loudness is too high!") + + if loudness_format is None: + # for some formats rendering is necessary prior to loudness measurement + if isinstance(input, audio.SceneBasedAudio) or isinstance( + input, audio.MetadataAssistedSpatialAudio + ): + loudness_format = "7_1_4" + elif isinstance(input, audio.ObjectBasedAudio): + loudness_format = "BINAURAL" + elif hasattr(input, "layout_file"): + loudness_format = input.layout_file + else: + # default use input format + loudness_format = input.name + + # configure intermediate format + tmp = audio.fromtype(loudness_format) + tmp.fs = input.fs + + if input.name != loudness_format: + convert.format_conversion(input, tmp) + else: + tmp.audio = input.audio + + return bs1770demo(tmp, target_loudness) + + +def loudness_norm( + input: audio.Audio, + target_loudness: Optional[float] = -26, + loudness_format: Optional[str] = None, +) -> np.ndarray: + """ + Iterative loudness normalization using ITU-R BS.1770-4 + Signal is iteratively scaled after rendering to the specified format + until loudness converges to the target value + + Parameters + ---------- + input : Audio + Input audio + target_loudness: Optional[float] + Desired loudness level in LKFS + loudness_format: Optional[str] + Loudness format to render to for loudness computation (default input format) + + Returns + ------- + norm : Audio + Normalized audio + """ + + # repeat until convergence of loudness + measured_loudness = np.inf + scale_factor = 1 + num_iter = 1 + + while np.abs(measured_loudness - target_loudness) > 0.5 and num_iter < 10: + measured_loudness, scale_factor_new = get_loudness( + input, target_loudness, loudness_format + ) + + # scale input + input.audio *= scale_factor_new + + # update scale factor + scale_factor *= scale_factor_new + + num_iter += 1 + + if num_iter >= 10: + warn( + f"Loudness did not converge to desired value, stopping at: {measured_loudness:.2f}" + ) + + return input.audio + + +def scale_files( + file_list: list[list[Union[Path, str]]], + fmt: str, + loudness: float, + fs: Optional[int] = 48000, + in_meta: Optional[list] = None, +) -> None: + """ + Scales audio files to desired loudness + + Parameters + ---------- + file_list : list[list[Union[Path, str]]] + List of file paths in a list of the condition folders + fmt: str + Audio format of files in list + loudness: float + Desired loudness level in LKFS/dBov + fs: Optional[int] + Sampling rate + in_meta: Optional[list] + Metadata for ISM with same structure as file_list but one layer more + for the list of metadata for one file + """ + + if fmt.startswith("ISM") and in_meta: + meta_bool = True + else: + in_meta = copy.copy(file_list) + meta_bool = False + + for folder, meta_folder in zip(file_list, in_meta): + for file, meta in zip(folder, meta_folder): + # create audio object + if meta_bool: + audio_obj = audio.fromfile(fmt, file, fs, meta) + else: + audio_obj = audio.fromfile(fmt, file, fs) + + # adjust loudness + scaled_audio = loudness_norm(audio_obj, loudness) + + # write into file + write(file, scaled_audio, audio_obj.fs) diff --git a/item_generation_scripts/audiotools/wrappers/eid_xor.py b/item_generation_scripts/audiotools/wrappers/eid_xor.py new file mode 100644 index 00000000..0b807d94 --- /dev/null +++ b/item_generation_scripts/audiotools/wrappers/eid_xor.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +import os.path +from pathlib import Path +from typing import Optional, Union + +from item_generation_scripts.audiotools.wrappers.gen_patt import create_error_pattern +from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES +from item_generation_scripts.utils import find_binary, run + + +def eid_xor( + error_pattern: Union[str, Path], + in_bitstream: Union[str, Path], + out_bitstream: Union[str, Path], +) -> None: + """ + Wrapper for eid-xor binary to apply error patterns for the bitstream processing + + Parameters + ---------- + error_pattern: Union[str, Path] + Path to error pattern file + in_bitstream: Union[str, Path] + Path to input bitstream file + out_bitstream: Union[str, Path] + Output path for modified bitstream + """ + + # find binary + if "eid-xor" in DEFAULT_CONFIG_BINARIES["binary_paths"]: + binary = find_binary( + DEFAULT_CONFIG_BINARIES["binary_paths"]["eid-xor"].name, + binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"]["eid-xor"].parent, + ) + else: + binary = find_binary("eid-xor") + + # check for valid inputs + if not Path(in_bitstream).is_file(): + raise ValueError( + f"Input bitstream file {in_bitstream} for bitstream processing does not exist" + ) + elif not Path(error_pattern).is_file(): + raise ValueError( + f"Error pattern file {error_pattern} for bitstream processing does not exist" + ) + + # set up command line + cmd = [ + str(binary), + "-vbr", # Enables variable bit rate operation + "-fer", # Error pattern is a frame erasure pattern + in_bitstream, + error_pattern, + out_bitstream, + ] + + # run command + run(cmd) + + return + + +def create_and_apply_error_pattern( + in_bitstream: Union[Path, str], + out_bitstream: Union[Path, str], + len_sig: int, + error_pattern: Optional[Union[Path, str]] = None, + error_rate: Optional[float] = None, + preamble: Optional[int] = 0, + master_seed: Optional[int] = 0, + prerun_seed: Optional[int] = 0, +) -> None: + """ + Function to create (or use existing) frame error pattern for bitstream processing + + Parameters + ---------- + in_bitstream: Union[Path, str] + Path of input bitstream + out_bitstream: Union[Path, str] + Path of output bitstream + len_sig: int + Length of signal in frames + error_pattern: Optional[Union[Path, str]] + Path to existing error pattern + error_rate: float + Error rate in percent + preamble: Optional[int] + Length of preamble in frames + master_seed: Optional[int] + Master seed for error pattern generation + prerun_seed: Optional[int] + Number of preruns in seed generation + """ + + if error_pattern is None: + # create error pattern + if error_rate is not None: + error_pattern = in_bitstream.parent.joinpath("error_pattern").with_suffix( + ".192" + ) + create_error_pattern( + len_sig, error_pattern, error_rate, preamble, master_seed, prerun_seed + ) + else: + raise ValueError( + "Either error pattern or error rate has to be specified for bitstream processing" + ) + elif error_rate is not None: + raise ValueError( + "Error pattern and error rate are specified for bitstream processing. Can't use both" + ) + + # apply error pattern + eid_xor(error_pattern, in_bitstream, out_bitstream) + + return + + +def validate_error_pattern_application( + error_pattern: Optional[Union[Path, str]] = None, + error_rate: Optional[int] = None, +) -> None: + """ + Validate settings for the network simulator + + Parameters + ---------- + error_pattern: Optional[Union[Path, str]] + Path to existing error pattern + error_rate: Optional[int] + Frame error rate + """ + + if find_binary("gen-patt") is None: + raise FileNotFoundError( + "The binary gen-patt for error pattern generation was not found! Please check the configuration." + ) + if find_binary("eid-xor") is None: + raise FileNotFoundError( + "The binary eid-xor for error patter application was not found! Please check the configuration." + ) + if error_pattern is not None: + if not os.path.exists(os.path.realpath(error_pattern)): + raise FileNotFoundError( + f"The frame error profile file {error_pattern} was not found! Please check the configuration." + ) + if error_rate is not None: + raise ValueError( + "Frame error pattern and error rate are specified for bitstream processing. Can't use both! Please check the configuration." + ) + else: + if error_rate is None: + raise ValueError( + "Either error rate or error pattern has to be specified for FER bitstream processing." + ) + elif error_rate < 0 or error_rate > 100: + raise ValueError( + f"Specified error rate of {error_rate}% is either too large or too small." + ) + return diff --git a/item_generation_scripts/audiotools/wrappers/esdru.py b/item_generation_scripts/audiotools/wrappers/esdru.py new file mode 100644 index 00000000..7785a586 --- /dev/null +++ b/item_generation_scripts/audiotools/wrappers/esdru.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Optional + +import numpy as np + +from item_generation_scripts.audiotools import audio +from item_generation_scripts.audiotools.audiofile import read, write +from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES +from item_generation_scripts.utils import find_binary, run + + +def esdru( + input: audio.Audio, + alpha: float, + sf: Optional[int] = 48000, + e_step: Optional[float] = 0.5, + seed: Optional[int] = 1, +) -> np.ndarray: + """ + Wrapper for ESDRU (Ericsson spatial distortion reference unit) Recommendation ITU-T P.811, requires esdru binary + + Parameters + ---------- + input : Audio + Input audio (16 bit Stereo PCM) + alpha: float + Alpha value [0.0 ... 1.0] + sf: Optional[int] + Sampling frequency FS Hz (Default: 48000 Hz) + e_step: Optional[float] + Max step S during high energy [0.0 ... 1.0] (Default: 0.5) + seed: Optional[int] + Set random seed I [unsigned int] (Default: 1) + + Returns + ------- + output: np.ndarray + Output array (16 bit Stereo PCM) + """ + if "esdru" in DEFAULT_CONFIG_BINARIES["binary_paths"]: + binary = find_binary( + DEFAULT_CONFIG_BINARIES["binary_paths"]["esdru"].name, + binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"]["esdru"].parent, + ) + else: + binary = find_binary("esdru") + + if not isinstance(input, audio.BinauralAudio) and not input.name == "STEREO": + raise Exception( + "ESDRU condition only available for STEREO or BINAURAL output format" + ) + + if alpha < 0.0 or alpha > 1.0: + raise Exception( + "Alpha value is out of bounds. Please choose a value between 0.0 and 1.0." + ) + + if e_step < 0.0 or e_step > 1.0: + raise Exception( + "Step value is out of bounds. Please choose a value between 0.0 and 1.0." + ) + + tmp_input_signal = input.audio + tmp_output_signal = np.ones((48000, 2)) + + with TemporaryDirectory() as tmp_dir: + tmp_dir = Path(tmp_dir) + tmp_input_file = tmp_dir.joinpath("tmp_input_signal.raw") + tmp_output_file = tmp_dir.joinpath("tmp_output_signal.raw") + + """ + ITU-T Recommendation P.811, ESDRU + """ + + cmd = [ + str(binary), + "-sf", + str(sf), + "-e_step", + str(e_step), + "-seed", + str(seed), + str(alpha), + str(tmp_input_file), + str(tmp_output_file), + ] + + # write temporary file + write(tmp_input_file, tmp_input_signal, sf) + write(tmp_output_file, tmp_output_signal, sf) + + # run command + run(cmd) + + tmp_output_signal, out_fs = read(tmp_output_file, 2, sf) + + return tmp_output_signal diff --git a/item_generation_scripts/audiotools/wrappers/filter.py b/item_generation_scripts/audiotools/wrappers/filter.py new file mode 100644 index 00000000..4c7b61b4 --- /dev/null +++ b/item_generation_scripts/audiotools/wrappers/filter.py @@ -0,0 +1,366 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +import re +from copy import copy +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Optional +from warnings import warn + +import numpy as np + +from item_generation_scripts.audiotools.audio import Audio, ChannelBasedAudio +from item_generation_scripts.audiotools.audioarray import delay_compensation +from item_generation_scripts.audiotools.audiofile import read, write +from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES +from item_generation_scripts.utils import find_binary, run + +FILTER_TYPES_REGEX = r"[\n][\s]{3}[A-Z0-9]\w+\s+" + + +def filter_itu( + input: Audio, + flt_type: str, + block_size: Optional[int] = None, + mod: Optional[bool] = False, + up: Optional[bool] = False, + down: Optional[bool] = False, + is_async: Optional[bool] = False, + delay: Optional[int] = None, + skip_channel: Optional[list[int]] = None, +) -> np.ndarray: + """ + Low-pass filter a multi-channel audio array + + Parameters + ---------- + input: Audio + Input array + flt_type: str + Name of filter type used for filtering + block_size: Optional[int] + Processing block size in number of samples (default 256 samples) + mod: Optional[bool] + Flag for using the modified IRS characteristic + up: Optional[bool] + Flag for up-sampling + down: Optional[bool] + Flag for down-sampling + is_async: Optional[bool] + Flag for asynchronization operation + delay: Optional[int] + Delay in number of samples + skip_channel: Optional[list[int]] + List of channel indices which should not be filtered + + Returns + ------- + output: np.ndarray + Output filtered array + """ + + if "filter" in DEFAULT_CONFIG_BINARIES["binary_paths"]: + binary = find_binary( + DEFAULT_CONFIG_BINARIES["binary_paths"]["filter"].name, + binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"]["filter"].parent, + ) + else: + binary = find_binary("filter") + + # check if filter type is supported + tmp = run([binary], check=False) + + FILTER_TYPES = [ + f.group().strip() for f in re.finditer(FILTER_TYPES_REGEX, tmp.stdout) + ] + + if flt_type not in FILTER_TYPES: + raise ValueError( + f"Filter type {flt_type} does not seem to be supported by the binary: {FILTER_TYPES}" + ) + + # create command line + cmd = [ + binary, + "-q", + ] + + if mod: + cmd.append("-mod") + if up and down: + raise ValueError("Either up-sampling or down-sampling has to be chosen") + if up: + cmd.append("-up") + elif down: + cmd.append("-down") + if is_async: + cmd.append("-async") + if delay: + cmd.extend(["-delay", str(delay)]) + + cmd.append(str(flt_type)) + + # create output array with according size + if up: + # upsampling -> size increases + if flt_type == "SHQ2": + output = np.zeros((np.shape(input.audio)[0] * 2, np.shape(input.audio)[1])) + elif flt_type == "SHQ3": + output = np.zeros((np.shape(input.audio)[0] * 3, np.shape(input.audio)[1])) + else: + raise ValueError(f"No upsampling with {flt_type} possible") + elif down: + # downsampling -> size decreases + if flt_type == "SHQ2": + output = np.zeros( + (int(np.ceil(np.shape(input.audio)[0] / 2)), np.shape(input.audio)[1]) + ) + elif flt_type == "SHQ3": + output = np.zeros( + (int(np.ceil(np.shape(input.audio)[0] / 3)), np.shape(input.audio)[1]) + ) + else: + raise ValueError(f"No downsampling with {flt_type} possible") + else: + # normal filtering -> size remains + output = np.zeros_like(input.audio) + + with TemporaryDirectory() as tmp_dir: + tmp_dir = Path(tmp_dir) + + # process channels separately + for channel in range(input.num_channels): + if skip_channel and channel in skip_channel: + continue + + cmd_in_out = cmd.copy() + + tmp_in = tmp_dir.joinpath(f"tmp_filterIn{channel}.pcm") + tmp_out = tmp_dir.joinpath(f"tmp_filterOut{channel}.pcm") + + cmd_in_out.append(str(tmp_in)) + cmd_in_out.append(str(tmp_out)) + + if block_size: + cmd_in_out.append(str(block_size)) + + write(tmp_in, input.audio[:, channel], input.fs) + + run(cmd_in_out) + + a, _ = read(tmp_out, nchannels=1, fs=input.fs) + output[:, channel][:, None] = a + + return output + + +def lpfilter_itu( + x: Audio, + fc: int, +) -> np.ndarray: + """ + Low-pass filter a multi-channel audio array + + Parameters + ---------- + x: Audio + Input audio + fc: int + Cut-off frequency in Hz + + Returns + ------- + y: np.ndarray + Output low-pass filtered array + """ + + # find right filter type for cut-off frequency + flt_types = ["LP1p5", "LP35", "LP7", "LP10", "LP12", "LP14", "LP20"] + flt_vals = [1500, 3500, 7000, 10000, 12000, 14000, 20000] + try: + flt_type = flt_types[flt_vals.index(fc)] + except Exception: + raise ValueError(f"LP cut-off frequency {fc}Hz not supported.") + + # resample if samplingrate is not supported + old_fs = None + tmp = copy(x) + if x.fs != 48000: + warn( + f"Filter type {flt_type} only supported for 48kHz samplingrate, not for {x.fs}Hz -> resampling" + ) + old_fs = x.fs + tmp.audio = resample_itu(tmp, 48000) + tmp.fs = 48000 + + # apply filter + y = filter_itu(tmp, flt_type=flt_type, block_size=960) + + # delay compensation + y = delay_compensation(y, flt_type=flt_type, fs=tmp.fs) + + # reverse resampling + if old_fs: + tmp.audio = y + y = resample_itu(tmp, old_fs) + + return y + + +def hp50filter_itu( + x: Audio, +) -> np.ndarray: + """ + High-pass 50Hz filter a multi-channel audio array + + Parameters + ---------- + x: Audio + Input audio + + Returns + ------- + y: np.ndarray + Output high-pass filtered array + """ + + # set filter type and check if sampling rate is supported + old_fs = None + tmp = copy(x) + if x.fs == 48000: + flt_type = "HP50_48KHZ" + elif x.fs == 32000: + flt_type = "HP50_32KHZ" + else: + # resample if samplingrate is not supported + warn( + f"Filter type HP50 only supported for 48kHz and 32kHz samlingrate, not for {x.fs}Hz -> resampling" + ) + flt_type = "HP50_48KHZ" + old_fs = x.fs + tmp.audio = resample_itu(tmp, 48000) + tmp.fs = 48000 + + # don't apply high-pass filtering to LFE channel + if isinstance(x, ChannelBasedAudio): + skip_channel = x.lfe_index + else: + skip_channel = None + + # apply filter + y = filter_itu(tmp, flt_type=flt_type, skip_channel=skip_channel) + + # delay compensation + y = delay_compensation(y, flt_type=flt_type, fs=tmp.fs) + + # reverse resampling + if old_fs: + tmp.audio = y + y = resample_itu(tmp, old_fs) + + return y + + +def resample_itu( + x: Audio, + fs_new: int, +) -> np.ndarray: + """ + Resampling of multi-channel audio array + + Parameters + ---------- + x: Audio + Input audio + fs_new: int + Target sampling rate in Hz + + Returns + ------- + y: np.ndarray + Output resampled array + """ + + fs_old = x.fs + + # if samplingrate is the same do nothing + if fs_new == fs_old: + return x.audio + + ratio_fs = fs_new / fs_old + up = [False] + down = [False] + + # select suitable processing to achieve target samplingrate + if ratio_fs == 2: + flt_type = ["SHQ2"] + up = [True] + elif ratio_fs == 0.5: + flt_type = ["SHQ2"] + down = [True] + elif ratio_fs == 3: + flt_type = ["SHQ3"] + up = [True] + elif ratio_fs == 1 / 3: + flt_type = ["SHQ3"] + down = [True] + elif ratio_fs == 2 / 3: + flt_type = ["SHQ2", "SHQ3"] + up = [True, False] + down = [False, True] + elif ratio_fs == ratio_fs == 3 / 2: + flt_type = ["SHQ3", "SHQ2"] + up = [True, False] + down = [False, True] + else: + raise ValueError("Ratio of input and output sampling frequency not supported") + + # apply filter + y = copy(x) + for i, flt in enumerate(flt_type): + y.audio = filter_itu(y, flt_type=flt, up=up[i], down=down[i]) + y.audio = delay_compensation( + y.audio, flt_type=flt, fs=y.fs, up=up[i], down=down[i] + ) + # if up[i]: + # if flt == "SHQ2": + # y.fs = y.fs * 2 + # elif flt == "SHQ3": + # y.fs = y.fs * 3 + # elif down[i]: + # if flt == "SHQ2": + # y.fs = int(y.fs / 2) + # elif flt == "SHQ3": + # y.fs = int(y.fs / 3) + + return y.audio diff --git a/item_generation_scripts/audiotools/wrappers/gen_patt.py b/item_generation_scripts/audiotools/wrappers/gen_patt.py new file mode 100644 index 00000000..a68706a7 --- /dev/null +++ b/item_generation_scripts/audiotools/wrappers/gen_patt.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +from os import getcwd +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Optional, Union + +from item_generation_scripts.audiotools.wrappers.random_seed import random_seed +from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES +from item_generation_scripts.utils import find_binary, run + +ERROR_PATTERNS_DIR = Path(__file__).parent.parent.parent.joinpath("error_patterns") + + +def gen_patt( + len_sig: int, + path_pattern: Union[Path, str], + error_rate: float, + start: Optional[int] = 0, + working_dir: Optional[Union[Path, str]] = None, +) -> None: + """ + Wrapper for gen-patt binary to create error patterns for the bitstream processing + + Parameters + ---------- + len_sig: int + Length of signal in frames + path_pattern: Union[Path, str] + Path of output pattern + error_rate: float + Error rate in percent + start: Optional[int] + Start frame of error pattern (length preamble) + working_dir: Optional[Union[Path, str]] + Directory where binary should be called (sta file has to be in this dir if desired) + """ + + # find binary + if "gen-patt" in DEFAULT_CONFIG_BINARIES["binary_paths"]: + binary = find_binary( + DEFAULT_CONFIG_BINARIES["binary_paths"]["gen-patt"].name, + binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"]["gen-patt"].parent, + ) + else: + binary = find_binary("gen-patt") + + if working_dir is None: + working_dir = getcwd() + + # set up command line + cmd = [ + str(binary), + "-tailstat", # Statistics performed on the tail + "-fer", # Frame erasure mode using Gilbert model + "-g192", # Save error pattern in 16-bit G.192 format + "-gamma", # Correlation for BER|FER modes + str(0), + "-rate", + str(error_rate / 100), + "-tol", # Max deviation of specified BER/FER/BFER + str(0.001), + "-reset", # Reset EID state in between iteractions + "-n", + str(int(len_sig)), + "-start", + str(int(start) + 1), + path_pattern, + ] + + # run command + run(cmd, cwd=working_dir) + + return + + +def create_error_pattern( + len_sig: int, + path_pattern: Union[Path, str], + frame_error_rate: float, + preamble: Optional[int] = 0, + master_seed: Optional[int] = 0, + prerun_seed: Optional[int] = 0, +) -> None: + """ + Creates error pattern with desired frame error rate for bitstream processing + + Parameters + ---------- + len_sig: int + Length of signal in frames + path_pattern: Union[Path, str] + Path of output pattern + frame_error_rate: float + Error rate in percent + preamble: Optional[int] + Length of preamble in frames + master_seed: Optional[int] + Master seed for error pattern generation + prerun_seed: optional[int] + Number of preruns in seed generation + """ + + with TemporaryDirectory() as tmp_dir: + tmp_dir = Path(tmp_dir) + + sta_file = ERROR_PATTERNS_DIR.joinpath("sta_template") + tmp_sta_file = tmp_dir.joinpath("sta") + + # compute seed + seed = random_seed((0, 99999999), master_seed, prerun_seed) + + # open file and modify + lines = [] + with open(sta_file, "r") as sta_file_txt: + lines.append(sta_file_txt.readline()) # not changed + lines.append(f"{sta_file_txt.readline()[:-2]}{frame_error_rate/100}\n") + lines.append(sta_file_txt.readline()) # not changed + lines.append(f"{sta_file_txt.readline()[:-2]}{seed}\n") + lines.append(sta_file_txt.readline()) # not changed + lines.append( + f"{sta_file_txt.readline()[:-2]}{1-(frame_error_rate/100*2)}\n" + ) + lines.append(sta_file_txt.readline()) # not changed + lines.append( + f"{sta_file_txt.readline()[:-2]}{1-(frame_error_rate/100*2)}\n" + ) + lines.append(sta_file_txt.readline()) # not changed + + with open(tmp_sta_file, "w") as tmp_sta_file_txt: + tmp_sta_file_txt.write("".join(lines)) + + gen_patt( + len_sig=len_sig, + error_rate=frame_error_rate, + path_pattern=path_pattern, + start=preamble, + working_dir=tmp_dir, + ) + + return diff --git a/item_generation_scripts/audiotools/wrappers/masaRenderer.py b/item_generation_scripts/audiotools/wrappers/masaRenderer.py new file mode 100644 index 00000000..a5987b1e --- /dev/null +++ b/item_generation_scripts/audiotools/wrappers/masaRenderer.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +from pathlib import Path +from tempfile import TemporaryDirectory + +import numpy as np + +from item_generation_scripts.audiotools import audio +from item_generation_scripts.audiotools.audiofile import read, write +from item_generation_scripts.audiotools.wrappers.filter import resample_itu +from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES +from item_generation_scripts.utils import find_binary, run + + +def masaRenderer( + masa: audio.MetadataAssistedSpatialAudio, + out_fmt: str, +) -> np.ndarray: + """ + Wrapper for masaRenderer (from MASA reference software) + + Parameters + ---------- + masa : MetadataAssistedSpatialAudio + Input MASA audio + out_fmt: str + Desired output format (only 5_1, 7_1_4 and BINAURAL supported) + + Returns + ------- + output : np.ndarray + MASA rendered to out_fmt + """ + + if "masaRenderer" in DEFAULT_CONFIG_BINARIES["binary_paths"]: + binary = find_binary( + DEFAULT_CONFIG_BINARIES["binary_paths"]["masaRenderer"].name, + binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"]["masaRenderer"].parent, + ) + else: + binary = find_binary("masaRenderer") + + if out_fmt not in ["5_1", "7_1_4", "BINAURAL"]: + raise ValueError(f"Output format {out_fmt} is not supported by MasaRenderer!") + + if out_fmt == "5_1": + output_mode = "-LS51" + num_channels = 6 + elif out_fmt == "7_1_4": + output_mode = "-LS714" + num_channels = 12 + else: + output_mode = "-BINAURAL" + num_channels = 2 + + cmd = [ + str(binary), + output_mode, + "", # 2 -> inputPcm + str(masa.metadata_files.resolve()), + "", # 4 -> outputPcm + ] + + with TemporaryDirectory() as tmp_dir: + tmp_dir = Path(tmp_dir) + tmp_in = tmp_dir.joinpath("tmp_masaRendIn.pcm") + tmp_out = tmp_dir.joinpath("tmp_masaRendOut.pcm") + + cmd[2] = str(tmp_in) + cmd[4] = str(tmp_out) + + tmp_audio = resample_itu(masa, 48000) + old_fs = masa.fs + + write(tmp_in, tmp_audio, 48000) + + # we need to run in the masaRenderer directory to use the .bin files it requires + run(cmd, cwd=binary.resolve().parent) + + output, _ = read(tmp_out, num_channels) + + output_audio = audio.fromtype(out_fmt) + output_audio.audio = output + output_audio.fs = 48000 + output = resample_itu(output_audio, old_fs) + + return output diff --git a/item_generation_scripts/audiotools/wrappers/networkSimulator.py b/item_generation_scripts/audiotools/wrappers/networkSimulator.py new file mode 100644 index 00000000..4e74c3ce --- /dev/null +++ b/item_generation_scripts/audiotools/wrappers/networkSimulator.py @@ -0,0 +1,224 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +import logging +import os.path +from pathlib import Path +from typing import Optional, Union + +from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES +from item_generation_scripts.utils import find_binary, run + +LIST_JBM_PROFILES = range(12) +ERROR_PATTERNS_DIR = Path(__file__).parent.parent.parent.joinpath("dly_error_profiles") + + +def validate_network_simulator( + error_pattern: Optional[Union[Path, str]] = None, + error_profile: Optional[int] = None, + n_frames_per_packet: Optional[int] = None, +) -> None: + """ + Validate settings for the network simulator + + Parameters + ---------- + error_pattern: Optional[Union[Path, str]] + Path to existing error pattern + error_profile: Optional[int] + Index of existing error pattern + n_frames_per_packet: Optional[int] + Number of frames per paket + """ + + if "networkSimulator_g192" in DEFAULT_CONFIG_BINARIES["binary_paths"]: + binary = find_binary( + DEFAULT_CONFIG_BINARIES["binary_paths"]["networkSimulator_g192"].name, + binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"][ + "networkSimulator_g192" + ].parent, + ) + else: + binary = find_binary("networkSimulator_g192") + + if binary is None: + raise FileNotFoundError( + "The network simulator binary was not found! Please check the configuration." + ) + if error_pattern is not None: + if not os.path.exists(os.path.realpath(error_pattern)): + raise FileNotFoundError( + f"The network simulator error profile file {error_pattern} was not found! Please check the configuration." + ) + if error_profile is not None: + raise ValueError( + "JBM pattern and JBM profile number are specified for bitstream processing. Can't use both! Please check the configuration." + ) + elif error_profile is not None: + if error_profile not in LIST_JBM_PROFILES: + raise ValueError( + f"JBM profile number {error_profile} does not exist, should be between {LIST_JBM_PROFILES[0]} and {LIST_JBM_PROFILES[-1]}" + ) + if n_frames_per_packet is not None and n_frames_per_packet not in [1, 2]: + raise ValueError( + f"n_frames_per_paket is {n_frames_per_packet}. Should be 1 or 2. Please check your configuration." + ) + + return + + +def network_simulator( + error_pattern: Union[str, Path], + in_bitstream: Union[str, Path], + out_bitstream: Union[str, Path], + n_frames_per_packet: int, + offset: int, + logger: Optional[logging.Logger] = None, +) -> None: + """ + Wrapper for networkSimulator_g192 binary to apply error patterns for the bitstream processing + + Parameters + ---------- + error_pattern: Union[str, Path] + Path to error pattern file + in_bitstream: Union[str, Path] + Path to input bitstream file + out_bitstream: Union[str, Path] + Output path for modified bitstream + n_frames_per_packet: int, + Number of frames per paket [1,2] + offset: Optional[int] + delay offset + logger: Optional[logging.Logger] + logger + """ + + # find binary + if "networkSimulator_g192" in DEFAULT_CONFIG_BINARIES["binary_paths"]: + binary = find_binary( + DEFAULT_CONFIG_BINARIES["binary_paths"]["networkSimulator_g192"].name, + binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"][ + "networkSimulator_g192" + ].parent, + ) + else: + binary = find_binary("networkSimulator_g192") + + # check for valid inputs + if not Path(in_bitstream).is_file(): + raise ValueError( + f"Input bitstream file {in_bitstream} for bitstream processing does not exist" + ) + elif not Path(error_pattern).is_file(): + raise ValueError( + f"Error pattern file {error_pattern} for bitstream processing does not exist" + ) + + # set up command line + cmd = [ + str(binary), + error_pattern, + in_bitstream, + out_bitstream, + f"{out_bitstream}_tracefile_sim", + str(n_frames_per_packet), + str(offset), + ] + + # run command + run(cmd, logger=logger) + + return + + +def apply_network_simulator( + in_bitstream: Union[Path, str], + out_bitstream: Union[Path, str], + error_pattern: Optional[Union[Path, str]] = None, + error_profile: Optional[int] = None, + n_frames_per_packet: Optional[int] = None, + offset: Optional[int] = 0, + logger: Optional[logging.Logger] = None, +) -> None: + """ + Function to apply a network simulator profile to a bitstreaam + + Parameters + ---------- + in_bitstream: Union[Path, str] + Path of input bitstream + out_bitstream: Union[Path, str] + Path of output bitstream + error_pattern: Optional[Union[Path, str]] + Path to existing error pattern + error_profile: Optional[int] + Index of existing error pattern + n_frames_per_packet: Optional[int] + Number of frames per paket + offset: Optional[int] + delay offset + logger: Optional[logging.Logger] + logger + """ + + if error_pattern is None: + # create error pattern + if error_profile is not None: + if error_profile in LIST_JBM_PROFILES: + error_pattern = ERROR_PATTERNS_DIR.joinpath( + f"dly_error_profile_{error_profile}.dat" + ) + else: + raise ValueError( + f"JBM profile number {error_profile} does not exist, should be between {LIST_JBM_PROFILES[0]} and {LIST_JBM_PROFILES[-1]}" + ) + else: + raise ValueError( + "Either error pattern or error profile number has to be specified for network simulator bitstream processing" + ) + elif error_profile is not None: + raise ValueError( + "JBM pattern and JBM profile number are specified for bitstream processing. Can't use both" + ) + + if n_frames_per_packet is None: + n_frames_per_packet = 1 + if error_profile is not None and error_profile == 5: + n_frames_per_packet = 2 + + # apply error pattern + network_simulator( + error_pattern, in_bitstream, out_bitstream, n_frames_per_packet, offset, logger + ) + + return diff --git a/item_generation_scripts/audiotools/wrappers/p50fbmnru.py b/item_generation_scripts/audiotools/wrappers/p50fbmnru.py new file mode 100644 index 00000000..2f4c19ef --- /dev/null +++ b/item_generation_scripts/audiotools/wrappers/p50fbmnru.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +from pathlib import Path +from tempfile import TemporaryDirectory +from warnings import warn + +import numpy as np + +from item_generation_scripts.audiotools import audio +from item_generation_scripts.audiotools.audiofile import read, write +from item_generation_scripts.audiotools.wrappers.filter import resample_itu +from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES +from item_generation_scripts.utils import find_binary, run + + +def p50fbmnru( + input: audio.Audio, + q_db: float, +) -> np.ndarray: + """ + Wrapper for P.50 Fullband MNRU (Modulated Noise Reference Unit), requires p50fbmnru binary + The mode is M (Modulated Noise) as specified in section 5.2.1 of S4-141392 - EVS-7c Processing functions for characterization phase v110.doc + + Parameters + ---------- + input : Audio + Input audio + q_db: float + The ratio, in dB, of speech power to modulated noise power + + Returns + ------- + output: np.ndarray + Output array + """ + + if "p50fbmnru" in DEFAULT_CONFIG_BINARIES["binary_paths"]: + binary = find_binary( + DEFAULT_CONFIG_BINARIES["binary_paths"]["p50fbmnru"].name, + binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"]["p50fbmnru"].parent, + ) + else: + binary = find_binary("p50fbmnru") + + if input.fs != 48000: + warn("P.50 Fullband MNRU requires a sampling rate of 48kHz.") + tmp_sig = resample_itu(input, 48000) + else: + tmp_sig = input.audio + + tmp_input_signal = tmp_sig + tmp_output_signal = np.ones((48000, input.num_channels)) + + with TemporaryDirectory() as tmp_dir: + tmp_dir = Path(tmp_dir) + tmp_input_file = tmp_dir.joinpath("tmp_input_signal.raw") + tmp_output_file = tmp_dir.joinpath("tmp_output_signal.raw") + + """ + P.50 Fullband MNRU + """ + + cmd = [ + str(binary), + str(tmp_input_file), + str(tmp_output_file), + str(q_db), + "M", + ] + + # write temporary file + write(tmp_input_file, tmp_input_signal) + write(tmp_output_file, tmp_output_signal) + + # run command + run(cmd) + + tmp_output_signal, out_fs = read(tmp_output_file, input.num_channels) + + return tmp_output_signal diff --git a/item_generation_scripts/audiotools/wrappers/random_seed.py b/item_generation_scripts/audiotools/wrappers/random_seed.py new file mode 100644 index 00000000..01cf0870 --- /dev/null +++ b/item_generation_scripts/audiotools/wrappers/random_seed.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +from typing import Optional, Tuple + +from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES +from item_generation_scripts.utils import find_binary, run + + +def random_seed( + range: Tuple[int, int], + master_seed: Optional[int] = 0, + prerun_seed: Optional[int] = 0, + hexa: Optional[bool] = True, +) -> int: + """ + + Parameters + ---------- + master_seed: Optional[int] + Master seed for error pattern generation + prerun_seed: Optional[int] + Number of preruns in seed generation + hexa: Optonal[bool] + Flag if output should be in hexadecimal or decimal format + + Returns + ------- + result: int + One random value + """ + + # find binary + if "random" in DEFAULT_CONFIG_BINARIES["binary_paths"]: + binary = find_binary( + DEFAULT_CONFIG_BINARIES["binary_paths"]["random"].name, + binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"]["random"].parent, + ) + else: + binary = find_binary("random") + + # set up command line + cmd = [ + str(binary), + "-n", # Number of items + str(1), + "-s", + str(master_seed), + "-d", + str(prerun_seed), + "-r", # value range for results + str(range[0]), + str(range[1]), + ] + + # run command + result = run(cmd) + result = int(result.stdout[:-1]) + + if hexa: + result = hex(result) + + return result diff --git a/item_generation_scripts/binary_paths.yml b/item_generation_scripts/binary_paths.yml new file mode 100644 index 00000000..bafcacfc --- /dev/null +++ b/item_generation_scripts/binary_paths.yml @@ -0,0 +1,30 @@ +--- +################################################ +# Binary paths +################################################ +### Custom binary paths and names can be specified here. +### If not defined here, the binaries in item_generation_scripts/bin would be used +### If binaries are neither specified here nor found in the bin folder, the scripts would look for them in $PATH +### DO NOT change the location of this file. +### DO NOT USE relative paths. The paths have to be absolute. +### DO NOT change the default keys. +### For example, if the user has renamed the 'filter' binary to 'foo' then use --> filter: path/to/binary/foo + +# ### Binary for resampling and filtering +# filter: "path/to/binary/filter_new" +# ### Binary for loudness adjustment +# bs1770demo: "path/to/binary/bs1880" +# ### Binary for MNRU +# p50fbmnru: "path/to/binary/p50fbmnru" +# ### Binary for ESDRU +# esdru: "path/to/binary/esdru" +# ### Binary for frame error pattern application +# eid-xor: "path/to/binary/eid-xor" +# ### Binary for error pattern generation +# gen-patt: "path/to/binary/gen-patt" +# ### Binary for random offset/seed generation +# random: "path/to/binary/random" +# ### Binary for JBM network similulator +# networkSimulator_g192: "path/to/binary/networkSimulator_g192" +# ### Binary for MASA rendering +# masaRenderer: "path/to/binary/masaRenderer" \ No newline at end of file diff --git a/item_generation_scripts/config/ISM1_CONFIG.yml b/item_generation_scripts/config/ISM1_CONFIG.yml new file mode 100644 index 00000000..f4e1ee31 --- /dev/null +++ b/item_generation_scripts/config/ISM1_CONFIG.yml @@ -0,0 +1,338 @@ +--- +################################################ +# General configuration +################################################ + +### Output format +format: "ISM1" + +### Date; default = YYYYMMDD_HH.MM.SS +# date: 2023.06.30 + +### Deletion of temporary directories containing intermediate processing files, bitstreams etc.; default = false +# delete_tmp: true + +### Output sampling rate in Hz needed for headerless audio files; default = 48000 +# fs: 32000 + +### Any relative paths will be interpreted relative to the working directory the script is called from! +### Usage of absolute paths is recommended. +### Do not use file names with dots "." in them! This is not supported, use "_" instead +### For Windows users: please use double back slash '\\' in paths and add '.exe' to executable definitions + +### Input path to mono files +input_path: "/mnt/c/Work/IVAS/3gpp_forge_gitlab/ivas-processing-scripts/items_mono" + +### Output path for generated test items and metadata files +output_path: "/mnt/c/Work/IVAS/3gpp_forge_gitlab/ivas-processing-scripts/output" + +### Target loudness in LKFS; default = null (no loudness normalization applied) +loudness: -26 + + +################################################ +### Scene description +################################################ + +### Each scene must start with the sceneN tag +### Specify the mono source filename (the program will search for it in the input_path folder) +### Specify azimuth and elevation for each input source +### Note 1: use [val1, val2, ...] for multiple sources in a scene +### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames + +### Note 3: we're using right-handed coordinate system with azi = 0 pointing from the nose to the screen +### azimuth: float, [-180,180]; positive indicates left +### elevation: float, [-90,90]; positive indicates up +### distance: float, tbd: default: 1 +### spread: float, [0,360]; spread in angles from 0 ... 360˚ +### gain: float, [0,1] + +scenes: + a1: + name: "G1S1.wav" + description: "Talker sitting at a table" + source: "f2s5a_Talker1.wav" + azimuth: 0 + elevation: 0 + delay: 0 + + a2: + name: "G6S2.wav" + description: "Talker sitting at a table" + source: "f5s10a_Talker1.wav" + azimuth: 60 + elevation: 0 + delay: 0 + + a3: + name: "G5S3.wav" + description: "Talker sitting at a table" + source: "f2s5a_Talker1.wav" + azimuth: 120 + elevation: 0 + delay: 0 + + a4: + name: "G4S4.wav" + description: "Talker sitting at a table" + source: "m4s11b_Talker1.wav" + azimuth: 180 + elevation: 0 + delay: 0 + + a5: + name: "G3S5.wav" + description: "Talker sitting at a table" + source: "m1s4a_Talker1.wav" + azimuth: 240 + elevation: 0 + delay: 0 + + a6: + name: "G2S6.wav" + description: "Talker sitting at a table" + source: "f5s10a_Talker1.wav" + azimuth: 300 + elevation: 0 + delay: 0 + + b1: + name: "G2S1.wav" + description: "standing talker." + source: "f5s10b_Talker1.wav" + azimuth: 120 + elevation: 35 + delay: 0 + + b2: + name: "G1S2.wav" + description: "standing talker." + source: "f2s1a_Talker1.wav" + azimuth: 180 + elevation: 35 + delay: 0 + + b3: + name: "G6S3.wav" + description: "standing talker." + source: "f5s10b_Talker1.wav" + azimuth: 240 + elevation: 35 + delay: 0 + + b4: + name: "G5S4.wav" + description: "standing talker." + source: "f2s1a_Talker1.wav" + azimuth: 300 + elevation: 35 + delay: 0 + + b5: + name: "G4S5.wav" + description: "standing talker." + source: "m4s11a_Talker1.wav" + azimuth: 0 + elevation: 35 + delay: 0 + + b6: + name: "G3S6.wav" + description: "standing talker." + source: "m1s2b_Talker1.wav" + azimuth: 60 + elevation: 35 + delay: 0 + + c1: + name: "G3S1.wav" + description: "Smaller talker (child) walking around a table." + source: "m1s6b_Talker1.wav" + azimuth: "0:1:360" + elevation: 0 + delay: 0 + + c2: + name: "G2S2.wav" + description: "Smaller talker (child) walking around a table." + source: "f5s14a_Talker1.wav" + azimuth: "60:1:60+360" + elevation: 0 + delay: 0 + + c3: + name: "G1S3.wav" + description: "Smaller talker (child) walking around a table." + source: "f2s6a_Talker1.wav" + azimuth: "120:1:120+360" + elevation: 0 + delay: 0 + + c4: + name: "G6S4.wav" + description: "Smaller talker (child) walking around a table." + source: "f5s14a_Talker1.wav" + azimuth: "180:1:180+360" + elevation: 0 + delay: 0 + + c5: + name: "G5S5.wav" + description: "Smaller talker (child) walking around a table." + source: "f2s6a_Talker1.wav" + azimuth: "240:1:240+360" + elevation: 0 + delay: 0 + + c6: + name: "G4S6.wav" + description: "Smaller talker (child) walking around a table." + source: "m4s13a_Talker1.wav" + azimuth: "300:1:300+360" + elevation: 0 + delay: 0 + + d1: + name: "G4S1.wav" + description: "Talker walking around the table." + source: "m4s12b_Talker1.wav" + azimuth: "0:-1:-360" + elevation: 35 + delay: 0 + + d2: + name: "G3S2.wav" + description: "Talker walking around the table." + source: "m1s12a_Talker1.wav" + azimuth: "60:-1:60-360" + elevation: 35 + delay: 0 + + d3: + name: "G3S2.wav" + description: "Talker walking around the table." + source: "f5s15b_Talker1.wav" + azimuth: "120:-1:120-360" + elevation: 35 + delay: 0 + + d4: + name: "G1S4.wav" + description: "Talker walking around the table." + source: "f2s3b_Talker1.wav" + azimuth: "180:-1:180-360" + elevation: 35 + delay: 0 + + d5: + name: "G6S5.wav" + description: "Talker walking around the table." + source: "f5s15b_Talker1.wav" + azimuth: "240:-1:240-360" + elevation: 35 + delay: 0 + + d6: + name: "G5S6.wav" + description: "Talker walking around the table." + source: "f2s3b_Talker1.wav" + azimuth: "300:-1:300-360" + elevation: 35 + delay: 0 + + e1: + name: "G5S1.wav" + description: "Elevation displacement." + source: "f2s4a_Talker1.wav" + azimuth: 240 + elevation: "-90:0.5:90" + delay: 0 + + e2: + name: "G4S2.wav" + description: "Elevation displacement." + source: "m4s16a_Talker1.wav" + azimuth: 300 + elevation: 0 + delay: 0 + + e3: + name: "G3S3.wav" + description: "Elevation displacement." + source: "m1s16b_Talker1.wav" + azimuth: 0 + elevation: "-90:0.5:90" + delay: 0 + + e4: + name: "G2S4.wav" + description: "Elevation displacement." + source: "f5s19a_Talker1.wav" + azimuth: 60 + elevation: "-90:0.5:90" + delay: 0 + + e5: + name: "G1S5.wav" + description: "Elevation displacement." + source: "f2s4a_Talker1.wav" + azimuth: 120 + elevation: "-90:0.5:90" + delay: 0 + + e6: + name: "G6S6.wav" + description: "Elevation displacement." + source: "f5s19a_Talker1.wav" + azimuth: 180 + elevation: "-90:0.5:90" + delay: 0 + + f1: + name: "G6S1.wav" + description: "Azimuth and elevation displacement." + source: "f5s15a_Talker1.wav" + azimuth: "60:0.5:60+180" + elevation: "35:-0.2:-35" + delay: 0 + + f2: + name: "G5S2.wav" + description: "Azimuth and elevation displacement." + source: "f2s7b_Talker1.wav" + azimuth: "120:0.5:120+180" + elevation: "35:-0.2:-35" + delay: 0 + + f3: + name: "G4S3.wav" + description: "Azimuth and elevation displacement." + source: "m4s14a_Talker1.wav" + azimuth: "180:0.5:180+180" + elevation: "35:-0.2:-35" + delay: 0 + + f4: + name: "G3S4.wav" + description: "Azimuth and elevation displacement." + source: "m1s7a_Talker1.wav" + azimuth: "240:0.5:240+180" + elevation: "35:-0.2:-35" + delay: 0 + + f5: + name: "G2S5.wav" + description: "Azimuth and elevation displacement." + source: "f5s15a_Talker1.wav" + azimuth: "300:0.5:300+180" + elevation: "35:-0.2:-35" + delay: 0 + + f6: + name: "G1S6.wav" + description: "Azimuth and elevation displacement." + source: "f2s7b_Talker1.wav" + azimuth: "0:0.5:0+180" + elevation: "35:-0.2:-35" + delay: 0 + \ No newline at end of file diff --git a/item_generation_scripts/config/ISM2_CONFIG.yml b/item_generation_scripts/config/ISM2_CONFIG.yml new file mode 100644 index 00000000..c4a65c07 --- /dev/null +++ b/item_generation_scripts/config/ISM2_CONFIG.yml @@ -0,0 +1,338 @@ +--- +################################################ +# General configuration +################################################ + +### Output format +format: "ISM2" + +### Date; default = YYYYMMDD_HH.MM.SS +# date: 2023.06.30 + +### Deletion of temporary directories containing intermediate processing files, bitstreams etc.; default = false +# delete_tmp: true + +### Output sampling rate in Hz needed for headerless audio files; default = 48000 +# fs: 32000 + +### Any relative paths will be interpreted relative to the working directory the script is called from! +### Usage of absolute paths is recommended. +### Do not use file names with dots "." in them! This is not supported, use "_" instead +### For Windows users: please use double back slash '\\' in paths and add '.exe' to executable definitions + +### Input path to mono files +input_path: "/mnt/c/Work/IVAS/3gpp_forge_gitlab/ivas-processing-scripts/items_mono" + +### Output path for generated test items and metadata files +output_path: "/mnt/c/Work/IVAS/3gpp_forge_gitlab/ivas-processing-scripts/output" + +### Target loudness in LKFS; default = null (no loudness normalization applied) +loudness: -26 + + +################################################ +### Scene description +################################################ + +### Each scene must start with the sceneN tag +### Specify the mono source filename (the program will search for it in the input_path folder) +### Specify azimuth and elevation for each input source +### Note 1: use [val1, val2, ...] for multiple sources in a scene +### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames + +### Note 3: we're using right-handed coordinate system with azi = 0 pointing from the nose to the screen +### azimuth: float, [-180,180]; positive indicates left +### elevation: float, [-90,90]; positive indicates up +### distance: float, tbd: default: 1 +### spread: float, [0,360]; spread in angles from 0 ... 360˚ +### gain: float, [0,1] + +scenes: + a1: + name: "G1S1.wav" + description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." + source: ["f2s5a_Talker1.wav", "m2s16b_Talker2.wav"] + azimuth: [0, 50] + elevation: [0, 0] + delay: [0, 0] + + a2: + name: "G6S2.wav" + description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." + source: ["f5s10a_Talker1.wav", "m3s2a_Talker2.wav"] + azimuth: [50, 350] + elevation: [0, 0] + delay: [0, 0] + + a3: + name: "G5S3.wav" + description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." + source: ["f2s5a_Talker1.wav", "m2s16b_Talker2.wav"] + azimuth: [40, 290] + elevation: [0, 0] + delay: [0, 0] + + a4: + name: "G4S4.wav" + description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." + source: ["m4s11b_Talker1.wav", "f1s4b_Talker2.wav"] + azimuth: [30, 230] + elevation: [15, 15] + delay: [0, 0] + + a5: + name: "G3S5.wav" + description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." + source: ["m1s4a_Talker1.wav", "f3s3a_Talker2.wav"] + azimuth: [20, 170] + elevation: [15, 15] + delay: [0, 0] + + a6: + name: "G2S6.wav" + description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." + source: ["f5s10a_Talker1.wav", "m3s2a_Talker2.wav"] + azimuth: [10, 110] + elevation: [15, 15] + delay: [0, 0] + + b1: + name: "G2S1.wav" + description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." + source: ["f5s10b_Talker1.wav", "m3s2b_Talker2.wav"] + azimuth: [20, 170] + elevation: [30, 30] + delay: [0, 0] + + b2: + name: "G1S2.wav" + description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." + source: ["f2s1a_Talker1.wav", "m2s10a_Talker2.wav"] + azimuth: [10, 110] + elevation: [30, 30] + delay: [0, 0] + + b3: + name: "G6S3.wav" + description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." + source: ["f5s10b_Talker1.wav", "m3s2b_Talker2.wav"] + azimuth: [0, 50] + elevation: [30, 30] + delay: [0, 0] + + b4: + name: "G5S4.wav" + description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." + source: ["f2s1a_Talker1.wav", "m2s10a_Talker2.wav"] + azimuth: [50, 350] + elevation: [60, 60] + delay: [0, 0] + + b5: + name: "G4S5.wav" + description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." + source: ["m4s11a_Talker1.wav", "f1s6a_Talker2.wav"] + azimuth: [40, 290] + elevation: [60, 60] + delay: [0, 0] + + b6: + name: "G3S6.wav" + description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." + source: ["m1s2b_Talker1.wav", "f3s5a_Talker2.wav"] + azimuth: [30, 230] + elevation: [60, 60] + delay: [0, 0] + + c1: + name: "G3S1.wav" + description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances." + source: ["m1s6b_Talker1.wav", "f3s5b_Talker2.wav"] + azimuth: [40, 290] + elevation: [0, 60] + delay: [0, 0] + + c2: + name: "G2S2.wav" + description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances." + source: ["f5s14a_Talker1.wav", "m3s8a_Talker2.wav"] + azimuth: [30, 230] + elevation: [0, 60] + delay: [0, 0] + + c3: + name: "G1S3.wav" + description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances." + source: ["f2s6a_Talker1.wav", "m2s13a_Talker2.wav"] + azimuth: [20, 170] + elevation: [0, 60] + delay: [0, 0] + + c4: + name: "G6S4.wav" + description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances." + source: ["f5s14a_Talker1.wav", "m3s8a_Talker2.wav"] + azimuth: [10, 110] + elevation: [0, 60] + delay: [0, 0] + + c5: + name: "G5S5.wav" + description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances." + source: ["f2s6a_Talker1.wav", "m2s13a_Talker2.wav"] + azimuth: [0, 50] + elevation: [0, 60] + delay: [0, 0] + + c6: + name: "G4S6.wav" + description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances." + source: ["m4s13a_Talker1.wav", "f1s20a_Talker2.wav"] + azimuth: [50, 350] + elevation: [0, 60] + delay: [0, 0] + + d1: + name: "G4S1.wav" + description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." + source: ["m4s12b_Talker1.wav", "f1s12b_Talker2.wav"] + azimuth: [50, "180:1:120 + 360"] + elevation: [0, 60] + delay: [0, 0] + + d2: + name: "G3S2.wav" + description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." + source: ["m1s12a_Talker1.wav", "f3s20a_Talker2.wav"] + azimuth: [300, "-70:-1:-10 - 360"] + elevation: [0, 60] + delay: [0, 0] + + d3: + name: "G3S2.wav" + description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." + source: ["f5s15b_Talker1.wav", "m3s1a_Talker2.wav"] + azimuth: [250, "-20:-1:-320"] + elevation: [0, 60] + delay: [0, 0] + + d4: + name: "G1S4.wav" + description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." + source: ["f2s3b_Talker1.wav", "m2s15a_Talker2.wav"] + azimuth: [200, "30:-1:-270"] + elevation: [0, 60] + delay: [0, 0] + + d5: + name: "G6S5.wav" + description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." + source: ["f5s15b_Talker1.wav", "m3s1a_Talker2.wav"] + azimuth: [150, "80:1:20 + 360"] + elevation: [0, 60] + delay: [0, 0] + + d6: + name: "G5S6.wav" + description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." + source: ["f2s3b_Talker1.wav", "m2s15a_Talker2.wav"] + azimuth: [100, "130:1:70 + 360"] + elevation: [0, 60] + delay: [0, 0] + + e1: + name: "G5S1.wav" + description: "two talkers walking side-by-side around the table, ~30% overlapping utterances" + source: ["f2s4a_Talker1.wav", "m2s17b_Talker2.wav"] + azimuth: ["80:1:20 + 360", "80:1:20 + 360"] + elevation: [10, 60] + delay: [0, 0] + + e2: + name: "G4S2.wav" + description: "two talkers walking side-by-side around the table, ~30% overlapping utterances" + source: ["m4s16a_Talker1.wav", "f1s16b_Talker2.wav"] + azimuth: ["130:1:70 + 360", "130:1:70 + 360"] + elevation: [10, 60] + delay: [0, 0] + + e3: + name: "G3S3.wav" + description: "two talkers walking side-by-side around the table, ~30% overlapping utterances" + source: ["m1s16b_Talker1.wav", "f3s10b_Talker2.wav"] + azimuth: ["180:1:120 + 360", "180:1:120 + 360"] + elevation: [10, 60] + delay: [0, 0] + + e4: + name: "G2S4.wav" + description: "two talkers walking side-by-side around the table, ~30% overlapping utterances" + source: ["f5s19a_Talker1.wav", "m3s1b_Talker2.wav"] + azimuth: ["-70:-1:-10 - 360", "-70:-1:-10 - 360"] + elevation: [10, 60] + delay: [0, 0] + + e5: + name: "G1S5.wav" + description: "two talkers walking side-by-side around the table, ~30% overlapping utterances" + source: ["f2s4a_Talker1.wav", "m2s17b_Talker2.wav"] + azimuth: ["-20:-1:-320", "-20:-1:-320"] + elevation: [10, 60] + delay: [0, 0] + + e6: + name: "G6S6.wav" + description: "two talkers walking side-by-side around the table, ~30% overlapping utterances" + source: ["f5s19a_Talker1.wav", "m3s1b_Talker2.wav"] + azimuth: ["30:-1:-270", "30:-1:-270"] + elevation: [10, 60] + delay: [0, 0] + + f1: + name: "G6S1.wav" + description: "two talkers walking around the table in opposite directions, non-overlapping utterances." + source: ["f5s15a_Talker1.wav", "m3s8b_Talker2.wav"] + azimuth: ["60:1:0 + 360", "60:-1:120 - 360"] + elevation: [20, 50] + delay: [0, 0] + + f2: + name: "G5S2.wav" + description: "two talkers walking around the table in opposite directions, non-overlapping utterances." + source: ["f2s7b_Talker1.wav", "m2s6b_Talker2.wav"] + azimuth: ["0:1:300", "0:-1:60 - 360"] + elevation: [20, 50] + delay: [0, 0] + + f3: + name: "G4S3.wav" + description: "two talkers walking around the table in opposite directions, non-overlapping utterances." + source: ["m4s14a_Talker1.wav", "f1s7a_Talker2.wav"] + azimuth: ["300:1:240 + 360", "300:-1:0"] + elevation: [20, 50] + delay: [0, 0] + + f4: + name: "G3S4.wav" + description: "two talkers walking around the table in opposite directions, non-overlapping utterances." + source: ["m1s7a_Talker1.wav", "f3s7a_Talker2.wav"] + azimuth: ["240:1:180 + 360", "240:-1:-60"] + elevation: [20, 50] + delay: [0, 0] + + f5: + name: "G2S5.wav" + description: "two talkers walking around the table in opposite directions, non-overlapping utterances." + source: ["f5s15a_Talker1.wav", "m3s8b_Talker2.wav"] + azimuth: ["180:1:120 + 360", "180:-1:-120"] + elevation: [20, 50] + delay: [0, 0] + + f6: + name: "G1S6.wav" + description: "two talkers walking around the table in opposite directions, non-overlapping utterances." + source: ["f2s7b_Talker1.wav", "m2s6b_Talker2.wav"] + azimuth: ["120:1:60 + 360", "120:-1:180 - 360"] + elevation: [20, 50] + delay: [0, 0] + \ No newline at end of file diff --git a/item_generation_scripts/constants.py b/item_generation_scripts/constants.py new file mode 100644 index 00000000..3b554800 --- /dev/null +++ b/item_generation_scripts/constants.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +from datetime import datetime +from pathlib import Path + +from item_generation_scripts.utils import find_binary, get_binary_paths + +LOGGER_SUFFIX = ".log" +LOGGER_FORMAT = ( + "%(levelname)-8s:%(processName)-10s | %(name)s | %(asctime)s | %(message)s" +) +LOGGER_DATEFMT = "%m-%d %H:%M:%S" + +SUPPORTED_FORMATS = { + "ISM1", + "ISM2", + "ISM3", + "ISM4", +} + +DEFAULT_CONFIG = { + # general options + "date": f"{datetime.now().strftime('%Y%m%d_%H.%M.%S')}", + "delete_tmp": False, +} + +DEFAULT_CONFIG_ISM2 = { + "format": "ISM2", + "input_path" : "./input", + "output_path": "./output", + # "cod": { + # "bin": find_binary("IVAS_cod", raise_error=False), + # }, + # "dec": { + # "bin": find_binary("IVAS_dec", raise_error=False), + # }, +} + +DEFAULT_CONFIG_BINARIES = { + "binary_paths": get_binary_paths( + Path(__file__).parent.joinpath("binary_paths.yml") + ), +} + +REQUIRED_KEYS = [ + "format", + "input_path", + "output_path", + "scenes", +] diff --git a/item_generation_scripts/processing/__init__.py b/item_generation_scripts/processing/__init__.py new file mode 100644 index 00000000..aea270d8 --- /dev/null +++ b/item_generation_scripts/processing/__init__.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# diff --git a/item_generation_scripts/processing/config.py b/item_generation_scripts/processing/config.py new file mode 100644 index 00000000..926689c4 --- /dev/null +++ b/item_generation_scripts/processing/config.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +from copy import deepcopy +from pathlib import Path + +import yaml + +from item_generation_scripts.constants import ( + DEFAULT_CONFIG, + DEFAULT_CONFIG_ISM2, + REQUIRED_KEYS +) + + +def merge_dicts(base: dict, other: dict) -> None: + """ + updates base with new keys from other + overrides existing keys + """ + for k in other.keys(): + if k in base and isinstance(base[k], dict) and isinstance(other[k], dict): + merge_dicts(base[k], other[k]) + # explicitly check for None here; + # if the user accidentally specifies only the parent but no sub-keys we don't want to overwrite the default + # however we do want to set non-truthy values e.g. False + elif other[k] is not None: + base[k] = other[k] + + +class TestConfig: + def __init__(self, filename: str): + """Parse a YAML or JSON configuration file""" + # init lists of conditions and associated dirs + self.out_dirs = [] + self.tmp_dirs = [] + + # get default config + cfg = DEFAULT_CONFIG + + # parse configuration file + file_cfg = self._parse_yaml(filename) + + # validate configuration from file + self._validate(file_cfg) + + # merge dictionaries, overriding from config file + merge_dicts(cfg, file_cfg) + + # set attributes from merged dictionary + self.__dict__.update(cfg) + + # store the merged config for writing to file later + self._yaml_dump = self._dump_yaml(cfg) + + # convert to Path + self.input_path = Path(self.input_path) + self.output_path = Path(self.output_path) + + def _parse_yaml(self, filename): + """parse configuration file""" + with open(filename) as fp: + return yaml.safe_load(fp) + + def _dump_yaml(self, cfg: dict): + """convert objects to to strings to avoid YAML dump as object""" + cfg = deepcopy(cfg) + + def format(d: dict): + for k, v in d.items(): + if isinstance(v, dict): + format(v) + else: + d[k] = str(v) + + format(cfg) + + return cfg + + def _validate(self, cfg: dict): + """ensure configuration contains required keys""" + MISSING_KEYS = [] + # check required keys + for r in REQUIRED_KEYS: + # if there was a tuple, we have a list of subkeys to check + if isinstance(r, tuple): + req_key, req_values = r + if not cfg.get(req_key): + MISSING_KEYS.append(req_key) + else: + # check all required values + for v in req_values: + if not cfg.get(req_key).get(v): + MISSING_KEYS.append(f"{req_key} : {v}") + elif not cfg.get(r): + MISSING_KEYS.append(r) + + # Report missing keys to the user + if MISSING_KEYS: + raise KeyError(f"The following key(s) must be specified : {MISSING_KEYS}") + diff --git a/item_generation_scripts/processing/preprocessing_2.py b/item_generation_scripts/processing/preprocessing_2.py new file mode 100644 index 00000000..1152ccc7 --- /dev/null +++ b/item_generation_scripts/processing/preprocessing_2.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +import logging +from pathlib import Path +from warnings import warn + +import numpy as np + +from item_generation_scripts.audiotools import audio +from item_generation_scripts.audiotools.audioarray import delay, trim +from item_generation_scripts.audiotools.audiofile import write +from item_generation_scripts.audiotools.metadata import ( + add_remove_preamble, + write_ISM_metadata_in_file, +) +from item_generation_scripts.audiotools.wrappers.bs1770 import ( + get_loudness, + loudness_norm, +) +from item_generation_scripts.audiotools.wrappers.random_seed import random_seed +from item_generation_scripts.processing.processing import Processing + + +class Preprocessing2(Processing): + def __init__(self, attrs: dict): + super().__init__(attrs) + self.name = "pre_2" + + def process(self, in_file: Path, out_file: Path, in_meta, logger: logging.Logger): + logger.debug(f"Preprocessing2 configuration : {self.__dict__}") + logger.debug(f"Preprocessing2 {in_file.absolute()} -> {out_file.absolute()}") + + # load in file + audio_object = audio.fromfile( + self.in_fmt, in_file, fs=self.in_fs, in_meta=in_meta + ) + + # add preamble + if self.preamble: + # also apply preamble to ISM metadata + if self.in_fmt.startswith("ISM"): + # read out old + metadata = [] + for meta in in_meta: + metadata.append(np.genfromtxt(meta, delimiter=",")) + + # modify metadata + metadata = add_remove_preamble(metadata, self.preamble) + meta_files = write_ISM_metadata_in_file(metadata, [out_file], True) + + # modify audio object + audio_object.metadata_files = meta_files + audio_object.obect_pos = metadata + + # add preamble to actual signal + audio_object.audio = trim( + audio_object.audio, + audio_object.fs, + (-self.preamble, 0), + self.pad_noise_preamble, + ) + + # add background noise + if self.background_noise: + audio_object.audio = self.add_background_noise(audio_object, in_meta) + + # save file + write(out_file, audio_object.audio, fs=audio_object.fs) + + return + + def add_background_noise(self, audio_object: audio.Audio, in_meta) -> np.ndarray: + # range for random delay + range_delay = (1, 2400000) + + # load background noise + noise_object = audio.fromfile( + self.in_fmt, + self.background_noise["background_noise_path"], + fs=self.in_fs, + in_meta=in_meta, + ) + + # if noise is too short raise error + if len(noise_object.audio) < len(audio_object.audio): + raise ValueError("Background noise too short for audio signal") + if len(noise_object.audio) - range_delay[1] < len(audio_object.audio): + warn( + "Background noise may be to short for audio signal when considering the random delay" + ) + + # measure loudness of audio signal based on output format + tmp_object = audio.fromtype(self.out_fmt) + if ( + isinstance(tmp_object, audio.ObjectBasedAudio) + or isinstance(tmp_object, audio.SceneBasedAudio) + or isinstance(tmp_object, audio.MetadataAssistedSpatialAudio) + ): + out_format = None + else: + out_format = self.out_fmt + + loudness_signal, _ = get_loudness(audio_object, loudness_format=out_format) + + # compute desired loudness of background noise + loudness_noise = loudness_signal - self.background_noise["snr"] + + # apply random delay and cut signal + rand_delay = random_seed( + range=range_delay, + master_seed=self.background_noise["master_seed"], + prerun_seed=self.background_noise["seed_delay"], + hexa=False, + ) + noise_object.audio = delay( + noise_object.audio, delay=-rand_delay, samples=True, fs=noise_object.fs + )[: len(audio_object.audio)] + + # scale background noise to desired loudness based on output format + noise_object.audio = loudness_norm(noise_object, loudness_noise, out_format) + + # add array to signal + audio_object.audio = noise_object.audio + audio_object.audio + + return audio_object.audio diff --git a/item_generation_scripts/processing/process_ism_items.py b/item_generation_scripts/processing/process_ism_items.py new file mode 100644 index 00000000..95bfb159 --- /dev/null +++ b/item_generation_scripts/processing/process_ism_items.py @@ -0,0 +1,221 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + + +import os +import sys +import shutil +import numpy as np +import logging +import csv +import subprocess as sp +from pathlib import Path + +from item_generation_scripts.audiotools import ( + audio, + audioarray, + audiofile, + binauralobjectrenderer, + metadata +) + +from item_generation_scripts.audiotools.wrappers.bs1770 import get_loudness +from item_generation_scripts.audiotools import audio + +# function for converting nd numpy array to strings with 2 decimal digits +def csv_formatdata(data): + for row in data: + yield ["%0.2f" % v for v in row] + + +def generate_ism_items( + format: str, + target_level: int, + input_path: Path, + output_path: Path, + scenes: dict, + logger: logging.Logger +): + + """Generate ISM items with metadata from mono items based on scene description """ + + # get the number of scenes + N_scenes = len(scenes) + + for scene_name, scene in scenes.items(): + logger.info(f"Processing {scene_name} out of {N_scenes} scenes") + + # extract the number of audio sources + N_sources = len(np.atleast_1d(scene['source'])) + + y = None + y_meta = None + for i in range(N_sources): + + source_file = np.atleast_1d(scene['source'])[i] + source_azi = np.atleast_1d(scene['azimuth'])[i] + source_ele = np.atleast_1d(scene['elevation'])[i] + source_type = 'speech' #### !!!! TBD - support generic audio + background noise and speech in the .yml file + source_delay = np.atleast_1d(scene['delay'])[i] + + logger.info(f"Encoding {source_file} at position(s) {source_azi},{source_ele}") + + # read source file + # x, fs = audiofile.read(os.path.join(input_path, source_file)) #### !!!! TBD - check the support for headerless .raw files + # pdb.set_trace() + audio_object = audio.fromfile("MONO", os.path.join(input_path, source_file)) + + + x = audio_object.audio + fs = audio_object.fs + + # find the number of frames + N_frames = int(len(x) / fs * 50 + 1) + + # adjust the level of the source file + _, scale_factor = get_loudness(audio_object, target_level, "MONO") + # print(f"Scaling loudness with factor: {scale_factor}") + x *= scale_factor + + # read azimuth information and create array + if isinstance(source_azi, str): + if ':' in source_azi: + source_azi = source_azi.split(':') + azi = np.arange(float(eval(source_azi[0])), float(eval(source_azi[2])), float(eval(source_azi[1]))) + else: + azi = np.array(float(eval(source_azi)), ndmin=1)[:N_frames] + else: + azi = np.array(source_azi, ndmin=1)[:N_frames] + + # ensure that azimuth array has N_frames values + if len(azi) > N_frames: + # cut the array of azimuth values + azi = azi[:N_frames] + elif len(azi) < N_frames: + # replicate the last azimuth + azi = np.append(azi, np.full( N_frames - len(azi), azi[-1])) + + # convert azimuth from 0 .. 360 to -180 .. +180 + azi = (azi + 180) % 360 - 180 + + # check if azimuth is from -180 .. +180 + if any(azi > 180) or any(azi < -180): + logger.error(f"Incorrect value(s) of azimuth: {azi[(azi > 180) | (azi < -180)]}") + + # read elevation information and create array + if isinstance(source_ele, str): + if ':' in source_ele: + source_ele = source_ele.split(':') + ele = np.arange(float(eval(source_ele[0])), float(eval(source_ele[2])), float(eval(source_ele[1]))) + else: + ele = np.array(float(eval(source_ele)), ndmin=1)[:N_frames] + else: + ele = np.array(source_ele, ndmin=1)[:N_frames] + + # ensure that elevation array has N_frames values + if len(ele) > N_frames: + # cut the array of elevation values + ele = ele[:N_frames] + elif len(ele) < N_frames: + # replicate the last elevation + ele = np.append(ele, np.full( N_frames - len(ele), ele[-1])) + + # check if elevation is from -90 .. +90 + if any(ele > 90) or any(ele < -90): + logger.error(f"Incorrect value(s) of elevation: {ele[(ele > 90) | (ele < -90)]}") + + # additional metadata + dist = np.ones(N_frames) #### !!!! TBD - check what to do with these metadata + spread = np.zeros(N_frames) + gain = np.ones(N_frames) + + # arrange all metadata fields column-wise into a matrix + x_meta = np.column_stack((azi, ele, dist, spread, gain)) + + # delay the source file + if source_delay > 0: + pre = np.zeros((int(source_delay * fs), x.shape[1])) + x = np.concatenate([pre, x]) + + # apply delay to metadata as well + pre = np.tile([0.00,0.00,1.00,0.00,1.00], (int(source_delay * 50), 1)) + # pre = np.zeros((int(source_delay * 50), x_meta.shape[1])) + x_meta = np.concatenate([pre, x_meta]) + + # add source signal to the array of source signals + if y is None: + y = x + else: + # append zeros to have equal length of all source signals + if x.shape[0] > y.shape[0]: + y = np.vstack((y, np.zeros((x.shape[0]-y.shape[0], y.shape[1])))) + elif y.shape[0] > x.shape[0]: + x = np.vstack((x, np.zeros((y.shape[0]-x.shape[0], x.shape[1])))) + y = np.hstack((y, x)) + + # add metadata to the array of all metadata + x_meta = x_meta[np.newaxis, :] # make sure x_meta is a 3d array + if y_meta is None: + y_meta = x_meta + else: + N_srcs = y_meta.shape[0] + N_meta_features = y_meta.shape[2] + + # append postamble (create by repeating the last row of metadata) to have equal length of all metadata + if x_meta.shape[1] > y_meta.shape[1]: + N_delta = x_meta.shape[1] - y_meta.shape[1] + y_meta = y_meta.reshape(y_meta.shape[1], -1) # reshape to 2d array + y_meta = np.vstack((y_meta, np.tile(y_meta[-1,:], (N_delta, 1)))) # repeat last row N_delta times and append to the array + y_meta = y_meta.reshape(N_srcs, -1, N_meta_features) # reshape back to 3d array + elif y_meta.shape[1] > x_meta.shape[1]: + N_delta = y_meta.shape[1] - x_meta.shape[1] + x_meta = x_meta.reshape(x_meta.shape[1], -1) # reshape to 2d array + x_meta = np.vstack((x_meta, np.tile(x_meta[-1,:], (N_delta, 1)))) # repeat last row N_delta times and append to the array + x_meta = np.expand_dims(x_meta, axis=0) # reshape back to 3d array + + y_meta = np.concatenate([y_meta, x_meta]) + + # write individual ISM audio streams to the output file in an interleaved format + output_filename = scene['name'] + audiofile.write(os.path.join(output_path, output_filename), y, fs) ### !!!! replace all os.path.xxx operations with the Path object + + # write individual ISM metadata to output files in .csv format + for i in range(N_sources): + # generate .csv filename (should end with .0.csv, .1.csv, ...) + csv_filename = os.path.normpath(f"{output_filename}.{i}.csv") + + with open(os.path.join(output_path, csv_filename), 'w') as f: + # create csv writer + writer = csv.writer(f) + + # write all rows to the .csv file + writer.writerows(csv_formatdata(y_meta[i])) diff --git a/item_generation_scripts/processing/processing.py b/item_generation_scripts/processing/processing.py new file mode 100644 index 00000000..ad2cf272 --- /dev/null +++ b/item_generation_scripts/processing/processing.py @@ -0,0 +1,455 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +import logging +from abc import ABC, abstractmethod +from itertools import repeat +from pathlib import Path +from shutil import copyfile +from typing import Iterable, Union +from warnings import warn + +import numpy as np + +from item_generation_scripts.audiotools import audio +from item_generation_scripts.audiotools.audiofile import ( + concat, + read, + split, + trim, + write, +) +from item_generation_scripts.audiotools.metadata import ( + add_remove_preamble, + concat_meta_from_file, + metadata_search, + split_meta_in_file, + write_ISM_metadata_in_file, +) +from item_generation_scripts.audiotools.wrappers.bs1770 import scale_files +from item_generation_scripts.constants import LOGGER_DATEFMT, LOGGER_FORMAT +from item_generation_scripts.processing.config import TestConfig +from item_generation_scripts.utils import apply_func_parallel, list_audio, pairwise + + +class Processing(ABC): + def __init__(self, attrs: dict): + self.__dict__.update(attrs) + + @abstractmethod + def process( + self, in_file: Path, out_file: Path, in_meta, logger: logging.Logger + ) -> None: + pass + + +def reorder_items_list(items_list: list, concatenation_order: list) -> list: + name_to_full = {Path(full_file).name: full_file for full_file in items_list} + ordered_full_files = [ + name_to_full[name] for name in concatenation_order if name in name_to_full + ] + return ordered_full_files + + +def concat_setup(cfg: TestConfig, chain, logger: logging.Logger): + n_items_list = len(cfg.items_list) + cfg_pre2 = chain[0] + + # check for text files + if any([i for i in cfg.items_list if i.suffix == ".txt"]): + raise SystemExit("Concatenation for text files is unsupported") + + # apply concatenation order + if cfg_pre2.concatenation_order is not None: + n_concatenation_order = len(cfg_pre2.concatenation_order) + if n_concatenation_order != n_items_list: + warn( + f"Warning: Mismatch in specified concatenation order and number of items to process!\n" + f"Number of items specified in concatenation order: {n_concatenation_order}\n" + f"Number of items in the directory: {n_items_list}\n" + f"Concatenation will use the following order:\n{cfg_pre2.concatenation_order}" + ) + + logger.info(f"Concatenating input files in directory {cfg.input_path}") + + # concatenate ISM metadata + if cfg.input["fmt"].startswith("ISM"): + cfg.concat_meta = [] + for obj_idx in range(len(cfg.metadata_path[0])): + cfg.concat_meta.append( + cfg.tmp_dirs[0].joinpath( + f"{cfg.input_path.name}_concatenated.wav.{obj_idx}.csv" + ) + ) + concat_meta_from_file( + cfg.items_list, + cfg.metadata_path, + cfg.concat_meta, + cfg.input["fmt"], + ) + + # set input to the concatenated file we have just written to the output dir + cfg.metadata_path = [cfg.concat_meta] + + # concatenate audio + cfg.concat_file = cfg.tmp_dirs[0].joinpath( + f"{cfg.input_path.name}_concatenated.wav" + ) + + # determine number of channels for pcm and raw files + tmp_audio = audio.fromtype(cfg_pre2.in_fmt) + tmp_num_chans = tmp_audio.num_channels + + cfg.splits = concat( + cfg.items_list, + cfg.concat_file, + in_fs=cfg.input.get("fs", 48000), + num_channels=tmp_num_chans, + ) + + # save item naming for splits naming in the end + cfg.split_names = [] + for name in cfg.items_list: + cfg.split_names.append(Path(name).stem.split(".")[0]) + # set input to the concatenated file we have just written to the output dir + cfg.items_list = [cfg.concat_file] + + # write out splits + with open(cfg.concat_file.with_suffix(".splits.log"), "w") as f: + print(", ".join([str(s) for s in cfg.splits]), file=f) + print(", ".join([str(sn) for sn in cfg.split_names]), file=f) + print(", ".join([str(i.stem) for i in cfg.items_list]), file=f) + + logger.info(f"Splits written to file {cfg.concat_file.with_suffix('.splits.log')}") + + +def concat_teardown(cfg: TestConfig, logger: logging.Logger): + if not cfg.splits: + raise ValueError("Splitting not possible without split marker") + + output_format = cfg.postprocessing["fmt"] + + out_files = [] + out_meta = [] + + logger.info(f"Splitting output file in directory {cfg.output_path}") + + for odir in cfg.out_dirs: + path_input = odir / cfg.items_list[0].name + out_paths = split( + path_input, + odir, + cfg.split_names, + cfg.splits, + in_fs=cfg.postprocessing["fs"], + ) + + logger.debug( + f"Resulting split files condition {odir.name}: {', '.join([str(op) for op in out_paths])}" + ) + out_files.append(out_paths) + + # split ISM metadata + if output_format.startswith("ISM"): + for odir in cfg.out_dirs: + path_input = odir / cfg.items_list[0].name + out_meta_paths = split_meta_in_file( + path_input, + odir, + cfg.split_names, + cfg.splits, + output_format, + meta_files=cfg.metadata_path[0], + ) + out_meta.append(out_meta_paths) + + # remove concatenated file + if cfg.delete_tmp: + cfg.concat_file.unlink(missing_ok=True) + + return out_files, out_meta + + +def preprocess(cfg, logger): + preprocessing = cfg.proc_chains[0] + chain = preprocessing["processes"] + + logger.info(f" Generating condition: {preprocessing['name']}") + + # run preprocessing + apply_func_parallel( + process_item, + zip( + cfg.items_list, + repeat(cfg.tmp_dirs[0]), + repeat(cfg.out_dirs[0]), + repeat(chain), + repeat(logger), + cfg.metadata_path, + ), + None, + "mp" if cfg.multiprocessing else None, + ) + + # update the configuration to use preprocessing outputs as new inputs + cfg.items_list = list_audio( + cfg.out_dirs[0], absolute=False, select_list=getattr(cfg, "input_select", None) + ) + + # Re-ordering items based on concatenation order + if ( + hasattr(cfg, "preprocessing_2") + and cfg.preprocessing_2.get("concatenate_input", False) + and cfg.preprocessing_2.get("concatenation_order", None) is not None + ): + cfg.items_list = reorder_items_list(cfg.items_list, cfg.concatenation_order) + + if cfg.metadata_path[0] is not None: + for item_idx in range(len(cfg.metadata_path)): + for obj_idx in range(len(cfg.metadata_path[item_idx])): + if cfg.metadata_path[item_idx][obj_idx]: + cfg.metadata_path[item_idx][obj_idx] = cfg.out_dirs[0] / Path( + f"{cfg.items_list[item_idx].stem}.wav.{obj_idx}.csv" + ) + # remove already applied processing stage + cfg.proc_chains = cfg.proc_chains[1:] + cfg.tmp_dirs = cfg.tmp_dirs[1:] + cfg.out_dirs = cfg.out_dirs[1:] + + +def preprocess_2(cfg, logger): + preprocessing_2 = cfg.proc_chains[0] + chain = preprocessing_2["processes"] + + logger.info(f" Generating condition: {preprocessing_2['name']}") + + # concatenate items if required + if chain[0].concatenate_input: + concat_setup(cfg, chain, logger) + + # run preprocessing 2 + apply_func_parallel( + process_item, + zip( + cfg.items_list, + repeat(cfg.tmp_dirs[0]), + repeat(cfg.out_dirs[0]), + repeat(chain), + repeat(logger), + cfg.metadata_path, + ), + None, + "mp" if cfg.multiprocessing else None, + ) + + # update the configuration to use preprocessing 2 outputs as new inputs + cfg.items_list = list_audio( + cfg.out_dirs[0], absolute=False, select_list=getattr(cfg, "input_select", None) + ) + + # Re-ordering items based on concatenation order + if ( + hasattr(cfg, "preprocessing_2") + and cfg.preprocessing_2.get("concatenate_input", False) + and cfg.preprocessing_2.get("concatenation_order", None) is not None + ): + cfg.items_list = reorder_items_list(cfg.items_list, cfg.concatenation_order) + + if cfg.metadata_path[0] is not None: + for item_idx in range(len(cfg.metadata_path)): + for obj_idx in range(len(cfg.metadata_path[item_idx])): + if cfg.metadata_path[item_idx][obj_idx]: + cfg.metadata_path[item_idx][obj_idx] = cfg.out_dirs[0] / Path( + f"{cfg.items_list[item_idx].stem}.wav.{obj_idx}.csv" + ) + # remove already applied processing stage + cfg.proc_chains = cfg.proc_chains[1:] + cfg.tmp_dirs = cfg.tmp_dirs[1:] + cfg.out_dirs = cfg.out_dirs[1:] + + return + + +def reverse_process_2(cfg, logger): + # remove preamble + if cfg.pre2.preamble: + remove_preamble(cfg) + + # reverse concatenation + if cfg.pre2.concatenate_input: + # write out the splits, optionally remove file + out_paths_splits, out_meta_splits = concat_teardown(cfg, logger) + else: + # if no concatenation read files from folder + out_paths_splits = [] + for out_dir in cfg.out_dirs: + list_audio_dir = list_audio(out_dir, absolute=True) + out_paths_splits.append(list_audio_dir) + if cfg.postprocessing["fmt"].startswith("ISM"): + out_meta_splits = [] + for i, condition in enumerate(out_paths_splits): + meta_condition = metadata_search( + cfg.out_dirs[i], + condition, + num_objects=int(cfg.postprocessing["fmt"][-1]), + ) + out_meta_splits.append(meta_condition) + else: + out_meta_splits = None + + # scale individual files + if cfg.postprocessing.get("loudness", False): + scale_files( + out_paths_splits, + cfg.postprocessing["fmt"], + cfg.postprocessing["loudness"], + cfg.postprocessing["fs"], + out_meta_splits, + ) + return + + +def process_item( + in_file: Union[Path, str], + tmp_dir: Union[Path, str], + out_dir: Union[Path, str], + chain: Iterable, + logger: logging.Logger, + in_meta, +) -> None: + tmp_file = tmp_dir.joinpath(in_file.name) + tmp_file_meta = [] + if in_meta: + for im in in_meta: + tmp_file_meta.append(tmp_dir.joinpath(Path(im).name)) + + # assemble a list of files to be used during the processing chain + out_dir_wav = False + processing_paths = [in_file] + processing_paths_meta = [in_meta] + for p in chain: + if Path(in_file.name).suffix == ".txt" and p.out_fmt is not None: + processing_paths.append(tmp_file.with_suffix(f".{p.name}.wav")) + out_dir_wav = True + else: + processing_paths.append(tmp_file.with_suffix(f".{p.name}{tmp_file.suffix}")) + try: + out_format = p.out_fmt + except AttributeError: + # EVS has no attribute out_fmt + out_format = p.in_fmt + try: + bool_ism = out_format.startswith("ISM") + except Exception: + bool_ism = out_format.name.startswith("ISM") + + if bool_ism: + list_meta_step = [] + for idx, tfm in enumerate(tmp_file_meta): + list_meta_step.append( + tfm.parent + / f"{in_file.stem.split('.')[0]}.{p.name}.wav.{idx}.csv" + ) + processing_paths_meta.append(list_meta_step) + else: + processing_paths_meta.append(None) + # TODO: support txt file writing for META pass-through + + if out_dir_wav: + out_file = out_dir.joinpath(in_file.name).with_suffix(".wav") + else: + out_file = out_dir.joinpath(in_file.name) + + out_meta = [] + if in_meta: + for im in range(len(in_meta)): + out_meta.append(out_dir.joinpath(f"{Path(out_file).stem}.wav.{im}.csv")) + + # execute each process sequentially, feed output into input of next process + for p, (input, output), input_meta in zip( + chain, pairwise(processing_paths), processing_paths_meta[:-1] + ): + # setup logging for the output + item_logger = logger.getChild(output.stem) + fh = logging.FileHandler(output.with_suffix(".log"), mode="w") + fh.setLevel(logging.DEBUG) + fh.setFormatter(logging.Formatter(LOGGER_FORMAT, datefmt=LOGGER_DATEFMT)) + item_logger.addHandler(fh) + + p.process(input, output, input_meta, item_logger) + + # copy output and metadata from final process to output file + copyfile(processing_paths[-1], out_file) + if processing_paths_meta[-1]: + for idx, ppm in enumerate(processing_paths_meta[-1]): + copyfile(ppm, out_meta[idx]) + + +def remove_preamble(cfg): + # get number of channels from output format + num_channels = audio.fromtype(cfg.postprocessing["fmt"]).num_channels + for odir in cfg.out_dirs: + for item in cfg.items_list: + path_input = odir / item.name + + # remove preamble for ISM metadata + if cfg.postprocessing["fmt"].startswith("ISM"): + # search for metadata + meta_item = metadata_search( + odir, [Path(item.name)], num_objects=num_channels + ) + metadata_array = [] + for meta_i in meta_item: + metadata_array.append(np.genfromtxt(meta_i, delimiter=",")) + + # remove preamble + metadata_array = add_remove_preamble( + metadata_array, cfg.pre2.preamble, add=False + ) + + # write csv files + write_ISM_metadata_in_file( + metadata_array, [path_input], automatic_naming=True + ) + + # read file + x, fs = read( + path_input, nchannels=num_channels, fs=cfg.postprocessing["fs"] + ) + + # remove preamble + x = trim(x, fs, (cfg.pre2.preamble, 0)) + + # write file + write(path_input, x, fs) + + return diff --git a/item_generation_scripts/utils.py b/item_generation_scripts/utils.py new file mode 100644 index 00000000..1e21b0db --- /dev/null +++ b/item_generation_scripts/utils.py @@ -0,0 +1,297 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +import logging +import shutil +import subprocess as sp +import sys +from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor +from itertools import repeat, tee +from os import devnull +from pathlib import Path +from shutil import which +from typing import Callable, Iterable, Optional, Union + +import yaml + +ALLOWED_INPUT_EXT = (".wav", ".pcm", ".txt", ".raw") +BIN_DIR = Path(__file__).parent.joinpath("bin") + + +""" +Directory/path handling +""" + + +def create_dir(p: str) -> None: + p = Path(p) + p.mkdir(exist_ok=True, parents=True) + + +def delete_dir(p: str) -> None: + p = Path(p) + if p.exists() and p.is_dir(): + shutil.rmtree(p) + + +class DirManager: + """ + Context manager that creates directories if not already present and + automatically cleans up (i.e. deletes) all specified paths + """ + + def __init__( + self, create_paths: Union[str, list], delete_paths: Union[str, list] = list() + ): + self.create_paths = ( + create_paths if isinstance(create_paths, list) else [create_paths] + ) + self.delete_paths = ( + delete_paths if isinstance(create_paths, list) else [delete_paths] + ) + + def __enter__(self): + for path in self.create_paths: + create_dir(path) + + def __exit__(self, exc_type, exc_value, exc_traceback): + for path in self.delete_paths: + if path in self.create_paths: + delete_dir(path) + else: + print( + f"Tmp dir '{path}' was not present in creation paths - skipping deletion." + ) + + +def list_audio(path: str, absolute: bool = False, select_list: list = None) -> list: + """ + Return list with all files with ALLOWED_INPUT_EXT found under the given path. + + If path is a directory, all files in it are included, if it is a file, just the file + will be in the list. If a select list is provided, files are filtered accordingly. + """ + path = Path(path) + audio_list = [] + + if path.exists(): + if path.is_dir(): + if absolute: + [audio_list.extend(list(path.glob(ext))) for ext in ALLOWED_INPUT_EXT] + audio_list = [ + path.joinpath(f) + for f in path.iterdir() + if f.suffix in ALLOWED_INPUT_EXT + ] + else: + audio_list = [ + f for f in path.iterdir() if f.suffix in ALLOWED_INPUT_EXT + ] + else: + if not absolute: + path = path.name + ext = path.suffix + if ext in ALLOWED_INPUT_EXT: + audio_list.append(path) + + # filter according to select list + if select_list: + select_set = set([Path(i).stem for i in select_list]) + audio_list = [ + f for f in audio_list if any([pattern in f.stem for pattern in select_set]) + ] + + return audio_list + + +def get_nickname(p: Path) -> str: + return f"{p.parent.name}/{p.name}" + + +""" +System interaction +""" + + +def find_binary( + binary: str, + raise_error: Optional[bool] = True, + logger: Optional[logging.Logger] = None, + binary_path: Optional[str] = None, +) -> Union[Path, None]: + """Attempt to find and return the path to the given binary""" + # prioritise binaries placed in the directory over $PATH + if binary_path is not None: + bin = which(binary, path=binary_path) + else: + bin = which(binary, path=BIN_DIR) + if not bin: + bin = which(binary) + + if not bin and raise_error: + raise FileNotFoundError( + f"Binary {binary} was neither found in {binary_path.absolute()} nor in {BIN_DIR.absolute()} or in $PATH!" + ) + elif not bin: + if logger: + logger.debug(f"Couldn't find binary {binary}") + return None + else: + if logger: + logger.debug(f"Found binary {bin}") + return Path(bin) + + +def get_devnull(): + return devnull + + +def get_gitsha(): + try: + git_sha = sp.check_output( + ["git", "rev-parse", "HEAD"], stderr=sp.STDOUT, text=True + ).strip() + except sp.CalledProcessError: + git_sha = "git repository not found!" + + return git_sha + + +def run(cmd, cwd=None, check=True, logger: Optional[logging.Logger] = None): + if logger: + logger.debug(f"Running command {' '.join([str(c) for c in cmd])}; cwd = {cwd}") + + try: + result = sp.run(cmd, check=check, capture_output=True, text=True, cwd=cwd) + except sp.CalledProcessError as e: + raise SystemError( + f"Command returned non-zero exit status ({e.returncode}): {' '.join([str(c) for c in e.cmd])}\n{e.stderr}\n{e.stdout}" + ) + + if logger: + logger.debug(result.stderr.strip()) + logger.debug(result.stdout.strip()) + + return result + + +""" +Utility functions +""" + + +def apply_func_parallel( + func: Callable, + args: Iterable, + kwargs: Optional[Iterable] = None, + type: Optional[str] = None, + show_progress: Optional[bool] = True, +) -> list: + """ + Apply a function iteratively to a list of arguments and keyword arguments + Optionally with multiprocessing or multithreading + + Parameters + ---------- + func : Callable + Function to use + args : Iterable + List of positional arguments + kwargs: Optional[Iterable] + List of keyword arguments + type: Optional[str] + Type of parallel processing to use, "mp" for multiprocessing or "mt" for threading, default = None (sequential processing) + show_progress: Optional[bool] + Flag whether to show progress bar + + Returns + ------- + List of function results + """ + + # if no kwargs are specified, repeat the empty dict to avoid issues with zipping and unpacking + if not kwargs: + kwargs = repeat({}) + + args_zip = zip(args, kwargs) + + if type == "mp": + executor = ProcessPoolExecutor + elif type == "mt": + executor = ThreadPoolExecutor + else: + return [ + func(*a, **k) + for a, k in (progressbar(list(args_zip)) if show_progress else args_zip) + ] + + with executor() as e: + results = [e.submit(func, *a, **k) for a, k in args_zip] + return [ + r.result() for r in (progressbar(results) if show_progress else results) + ] + + +def pairwise(iter): + """itertools.pairwise() for python < 3.10""" + a, b = tee(iter) + next(b, None) + return zip(a, b) + + +def progressbar(iter: Iterable, width=80): + """simple unicode progressbar""" + count = len(iter) + + def update(progress): + fill = int(width * progress / count) + print( + f"{int(progress/count*100):3d}%{u'│'}{u'█'*fill}{(u'░'*(width-fill))}{u'│'}{progress}/{count}", + end="\r", + file=sys.stdout, + flush=True, + ) + + update(0) + for i, item in enumerate(iter): + yield item + update(i + 1) + print("\n", flush=True, file=sys.stdout) + + +def get_binary_paths(yaml_file_with_binary_paths): + with open(yaml_file_with_binary_paths, "r") as f: + data = yaml.safe_load(f) + if data is None: + return {} + else: + return {key: Path(value) for key, value in data.items()} -- GitLab From f2d3f6e9a2d6383a7a29d943f4599eba783af71c Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Tue, 2 May 2023 15:28:16 +0200 Subject: [PATCH 02/27] formatting --- item_generation_scripts/__init__.py | 9 +- item_generation_scripts/constants.py | 6 +- item_generation_scripts/processing/config.py | 3 +- .../processing/process_ism_items.py | 167 ++++++++++-------- 4 files changed, 103 insertions(+), 82 deletions(-) diff --git a/item_generation_scripts/__init__.py b/item_generation_scripts/__init__.py index 989d61a6..64efb46d 100644 --- a/item_generation_scripts/__init__.py +++ b/item_generation_scripts/__init__.py @@ -30,11 +30,12 @@ # the United Nations Convention on Contracts on the International Sales of Goods. # -import os import logging +import os +import pdb from itertools import repeat + import yaml -import pdb from item_generation_scripts.constants import ( LOGGER_DATEFMT, @@ -42,7 +43,6 @@ from item_generation_scripts.constants import ( LOGGER_SUFFIX, ) from item_generation_scripts.processing import config, process_ism_items -from item_generation_scripts.processing import config from item_generation_scripts.utils import create_dir @@ -73,7 +73,6 @@ def logging_init(args, cfg): def main(args): - # parse configuration cfg = config.TestConfig(args.config) @@ -93,7 +92,7 @@ def main(args): cfg.input_path, cfg.output_path, cfg.scenes, - logger + logger, ) # copy configuration to output directory diff --git a/item_generation_scripts/constants.py b/item_generation_scripts/constants.py index 3b554800..c3d5061f 100644 --- a/item_generation_scripts/constants.py +++ b/item_generation_scripts/constants.py @@ -56,13 +56,13 @@ DEFAULT_CONFIG = { DEFAULT_CONFIG_ISM2 = { "format": "ISM2", - "input_path" : "./input", + "input_path": "./input", "output_path": "./output", # "cod": { - # "bin": find_binary("IVAS_cod", raise_error=False), + # "bin": find_binary("IVAS_cod", raise_error=False), # }, # "dec": { - # "bin": find_binary("IVAS_dec", raise_error=False), + # "bin": find_binary("IVAS_dec", raise_error=False), # }, } diff --git a/item_generation_scripts/processing/config.py b/item_generation_scripts/processing/config.py index 926689c4..0fa1fa5e 100644 --- a/item_generation_scripts/processing/config.py +++ b/item_generation_scripts/processing/config.py @@ -38,7 +38,7 @@ import yaml from item_generation_scripts.constants import ( DEFAULT_CONFIG, DEFAULT_CONFIG_ISM2, - REQUIRED_KEYS + REQUIRED_KEYS, ) @@ -127,4 +127,3 @@ class TestConfig: # Report missing keys to the user if MISSING_KEYS: raise KeyError(f"The following key(s) must be specified : {MISSING_KEYS}") - diff --git a/item_generation_scripts/processing/process_ism_items.py b/item_generation_scripts/processing/process_ism_items.py index 95bfb159..f4f58fc1 100644 --- a/item_generation_scripts/processing/process_ism_items.py +++ b/item_generation_scripts/processing/process_ism_items.py @@ -31,27 +31,27 @@ # +import csv +import logging import os -import sys import shutil -import numpy as np -import logging -import csv import subprocess as sp +import sys from pathlib import Path +import numpy as np + from item_generation_scripts.audiotools import ( audio, audioarray, audiofile, binauralobjectrenderer, - metadata + metadata, ) - from item_generation_scripts.audiotools.wrappers.bs1770 import get_loudness -from item_generation_scripts.audiotools import audio -# function for converting nd numpy array to strings with 2 decimal digits + +# function for converting nd numpy array to strings with 2 decimal digits def csv_formatdata(data): for row in data: yield ["%0.2f" % v for v in row] @@ -63,159 +63,182 @@ def generate_ism_items( input_path: Path, output_path: Path, scenes: dict, - logger: logging.Logger + logger: logging.Logger, ): + """Generate ISM items with metadata from mono items based on scene description""" - """Generate ISM items with metadata from mono items based on scene description """ - # get the number of scenes N_scenes = len(scenes) - + for scene_name, scene in scenes.items(): logger.info(f"Processing {scene_name} out of {N_scenes} scenes") - + # extract the number of audio sources - N_sources = len(np.atleast_1d(scene['source'])) + N_sources = len(np.atleast_1d(scene["source"])) y = None y_meta = None for i in range(N_sources): - - source_file = np.atleast_1d(scene['source'])[i] - source_azi = np.atleast_1d(scene['azimuth'])[i] - source_ele = np.atleast_1d(scene['elevation'])[i] - source_type = 'speech' #### !!!! TBD - support generic audio + background noise and speech in the .yml file - source_delay = np.atleast_1d(scene['delay'])[i] - - logger.info(f"Encoding {source_file} at position(s) {source_azi},{source_ele}") - + source_file = np.atleast_1d(scene["source"])[i] + source_azi = np.atleast_1d(scene["azimuth"])[i] + source_ele = np.atleast_1d(scene["elevation"])[i] + source_type = "speech" #### !!!! TBD - support generic audio + background noise and speech in the .yml file + source_delay = np.atleast_1d(scene["delay"])[i] + + logger.info( + f"Encoding {source_file} at position(s) {source_azi},{source_ele}" + ) + # read source file # x, fs = audiofile.read(os.path.join(input_path, source_file)) #### !!!! TBD - check the support for headerless .raw files # pdb.set_trace() audio_object = audio.fromfile("MONO", os.path.join(input_path, source_file)) - - + x = audio_object.audio fs = audio_object.fs - + # find the number of frames N_frames = int(len(x) / fs * 50 + 1) - + # adjust the level of the source file - _, scale_factor = get_loudness(audio_object, target_level, "MONO") + _, scale_factor = get_loudness(audio_object, target_level, "MONO") # print(f"Scaling loudness with factor: {scale_factor}") x *= scale_factor - + # read azimuth information and create array if isinstance(source_azi, str): - if ':' in source_azi: - source_azi = source_azi.split(':') - azi = np.arange(float(eval(source_azi[0])), float(eval(source_azi[2])), float(eval(source_azi[1]))) + if ":" in source_azi: + source_azi = source_azi.split(":") + azi = np.arange( + float(eval(source_azi[0])), + float(eval(source_azi[2])), + float(eval(source_azi[1])), + ) else: azi = np.array(float(eval(source_azi)), ndmin=1)[:N_frames] else: azi = np.array(source_azi, ndmin=1)[:N_frames] - + # ensure that azimuth array has N_frames values if len(azi) > N_frames: # cut the array of azimuth values azi = azi[:N_frames] elif len(azi) < N_frames: # replicate the last azimuth - azi = np.append(azi, np.full( N_frames - len(azi), azi[-1])) - + azi = np.append(azi, np.full(N_frames - len(azi), azi[-1])) + # convert azimuth from 0 .. 360 to -180 .. +180 - azi = (azi + 180) % 360 - 180 + azi = (azi + 180) % 360 - 180 # check if azimuth is from -180 .. +180 if any(azi > 180) or any(azi < -180): - logger.error(f"Incorrect value(s) of azimuth: {azi[(azi > 180) | (azi < -180)]}") - + logger.error( + f"Incorrect value(s) of azimuth: {azi[(azi > 180) | (azi < -180)]}" + ) + # read elevation information and create array if isinstance(source_ele, str): - if ':' in source_ele: - source_ele = source_ele.split(':') - ele = np.arange(float(eval(source_ele[0])), float(eval(source_ele[2])), float(eval(source_ele[1]))) + if ":" in source_ele: + source_ele = source_ele.split(":") + ele = np.arange( + float(eval(source_ele[0])), + float(eval(source_ele[2])), + float(eval(source_ele[1])), + ) else: ele = np.array(float(eval(source_ele)), ndmin=1)[:N_frames] else: ele = np.array(source_ele, ndmin=1)[:N_frames] - + # ensure that elevation array has N_frames values if len(ele) > N_frames: # cut the array of elevation values ele = ele[:N_frames] elif len(ele) < N_frames: # replicate the last elevation - ele = np.append(ele, np.full( N_frames - len(ele), ele[-1])) + ele = np.append(ele, np.full(N_frames - len(ele), ele[-1])) # check if elevation is from -90 .. +90 if any(ele > 90) or any(ele < -90): - logger.error(f"Incorrect value(s) of elevation: {ele[(ele > 90) | (ele < -90)]}") - + logger.error( + f"Incorrect value(s) of elevation: {ele[(ele > 90) | (ele < -90)]}" + ) + # additional metadata - dist = np.ones(N_frames) #### !!!! TBD - check what to do with these metadata + dist = np.ones( + N_frames + ) #### !!!! TBD - check what to do with these metadata spread = np.zeros(N_frames) gain = np.ones(N_frames) - + # arrange all metadata fields column-wise into a matrix x_meta = np.column_stack((azi, ele, dist, spread, gain)) - + # delay the source file if source_delay > 0: pre = np.zeros((int(source_delay * fs), x.shape[1])) x = np.concatenate([pre, x]) - + # apply delay to metadata as well - pre = np.tile([0.00,0.00,1.00,0.00,1.00], (int(source_delay * 50), 1)) + pre = np.tile( + [0.00, 0.00, 1.00, 0.00, 1.00], (int(source_delay * 50), 1) + ) # pre = np.zeros((int(source_delay * 50), x_meta.shape[1])) x_meta = np.concatenate([pre, x_meta]) - + # add source signal to the array of source signals if y is None: y = x else: # append zeros to have equal length of all source signals if x.shape[0] > y.shape[0]: - y = np.vstack((y, np.zeros((x.shape[0]-y.shape[0], y.shape[1])))) + y = np.vstack((y, np.zeros((x.shape[0] - y.shape[0], y.shape[1])))) elif y.shape[0] > x.shape[0]: - x = np.vstack((x, np.zeros((y.shape[0]-x.shape[0], x.shape[1])))) + x = np.vstack((x, np.zeros((y.shape[0] - x.shape[0], x.shape[1])))) y = np.hstack((y, x)) - + # add metadata to the array of all metadata - x_meta = x_meta[np.newaxis, :] # make sure x_meta is a 3d array + x_meta = x_meta[np.newaxis, :] # make sure x_meta is a 3d array if y_meta is None: y_meta = x_meta else: N_srcs = y_meta.shape[0] N_meta_features = y_meta.shape[2] - + # append postamble (create by repeating the last row of metadata) to have equal length of all metadata if x_meta.shape[1] > y_meta.shape[1]: N_delta = x_meta.shape[1] - y_meta.shape[1] - y_meta = y_meta.reshape(y_meta.shape[1], -1) # reshape to 2d array - y_meta = np.vstack((y_meta, np.tile(y_meta[-1,:], (N_delta, 1)))) # repeat last row N_delta times and append to the array - y_meta = y_meta.reshape(N_srcs, -1, N_meta_features) # reshape back to 3d array + y_meta = y_meta.reshape(y_meta.shape[1], -1) # reshape to 2d array + y_meta = np.vstack( + (y_meta, np.tile(y_meta[-1, :], (N_delta, 1))) + ) # repeat last row N_delta times and append to the array + y_meta = y_meta.reshape( + N_srcs, -1, N_meta_features + ) # reshape back to 3d array elif y_meta.shape[1] > x_meta.shape[1]: N_delta = y_meta.shape[1] - x_meta.shape[1] - x_meta = x_meta.reshape(x_meta.shape[1], -1) # reshape to 2d array - x_meta = np.vstack((x_meta, np.tile(x_meta[-1,:], (N_delta, 1)))) # repeat last row N_delta times and append to the array - x_meta = np.expand_dims(x_meta, axis=0) # reshape back to 3d array - + x_meta = x_meta.reshape(x_meta.shape[1], -1) # reshape to 2d array + x_meta = np.vstack( + (x_meta, np.tile(x_meta[-1, :], (N_delta, 1))) + ) # repeat last row N_delta times and append to the array + x_meta = np.expand_dims(x_meta, axis=0) # reshape back to 3d array + y_meta = np.concatenate([y_meta, x_meta]) - + # write individual ISM audio streams to the output file in an interleaved format - output_filename = scene['name'] - audiofile.write(os.path.join(output_path, output_filename), y, fs) ### !!!! replace all os.path.xxx operations with the Path object - - # write individual ISM metadata to output files in .csv format + output_filename = scene["name"] + audiofile.write( + os.path.join(output_path, output_filename), y, fs + ) ### !!!! replace all os.path.xxx operations with the Path object + + # write individual ISM metadata to output files in .csv format for i in range(N_sources): # generate .csv filename (should end with .0.csv, .1.csv, ...) csv_filename = os.path.normpath(f"{output_filename}.{i}.csv") - - with open(os.path.join(output_path, csv_filename), 'w') as f: + + with open(os.path.join(output_path, csv_filename), "w") as f: # create csv writer writer = csv.writer(f) - + # write all rows to the .csv file - writer.writerows(csv_formatdata(y_meta[i])) + writer.writerows(csv_formatdata(y_meta[i])) -- GitLab From 8d7b16e85bbf11c9c8fc28a899a600ae6cc1821a Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Tue, 2 May 2023 15:37:19 +0200 Subject: [PATCH 03/27] formatting --- ivas_processing_scripts/audiotools/wrappers/gen_patt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ivas_processing_scripts/audiotools/wrappers/gen_patt.py b/ivas_processing_scripts/audiotools/wrappers/gen_patt.py index f801b07b..d8737ef4 100644 --- a/ivas_processing_scripts/audiotools/wrappers/gen_patt.py +++ b/ivas_processing_scripts/audiotools/wrappers/gen_patt.py @@ -138,7 +138,7 @@ def create_error_pattern( gen_patt(100, "ep.g192", 5, working_dir=tmp_dir_test) if not tmp_sta_file_test.exists(): raise RuntimeError( - "Used version of gen-patt was detected to be faulty (unable to write \"sta\"-file). See bin/README.md for details." + 'Used version of gen-patt was detected to be faulty (unable to write "sta"-file). See bin/README.md for details.' ) with TemporaryDirectory() as tmp_dir: -- GitLab From 872d533c7ce23e3d18649d1e71fab3b1b81fb24f Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Tue, 2 May 2023 15:46:39 +0200 Subject: [PATCH 04/27] formatting --- item_generation_scripts/__init__.py | 2 -- item_generation_scripts/constants.py | 2 +- item_generation_scripts/processing/config.py | 1 - .../processing/process_ism_items.py | 19 +++++-------------- 4 files changed, 6 insertions(+), 18 deletions(-) diff --git a/item_generation_scripts/__init__.py b/item_generation_scripts/__init__.py index 64efb46d..88951e80 100644 --- a/item_generation_scripts/__init__.py +++ b/item_generation_scripts/__init__.py @@ -32,8 +32,6 @@ import logging import os -import pdb -from itertools import repeat import yaml diff --git a/item_generation_scripts/constants.py b/item_generation_scripts/constants.py index c3d5061f..9509d069 100644 --- a/item_generation_scripts/constants.py +++ b/item_generation_scripts/constants.py @@ -33,7 +33,7 @@ from datetime import datetime from pathlib import Path -from item_generation_scripts.utils import find_binary, get_binary_paths +from item_generation_scripts.utils import get_binary_paths LOGGER_SUFFIX = ".log" LOGGER_FORMAT = ( diff --git a/item_generation_scripts/processing/config.py b/item_generation_scripts/processing/config.py index 0fa1fa5e..06f828bb 100644 --- a/item_generation_scripts/processing/config.py +++ b/item_generation_scripts/processing/config.py @@ -37,7 +37,6 @@ import yaml from item_generation_scripts.constants import ( DEFAULT_CONFIG, - DEFAULT_CONFIG_ISM2, REQUIRED_KEYS, ) diff --git a/item_generation_scripts/processing/process_ism_items.py b/item_generation_scripts/processing/process_ism_items.py index f4f58fc1..73267607 100644 --- a/item_generation_scripts/processing/process_ism_items.py +++ b/item_generation_scripts/processing/process_ism_items.py @@ -34,20 +34,11 @@ import csv import logging import os -import shutil -import subprocess as sp -import sys from pathlib import Path import numpy as np -from item_generation_scripts.audiotools import ( - audio, - audioarray, - audiofile, - binauralobjectrenderer, - metadata, -) +from item_generation_scripts.audiotools import audio, audiofile from item_generation_scripts.audiotools.wrappers.bs1770 import get_loudness @@ -82,7 +73,7 @@ def generate_ism_items( source_file = np.atleast_1d(scene["source"])[i] source_azi = np.atleast_1d(scene["azimuth"])[i] source_ele = np.atleast_1d(scene["elevation"])[i] - source_type = "speech" #### !!!! TBD - support generic audio + background noise and speech in the .yml file + # source_type = "speech" # !!!! TBD - support generic audio + background noise and speech in the .yml file source_delay = np.atleast_1d(scene["delay"])[i] logger.info( @@ -90,7 +81,7 @@ def generate_ism_items( ) # read source file - # x, fs = audiofile.read(os.path.join(input_path, source_file)) #### !!!! TBD - check the support for headerless .raw files + # x, fs = audiofile.read(os.path.join(input_path, source_file)) # !!!! TBD - check the support for headerless .raw files # pdb.set_trace() audio_object = audio.fromfile("MONO", os.path.join(input_path, source_file)) @@ -167,7 +158,7 @@ def generate_ism_items( # additional metadata dist = np.ones( N_frames - ) #### !!!! TBD - check what to do with these metadata + ) # !!!! TBD - check what to do with these metadata spread = np.zeros(N_frames) gain = np.ones(N_frames) @@ -229,7 +220,7 @@ def generate_ism_items( output_filename = scene["name"] audiofile.write( os.path.join(output_path, output_filename), y, fs - ) ### !!!! replace all os.path.xxx operations with the Path object + ) # !!!! TBD: replace all os.path.xxx operations with the Path object # write individual ISM metadata to output files in .csv format for i in range(N_sources): -- GitLab From 81628b69aa93028d232c242895bdc0e205c8b25c Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Tue, 2 May 2023 16:34:12 +0200 Subject: [PATCH 05/27] support of .raw format --- item_generation_scripts/__init__.py | 1 + .../config/ISM1_CONFIG.yml | 78 +++++++++---------- .../config/ISM2_CONFIG.yml | 6 +- item_generation_scripts/processing/config.py | 5 +- .../processing/process_ism_items.py | 13 +--- 5 files changed, 48 insertions(+), 55 deletions(-) diff --git a/item_generation_scripts/__init__.py b/item_generation_scripts/__init__.py index 88951e80..c08820ea 100644 --- a/item_generation_scripts/__init__.py +++ b/item_generation_scripts/__init__.py @@ -91,6 +91,7 @@ def main(args): cfg.output_path, cfg.scenes, logger, + fs=cfg.fs ) # copy configuration to output directory diff --git a/item_generation_scripts/config/ISM1_CONFIG.yml b/item_generation_scripts/config/ISM1_CONFIG.yml index f4e1ee31..cbe4eb71 100644 --- a/item_generation_scripts/config/ISM1_CONFIG.yml +++ b/item_generation_scripts/config/ISM1_CONFIG.yml @@ -13,7 +13,7 @@ format: "ISM1" # delete_tmp: true ### Output sampling rate in Hz needed for headerless audio files; default = 48000 -# fs: 32000 +fs: 48000 ### Any relative paths will be interpreted relative to the working directory the script is called from! ### Usage of absolute paths is recommended. @@ -21,10 +21,10 @@ format: "ISM1" ### For Windows users: please use double back slash '\\' in paths and add '.exe' to executable definitions ### Input path to mono files -input_path: "/mnt/c/Work/IVAS/3gpp_forge_gitlab/ivas-processing-scripts/items_mono" +input_path: "./items_mono" ### Output path for generated test items and metadata files -output_path: "/mnt/c/Work/IVAS/3gpp_forge_gitlab/ivas-processing-scripts/output" +output_path: "./output" ### Target loudness in LKFS; default = null (no loudness normalization applied) loudness: -26 @@ -51,7 +51,7 @@ scenes: a1: name: "G1S1.wav" description: "Talker sitting at a table" - source: "f2s5a_Talker1.wav" + source: "test_single.wav" azimuth: 0 elevation: 0 delay: 0 @@ -59,7 +59,7 @@ scenes: a2: name: "G6S2.wav" description: "Talker sitting at a table" - source: "f5s10a_Talker1.wav" + source: "test_single.wav" azimuth: 60 elevation: 0 delay: 0 @@ -67,7 +67,7 @@ scenes: a3: name: "G5S3.wav" description: "Talker sitting at a table" - source: "f2s5a_Talker1.wav" + source: "test_single.wav" azimuth: 120 elevation: 0 delay: 0 @@ -75,7 +75,7 @@ scenes: a4: name: "G4S4.wav" description: "Talker sitting at a table" - source: "m4s11b_Talker1.wav" + source: "test_single.wav" azimuth: 180 elevation: 0 delay: 0 @@ -83,7 +83,7 @@ scenes: a5: name: "G3S5.wav" description: "Talker sitting at a table" - source: "m1s4a_Talker1.wav" + source: "test_single.wav" azimuth: 240 elevation: 0 delay: 0 @@ -91,7 +91,7 @@ scenes: a6: name: "G2S6.wav" description: "Talker sitting at a table" - source: "f5s10a_Talker1.wav" + source: "test_single.wav" azimuth: 300 elevation: 0 delay: 0 @@ -99,7 +99,7 @@ scenes: b1: name: "G2S1.wav" description: "standing talker." - source: "f5s10b_Talker1.wav" + source: "test_single.wav" azimuth: 120 elevation: 35 delay: 0 @@ -107,7 +107,7 @@ scenes: b2: name: "G1S2.wav" description: "standing talker." - source: "f2s1a_Talker1.wav" + source: "test_single.wav" azimuth: 180 elevation: 35 delay: 0 @@ -115,7 +115,7 @@ scenes: b3: name: "G6S3.wav" description: "standing talker." - source: "f5s10b_Talker1.wav" + source: "test_single.wav" azimuth: 240 elevation: 35 delay: 0 @@ -123,7 +123,7 @@ scenes: b4: name: "G5S4.wav" description: "standing talker." - source: "f2s1a_Talker1.wav" + source: "test_single.wav" azimuth: 300 elevation: 35 delay: 0 @@ -131,7 +131,7 @@ scenes: b5: name: "G4S5.wav" description: "standing talker." - source: "m4s11a_Talker1.wav" + source: "test_single.wav" azimuth: 0 elevation: 35 delay: 0 @@ -139,7 +139,7 @@ scenes: b6: name: "G3S6.wav" description: "standing talker." - source: "m1s2b_Talker1.wav" + source: "test_single.wav" azimuth: 60 elevation: 35 delay: 0 @@ -147,7 +147,7 @@ scenes: c1: name: "G3S1.wav" description: "Smaller talker (child) walking around a table." - source: "m1s6b_Talker1.wav" + source: "test_single.wav" azimuth: "0:1:360" elevation: 0 delay: 0 @@ -155,7 +155,7 @@ scenes: c2: name: "G2S2.wav" description: "Smaller talker (child) walking around a table." - source: "f5s14a_Talker1.wav" + source: "test_single.wav" azimuth: "60:1:60+360" elevation: 0 delay: 0 @@ -163,7 +163,7 @@ scenes: c3: name: "G1S3.wav" description: "Smaller talker (child) walking around a table." - source: "f2s6a_Talker1.wav" + source: "test_single.wav" azimuth: "120:1:120+360" elevation: 0 delay: 0 @@ -171,7 +171,7 @@ scenes: c4: name: "G6S4.wav" description: "Smaller talker (child) walking around a table." - source: "f5s14a_Talker1.wav" + source: "test_single.wav" azimuth: "180:1:180+360" elevation: 0 delay: 0 @@ -179,7 +179,7 @@ scenes: c5: name: "G5S5.wav" description: "Smaller talker (child) walking around a table." - source: "f2s6a_Talker1.wav" + source: "test_single.wav" azimuth: "240:1:240+360" elevation: 0 delay: 0 @@ -187,7 +187,7 @@ scenes: c6: name: "G4S6.wav" description: "Smaller talker (child) walking around a table." - source: "m4s13a_Talker1.wav" + source: "test_single.wav" azimuth: "300:1:300+360" elevation: 0 delay: 0 @@ -195,7 +195,7 @@ scenes: d1: name: "G4S1.wav" description: "Talker walking around the table." - source: "m4s12b_Talker1.wav" + source: "test_single.wav" azimuth: "0:-1:-360" elevation: 35 delay: 0 @@ -203,7 +203,7 @@ scenes: d2: name: "G3S2.wav" description: "Talker walking around the table." - source: "m1s12a_Talker1.wav" + source: "test_single.wav" azimuth: "60:-1:60-360" elevation: 35 delay: 0 @@ -211,7 +211,7 @@ scenes: d3: name: "G3S2.wav" description: "Talker walking around the table." - source: "f5s15b_Talker1.wav" + source: "test_single.wav" azimuth: "120:-1:120-360" elevation: 35 delay: 0 @@ -219,7 +219,7 @@ scenes: d4: name: "G1S4.wav" description: "Talker walking around the table." - source: "f2s3b_Talker1.wav" + source: "test_single.wav" azimuth: "180:-1:180-360" elevation: 35 delay: 0 @@ -227,7 +227,7 @@ scenes: d5: name: "G6S5.wav" description: "Talker walking around the table." - source: "f5s15b_Talker1.wav" + source: "test_single.wav" azimuth: "240:-1:240-360" elevation: 35 delay: 0 @@ -235,7 +235,7 @@ scenes: d6: name: "G5S6.wav" description: "Talker walking around the table." - source: "f2s3b_Talker1.wav" + source: "test_single.wav" azimuth: "300:-1:300-360" elevation: 35 delay: 0 @@ -243,7 +243,7 @@ scenes: e1: name: "G5S1.wav" description: "Elevation displacement." - source: "f2s4a_Talker1.wav" + source: "test_single.wav" azimuth: 240 elevation: "-90:0.5:90" delay: 0 @@ -251,7 +251,7 @@ scenes: e2: name: "G4S2.wav" description: "Elevation displacement." - source: "m4s16a_Talker1.wav" + source: "test_single.wav" azimuth: 300 elevation: 0 delay: 0 @@ -259,7 +259,7 @@ scenes: e3: name: "G3S3.wav" description: "Elevation displacement." - source: "m1s16b_Talker1.wav" + source: "test_single.wav" azimuth: 0 elevation: "-90:0.5:90" delay: 0 @@ -267,7 +267,7 @@ scenes: e4: name: "G2S4.wav" description: "Elevation displacement." - source: "f5s19a_Talker1.wav" + source: "test_single.wav" azimuth: 60 elevation: "-90:0.5:90" delay: 0 @@ -275,7 +275,7 @@ scenes: e5: name: "G1S5.wav" description: "Elevation displacement." - source: "f2s4a_Talker1.wav" + source: "test_single.wav" azimuth: 120 elevation: "-90:0.5:90" delay: 0 @@ -283,7 +283,7 @@ scenes: e6: name: "G6S6.wav" description: "Elevation displacement." - source: "f5s19a_Talker1.wav" + source: "test_single.wav" azimuth: 180 elevation: "-90:0.5:90" delay: 0 @@ -291,7 +291,7 @@ scenes: f1: name: "G6S1.wav" description: "Azimuth and elevation displacement." - source: "f5s15a_Talker1.wav" + source: "test_single.wav" azimuth: "60:0.5:60+180" elevation: "35:-0.2:-35" delay: 0 @@ -299,7 +299,7 @@ scenes: f2: name: "G5S2.wav" description: "Azimuth and elevation displacement." - source: "f2s7b_Talker1.wav" + source: "test_single.wav" azimuth: "120:0.5:120+180" elevation: "35:-0.2:-35" delay: 0 @@ -307,7 +307,7 @@ scenes: f3: name: "G4S3.wav" description: "Azimuth and elevation displacement." - source: "m4s14a_Talker1.wav" + source: "test_single.wav" azimuth: "180:0.5:180+180" elevation: "35:-0.2:-35" delay: 0 @@ -315,7 +315,7 @@ scenes: f4: name: "G3S4.wav" description: "Azimuth and elevation displacement." - source: "m1s7a_Talker1.wav" + source: "test_single.wav" azimuth: "240:0.5:240+180" elevation: "35:-0.2:-35" delay: 0 @@ -323,7 +323,7 @@ scenes: f5: name: "G2S5.wav" description: "Azimuth and elevation displacement." - source: "f5s15a_Talker1.wav" + source: "test_single.wav" azimuth: "300:0.5:300+180" elevation: "35:-0.2:-35" delay: 0 @@ -331,7 +331,7 @@ scenes: f6: name: "G1S6.wav" description: "Azimuth and elevation displacement." - source: "f2s7b_Talker1.wav" + source: "test_single.wav" azimuth: "0:0.5:0+180" elevation: "35:-0.2:-35" delay: 0 diff --git a/item_generation_scripts/config/ISM2_CONFIG.yml b/item_generation_scripts/config/ISM2_CONFIG.yml index c4a65c07..8886f562 100644 --- a/item_generation_scripts/config/ISM2_CONFIG.yml +++ b/item_generation_scripts/config/ISM2_CONFIG.yml @@ -13,7 +13,7 @@ format: "ISM2" # delete_tmp: true ### Output sampling rate in Hz needed for headerless audio files; default = 48000 -# fs: 32000 +fs: 48000 ### Any relative paths will be interpreted relative to the working directory the script is called from! ### Usage of absolute paths is recommended. @@ -21,10 +21,10 @@ format: "ISM2" ### For Windows users: please use double back slash '\\' in paths and add '.exe' to executable definitions ### Input path to mono files -input_path: "/mnt/c/Work/IVAS/3gpp_forge_gitlab/ivas-processing-scripts/items_mono" +input_path: "./items_mono" ### Output path for generated test items and metadata files -output_path: "/mnt/c/Work/IVAS/3gpp_forge_gitlab/ivas-processing-scripts/output" +output_path: "./output" ### Target loudness in LKFS; default = null (no loudness normalization applied) loudness: -26 diff --git a/item_generation_scripts/processing/config.py b/item_generation_scripts/processing/config.py index 06f828bb..3e9aaaa5 100644 --- a/item_generation_scripts/processing/config.py +++ b/item_generation_scripts/processing/config.py @@ -35,10 +35,7 @@ from pathlib import Path import yaml -from item_generation_scripts.constants import ( - DEFAULT_CONFIG, - REQUIRED_KEYS, -) +from item_generation_scripts.constants import DEFAULT_CONFIG, REQUIRED_KEYS def merge_dicts(base: dict, other: dict) -> None: diff --git a/item_generation_scripts/processing/process_ism_items.py b/item_generation_scripts/processing/process_ism_items.py index 73267607..8f69a4c6 100644 --- a/item_generation_scripts/processing/process_ism_items.py +++ b/item_generation_scripts/processing/process_ism_items.py @@ -35,6 +35,7 @@ import csv import logging import os from pathlib import Path +from typing import Optional import numpy as np @@ -55,6 +56,7 @@ def generate_ism_items( output_path: Path, scenes: dict, logger: logging.Logger, + fs: Optional[int] = 48000, ): """Generate ISM items with metadata from mono items based on scene description""" @@ -73,7 +75,6 @@ def generate_ism_items( source_file = np.atleast_1d(scene["source"])[i] source_azi = np.atleast_1d(scene["azimuth"])[i] source_ele = np.atleast_1d(scene["elevation"])[i] - # source_type = "speech" # !!!! TBD - support generic audio + background noise and speech in the .yml file source_delay = np.atleast_1d(scene["delay"])[i] logger.info( @@ -81,10 +82,7 @@ def generate_ism_items( ) # read source file - # x, fs = audiofile.read(os.path.join(input_path, source_file)) # !!!! TBD - check the support for headerless .raw files - # pdb.set_trace() - audio_object = audio.fromfile("MONO", os.path.join(input_path, source_file)) - + audio_object = audio.fromfile("MONO", os.path.join(input_path, source_file), fs=fs) x = audio_object.audio fs = audio_object.fs @@ -93,7 +91,6 @@ def generate_ism_items( # adjust the level of the source file _, scale_factor = get_loudness(audio_object, target_level, "MONO") - # print(f"Scaling loudness with factor: {scale_factor}") x *= scale_factor # read azimuth information and create array @@ -156,9 +153,7 @@ def generate_ism_items( ) # additional metadata - dist = np.ones( - N_frames - ) # !!!! TBD - check what to do with these metadata + dist = np.ones(N_frames) # !!!! TBD - check what to do with these metadata spread = np.zeros(N_frames) gain = np.ones(N_frames) -- GitLab From 086b23096564833d75d51c4374ed24195884df98 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Tue, 2 May 2023 17:40:30 +0200 Subject: [PATCH 06/27] support delay of mono items to crate some overlap --- .../config/ISM1_CONFIG.yml | 36 ----- .../config/ISM2_CONFIG.yml | 145 +++++++++--------- .../processing/process_ism_items.py | 9 +- 3 files changed, 80 insertions(+), 110 deletions(-) diff --git a/item_generation_scripts/config/ISM1_CONFIG.yml b/item_generation_scripts/config/ISM1_CONFIG.yml index cbe4eb71..8d85906b 100644 --- a/item_generation_scripts/config/ISM1_CONFIG.yml +++ b/item_generation_scripts/config/ISM1_CONFIG.yml @@ -54,7 +54,6 @@ scenes: source: "test_single.wav" azimuth: 0 elevation: 0 - delay: 0 a2: name: "G6S2.wav" @@ -62,7 +61,6 @@ scenes: source: "test_single.wav" azimuth: 60 elevation: 0 - delay: 0 a3: name: "G5S3.wav" @@ -70,7 +68,6 @@ scenes: source: "test_single.wav" azimuth: 120 elevation: 0 - delay: 0 a4: name: "G4S4.wav" @@ -78,7 +75,6 @@ scenes: source: "test_single.wav" azimuth: 180 elevation: 0 - delay: 0 a5: name: "G3S5.wav" @@ -86,7 +82,6 @@ scenes: source: "test_single.wav" azimuth: 240 elevation: 0 - delay: 0 a6: name: "G2S6.wav" @@ -94,7 +89,6 @@ scenes: source: "test_single.wav" azimuth: 300 elevation: 0 - delay: 0 b1: name: "G2S1.wav" @@ -102,7 +96,6 @@ scenes: source: "test_single.wav" azimuth: 120 elevation: 35 - delay: 0 b2: name: "G1S2.wav" @@ -110,7 +103,6 @@ scenes: source: "test_single.wav" azimuth: 180 elevation: 35 - delay: 0 b3: name: "G6S3.wav" @@ -118,7 +110,6 @@ scenes: source: "test_single.wav" azimuth: 240 elevation: 35 - delay: 0 b4: name: "G5S4.wav" @@ -126,7 +117,6 @@ scenes: source: "test_single.wav" azimuth: 300 elevation: 35 - delay: 0 b5: name: "G4S5.wav" @@ -134,7 +124,6 @@ scenes: source: "test_single.wav" azimuth: 0 elevation: 35 - delay: 0 b6: name: "G3S6.wav" @@ -142,7 +131,6 @@ scenes: source: "test_single.wav" azimuth: 60 elevation: 35 - delay: 0 c1: name: "G3S1.wav" @@ -150,7 +138,6 @@ scenes: source: "test_single.wav" azimuth: "0:1:360" elevation: 0 - delay: 0 c2: name: "G2S2.wav" @@ -158,7 +145,6 @@ scenes: source: "test_single.wav" azimuth: "60:1:60+360" elevation: 0 - delay: 0 c3: name: "G1S3.wav" @@ -166,7 +152,6 @@ scenes: source: "test_single.wav" azimuth: "120:1:120+360" elevation: 0 - delay: 0 c4: name: "G6S4.wav" @@ -174,7 +159,6 @@ scenes: source: "test_single.wav" azimuth: "180:1:180+360" elevation: 0 - delay: 0 c5: name: "G5S5.wav" @@ -182,7 +166,6 @@ scenes: source: "test_single.wav" azimuth: "240:1:240+360" elevation: 0 - delay: 0 c6: name: "G4S6.wav" @@ -190,7 +173,6 @@ scenes: source: "test_single.wav" azimuth: "300:1:300+360" elevation: 0 - delay: 0 d1: name: "G4S1.wav" @@ -198,7 +180,6 @@ scenes: source: "test_single.wav" azimuth: "0:-1:-360" elevation: 35 - delay: 0 d2: name: "G3S2.wav" @@ -206,7 +187,6 @@ scenes: source: "test_single.wav" azimuth: "60:-1:60-360" elevation: 35 - delay: 0 d3: name: "G3S2.wav" @@ -214,7 +194,6 @@ scenes: source: "test_single.wav" azimuth: "120:-1:120-360" elevation: 35 - delay: 0 d4: name: "G1S4.wav" @@ -222,7 +201,6 @@ scenes: source: "test_single.wav" azimuth: "180:-1:180-360" elevation: 35 - delay: 0 d5: name: "G6S5.wav" @@ -230,7 +208,6 @@ scenes: source: "test_single.wav" azimuth: "240:-1:240-360" elevation: 35 - delay: 0 d6: name: "G5S6.wav" @@ -238,7 +215,6 @@ scenes: source: "test_single.wav" azimuth: "300:-1:300-360" elevation: 35 - delay: 0 e1: name: "G5S1.wav" @@ -246,7 +222,6 @@ scenes: source: "test_single.wav" azimuth: 240 elevation: "-90:0.5:90" - delay: 0 e2: name: "G4S2.wav" @@ -254,7 +229,6 @@ scenes: source: "test_single.wav" azimuth: 300 elevation: 0 - delay: 0 e3: name: "G3S3.wav" @@ -262,7 +236,6 @@ scenes: source: "test_single.wav" azimuth: 0 elevation: "-90:0.5:90" - delay: 0 e4: name: "G2S4.wav" @@ -270,7 +243,6 @@ scenes: source: "test_single.wav" azimuth: 60 elevation: "-90:0.5:90" - delay: 0 e5: name: "G1S5.wav" @@ -278,7 +250,6 @@ scenes: source: "test_single.wav" azimuth: 120 elevation: "-90:0.5:90" - delay: 0 e6: name: "G6S6.wav" @@ -286,7 +257,6 @@ scenes: source: "test_single.wav" azimuth: 180 elevation: "-90:0.5:90" - delay: 0 f1: name: "G6S1.wav" @@ -294,7 +264,6 @@ scenes: source: "test_single.wav" azimuth: "60:0.5:60+180" elevation: "35:-0.2:-35" - delay: 0 f2: name: "G5S2.wav" @@ -302,7 +271,6 @@ scenes: source: "test_single.wav" azimuth: "120:0.5:120+180" elevation: "35:-0.2:-35" - delay: 0 f3: name: "G4S3.wav" @@ -310,7 +278,6 @@ scenes: source: "test_single.wav" azimuth: "180:0.5:180+180" elevation: "35:-0.2:-35" - delay: 0 f4: name: "G3S4.wav" @@ -318,7 +285,6 @@ scenes: source: "test_single.wav" azimuth: "240:0.5:240+180" elevation: "35:-0.2:-35" - delay: 0 f5: name: "G2S5.wav" @@ -326,7 +292,6 @@ scenes: source: "test_single.wav" azimuth: "300:0.5:300+180" elevation: "35:-0.2:-35" - delay: 0 f6: name: "G1S6.wav" @@ -334,5 +299,4 @@ scenes: source: "test_single.wav" azimuth: "0:0.5:0+180" elevation: "35:-0.2:-35" - delay: 0 \ No newline at end of file diff --git a/item_generation_scripts/config/ISM2_CONFIG.yml b/item_generation_scripts/config/ISM2_CONFIG.yml index 8886f562..798da9d0 100644 --- a/item_generation_scripts/config/ISM2_CONFIG.yml +++ b/item_generation_scripts/config/ISM2_CONFIG.yml @@ -37,6 +37,7 @@ loudness: -26 ### Each scene must start with the sceneN tag ### Specify the mono source filename (the program will search for it in the input_path folder) ### Specify azimuth and elevation for each input source +### Specify the delay in seconds for each input source ### Note 1: use [val1, val2, ...] for multiple sources in a scene ### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames @@ -51,288 +52,288 @@ scenes: a1: name: "G1S1.wav" description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." - source: ["f2s5a_Talker1.wav", "m2s16b_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: [0, 50] elevation: [0, 0] - delay: [0, 0] + delay: [0, 1] a2: name: "G6S2.wav" description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." - source: ["f5s10a_Talker1.wav", "m3s2a_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: [50, 350] elevation: [0, 0] - delay: [0, 0] + delay: [0, 1] a3: name: "G5S3.wav" description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." - source: ["f2s5a_Talker1.wav", "m2s16b_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: [40, 290] elevation: [0, 0] - delay: [0, 0] + delay: [0, 1] a4: name: "G4S4.wav" description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." - source: ["m4s11b_Talker1.wav", "f1s4b_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: [30, 230] elevation: [15, 15] - delay: [0, 0] + delay: [0, 1] a5: name: "G3S5.wav" description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." - source: ["m1s4a_Talker1.wav", "f3s3a_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: [20, 170] elevation: [15, 15] - delay: [0, 0] + delay: [0, 1] a6: name: "G2S6.wav" description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." - source: ["f5s10a_Talker1.wav", "m3s2a_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: [10, 110] elevation: [15, 15] - delay: [0, 0] + delay: [0, 1] b1: name: "G2S1.wav" description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." - source: ["f5s10b_Talker1.wav", "m3s2b_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: [20, 170] elevation: [30, 30] - delay: [0, 0] + delay: [0, 1] b2: name: "G1S2.wav" description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." - source: ["f2s1a_Talker1.wav", "m2s10a_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: [10, 110] elevation: [30, 30] - delay: [0, 0] + delay: [0, 1] b3: name: "G6S3.wav" description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." - source: ["f5s10b_Talker1.wav", "m3s2b_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: [0, 50] elevation: [30, 30] - delay: [0, 0] + delay: [0, 1] b4: name: "G5S4.wav" description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." - source: ["f2s1a_Talker1.wav", "m2s10a_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: [50, 350] elevation: [60, 60] - delay: [0, 0] + delay: [0, 1] b5: name: "G4S5.wav" description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." - source: ["m4s11a_Talker1.wav", "f1s6a_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: [40, 290] elevation: [60, 60] - delay: [0, 0] + delay: [0, 1] b6: name: "G3S6.wav" description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." - source: ["m1s2b_Talker1.wav", "f3s5a_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: [30, 230] elevation: [60, 60] - delay: [0, 0] + delay: [0, 1] c1: name: "G3S1.wav" description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances." - source: ["m1s6b_Talker1.wav", "f3s5b_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: [40, 290] elevation: [0, 60] - delay: [0, 0] + delay: [0, 1] c2: name: "G2S2.wav" description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances." - source: ["f5s14a_Talker1.wav", "m3s8a_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: [30, 230] elevation: [0, 60] - delay: [0, 0] + delay: [0, 1] c3: name: "G1S3.wav" description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances." - source: ["f2s6a_Talker1.wav", "m2s13a_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: [20, 170] elevation: [0, 60] - delay: [0, 0] + delay: [0, 1] c4: name: "G6S4.wav" description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances." - source: ["f5s14a_Talker1.wav", "m3s8a_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: [10, 110] elevation: [0, 60] - delay: [0, 0] + delay: [0, 1] c5: name: "G5S5.wav" description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances." - source: ["f2s6a_Talker1.wav", "m2s13a_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: [0, 50] elevation: [0, 60] - delay: [0, 0] + delay: [0, 1] c6: name: "G4S6.wav" description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances." - source: ["m4s13a_Talker1.wav", "f1s20a_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: [50, 350] elevation: [0, 60] - delay: [0, 0] + delay: [0, 1] d1: name: "G4S1.wav" description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." - source: ["m4s12b_Talker1.wav", "f1s12b_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: [50, "180:1:120 + 360"] elevation: [0, 60] - delay: [0, 0] + delay: [0, 1] d2: name: "G3S2.wav" description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." - source: ["m1s12a_Talker1.wav", "f3s20a_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: [300, "-70:-1:-10 - 360"] elevation: [0, 60] - delay: [0, 0] + delay: [0, 1] d3: name: "G3S2.wav" description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." - source: ["f5s15b_Talker1.wav", "m3s1a_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: [250, "-20:-1:-320"] elevation: [0, 60] - delay: [0, 0] + delay: [0, 1] d4: name: "G1S4.wav" description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." - source: ["f2s3b_Talker1.wav", "m2s15a_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: [200, "30:-1:-270"] elevation: [0, 60] - delay: [0, 0] + delay: [0, 1] d5: name: "G6S5.wav" description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." - source: ["f5s15b_Talker1.wav", "m3s1a_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: [150, "80:1:20 + 360"] elevation: [0, 60] - delay: [0, 0] + delay: [0, 1] d6: name: "G5S6.wav" description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." - source: ["f2s3b_Talker1.wav", "m2s15a_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: [100, "130:1:70 + 360"] elevation: [0, 60] - delay: [0, 0] + delay: [0, 1] e1: name: "G5S1.wav" description: "two talkers walking side-by-side around the table, ~30% overlapping utterances" - source: ["f2s4a_Talker1.wav", "m2s17b_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: ["80:1:20 + 360", "80:1:20 + 360"] elevation: [10, 60] - delay: [0, 0] + delay: [0, 1] e2: name: "G4S2.wav" description: "two talkers walking side-by-side around the table, ~30% overlapping utterances" - source: ["m4s16a_Talker1.wav", "f1s16b_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: ["130:1:70 + 360", "130:1:70 + 360"] elevation: [10, 60] - delay: [0, 0] + delay: [0, 1] e3: name: "G3S3.wav" description: "two talkers walking side-by-side around the table, ~30% overlapping utterances" - source: ["m1s16b_Talker1.wav", "f3s10b_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: ["180:1:120 + 360", "180:1:120 + 360"] elevation: [10, 60] - delay: [0, 0] + delay: [0, 1] e4: name: "G2S4.wav" description: "two talkers walking side-by-side around the table, ~30% overlapping utterances" - source: ["f5s19a_Talker1.wav", "m3s1b_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: ["-70:-1:-10 - 360", "-70:-1:-10 - 360"] elevation: [10, 60] - delay: [0, 0] + delay: [0, 1] e5: name: "G1S5.wav" description: "two talkers walking side-by-side around the table, ~30% overlapping utterances" - source: ["f2s4a_Talker1.wav", "m2s17b_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: ["-20:-1:-320", "-20:-1:-320"] elevation: [10, 60] - delay: [0, 0] + delay: [0, 1] e6: name: "G6S6.wav" description: "two talkers walking side-by-side around the table, ~30% overlapping utterances" - source: ["f5s19a_Talker1.wav", "m3s1b_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: ["30:-1:-270", "30:-1:-270"] elevation: [10, 60] - delay: [0, 0] + delay: [0, 1] f1: name: "G6S1.wav" description: "two talkers walking around the table in opposite directions, non-overlapping utterances." - source: ["f5s15a_Talker1.wav", "m3s8b_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: ["60:1:0 + 360", "60:-1:120 - 360"] elevation: [20, 50] - delay: [0, 0] + delay: [0, 1] f2: name: "G5S2.wav" description: "two talkers walking around the table in opposite directions, non-overlapping utterances." - source: ["f2s7b_Talker1.wav", "m2s6b_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: ["0:1:300", "0:-1:60 - 360"] elevation: [20, 50] - delay: [0, 0] + delay: [0, 1] f3: name: "G4S3.wav" description: "two talkers walking around the table in opposite directions, non-overlapping utterances." - source: ["m4s14a_Talker1.wav", "f1s7a_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: ["300:1:240 + 360", "300:-1:0"] elevation: [20, 50] - delay: [0, 0] + delay: [0, 1] f4: name: "G3S4.wav" description: "two talkers walking around the table in opposite directions, non-overlapping utterances." - source: ["m1s7a_Talker1.wav", "f3s7a_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: ["240:1:180 + 360", "240:-1:-60"] elevation: [20, 50] - delay: [0, 0] + delay: [0, 1] f5: name: "G2S5.wav" description: "two talkers walking around the table in opposite directions, non-overlapping utterances." - source: ["f5s15a_Talker1.wav", "m3s8b_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: ["180:1:120 + 360", "180:-1:-120"] elevation: [20, 50] - delay: [0, 0] + delay: [0, 1] f6: name: "G1S6.wav" description: "two talkers walking around the table in opposite directions, non-overlapping utterances." - source: ["f2s7b_Talker1.wav", "m2s6b_Talker2.wav"] + source: ["test_double.wav", "test_double.wav"] azimuth: ["120:1:60 + 360", "120:-1:180 - 360"] elevation: [20, 50] - delay: [0, 0] + delay: [0, 1] \ No newline at end of file diff --git a/item_generation_scripts/processing/process_ism_items.py b/item_generation_scripts/processing/process_ism_items.py index 8f69a4c6..cf6ade22 100644 --- a/item_generation_scripts/processing/process_ism_items.py +++ b/item_generation_scripts/processing/process_ism_items.py @@ -72,11 +72,16 @@ def generate_ism_items( y = None y_meta = None for i in range(N_sources): + + # parse parameters from the scene description source_file = np.atleast_1d(scene["source"])[i] source_azi = np.atleast_1d(scene["azimuth"])[i] source_ele = np.atleast_1d(scene["elevation"])[i] - source_delay = np.atleast_1d(scene["delay"])[i] - + if 'delay' in scene.keys(): + source_delay = np.atleast_1d(scene["delay"])[i] + else: + source_delay = np.array([0]) + logger.info( f"Encoding {source_file} at position(s) {source_azi},{source_ele}" ) -- GitLab From 9db60d12ee647e33a07c77c446ddc670909d3000 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 4 May 2023 08:42:25 +0200 Subject: [PATCH 07/27] fix extra CRLF in .csv files on Windows --- .../config/ISM1_CONFIG.yml | 2 +- .../config/ISM2_CONFIG.yml | 72 ++--- .../config/STEREO_CONFIG.yml | 306 ++++++++++++++++++ .../processing/process_ism_items.py | 2 +- 4 files changed, 344 insertions(+), 38 deletions(-) create mode 100644 item_generation_scripts/config/STEREO_CONFIG.yml diff --git a/item_generation_scripts/config/ISM1_CONFIG.yml b/item_generation_scripts/config/ISM1_CONFIG.yml index 8d85906b..66f81617 100644 --- a/item_generation_scripts/config/ISM1_CONFIG.yml +++ b/item_generation_scripts/config/ISM1_CONFIG.yml @@ -189,7 +189,7 @@ scenes: elevation: 35 d3: - name: "G3S2.wav" + name: "G2S3.wav" description: "Talker walking around the table." source: "test_single.wav" azimuth: "120:-1:120-360" diff --git a/item_generation_scripts/config/ISM2_CONFIG.yml b/item_generation_scripts/config/ISM2_CONFIG.yml index 798da9d0..3bb200e2 100644 --- a/item_generation_scripts/config/ISM2_CONFIG.yml +++ b/item_generation_scripts/config/ISM2_CONFIG.yml @@ -55,7 +55,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: [0, 50] elevation: [0, 0] - delay: [0, 1] + delay: [0, 0] a2: name: "G6S2.wav" @@ -63,7 +63,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: [50, 350] elevation: [0, 0] - delay: [0, 1] + delay: [0, 0] a3: name: "G5S3.wav" @@ -71,7 +71,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: [40, 290] elevation: [0, 0] - delay: [0, 1] + delay: [0, 0] a4: name: "G4S4.wav" @@ -79,7 +79,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: [30, 230] elevation: [15, 15] - delay: [0, 1] + delay: [0, 0] a5: name: "G3S5.wav" @@ -87,7 +87,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: [20, 170] elevation: [15, 15] - delay: [0, 1] + delay: [0, 0] a6: name: "G2S6.wav" @@ -95,7 +95,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: [10, 110] elevation: [15, 15] - delay: [0, 1] + delay: [0, 0] b1: name: "G2S1.wav" @@ -103,7 +103,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: [20, 170] elevation: [30, 30] - delay: [0, 1] + delay: [0, 1.5] b2: name: "G1S2.wav" @@ -111,7 +111,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: [10, 110] elevation: [30, 30] - delay: [0, 1] + delay: [0, 1.5] b3: name: "G6S3.wav" @@ -119,7 +119,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: [0, 50] elevation: [30, 30] - delay: [0, 1] + delay: [0, 1.5] b4: name: "G5S4.wav" @@ -127,7 +127,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: [50, 350] elevation: [60, 60] - delay: [0, 1] + delay: [0, 1.5] b5: name: "G4S5.wav" @@ -135,7 +135,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: [40, 290] elevation: [60, 60] - delay: [0, 1] + delay: [0, 1.5] b6: name: "G3S6.wav" @@ -143,7 +143,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: [30, 230] elevation: [60, 60] - delay: [0, 1] + delay: [0, 1.5] c1: name: "G3S1.wav" @@ -151,7 +151,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: [40, 290] elevation: [0, 60] - delay: [0, 1] + delay: [0, 0] c2: name: "G2S2.wav" @@ -159,7 +159,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: [30, 230] elevation: [0, 60] - delay: [0, 1] + delay: [0, 0] c3: name: "G1S3.wav" @@ -167,7 +167,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: [20, 170] elevation: [0, 60] - delay: [0, 1] + delay: [0, 0] c4: name: "G6S4.wav" @@ -183,7 +183,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: [0, 50] elevation: [0, 60] - delay: [0, 1] + delay: [0, 0] c6: name: "G4S6.wav" @@ -191,7 +191,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: [50, 350] elevation: [0, 60] - delay: [0, 1] + delay: [0, 0] d1: name: "G4S1.wav" @@ -199,7 +199,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: [50, "180:1:120 + 360"] elevation: [0, 60] - delay: [0, 1] + delay: [0, 1.5] d2: name: "G3S2.wav" @@ -207,15 +207,15 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: [300, "-70:-1:-10 - 360"] elevation: [0, 60] - delay: [0, 1] + delay: [0, 1.5] d3: - name: "G3S2.wav" + name: "G2S3.wav" description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." source: ["test_double.wav", "test_double.wav"] azimuth: [250, "-20:-1:-320"] elevation: [0, 60] - delay: [0, 1] + delay: [0, 1.5] d4: name: "G1S4.wav" @@ -223,7 +223,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: [200, "30:-1:-270"] elevation: [0, 60] - delay: [0, 1] + delay: [0, 1.5] d5: name: "G6S5.wav" @@ -231,7 +231,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: [150, "80:1:20 + 360"] elevation: [0, 60] - delay: [0, 1] + delay: [0, 1.5] d6: name: "G5S6.wav" @@ -239,7 +239,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: [100, "130:1:70 + 360"] elevation: [0, 60] - delay: [0, 1] + delay: [0, 1.5] e1: name: "G5S1.wav" @@ -247,7 +247,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: ["80:1:20 + 360", "80:1:20 + 360"] elevation: [10, 60] - delay: [0, 1] + delay: [0, 1.5] e2: name: "G4S2.wav" @@ -255,7 +255,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: ["130:1:70 + 360", "130:1:70 + 360"] elevation: [10, 60] - delay: [0, 1] + delay: [0, 1.5] e3: name: "G3S3.wav" @@ -263,7 +263,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: ["180:1:120 + 360", "180:1:120 + 360"] elevation: [10, 60] - delay: [0, 1] + delay: [0, 1.5] e4: name: "G2S4.wav" @@ -271,7 +271,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: ["-70:-1:-10 - 360", "-70:-1:-10 - 360"] elevation: [10, 60] - delay: [0, 1] + delay: [0, 1.5] e5: name: "G1S5.wav" @@ -279,7 +279,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: ["-20:-1:-320", "-20:-1:-320"] elevation: [10, 60] - delay: [0, 1] + delay: [0, 1.5] e6: name: "G6S6.wav" @@ -287,7 +287,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: ["30:-1:-270", "30:-1:-270"] elevation: [10, 60] - delay: [0, 1] + delay: [0, 1.5] f1: name: "G6S1.wav" @@ -295,7 +295,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: ["60:1:0 + 360", "60:-1:120 - 360"] elevation: [20, 50] - delay: [0, 1] + delay: [0, 0] f2: name: "G5S2.wav" @@ -303,7 +303,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: ["0:1:300", "0:-1:60 - 360"] elevation: [20, 50] - delay: [0, 1] + delay: [0, 0] f3: name: "G4S3.wav" @@ -311,7 +311,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: ["300:1:240 + 360", "300:-1:0"] elevation: [20, 50] - delay: [0, 1] + delay: [0, 0] f4: name: "G3S4.wav" @@ -319,7 +319,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: ["240:1:180 + 360", "240:-1:-60"] elevation: [20, 50] - delay: [0, 1] + delay: [0, 0] f5: name: "G2S5.wav" @@ -327,7 +327,7 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: ["180:1:120 + 360", "180:-1:-120"] elevation: [20, 50] - delay: [0, 1] + delay: [0, 0] f6: name: "G1S6.wav" @@ -335,5 +335,5 @@ scenes: source: ["test_double.wav", "test_double.wav"] azimuth: ["120:1:60 + 360", "120:-1:180 - 360"] elevation: [20, 50] - delay: [0, 1] + delay: [0, 0] \ No newline at end of file diff --git a/item_generation_scripts/config/STEREO_CONFIG.yml b/item_generation_scripts/config/STEREO_CONFIG.yml new file mode 100644 index 00000000..adc08b4c --- /dev/null +++ b/item_generation_scripts/config/STEREO_CONFIG.yml @@ -0,0 +1,306 @@ +--- +################################################ +# General configuration +################################################ + +### Output format +format: "STEREO" + +### Date; default = YYYYMMDD_HH.MM.SS +# date: 2023.06.30 + +### Deletion of temporary directories containing intermediate processing files, bitstreams etc.; default = false +# delete_tmp: true + +### Output sampling rate in Hz needed for headerless audio files; default = 48000 +fs: 48000 + +### Any relative paths will be interpreted relative to the working directory the script is called from! +### Usage of absolute paths is recommended. +### Do not use file names with dots "." in them! This is not supported, use "_" instead +### For Windows users: please use double back slash '\\' in paths and add '.exe' to executable definitions + +### Input path to mono files +input_path: "./items_mono" + +### Input path to stereo impulse response files +input_path_IR: "./IR" + +### Output path for generated test items and metadata files +output_path: "./output" + +### Target loudness in LKFS; default = null (no loudness normalization applied) +loudness: -26 + + +################################################ +### Scene description +################################################ + +### Each scene must start with the sceneN tag +### Specify the mono source filename (the program will search for it in the input_path folder) +### Specify azimuth and elevation for each input source +### Specify the delay in seconds for each input source +### Note 1: use [val1, val2, ...] for multiple sources in a scene +### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames + +### Note 3: we're using right-handed coordinate system with azi = 0 pointing from the nose to the screen +### azimuth: float, [-180,180]; positive indicates left +### elevation: float, [-90,90]; positive indicates up +### distance: float, tbd: default: 1 +### spread: float, [0,360]; spread in angles from 0 ... 360˚ +### gain: float, [0,1] + +scenes: + a1: + name: "G1S1.wav" + description: "Large anechoic room with AB microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["LAABP01.L.IR32", "LAABP01.R.IR32"] + delay: [0, 0] + + a2: + name: "G6S2.wav" + description: "Large anechoic room with AB microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["LAABP02.L.IR32", "LAABP02.R.IR32"] + delay: [0, 0] + + a3: + name: "G5S3.wav" + description: "Large anechoic room with AB microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["LAABP03.L.IR32", "LAABP03.R.IR32"] + delay: [0, 0] + + a4: + name: "G4S4.wav" + description: "Large anechoic room with AB microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["LAABP04.L.IR32", "LAABP04.R.IR32"] + delay: [0, 0] + + a5: + name: "G3S5.wav" + description: "Large anechoic room with AB microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["LAABP05.L.IR32", "LAABP05.R.IR32"] + delay: [0, 0] + + a6: + name: "G2S6.wav" + description: "Large anechoic room with AB microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["LAABP06.L.IR32", "LAABP06.R.IR32"] + delay: [0, 0] + + b1: + name: "G2S1.wav" + description: "Small anechoic room with AB microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["SAABP01.L.IR32", "SAABP01.R.IR32"] + delay: [0, 1.5] + + b2: + name: "G1S2.wav" + description: "Small anechoic room with AB microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["SAABP01.L.IR32", "SAABP01.R.IR32"] + delay: [0, 1.5] + + b3: + name: "G6S3.wav" + description: "Small anechoic room with AB microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["SAABP01.L.IR32", "SAABP01.R.IR32"] + delay: [0, 1.5] + + b4: + name: "G5S4.wav" + description: "Small anechoic room with AB microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["SAABP01.L.IR32", "SAABP01.R.IR32"] + delay: [0, 1.5] + + b5: + name: "G4S5.wav" + description: "Small anechoic room with AB microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["SAABP01.L.IR32", "SAABP01.R.IR32"] + delay: [0, 1.5] + + b6: + name: "G3S6.wav" + description: "Small anechoic room with AB microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["SAABP01.L.IR32", "SAABP01.R.IR32"] + delay: [0, 1.5] + + c1: + name: "G3S1.wav" + description: "Small anechoic room with MS microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["SAMSP01.L.IR32", "SAMSP01.R.IR32"] + delay: [0, 0] + + c2: + name: "G2S2.wav" + description: "Small anechoic room with MS microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["SAMSP01.L.IR32", "SAMSP01.R.IR32"] + delay: [0, 0] + + c3: + name: "G1S3.wav" + description: "Small anechoic room with MS microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["SAMSP01.L.IR32", "SAMSP01.R.IR32"] + delay: [0, 0] + + c4: + name: "G6S4.wav" + description: "Small anechoic room with MS microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["SAMSP01.L.IR32", "SAMSP01.R.IR32"] + delay: [0, 1] + + c5: + name: "G5S5.wav" + description: "Small anechoic room with MS microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["SAMSP01.L.IR32", "SAMSP01.R.IR32"] + delay: [0, 0] + + c6: + name: "G4S6.wav" + description: "Small anechoic room with MS microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["SAMSP01.L.IR32", "SAMSP01.R.IR32"] + delay: [0, 0] + + d1: + name: "G4S1.wav" + description: "Small echoic room with MS microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] + delay: [0, 1.5] + + d2: + name: "G3S2.wav" + description: "Small echoic room with MS microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] + delay: [0, 1.5] + + d3: + name: "G3S2.wav" + description: "Small echoic room with MS microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] + delay: [0, 1.5] + + d4: + name: "G1S4.wav" + description: "Small echoic room with MS microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] + delay: [0, 1.5] + + d5: + name: "G6S5.wav" + description: "Small echoic room with MS microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] + delay: [0, 1.5] + + d6: + name: "G5S6.wav" + description: "Small echoic room with MS microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] + delay: [0, 1.5] + + e1: + name: "G5S1.wav" + description: "Small echoic room with binaural microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["SEBIP01.L.IR32", "SEBIP01.R.IR32"] + delay: [0, 1.5] + + e2: + name: "G4S2.wav" + description: "Small echoic room with binaural microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["SEBIP01.L.IR32", "SEBIP01.R.IR32"] + delay: [0, 1.5] + + e3: + name: "G3S3.wav" + description: "Small echoic room with binaural microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["SEBIP01.L.IR32", "SEBIP01.R.IR32"] + delay: [0, 1.5] + + e4: + name: "G2S4.wav" + description: "Small echoic room with binaural microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["SEBIP01.L.IR32", "SEBIP01.R.IR32"] + delay: [0, 1.5] + + e5: + name: "G1S5.wav" + description: "Small echoic room with binaural microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["SEBIP01.L.IR32", "SEBIP01.R.IR32"] + delay: [0, 1.5] + + e6: + name: "G6S6.wav" + description: "Small echoic room with binaural microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["SEBIP01.L.IR32", "SEBIP01.R.IR32"] + delay: [0, 1.5] + + f1: + name: "G6S1.wav" + description: "Small echoic room with MS microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] + delay: [0, 0] + + f2: + name: "G5S2.wav" + description: "Small echoic room with MS microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] + delay: [0, 0] + + f3: + name: "G4S3.wav" + description: "Small echoic room with MS microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] + delay: [0, 0] + + f4: + name: "G3S4.wav" + description: "Small echoic room with MS microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] + delay: [0, 0] + + f5: + name: "G2S5.wav" + description: "Small echoic room with MS microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] + delay: [0, 0] + + f6: + name: "G1S6.wav" + description: "Small echoic room with MS microphone pickup." + source: ["test_double.wav", "test_double.wav"] + IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] + delay: [0, 0] + \ No newline at end of file diff --git a/item_generation_scripts/processing/process_ism_items.py b/item_generation_scripts/processing/process_ism_items.py index cf6ade22..f1b84cda 100644 --- a/item_generation_scripts/processing/process_ism_items.py +++ b/item_generation_scripts/processing/process_ism_items.py @@ -227,7 +227,7 @@ def generate_ism_items( # generate .csv filename (should end with .0.csv, .1.csv, ...) csv_filename = os.path.normpath(f"{output_filename}.{i}.csv") - with open(os.path.join(output_path, csv_filename), "w") as f: + with open(os.path.join(output_path, csv_filename), 'w', newline='', encoding='utf-8') as f: # create csv writer writer = csv.writer(f) -- GitLab From e0fbcf7a0eb7fd4ef3d48749e78611fecb5785c8 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Mon, 8 May 2023 18:36:16 +0200 Subject: [PATCH 08/27] fix 20ms frame alignment --- .../processing/process_ism_items.py | 83 ++++++++++++------- 1 file changed, 51 insertions(+), 32 deletions(-) diff --git a/item_generation_scripts/processing/process_ism_items.py b/item_generation_scripts/processing/process_ism_items.py index f1b84cda..4b33a84e 100644 --- a/item_generation_scripts/processing/process_ism_items.py +++ b/item_generation_scripts/processing/process_ism_items.py @@ -36,8 +36,8 @@ import logging import os from pathlib import Path from typing import Optional - import numpy as np +from math import floor from item_generation_scripts.audiotools import audio, audiofile from item_generation_scripts.audiotools.wrappers.bs1770 import get_loudness @@ -69,8 +69,14 @@ def generate_ism_items( # extract the number of audio sources N_sources = len(np.atleast_1d(scene["source"])) - y = None + # initialize output variables + if format == "ISM2": + y = audio.ChannelBasedAudio("STEREO") + else: + y = audio.ChannelBasedAudio("MONO") y_meta = None + + # repeat for all source files for i in range(N_sources): # parse parameters from the scene description @@ -87,16 +93,18 @@ def generate_ism_items( ) # read source file - audio_object = audio.fromfile("MONO", os.path.join(input_path, source_file), fs=fs) - x = audio_object.audio - fs = audio_object.fs + x = audio.fromfile("MONO", os.path.join(input_path, source_file), fs=fs) - # find the number of frames - N_frames = int(len(x) / fs * 50 + 1) + # get the number of frames (multiple of 20ms) + N_frames = int(len(x.audio) / x.fs * 50) + + # trim the source signal to align to 20ms boundary + len = int(N_frames * x.fs / 50) + x.audio = x.audio[:len] # adjust the level of the source file - _, scale_factor = get_loudness(audio_object, target_level, "MONO") - x *= scale_factor + _, scale_factor = get_loudness(x, target_level, "MONO") + x.audio *= scale_factor # read azimuth information and create array if isinstance(source_azi, str): @@ -167,59 +175,70 @@ def generate_ism_items( # delay the source file if source_delay > 0: - pre = np.zeros((int(source_delay * fs), x.shape[1])) - x = np.concatenate([pre, x]) + # ensure delay is a multiple of 20ms + N_delay = int(floor(source_delay * 50) / 50 * x.fs) + + # insert all-zero preamble + pre = np.zeros((N_delay, x.audio.shape[1])) + x.audio = np.concatenate([pre, x.audio]) - # apply delay to metadata as well + # insert neutral position as a pre-amble pre = np.tile( - [0.00, 0.00, 1.00, 0.00, 1.00], (int(source_delay * 50), 1) - ) - # pre = np.zeros((int(source_delay * 50), x_meta.shape[1])) + [0.00, 0.00, 1.00, 0.00, 1.00], (N_delay, 1) + ) # !!!! TBD - check if we should insert netrual position or the first position of the metadata x_meta = np.concatenate([pre, x_meta]) - # add source signal to the array of source signals - if y is None: - y = x + # add source signal to the array of all source signals + y.fs = x.fs + if y.audio is None: + y.audio = x.audio else: # append zeros to have equal length of all source signals - if x.shape[0] > y.shape[0]: - y = np.vstack((y, np.zeros((x.shape[0] - y.shape[0], y.shape[1])))) - elif y.shape[0] > x.shape[0]: - x = np.vstack((x, np.zeros((y.shape[0] - x.shape[0], x.shape[1])))) - y = np.hstack((y, x)) + if x.audio.shape[0] > y.audio.shape[0]: + y.audio = np.vstack((y.audio, np.zeros((x.audio.shape[0] - y.audio.shape[0], y.audio.shape[1])))) + elif y.audio.shape[0] > x.audio.shape[0]: + x.audio = np.vstack((x.audio, np.zeros((y.audio.shape[0] - x.audio.shape[0], x.audio.shape[1])))) + y.audio = np.hstack((y.audio, x.audio)) # add metadata to the array of all metadata - x_meta = x_meta[np.newaxis, :] # make sure x_meta is a 3d array + # make sure x_meta is a 3d array + x_meta = x_meta[np.newaxis, :] if y_meta is None: y_meta = x_meta else: N_srcs = y_meta.shape[0] N_meta_features = y_meta.shape[2] - # append postamble (create by repeating the last row of metadata) to have equal length of all metadata + # append the last position of the metadata to have equal length of all metadata if x_meta.shape[1] > y_meta.shape[1]: N_delta = x_meta.shape[1] - y_meta.shape[1] - y_meta = y_meta.reshape(y_meta.shape[1], -1) # reshape to 2d array + # reshape to 2d array + y_meta = y_meta.reshape(y_meta.shape[1], -1) + # repeat last row N_delta times and append to the array y_meta = np.vstack( (y_meta, np.tile(y_meta[-1, :], (N_delta, 1))) - ) # repeat last row N_delta times and append to the array + ) + # reshape back to 3d array y_meta = y_meta.reshape( N_srcs, -1, N_meta_features - ) # reshape back to 3d array + ) elif y_meta.shape[1] > x_meta.shape[1]: N_delta = y_meta.shape[1] - x_meta.shape[1] - x_meta = x_meta.reshape(x_meta.shape[1], -1) # reshape to 2d array + # reshape to 2d array + x_meta = x_meta.reshape(x_meta.shape[1], -1) + # repeat last row N_delta times and append to the array x_meta = np.vstack( (x_meta, np.tile(x_meta[-1, :], (N_delta, 1))) - ) # repeat last row N_delta times and append to the array - x_meta = np.expand_dims(x_meta, axis=0) # reshape back to 3d array + ) + # reshape back to 3d array + x_meta = np.expand_dims(x_meta, axis=0) y_meta = np.concatenate([y_meta, x_meta]) # write individual ISM audio streams to the output file in an interleaved format output_filename = scene["name"] audiofile.write( - os.path.join(output_path, output_filename), y, fs + os.path.join(output_path, output_filename), y.audio, y.fs ) # !!!! TBD: replace all os.path.xxx operations with the Path object # write individual ISM metadata to output files in .csv format -- GitLab From 9b9dead85aea5e0510521221614b50cff7598ff7 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Mon, 8 May 2023 18:36:50 +0200 Subject: [PATCH 09/27] stereo item generation --- item_generation_scripts/__init__.py | 19 +- .../audiotools/audiofile.py | 5 +- .../audiotools/wrappers/reverb.py | 186 ++++++++++++++++++ .../config/STEREO_CONFIG.yml | 164 ++++++++------- item_generation_scripts/constants.py | 15 +- .../processing/process_stereo_items.py | 144 ++++++++++++++ 6 files changed, 431 insertions(+), 102 deletions(-) create mode 100644 item_generation_scripts/audiotools/wrappers/reverb.py create mode 100644 item_generation_scripts/processing/process_stereo_items.py diff --git a/item_generation_scripts/__init__.py b/item_generation_scripts/__init__.py index c08820ea..5afa3cc6 100644 --- a/item_generation_scripts/__init__.py +++ b/item_generation_scripts/__init__.py @@ -40,7 +40,7 @@ from item_generation_scripts.constants import ( LOGGER_FORMAT, LOGGER_SUFFIX, ) -from item_generation_scripts.processing import config, process_ism_items +from item_generation_scripts.processing import config, process_ism_items, process_stereo_items from item_generation_scripts.utils import create_dir @@ -83,7 +83,7 @@ def main(args): # generate input items if cfg.format.startswith("ISM"): - # generate ISM items according to scene description + # generate ISM items with metadata according to scene description process_ism_items.generate_ism_items( cfg.format, cfg.loudness, @@ -93,7 +93,20 @@ def main(args): logger, fs=cfg.fs ) - + elif cfg.format == "STEREO": + # generate STEREO items according to scene description + process_stereo_items.generate_stereo_items( + cfg.format, + cfg.loudness, + cfg.input_path, + cfg.IR_path, + cfg.output_path, + cfg.scenes, + logger, + fs=cfg.fs, + IR_fs=cfg.IR_fs, + ) + # copy configuration to output directory with open(cfg.output_path.joinpath(f"{cfg.format}.yml"), "w") as f: yaml.safe_dump(cfg._yaml_dump, f) diff --git a/item_generation_scripts/audiotools/audiofile.py b/item_generation_scripts/audiotools/audiofile.py index 954c91f8..d5687a89 100644 --- a/item_generation_scripts/audiotools/audiofile.py +++ b/item_generation_scripts/audiotools/audiofile.py @@ -110,6 +110,7 @@ def write( filename: Union[str, Path], x: np.ndarray, fs: Optional[int] = 48000, + dtype: Optional[str] = "int16", ) -> None: """ Write audio file (.pcm, .wav or .raw) @@ -122,6 +123,8 @@ def write( Numpy 2D array of dimension: number of channels x number of samples fs: Optional[int] Sampling rate, required for .pcm or .raw input file, default = 48000 (Hz) + dtype: Optional[str] + Data type format required for .pcm or .raw input file, default = 'int16' Returns ------- @@ -141,7 +144,7 @@ def write( x = x.astype(np.int16) wav.write(filename, fs, x) elif file_extension == ".pcm" or file_extension == ".raw": - x = x.astype("int16").reshape(-1, 1) + x = x.astype(dtype).reshape(-1, 1) x.tofile(filename) else: raise ValueError("Wrong input format. Use wav, pcm or raw") diff --git a/item_generation_scripts/audiotools/wrappers/reverb.py b/item_generation_scripts/audiotools/wrappers/reverb.py new file mode 100644 index 00000000..97fae8f5 --- /dev/null +++ b/item_generation_scripts/audiotools/wrappers/reverb.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + +import os.path +import numpy as np +from scipy.fft import fft +from copy import copy +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Optional, Union + +from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES +from item_generation_scripts.utils import find_binary, run +from item_generation_scripts.audiotools.audio import Audio +from item_generation_scripts.audiotools.audiofile import read, write +from item_generation_scripts.audiotools.wrappers.filter import resample_itu + + +def reverb( + input: Audio, + IR: Audio, + align: Optional[float] = None, +) -> Audio: + """ + Wrapper for the ITU-T reverb binary to convolve mono audio signal with an impulse response + Note: The 'reverb' binary tool expects that the IR file is written in the 32b IEEE Standard 754 floating-point representation. + + Parameters + ---------- + input: Audio + Input audio signal + IR: Audio + Impulse response + align: float + multiplicative factor to apply to the reverberated sound in order to align its energy level with a second filePath to the output file + + Returns + ------- + output: Audio + Convolved audio signal with IR + """ + + # find binary + if "reverb" in DEFAULT_CONFIG_BINARIES["binary_paths"]: + binary = find_binary( + DEFAULT_CONFIG_BINARIES["binary_paths"]["reverb"].name, + binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"]["reverb"].parent, + ) + else: + binary = find_binary("reverb") + + with TemporaryDirectory(dir="./tmp_reverb") as tmp_dir: + tmp_dir = Path(tmp_dir) + + # resample input audio signal to that of the IR + old_fs = None + tmp_input = copy(input) + if input.fs != IR.fs: + old_fs = input.fs + tmp_input.audio = resample_itu(tmp_input, IR.fs) + tmp_input.fs = IR.fs + + # write input audio signal to temporary file in .pcm format + tmp_input_file = tmp_dir.joinpath(f"tmp_reverbIn.pcm") + write(tmp_input_file, tmp_input.audio, tmp_input.fs) + + # down-scale IR to prevent saturation + # max_value = np.max(np.abs(IR.audio)) + # if max_value > 1.0: + # IR.audio = IR.audio / max_value + + # write IR to temporary file in .pcm format + # note: the reverb tool expects 32b float format + tmp_IR_file = tmp_dir.joinpath(f"tmp_IR.pcm") + write(tmp_IR_file, IR.audio.astype("float32"), IR.fs, dtype="float32") + + # set up the 'reverb' command line + cmd = [ + str(binary), + ] + + # append multiplicative factor, if provided + if align: + cmd.extend(["-align", str(align)]) + + # append temporary filenames + tmp_output_file = tmp_dir.joinpath(f"tmp_reverbOut.pcm") + cmd.extend([tmp_input_file, tmp_IR_file, tmp_output_file]) + + # run the 'reverb' command + run(cmd) + + # read the reverberated output file + output = copy(tmp_input) + output.audio, _ = read(tmp_output_file, nchannels=1, fs=tmp_input.fs) + + # reverse the resampling + if old_fs: + output.audio = resample_itu(output, old_fs) + output.fs = old_fs + + return output + +def reverb_stereo( + input: Audio, + stereo_IR: Audio, + align: Optional[float] = None, +) -> Audio: + """ + Wrapper for the ITU-T reverb binary to convolve mono audio signal with a stereo impulse response + + Parameters + ---------- + input: Audio + Input audio signal + IR: Audio + Impulse response + align: float + multiplicative factor to apply to the reverberated sound in order to align its energy level with the second file + + Returns + ------- + output: Audio + Convolved audio signal with stereo IR + """ + + # convert to float32 + stereo_IR.audio = np.float32(stereo_IR.audio) + + # separate into left and right IR + IR_left = copy(stereo_IR) + IR_left.name = "MONO" + IR_left.num_channels = 1 + IR_left.audio = np.reshape(stereo_IR.audio[:,0], (-1, 1)) + + IR_right = copy(stereo_IR) + IR_right.name = "MONO" + IR_right.num_channels = 1 + IR_right.audio = np.reshape(stereo_IR.audio[:,1], (-1, 1)) + + # calculate the scaling factor such that the maximum gain of the IR filter across all frequencies is 0dB + if align is None: + H = fft(stereo_IR.audio, axis=0) + align = 1.0 / np.max(np.abs(H)) + # stereo_IR.audio *= align + + # convolve mono input with left and right IR + y_left = reverb(input, IR_left, align=align) + y_right = reverb(input, IR_right, align=align) + + # combine into stereo output + y = copy(input) + y.name = "STEREO" + y.num_channels = 2 + y.audio = np.column_stack([y_left.audio, y_right.audio]) + + return y diff --git a/item_generation_scripts/config/STEREO_CONFIG.yml b/item_generation_scripts/config/STEREO_CONFIG.yml index adc08b4c..65b9b7e4 100644 --- a/item_generation_scripts/config/STEREO_CONFIG.yml +++ b/item_generation_scripts/config/STEREO_CONFIG.yml @@ -15,6 +15,9 @@ format: "STEREO" ### Output sampling rate in Hz needed for headerless audio files; default = 48000 fs: 48000 +### IR sampling rate in Hz needed for headerless audio files; default = 48000 +IR_fs: 32000 + ### Any relative paths will be interpreted relative to the working directory the script is called from! ### Usage of absolute paths is recommended. ### Do not use file names with dots "." in them! This is not supported, use "_" instead @@ -24,7 +27,7 @@ fs: 48000 input_path: "./items_mono" ### Input path to stereo impulse response files -input_path_IR: "./IR" +IR_path: "./IR" ### Output path for generated test items and metadata files output_path: "./output" @@ -39,268 +42,261 @@ loudness: -26 ### Each scene must start with the sceneN tag ### Specify the mono source filename (the program will search for it in the input_path folder) -### Specify azimuth and elevation for each input source +### Specify the stereo IR source filename (the program will search for it in the input_path_IR folder) ### Specify the delay in seconds for each input source ### Note 1: use [val1, val2, ...] for multiple sources in a scene ### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames -### Note 3: we're using right-handed coordinate system with azi = 0 pointing from the nose to the screen -### azimuth: float, [-180,180]; positive indicates left -### elevation: float, [-90,90]; positive indicates up -### distance: float, tbd: default: 1 -### spread: float, [0,360]; spread in angles from 0 ... 360˚ -### gain: float, [0,1] - scenes: a1: name: "G1S1.wav" - description: "Large anechoic room with AB microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["LAABP01.L.IR32", "LAABP01.R.IR32"] - delay: [0, 0] + description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room." + source: ["test_single.wav", "test_single.wav"] + IR: ["LEABP05.wav", "LEABP11.wav"] + delay: [0, 3] a2: name: "G6S2.wav" description: "Large anechoic room with AB microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["LAABP02.L.IR32", "LAABP02.R.IR32"] - delay: [0, 0] + source: ["test_single.wav", "test_single.wav"] + IR: ["LEABP05.wav", "LEABP11.wav"] + delay: [0, 3] a3: name: "G5S3.wav" description: "Large anechoic room with AB microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["LAABP03.L.IR32", "LAABP03.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["LAABP03.wav", "LAABP03.wav"] delay: [0, 0] a4: name: "G4S4.wav" description: "Large anechoic room with AB microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["LAABP04.L.IR32", "LAABP04.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["LAABP04.wav", "LAABP04.wav"] delay: [0, 0] a5: name: "G3S5.wav" description: "Large anechoic room with AB microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["LAABP05.L.IR32", "LAABP05.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["LAABP05.wav", "LAABP05.wav"] delay: [0, 0] a6: name: "G2S6.wav" description: "Large anechoic room with AB microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["LAABP06.L.IR32", "LAABP06.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["LAABP06.wav", "LAABP06.wav"] delay: [0, 0] b1: name: "G2S1.wav" description: "Small anechoic room with AB microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["SAABP01.L.IR32", "SAABP01.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["SAABP01.wav", "SAABP01.wav"] delay: [0, 1.5] b2: name: "G1S2.wav" description: "Small anechoic room with AB microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["SAABP01.L.IR32", "SAABP01.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["SAABP01.wav", "SAABP01.wav"] delay: [0, 1.5] b3: name: "G6S3.wav" description: "Small anechoic room with AB microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["SAABP01.L.IR32", "SAABP01.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["SAABP01.wav", "SAABP01.wav"] delay: [0, 1.5] b4: name: "G5S4.wav" description: "Small anechoic room with AB microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["SAABP01.L.IR32", "SAABP01.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["SAABP01.wav", "SAABP01.wav"] delay: [0, 1.5] b5: name: "G4S5.wav" description: "Small anechoic room with AB microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["SAABP01.L.IR32", "SAABP01.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["SAABP01.wav", "SAABP01.wav"] delay: [0, 1.5] b6: name: "G3S6.wav" description: "Small anechoic room with AB microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["SAABP01.L.IR32", "SAABP01.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["SAABP01.wav", "SAABP01.wav"] delay: [0, 1.5] c1: name: "G3S1.wav" description: "Small anechoic room with MS microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["SAMSP01.L.IR32", "SAMSP01.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["SAMSP01.wav", "SAMSP01.wav"] delay: [0, 0] c2: name: "G2S2.wav" description: "Small anechoic room with MS microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["SAMSP01.L.IR32", "SAMSP01.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["SAMSP01.wav", "SAMSP01.wav"] delay: [0, 0] c3: name: "G1S3.wav" description: "Small anechoic room with MS microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["SAMSP01.L.IR32", "SAMSP01.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["SAMSP01.wav", "SAMSP01.wav"] delay: [0, 0] c4: name: "G6S4.wav" description: "Small anechoic room with MS microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["SAMSP01.L.IR32", "SAMSP01.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["SAMSP01.wav", "SAMSP01.wav"] delay: [0, 1] c5: name: "G5S5.wav" description: "Small anechoic room with MS microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["SAMSP01.L.IR32", "SAMSP01.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["SAMSP01.wav", "SAMSP01.wav"] delay: [0, 0] c6: name: "G4S6.wav" description: "Small anechoic room with MS microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["SAMSP01.L.IR32", "SAMSP01.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["SAMSP01.wav", "SAMSP01.wav"] delay: [0, 0] d1: name: "G4S1.wav" description: "Small echoic room with MS microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 1.5] d2: name: "G3S2.wav" description: "Small echoic room with MS microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 1.5] d3: name: "G3S2.wav" description: "Small echoic room with MS microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 1.5] d4: name: "G1S4.wav" description: "Small echoic room with MS microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 1.5] d5: name: "G6S5.wav" description: "Small echoic room with MS microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 1.5] d6: name: "G5S6.wav" description: "Small echoic room with MS microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 1.5] e1: name: "G5S1.wav" description: "Small echoic room with binaural microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["SEBIP01.L.IR32", "SEBIP01.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["SEBIP01.wav", "SEBIP01.wav"] delay: [0, 1.5] e2: name: "G4S2.wav" description: "Small echoic room with binaural microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["SEBIP01.L.IR32", "SEBIP01.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["SEBIP01.wav", "SEBIP01.wav"] delay: [0, 1.5] e3: name: "G3S3.wav" description: "Small echoic room with binaural microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["SEBIP01.L.IR32", "SEBIP01.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["SEBIP01.wav", "SEBIP01.wav"] delay: [0, 1.5] e4: name: "G2S4.wav" description: "Small echoic room with binaural microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["SEBIP01.L.IR32", "SEBIP01.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["SEBIP01.wav", "SEBIP01.wav"] delay: [0, 1.5] e5: name: "G1S5.wav" description: "Small echoic room with binaural microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["SEBIP01.L.IR32", "SEBIP01.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["SEBIP01.wav", "SEBIP01.wav"] delay: [0, 1.5] e6: name: "G6S6.wav" description: "Small echoic room with binaural microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["SEBIP01.L.IR32", "SEBIP01.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["SEBIP01.wav", "SEBIP01.wav"] delay: [0, 1.5] f1: name: "G6S1.wav" description: "Small echoic room with MS microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 0] f2: name: "G5S2.wav" description: "Small echoic room with MS microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 0] f3: name: "G4S3.wav" description: "Small echoic room with MS microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 0] f4: name: "G3S4.wav" description: "Small echoic room with MS microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 0] f5: name: "G2S5.wav" description: "Small echoic room with MS microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 0] f6: name: "G1S6.wav" description: "Small echoic room with MS microphone pickup." - source: ["test_double.wav", "test_double.wav"] - IR: ["SEMSP01.L.IR32", "SEMSP01.R.IR32"] + source: ["test_single.wav", "test_single.wav"] + IR: ["SEMSP01.wav", "SEMSP01.wav"] delay: [0, 0] \ No newline at end of file diff --git a/item_generation_scripts/constants.py b/item_generation_scripts/constants.py index 9509d069..6b0d0681 100644 --- a/item_generation_scripts/constants.py +++ b/item_generation_scripts/constants.py @@ -42,10 +42,9 @@ LOGGER_FORMAT = ( LOGGER_DATEFMT = "%m-%d %H:%M:%S" SUPPORTED_FORMATS = { + "STEREO", "ISM1", "ISM2", - "ISM3", - "ISM4", } DEFAULT_CONFIG = { @@ -54,18 +53,6 @@ DEFAULT_CONFIG = { "delete_tmp": False, } -DEFAULT_CONFIG_ISM2 = { - "format": "ISM2", - "input_path": "./input", - "output_path": "./output", - # "cod": { - # "bin": find_binary("IVAS_cod", raise_error=False), - # }, - # "dec": { - # "bin": find_binary("IVAS_dec", raise_error=False), - # }, -} - DEFAULT_CONFIG_BINARIES = { "binary_paths": get_binary_paths( Path(__file__).parent.joinpath("binary_paths.yml") diff --git a/item_generation_scripts/processing/process_stereo_items.py b/item_generation_scripts/processing/process_stereo_items.py new file mode 100644 index 00000000..f8dcc43d --- /dev/null +++ b/item_generation_scripts/processing/process_stereo_items.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 + +# +# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository. All Rights Reserved. +# +# This software is protected by copyright law and by international treaties. +# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, +# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., +# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, +# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other +# contributors to this repository retain full ownership rights in their respective contributions in +# the software. This notice grants no license of any kind, including but not limited to patent +# license, nor is any license granted by implication, estoppel or otherwise. +# +# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making +# contributions. +# +# This software is provided "AS IS", without any express or implied warranties. The software is in the +# development stage. It is intended exclusively for experts who have experience with such software and +# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability +# and fitness for a particular purpose are hereby disclaimed and excluded. +# +# Any dispute, controversy or claim arising under or in relation to providing this software shall be +# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in +# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and +# the United Nations Convention on Contracts on the International Sales of Goods. +# + + +import csv +import logging +import os +from pathlib import Path +from typing import Optional +from copy import copy +import numpy as np +from math import floor + + +from item_generation_scripts.audiotools import audio, audiofile +from item_generation_scripts.audiotools.wrappers.bs1770 import get_loudness +from item_generation_scripts.audiotools.wrappers.reverb import reverb_stereo + + +# function for converting nd numpy array to strings with 2 decimal digits +def csv_formatdata(data): + for row in data: + yield ["%0.2f" % v for v in row] + + +def generate_stereo_items( + format: str, + target_level: int, + input_path: Path, + IR_path: Path, + output_path: Path, + scenes: dict, + logger: logging.Logger, + fs: Optional[int] = 48000, + IR_fs: Optional[int] = 48000, +): + """Generate STEREO items from mono items based on scene description""" + + # get the number of scenes + N_scenes = len(scenes) + + for scene_name, scene in scenes.items(): + logger.info(f"Processing scene: {scene_name} out of {N_scenes} scenes") + + # extract the number of audio sources + N_sources = len(np.atleast_1d(scene["source"])) + + # read the IR (check if stereo or two mono files were provided) + source_IR = np.atleast_1d(scene["IR"]) + + y = audio.ChannelBasedAudio("STEREO") + for i in range(N_sources): + + # parse parameters from the scene description + source_file = np.atleast_1d(scene["source"])[i] + IR_file = np.atleast_1d(scene["IR"])[i] + if 'delay' in scene.keys(): + source_delay = np.atleast_1d(scene["delay"])[i] + else: + source_delay = np.array([0]) + + logger.info( + f"Convolving {source_file} with {source_IR}" + ) + + # read source file + x = audio.fromfile("MONO", os.path.join(input_path, source_file), fs=fs) + + # get the number of frames (multiple of 20ms) + N_frames = int(len(x.audio) / x.fs * 50) + + # trim the source signal to align to 20ms boundary + N_trim = int(N_frames * x.fs / 50) + x.audio = x.audio[:N_trim] + + # read the IR file + IR = audio.fromfile("STEREO", os.path.join(IR_path, IR_file), fs=IR_fs) + + # delay the source file + if source_delay > 0: + # ensure delay is a multiple of 20ms + N_delay = int(floor(source_delay * 50) / 50 * x.fs) + + # insert all-zero preamble + pre = np.zeros((N_delay, x.audio.shape[1])) + x.audio = np.concatenate([pre, x.audio]) + + # convolve with stereo IR + x_rev = reverb_stereo(x, IR) + + # adjust the level of the stereo signal + _, scale_factor = get_loudness(x_rev, target_level, "STEREO") + x_rev.audio *= scale_factor + + # add source signal to the array of source signals + y.fs = x.fs + if y.audio is None: + y.audio = x_rev.audio + else: + # append zeros to have equal length of all source signals + if x_rev.audio.shape[0] > y.audio.shape[0]: + y.audio = np.vstack((y.audio, np.zeros((x_rev.audio.shape[0] - y.audio.shape[0], y.audio.shape[1])))) + elif y.audio.shape[0] > x_rev.audio.shape[0]: + x_rev.audio = np.vstack((x_rev.audio, np.zeros((y.audio.shape[0] - x_rev.audio.shape[0], x_rev.audio.shape[1])))) + + # superimpose + y.audio += x_rev.audio + + # write the reverberated audio into output file + output_filename = scene["name"] + audiofile.write( + os.path.join(output_path, output_filename), y.audio, y.fs + ) # !!!! TBD: replace all os.path.xxx operations with the Path object + + return \ No newline at end of file -- GitLab From 714ab327aa7c82e82a984b8047f2aed200efc116 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Tue, 9 May 2023 09:25:04 +0200 Subject: [PATCH 10/27] fix incorrect usage of the len keyword --- item_generation_scripts/audiotools/wrappers/reverb.py | 5 ++--- item_generation_scripts/config/ISM1_CONFIG.yml | 2 +- item_generation_scripts/config/ISM2_CONFIG.yml | 2 +- item_generation_scripts/config/STEREO_CONFIG.yml | 2 +- item_generation_scripts/processing/process_ism_items.py | 5 ++--- 5 files changed, 7 insertions(+), 9 deletions(-) diff --git a/item_generation_scripts/audiotools/wrappers/reverb.py b/item_generation_scripts/audiotools/wrappers/reverb.py index 97fae8f5..1c4491bd 100644 --- a/item_generation_scripts/audiotools/wrappers/reverb.py +++ b/item_generation_scripts/audiotools/wrappers/reverb.py @@ -78,7 +78,7 @@ def reverb( else: binary = find_binary("reverb") - with TemporaryDirectory(dir="./tmp_reverb") as tmp_dir: + with TemporaryDirectory() as tmp_dir: tmp_dir = Path(tmp_dir) # resample input audio signal to that of the IR @@ -167,11 +167,10 @@ def reverb_stereo( IR_right.num_channels = 1 IR_right.audio = np.reshape(stereo_IR.audio[:,1], (-1, 1)) - # calculate the scaling factor such that the maximum gain of the IR filter across all frequencies is 0dB + # calculate the scaling (multiplicative) factor such that the maximum gain of the IR filter across all frequencies is 0dB if align is None: H = fft(stereo_IR.audio, axis=0) align = 1.0 / np.max(np.abs(H)) - # stereo_IR.audio *= align # convolve mono input with left and right IR y_left = reverb(input, IR_left, align=align) diff --git a/item_generation_scripts/config/ISM1_CONFIG.yml b/item_generation_scripts/config/ISM1_CONFIG.yml index 66f81617..560c48fe 100644 --- a/item_generation_scripts/config/ISM1_CONFIG.yml +++ b/item_generation_scripts/config/ISM1_CONFIG.yml @@ -24,7 +24,7 @@ fs: 48000 input_path: "./items_mono" ### Output path for generated test items and metadata files -output_path: "./output" +output_path: "./items_ISM1" ### Target loudness in LKFS; default = null (no loudness normalization applied) loudness: -26 diff --git a/item_generation_scripts/config/ISM2_CONFIG.yml b/item_generation_scripts/config/ISM2_CONFIG.yml index 3bb200e2..3329b440 100644 --- a/item_generation_scripts/config/ISM2_CONFIG.yml +++ b/item_generation_scripts/config/ISM2_CONFIG.yml @@ -24,7 +24,7 @@ fs: 48000 input_path: "./items_mono" ### Output path for generated test items and metadata files -output_path: "./output" +output_path: "./items_ISM2" ### Target loudness in LKFS; default = null (no loudness normalization applied) loudness: -26 diff --git a/item_generation_scripts/config/STEREO_CONFIG.yml b/item_generation_scripts/config/STEREO_CONFIG.yml index 65b9b7e4..b1095a4b 100644 --- a/item_generation_scripts/config/STEREO_CONFIG.yml +++ b/item_generation_scripts/config/STEREO_CONFIG.yml @@ -30,7 +30,7 @@ input_path: "./items_mono" IR_path: "./IR" ### Output path for generated test items and metadata files -output_path: "./output" +output_path: "./items_STEREO" ### Target loudness in LKFS; default = null (no loudness normalization applied) loudness: -26 diff --git a/item_generation_scripts/processing/process_ism_items.py b/item_generation_scripts/processing/process_ism_items.py index 4b33a84e..db931d48 100644 --- a/item_generation_scripts/processing/process_ism_items.py +++ b/item_generation_scripts/processing/process_ism_items.py @@ -30,7 +30,6 @@ # the United Nations Convention on Contracts on the International Sales of Goods. # - import csv import logging import os @@ -99,8 +98,8 @@ def generate_ism_items( N_frames = int(len(x.audio) / x.fs * 50) # trim the source signal to align to 20ms boundary - len = int(N_frames * x.fs / 50) - x.audio = x.audio[:len] + N_trim = int(N_frames * x.fs / 50) + x.audio = x.audio[:N_trim] # adjust the level of the source file _, scale_factor = get_loudness(x, target_level, "MONO") -- GitLab From bab5d25fe3aba6bdc20fc4fc95e9097a1332df80 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Tue, 9 May 2023 17:05:04 +0200 Subject: [PATCH 11/27] update of the example .yml config file for STEREO --- .../config/STEREO_CONFIG.yml | 226 +++++++++--------- 1 file changed, 113 insertions(+), 113 deletions(-) diff --git a/item_generation_scripts/config/STEREO_CONFIG.yml b/item_generation_scripts/config/STEREO_CONFIG.yml index b1095a4b..0933b1da 100644 --- a/item_generation_scripts/config/STEREO_CONFIG.yml +++ b/item_generation_scripts/config/STEREO_CONFIG.yml @@ -52,251 +52,251 @@ scenes: name: "G1S1.wav" description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room." source: ["test_single.wav", "test_single.wav"] - IR: ["LEABP05.wav", "LEABP11.wav"] + IR: ["LEABP04.wav", "LEABP11.wav"] delay: [0, 3] a2: name: "G6S2.wav" - description: "Large anechoic room with AB microphone pickup." + description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room." source: ["test_single.wav", "test_single.wav"] IR: ["LEABP05.wav", "LEABP11.wav"] delay: [0, 3] a3: name: "G5S3.wav" - description: "Large anechoic room with AB microphone pickup." + description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room." source: ["test_single.wav", "test_single.wav"] - IR: ["LAABP03.wav", "LAABP03.wav"] - delay: [0, 0] + IR: ["LEABP06.wav", "LEABP11.wav"] + delay: [0, 3] a4: name: "G4S4.wav" - description: "Large anechoic room with AB microphone pickup." + description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room." source: ["test_single.wav", "test_single.wav"] - IR: ["LAABP04.wav", "LAABP04.wav"] - delay: [0, 0] + IR: ["LEABP05.wav", "LEABP10.wav"] + delay: [0, 1.5] a5: name: "G3S5.wav" - description: "Large anechoic room with AB microphone pickup." + description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room." source: ["test_single.wav", "test_single.wav"] - IR: ["LAABP05.wav", "LAABP05.wav"] - delay: [0, 0] + IR: ["LEABP05.wav", "LEABP11.wav"] + delay: [0, 1.5] a6: name: "G2S6.wav" - description: "Large anechoic room with AB microphone pickup." + description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room." source: ["test_single.wav", "test_single.wav"] - IR: ["LAABP06.wav", "LAABP06.wav"] - delay: [0, 0] + IR: ["LEABP05.wav", "LEABP12.wav"] + delay: [0, 1.5] b1: name: "G2S1.wav" - description: "Small anechoic room with AB microphone pickup." + description: "Two speakers sitting at oval table side by side in a large anechoic conference room." source: ["test_single.wav", "test_single.wav"] - IR: ["SAABP01.wav", "SAABP01.wav"] - delay: [0, 1.5] + IR: ["LAABP05.wav", "LAABP06.wav"] + delay: [0, 35] b2: name: "G1S2.wav" - description: "Small anechoic room with AB microphone pickup." + description: "Two speakers sitting at oval table side by side in a large anechoic conference room." source: ["test_single.wav", "test_single.wav"] - IR: ["SAABP01.wav", "SAABP01.wav"] - delay: [0, 1.5] + IR: ["LAABP07.wav", "LAABP08.wav"] + delay: [0, 3] b3: name: "G6S3.wav" - description: "Small anechoic room with AB microphone pickup." + description: "Two speakers sitting at oval table side by side in a large anechoic conference room." source: ["test_single.wav", "test_single.wav"] - IR: ["SAABP01.wav", "SAABP01.wav"] - delay: [0, 1.5] + IR: ["LAABP09.wav", "LAABP10.wav"] + delay: [0, 3] b4: name: "G5S4.wav" - description: "Small anechoic room with AB microphone pickup." + description: "Two speakers sitting at oval table side by side in a large anechoic conference room." source: ["test_single.wav", "test_single.wav"] - IR: ["SAABP01.wav", "SAABP01.wav"] + IR: ["LAABP11.wav", "LAABP12.wav"] delay: [0, 1.5] b5: name: "G4S5.wav" - description: "Small anechoic room with AB microphone pickup." + description: "Two speakers sitting at oval table side by side in a large anechoic conference room." source: ["test_single.wav", "test_single.wav"] - IR: ["SAABP01.wav", "SAABP01.wav"] + IR: ["LAABP01.wav", "LAABP02.wav"] delay: [0, 1.5] b6: name: "G3S6.wav" - description: "Small anechoic room with AB microphone pickup." + description: "Two speakers sitting at oval table side by side in a large anechoic conference room." source: ["test_single.wav", "test_single.wav"] - IR: ["SAABP01.wav", "SAABP01.wav"] + IR: ["LAABP03.wav", "LAABP04.wav"] delay: [0, 1.5] c1: name: "G3S1.wav" - description: "Small anechoic room with MS microphone pickup." - source: ["test_single.wav", "test_single.wav"] - IR: ["SAMSP01.wav", "SAMSP01.wav"] - delay: [0, 0] + description: "One talker sitting at table in a small anechoic conference room." + source: ["test_single.wav"] + IR: ["SAMSP01.wav"] + delay: [0] c2: name: "G2S2.wav" - description: "Small anechoic room with MS microphone pickup." - source: ["test_single.wav", "test_single.wav"] - IR: ["SAMSP01.wav", "SAMSP01.wav"] - delay: [0, 0] + description: "One talker sitting at table in a small anechoic conference room." + source: ["test_single.wav"] + IR: ["SAMSP04.wav"] + delay: [0] c3: name: "G1S3.wav" - description: "Small anechoic room with MS microphone pickup." - source: ["test_single.wav", "test_single.wav"] - IR: ["SAMSP01.wav", "SAMSP01.wav"] - delay: [0, 0] + description: "One talker sitting at table in a small anechoic conference room." + source: ["test_single.wav"] + IR: ["SAMSP07.wav"] + delay: [0] c4: name: "G6S4.wav" - description: "Small anechoic room with MS microphone pickup." - source: ["test_single.wav", "test_single.wav"] - IR: ["SAMSP01.wav", "SAMSP01.wav"] - delay: [0, 1] + description: "One talker sitting at table in a small echoic conference room." + source: ["test_single.wav"] + IR: ["SEABP01.wav"] + delay: [0] c5: name: "G5S5.wav" - description: "Small anechoic room with MS microphone pickup." - source: ["test_single.wav", "test_single.wav"] - IR: ["SAMSP01.wav", "SAMSP01.wav"] - delay: [0, 0] + description: "One talker sitting at table in a small echoic conference room." + source: ["test_single.wav"] + IR: ["SEABP03.wav"] + delay: [0] c6: name: "G4S6.wav" - description: "Small anechoic room with MS microphone pickup." - source: ["test_single.wav", "test_single.wav"] - IR: ["SAMSP01.wav", "SAMSP01.wav"] - delay: [0, 0] + description: "One talker sitting at table in a small echoic conference room." + source: ["test_single.wav"] + IR: ["SEABP06.wav"] + delay: [0] d1: name: "G4S1.wav" - description: "Small echoic room with MS microphone pickup." - source: ["test_single.wav", "test_single.wav"] - IR: ["SEMSP01.wav", "SEMSP01.wav"] - delay: [0, 1.5] + description: "One talker sitting at table in a small anechoic conference room." + source: ["test_single.wav"] + IR: ["SEBIP01.wav"] + delay: [0] d2: name: "G3S2.wav" - description: "Small echoic room with MS microphone pickup." - source: ["test_single.wav", "test_single.wav"] - IR: ["SEMSP01.wav", "SEMSP01.wav"] - delay: [0, 1.5] + description: "One talker sitting at table in a small anechoic conference room." + source: ["test_single.wav"] + IR: ["SEBIP04.wav"] + delay: [0] d3: name: "G3S2.wav" - description: "Small echoic room with MS microphone pickup." - source: ["test_single.wav", "test_single.wav"] - IR: ["SEMSP01.wav", "SEMSP01.wav"] - delay: [0, 1.5] + description: "One talker sitting at table in a small anechoic conference room." + source: ["test_single.wav"] + IR: ["SEBIP07.wav"] + delay: [0] d4: name: "G1S4.wav" - description: "Small echoic room with MS microphone pickup." - source: ["test_single.wav", "test_single.wav"] - IR: ["SEMSP01.wav", "SEMSP01.wav"] - delay: [0, 1.5] + description: "One talker sitting at table in a small echoic conference room." + source: ["test_single.wav"] + IR: ["SEBIP07.wav"] + delay: [0] d5: name: "G6S5.wav" - description: "Small echoic room with MS microphone pickup." - source: ["test_single.wav", "test_single.wav"] - IR: ["SEMSP01.wav", "SEMSP01.wav"] - delay: [0, 1.5] + description: "One talker sitting at table in a small echoic conference room." + source: ["test_single.wav"] + IR: ["SEBIP07.wav"] + delay: [0] d6: name: "G5S6.wav" - description: "Small echoic room with MS microphone pickup." - source: ["test_single.wav", "test_single.wav"] - IR: ["SEMSP01.wav", "SEMSP01.wav"] - delay: [0, 1.5] + description: "One talker sitting at table in a small echoic conference room." + source: ["test_single.wav"] + IR: ["SEBIP07.wav"] + delay: [0] e1: name: "G5S1.wav" - description: "Small echoic room with binaural microphone pickup." + description: "Two talkers sitting in a room." source: ["test_single.wav", "test_single.wav"] - IR: ["SEBIP01.wav", "SEBIP01.wav"] - delay: [0, 1.5] + IR: ["SEMSP01.wav", "SEMSP03.wav"] + delay: [0, 3] e2: name: "G4S2.wav" - description: "Small echoic room with binaural microphone pickup." + description: "Two talkers sitting in a room." source: ["test_single.wav", "test_single.wav"] - IR: ["SEBIP01.wav", "SEBIP01.wav"] - delay: [0, 1.5] + IR: ["SEMSP01.wav", "SEMSP05.wav"] + delay: [0, 3] e3: name: "G3S3.wav" - description: "Small echoic room with binaural microphone pickup." + description: "Two talkers sitting in a room." source: ["test_single.wav", "test_single.wav"] - IR: ["SEBIP01.wav", "SEBIP01.wav"] - delay: [0, 1.5] + IR: ["SEMSP01.wav", "SEMSP07.wav"] + delay: [0, 3] e4: name: "G2S4.wav" - description: "Small echoic room with binaural microphone pickup." + description: "Two talkers sitting in a room." source: ["test_single.wav", "test_single.wav"] - IR: ["SEBIP01.wav", "SEBIP01.wav"] - delay: [0, 1.5] + IR: ["SEMSP03.wav", "SEMSP04.wav"] + delay: [0, 1.5] e5: name: "G1S5.wav" - description: "Small echoic room with binaural microphone pickup." + description: "Two talkers sitting in a room." source: ["test_single.wav", "test_single.wav"] - IR: ["SEBIP01.wav", "SEBIP01.wav"] - delay: [0, 1.5] + IR: ["SEMSP05.wav", "SEMSP07.wav"] + delay: [0, 1.5] e6: name: "G6S6.wav" - description: "Small echoic room with binaural microphone pickup." + description: "Two talkers sitting in a room." source: ["test_single.wav", "test_single.wav"] - IR: ["SEBIP01.wav", "SEBIP01.wav"] - delay: [0, 1.5] + IR: ["SEMSP06.wav", "SEMSP02.wav"] + delay: [0, 1.5] f1: name: "G6S1.wav" - description: "Small echoic room with MS microphone pickup." + description: "Two talkers sitting in a room." source: ["test_single.wav", "test_single.wav"] - IR: ["SEMSP01.wav", "SEMSP01.wav"] - delay: [0, 0] + IR: ["SEBIP05.wav", "SEBIP01.wav"] + delay: [0, 3] f2: name: "G5S2.wav" - description: "Small echoic room with MS microphone pickup." + description: "Two talkers sitting in a room." source: ["test_single.wav", "test_single.wav"] - IR: ["SEMSP01.wav", "SEMSP01.wav"] - delay: [0, 0] + IR: ["SEBIP07.wav", "SEBIP01.wav"] + delay: [0, 3] f3: name: "G4S3.wav" - description: "Small echoic room with MS microphone pickup." + description: "Two talkers sitting in a room." source: ["test_single.wav", "test_single.wav"] - IR: ["SEMSP01.wav", "SEMSP01.wav"] - delay: [0, 0] + IR: ["SEBIP04.wav", "SEBIP01.wav"] + delay: [0, 3] f4: name: "G3S4.wav" - description: "Small echoic room with MS microphone pickup." + description: "Two talkers sitting in a room." source: ["test_single.wav", "test_single.wav"] - IR: ["SEMSP01.wav", "SEMSP01.wav"] - delay: [0, 0] + IR: ["SEBIP02.wav", "SEBIP06.wav"] + delay: [0, 1.5] f5: name: "G2S5.wav" - description: "Small echoic room with MS microphone pickup." + description: "Two talkers sitting in a room." source: ["test_single.wav", "test_single.wav"] - IR: ["SEMSP01.wav", "SEMSP01.wav"] - delay: [0, 0] + IR: ["SEBIP02.wav", "SEBIP06.wav"] + delay: [0, 1.5] f6: name: "G1S6.wav" - description: "Small echoic room with MS microphone pickup." + description: "Two talkers sitting in a room." source: ["test_single.wav", "test_single.wav"] - IR: ["SEMSP01.wav", "SEMSP01.wav"] - delay: [0, 0] + IR: ["SEBIP03.wav", "SEBIP04.wav"] + delay: [0, 1.5] \ No newline at end of file -- GitLab From 8a6542d4b6907ca378f9631b3be613cf065c97d1 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 11 May 2023 11:08:15 +0200 Subject: [PATCH 12/27] support for +- overlap in ISM items, expect trimmed sentences, support for low-level random noise addition --- item_generation_scripts/__init__.py | 7 +- .../config/ISM1_CONFIG.yml | 4 + .../config/ISM2_CONFIG.yml | 152 +++++++++--------- .../processing/process_ism_items.py | 92 +++++++++-- 4 files changed, 170 insertions(+), 85 deletions(-) diff --git a/item_generation_scripts/__init__.py b/item_generation_scripts/__init__.py index 5afa3cc6..8b3d8bae 100644 --- a/item_generation_scripts/__init__.py +++ b/item_generation_scripts/__init__.py @@ -91,7 +91,10 @@ def main(args): cfg.output_path, cfg.scenes, logger, - fs=cfg.fs + fs=cfg.fs, + preamble=cfg.preamble, + postamble=cfg.postamble, + add_low_level_random_noise=cfg.add_low_level_random_noise, ) elif cfg.format == "STEREO": # generate STEREO items according to scene description @@ -105,6 +108,8 @@ def main(args): logger, fs=cfg.fs, IR_fs=cfg.IR_fs, + preamble=cfg.preamble, + postamble=cfg.postamble, ) # copy configuration to output directory diff --git a/item_generation_scripts/config/ISM1_CONFIG.yml b/item_generation_scripts/config/ISM1_CONFIG.yml index 560c48fe..9ba070f7 100644 --- a/item_generation_scripts/config/ISM1_CONFIG.yml +++ b/item_generation_scripts/config/ISM1_CONFIG.yml @@ -29,6 +29,10 @@ output_path: "./items_ISM1" ### Target loudness in LKFS; default = null (no loudness normalization applied) loudness: -26 +### Pre-amble and Post-amble length in seconds (default = None) +preamble: 0.5 +postamble: 0.5 + ################################################ ### Scene description diff --git a/item_generation_scripts/config/ISM2_CONFIG.yml b/item_generation_scripts/config/ISM2_CONFIG.yml index 3329b440..198571d2 100644 --- a/item_generation_scripts/config/ISM2_CONFIG.yml +++ b/item_generation_scripts/config/ISM2_CONFIG.yml @@ -29,6 +29,12 @@ output_path: "./items_ISM2" ### Target loudness in LKFS; default = null (no loudness normalization applied) loudness: -26 +### Pre-amble and Post-amble length in seconds (default = 0.0) +preamble: 0.5 +postamble: 0.5 + +### Flag for adding low-level random background noise (amplitude +-4) instead of silence; default = false (silence) +add_low_level_random_noise: true ################################################ ### Scene description @@ -37,7 +43,7 @@ loudness: -26 ### Each scene must start with the sceneN tag ### Specify the mono source filename (the program will search for it in the input_path folder) ### Specify azimuth and elevation for each input source -### Specify the delay in seconds for each input source +### Specify the overlap length in seconds for each input source (negative value creates a gap) ### Note 1: use [val1, val2, ...] for multiple sources in a scene ### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames @@ -52,288 +58,288 @@ scenes: a1: name: "G1S1.wav" description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [0, 50] elevation: [0, 0] - delay: [0, 0] + overlap: -0.5 a2: name: "G6S2.wav" description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [50, 350] elevation: [0, 0] - delay: [0, 0] + overlap: -0.5 a3: name: "G5S3.wav" description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [40, 290] elevation: [0, 0] - delay: [0, 0] + overlap: -0.5 a4: name: "G4S4.wav" description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [30, 230] elevation: [15, 15] - delay: [0, 0] + overlap: -0.5 a5: name: "G3S5.wav" description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [20, 170] elevation: [15, 15] - delay: [0, 0] + overlap: -0.5 a6: name: "G2S6.wav" description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances." - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [10, 110] elevation: [15, 15] - delay: [0, 0] + overlap: -0.5 b1: name: "G2S1.wav" description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [20, 170] elevation: [30, 30] - delay: [0, 1.5] + overlap: 0.5 b2: name: "G1S2.wav" description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [10, 110] elevation: [30, 30] - delay: [0, 1.5] + overlap: 0.5 b3: name: "G6S3.wav" description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [0, 50] elevation: [30, 30] - delay: [0, 1.5] + overlap: 0.5 b4: name: "G5S4.wav" description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [50, 350] elevation: [60, 60] - delay: [0, 1.5] + overlap: 0.5 b5: name: "G4S5.wav" description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [40, 290] elevation: [60, 60] - delay: [0, 1.5] + overlap: 0.5 b6: name: "G3S6.wav" description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances." - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [30, 230] elevation: [60, 60] - delay: [0, 1.5] + overlap: 0.5 c1: name: "G3S1.wav" description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances." - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [40, 290] elevation: [0, 60] - delay: [0, 0] + overlap: -0.5 c2: name: "G2S2.wav" description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances." - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [30, 230] elevation: [0, 60] - delay: [0, 0] + overlap: -0.5 c3: name: "G1S3.wav" description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances." - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [20, 170] elevation: [0, 60] - delay: [0, 0] + overlap: -0.5 c4: name: "G6S4.wav" description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances." - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [10, 110] elevation: [0, 60] - delay: [0, 1] + shift: [0, 1] c5: name: "G5S5.wav" description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances." - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [0, 50] elevation: [0, 60] - delay: [0, 0] + overlap: -0.5 c6: name: "G4S6.wav" description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances." - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [50, 350] elevation: [0, 60] - delay: [0, 0] + overlap: -0.5 d1: name: "G4S1.wav" description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [50, "180:1:120 + 360"] elevation: [0, 60] - delay: [0, 1.5] + overlap: 0.5 d2: name: "G3S2.wav" description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [300, "-70:-1:-10 - 360"] elevation: [0, 60] - delay: [0, 1.5] + overlap: 0.5 d3: name: "G2S3.wav" description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [250, "-20:-1:-320"] elevation: [0, 60] - delay: [0, 1.5] + overlap: 0.5 d4: name: "G1S4.wav" description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [200, "30:-1:-270"] elevation: [0, 60] - delay: [0, 1.5] + overlap: 0.5 d5: name: "G6S5.wav" description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [150, "80:1:20 + 360"] elevation: [0, 60] - delay: [0, 1.5] + overlap: 0.5 d6: name: "G5S6.wav" description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances." - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [100, "130:1:70 + 360"] elevation: [0, 60] - delay: [0, 1.5] + overlap: 0.5 e1: name: "G5S1.wav" description: "two talkers walking side-by-side around the table, ~30% overlapping utterances" - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: ["80:1:20 + 360", "80:1:20 + 360"] elevation: [10, 60] - delay: [0, 1.5] + overlap: 0.5 e2: name: "G4S2.wav" description: "two talkers walking side-by-side around the table, ~30% overlapping utterances" - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: ["130:1:70 + 360", "130:1:70 + 360"] elevation: [10, 60] - delay: [0, 1.5] + overlap: 0.5 e3: name: "G3S3.wav" description: "two talkers walking side-by-side around the table, ~30% overlapping utterances" - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: ["180:1:120 + 360", "180:1:120 + 360"] elevation: [10, 60] - delay: [0, 1.5] + overlap: 0.5 e4: name: "G2S4.wav" description: "two talkers walking side-by-side around the table, ~30% overlapping utterances" - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: ["-70:-1:-10 - 360", "-70:-1:-10 - 360"] elevation: [10, 60] - delay: [0, 1.5] + overlap: 0.5 e5: name: "G1S5.wav" description: "two talkers walking side-by-side around the table, ~30% overlapping utterances" - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: ["-20:-1:-320", "-20:-1:-320"] elevation: [10, 60] - delay: [0, 1.5] + overlap: 0.5 e6: name: "G6S6.wav" description: "two talkers walking side-by-side around the table, ~30% overlapping utterances" - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: ["30:-1:-270", "30:-1:-270"] elevation: [10, 60] - delay: [0, 1.5] + overlap: 0.5 f1: name: "G6S1.wav" description: "two talkers walking around the table in opposite directions, non-overlapping utterances." - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: ["60:1:0 + 360", "60:-1:120 - 360"] elevation: [20, 50] - delay: [0, 0] + overlap: -0.5 f2: name: "G5S2.wav" description: "two talkers walking around the table in opposite directions, non-overlapping utterances." - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: ["0:1:300", "0:-1:60 - 360"] elevation: [20, 50] - delay: [0, 0] + overlap: -0.5 f3: name: "G4S3.wav" description: "two talkers walking around the table in opposite directions, non-overlapping utterances." - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: ["300:1:240 + 360", "300:-1:0"] elevation: [20, 50] - delay: [0, 0] + overlap: -0.5 f4: name: "G3S4.wav" description: "two talkers walking around the table in opposite directions, non-overlapping utterances." - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: ["240:1:180 + 360", "240:-1:-60"] elevation: [20, 50] - delay: [0, 0] + overlap: -0.5 f5: name: "G2S5.wav" description: "two talkers walking around the table in opposite directions, non-overlapping utterances." - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: ["180:1:120 + 360", "180:-1:-120"] elevation: [20, 50] - delay: [0, 0] + overlap: -0.5 f6: name: "G1S6.wav" description: "two talkers walking around the table in opposite directions, non-overlapping utterances." - source: ["test_double.wav", "test_double.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: ["120:1:60 + 360", "120:-1:180 - 360"] elevation: [20, 50] - delay: [0, 0] + overlap: -0.5 \ No newline at end of file diff --git a/item_generation_scripts/processing/process_ism_items.py b/item_generation_scripts/processing/process_ism_items.py index db931d48..fe62f048 100644 --- a/item_generation_scripts/processing/process_ism_items.py +++ b/item_generation_scripts/processing/process_ism_items.py @@ -41,6 +41,7 @@ from math import floor from item_generation_scripts.audiotools import audio, audiofile from item_generation_scripts.audiotools.wrappers.bs1770 import get_loudness +SEED_RANDOM_NOISE = 0 # function for converting nd numpy array to strings with 2 decimal digits def csv_formatdata(data): @@ -56,6 +57,9 @@ def generate_ism_items( scenes: dict, logger: logging.Logger, fs: Optional[int] = 48000, + preamble: Optional[float] = 0.0, + postamble: Optional[float] = 0.0, + add_low_level_random_noise: Optional[bool] = False, ): """Generate ISM items with metadata from mono items based on scene description""" @@ -75,6 +79,12 @@ def generate_ism_items( y = audio.ChannelBasedAudio("MONO") y_meta = None + # read the overlap length + if 'overlap' in scene.keys(): + source_overlap = float(scene["overlap"]) + else: + source_overlap = 0.0 + # repeat for all source files for i in range(N_sources): @@ -82,10 +92,6 @@ def generate_ism_items( source_file = np.atleast_1d(scene["source"])[i] source_azi = np.atleast_1d(scene["azimuth"])[i] source_ele = np.atleast_1d(scene["elevation"])[i] - if 'delay' in scene.keys(): - source_delay = np.atleast_1d(scene["delay"])[i] - else: - source_delay = np.array([0]) logger.info( f"Encoding {source_file} at position(s) {source_azi},{source_ele}" @@ -93,13 +99,16 @@ def generate_ism_items( # read source file x = audio.fromfile("MONO", os.path.join(input_path, source_file), fs=fs) + + ############### DEBUG ############33 + # x.audio = x.audio[:-10] # get the number of frames (multiple of 20ms) N_frames = int(len(x.audio) / x.fs * 50) # trim the source signal to align to 20ms boundary - N_trim = int(N_frames * x.fs / 50) - x.audio = x.audio[:N_trim] + # N_trim = int(N_frames * x.fs / 50) + # x.audio = x.audio[:N_trim] # adjust the level of the source file _, scale_factor = get_loudness(x, target_level, "MONO") @@ -171,11 +180,17 @@ def generate_ism_items( # arrange all metadata fields column-wise into a matrix x_meta = np.column_stack((azi, ele, dist, spread, gain)) - - # delay the source file - if source_delay > 0: + + # shift the second (and all other) source files (positive shift creates overlap, negative shift creates a gap) + if i > 0 and source_overlap != 0.0: + # get the length of the first source file + N_delay = len(y.audio[:,0]) + + # add the shift + N_delay += int(source_overlap * x.fs) + # ensure delay is a multiple of 20ms - N_delay = int(floor(source_delay * 50) / 50 * x.fs) + # N_delay = int(floor(source_shift * 50) / 50 * x.fs) # insert all-zero preamble pre = np.zeros((N_delay, x.audio.shape[1])) @@ -186,13 +201,28 @@ def generate_ism_items( [0.00, 0.00, 1.00, 0.00, 1.00], (N_delay, 1) ) # !!!! TBD - check if we should insert netrual position or the first position of the metadata x_meta = np.concatenate([pre, x_meta]) + + # pad with zeros to ensure that the signal length is a multiple of 20ms + N_frame = x.fs / 50 + if len(x.audio) % N_frame != 0: + N_pad = int(N_frame - len(x.audio) % N_frame) + + # insert all-zero preamble + pre = np.zeros((N_pad, x.audio.shape[1])) + x.audio = np.concatenate([pre, x.audio]) + + # insert neutral position as a pre-amble + pre = np.tile( + [0.00, 0.00, 1.00, 0.00, 1.00], (N_pad, 1) + ) # !!!! TBD - check if we should insert netrual position or the first position of the metadata + x_meta = np.concatenate([pre, x_meta]) # add source signal to the array of all source signals y.fs = x.fs if y.audio is None: y.audio = x.audio else: - # append zeros to have equal length of all source signals + # pad with zeros to have the same length of all source signals if x.audio.shape[0] > y.audio.shape[0]: y.audio = np.vstack((y.audio, np.zeros((x.audio.shape[0] - y.audio.shape[0], y.audio.shape[1])))) elif y.audio.shape[0] > x.audio.shape[0]: @@ -234,6 +264,46 @@ def generate_ism_items( y_meta = np.concatenate([y_meta, x_meta]) + # append pre-amble and post-amble to all sources + if preamble != 0.0: + # ensure that pre-mable is a multiple of 20ms + N_pre = int(floor(preamble * 50) / 50 * y.fs) + + # insert all-zero preamble to all sources + pre = np.zeros((N_pre, y.audio.shape[1])) + y.audio = np.concatenate([pre, y.audio]) + + # insert neutral position as a pre-amble to all sources + pre = np.tile( + [0.00, 0.00, 1.00, 0.00, 1.00], (y_meta.shape[0], N_pre, 1) + ) # !!!! TBD - check if we should insert netrual position or the first position of the metadata + y_meta = np.concatenate([pre, y_meta], axis=1) + + if postamble != 0.0: + # ensure that post-mable is a multiple of 20ms + N_post = int(floor(postamble * 50) / 50 * y.fs) + + # append all-zero postamble to all sources + post = np.zeros((N_post, y.audio.shape[1])) + y.audio = np.concatenate([y.audio, post]) + + # append neutral position as a post-amble to all sources + post = np.tile( + [0.00, 0.00, 1.00, 0.00, 1.00], (y_meta.shape[0], N_post, 1) + ) # !!!! TBD - check if we should insert netrual position or the last position of the metadata + y_meta = np.concatenate([y_meta, post], axis=1) + + # add random noise + if add_low_level_random_noise: + # create uniformly distributed noise between -4 and 4 + np.random.seed(SEED_RANDOM_NOISE) + noise = np.random.randint( + low=-4, high=5, size=y.audio.shape + ).astype("float") + + # superimpose + y.audio += noise + # write individual ISM audio streams to the output file in an interleaved format output_filename = scene["name"] audiofile.write( -- GitLab From 48039fc61c0295ff7e439005ff533defd0dc68dc Mon Sep 17 00:00:00 2001 From: Archit Tamarapu Date: Thu, 11 May 2023 15:53:07 +0200 Subject: [PATCH 13/27] [cleanup] move item generation scripts into subfolder generation; see notes below - created __init__.py and __main__.py for generation module !! now use python -m ivas_processing_scripts.generation !! - moved reverb.py wrapper to main wrappers folder - moved modified config.py to generation/config.py - moved modified constants.py to generation/constants.py - moved process_{ism,stereo}_items to generation/ - integrated modifications to audiotools.audiofile.py --- .../ISM1_CONFIG.yml | 0 .../ISM2_CONFIG.yml | 0 .../STEREO_CONFIG.yml | 0 item_generation_scripts/audiotools/EFAP.py | 922 ------------------ .../audiotools/__init__.py | 286 ------ .../audiotools/__main__.py | 36 - item_generation_scripts/audiotools/audio.py | 428 -------- .../audiotools/audioarray.py | 690 ------------- .../audiotools/audiofile.py | 436 --------- .../BRIR_IISofficialMPEG222UC_FULL.mat | 3 - .../BRIR_IISofficialMPEG222UC_LS.mat | 3 - .../HRIR_ORANGE53_Dolby_SBA1.mat | 3 - .../HRIR_ORANGE53_Dolby_SBA2.mat | 3 - .../HRIR_ORANGE53_Dolby_SBA3.mat | 3 - .../binaural_datasets/HRIR_ORANGE53_FULL.mat | 3 - .../binaural_datasets/HRIR_ORANGE53_LS.mat | 3 - .../audiotools/binaural_datasets/README.txt | 34 - .../audiotools/binaural_datasets/__init__.py | 31 - .../binaural_datasets/binaural_dataset.py | 288 ------ .../audiotools/binauralobjectrenderer.py | 652 ------------- .../audiotools/constants.py | 704 ------------- .../audiotools/convert/__init__.py | 323 ------ .../audiotools/convert/binaural.py | 108 -- .../audiotools/convert/channelbased.py | 390 -------- .../audiotools/convert/masa.py | 165 ---- .../audiotools/convert/objectbased.py | 352 ------- .../audiotools/convert/scenebased.py | 429 -------- .../audiotools/metadata.py | 571 ----------- .../audiotools/rotation.py | 379 ------- item_generation_scripts/audiotools/utils.py | 71 -- .../audiotools/wrappers/__init__.py | 31 - .../audiotools/wrappers/bs1770.py | 291 ------ .../audiotools/wrappers/eid_xor.py | 193 ---- .../audiotools/wrappers/esdru.py | 130 --- .../audiotools/wrappers/filter.py | 366 ------- .../audiotools/wrappers/gen_patt.py | 171 ---- .../audiotools/wrappers/masaRenderer.py | 117 --- .../audiotools/wrappers/networkSimulator.py | 224 ----- .../audiotools/wrappers/p50fbmnru.py | 110 --- .../audiotools/wrappers/random_seed.py | 92 -- item_generation_scripts/binary_paths.yml | 30 - .../processing/__init__.py | 31 - .../processing/preprocessing_2.py | 155 --- .../processing/processing.py | 455 --------- item_generation_scripts/utils.py | 297 ------ .../audiotools/audiofile.py | 5 +- .../audiotools/wrappers/reverb.py | 54 +- .../generation}/__init__.py | 12 +- .../generation}/__main__.py | 2 +- .../generation}/config.py | 4 +- .../generation}/constants.py | 6 +- .../generation}/process_ism_items.py | 106 +- .../generation}/process_stereo_items.py | 67 +- 53 files changed, 147 insertions(+), 10118 deletions(-) rename {item_generation_scripts/config => item_gen_configs}/ISM1_CONFIG.yml (100%) rename {item_generation_scripts/config => item_gen_configs}/ISM2_CONFIG.yml (100%) rename {item_generation_scripts/config => item_gen_configs}/STEREO_CONFIG.yml (100%) delete mode 100644 item_generation_scripts/audiotools/EFAP.py delete mode 100644 item_generation_scripts/audiotools/__init__.py delete mode 100644 item_generation_scripts/audiotools/__main__.py delete mode 100644 item_generation_scripts/audiotools/audio.py delete mode 100644 item_generation_scripts/audiotools/audioarray.py delete mode 100644 item_generation_scripts/audiotools/audiofile.py delete mode 100644 item_generation_scripts/audiotools/binaural_datasets/BRIR_IISofficialMPEG222UC_FULL.mat delete mode 100644 item_generation_scripts/audiotools/binaural_datasets/BRIR_IISofficialMPEG222UC_LS.mat delete mode 100644 item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA1.mat delete mode 100644 item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA2.mat delete mode 100644 item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA3.mat delete mode 100644 item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_FULL.mat delete mode 100644 item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_LS.mat delete mode 100644 item_generation_scripts/audiotools/binaural_datasets/README.txt delete mode 100644 item_generation_scripts/audiotools/binaural_datasets/__init__.py delete mode 100644 item_generation_scripts/audiotools/binaural_datasets/binaural_dataset.py delete mode 100644 item_generation_scripts/audiotools/binauralobjectrenderer.py delete mode 100644 item_generation_scripts/audiotools/constants.py delete mode 100644 item_generation_scripts/audiotools/convert/__init__.py delete mode 100644 item_generation_scripts/audiotools/convert/binaural.py delete mode 100644 item_generation_scripts/audiotools/convert/channelbased.py delete mode 100644 item_generation_scripts/audiotools/convert/masa.py delete mode 100644 item_generation_scripts/audiotools/convert/objectbased.py delete mode 100644 item_generation_scripts/audiotools/convert/scenebased.py delete mode 100644 item_generation_scripts/audiotools/metadata.py delete mode 100644 item_generation_scripts/audiotools/rotation.py delete mode 100644 item_generation_scripts/audiotools/utils.py delete mode 100644 item_generation_scripts/audiotools/wrappers/__init__.py delete mode 100644 item_generation_scripts/audiotools/wrappers/bs1770.py delete mode 100644 item_generation_scripts/audiotools/wrappers/eid_xor.py delete mode 100644 item_generation_scripts/audiotools/wrappers/esdru.py delete mode 100644 item_generation_scripts/audiotools/wrappers/filter.py delete mode 100644 item_generation_scripts/audiotools/wrappers/gen_patt.py delete mode 100644 item_generation_scripts/audiotools/wrappers/masaRenderer.py delete mode 100644 item_generation_scripts/audiotools/wrappers/networkSimulator.py delete mode 100644 item_generation_scripts/audiotools/wrappers/p50fbmnru.py delete mode 100644 item_generation_scripts/audiotools/wrappers/random_seed.py delete mode 100644 item_generation_scripts/binary_paths.yml delete mode 100644 item_generation_scripts/processing/__init__.py delete mode 100644 item_generation_scripts/processing/preprocessing_2.py delete mode 100644 item_generation_scripts/processing/processing.py delete mode 100644 item_generation_scripts/utils.py rename {item_generation_scripts => ivas_processing_scripts}/audiotools/wrappers/reverb.py (90%) rename {item_generation_scripts => ivas_processing_scripts/generation}/__init__.py (90%) mode change 100644 => 100755 rename {item_generation_scripts => ivas_processing_scripts/generation}/__main__.py (98%) mode change 100644 => 100755 rename {item_generation_scripts/processing => ivas_processing_scripts/generation}/config.py (97%) rename {item_generation_scripts => ivas_processing_scripts/generation}/constants.py (95%) rename {item_generation_scripts/processing => ivas_processing_scripts/generation}/process_ism_items.py (86%) rename {item_generation_scripts/processing => ivas_processing_scripts/generation}/process_stereo_items.py (81%) diff --git a/item_generation_scripts/config/ISM1_CONFIG.yml b/item_gen_configs/ISM1_CONFIG.yml similarity index 100% rename from item_generation_scripts/config/ISM1_CONFIG.yml rename to item_gen_configs/ISM1_CONFIG.yml diff --git a/item_generation_scripts/config/ISM2_CONFIG.yml b/item_gen_configs/ISM2_CONFIG.yml similarity index 100% rename from item_generation_scripts/config/ISM2_CONFIG.yml rename to item_gen_configs/ISM2_CONFIG.yml diff --git a/item_generation_scripts/config/STEREO_CONFIG.yml b/item_gen_configs/STEREO_CONFIG.yml similarity index 100% rename from item_generation_scripts/config/STEREO_CONFIG.yml rename to item_gen_configs/STEREO_CONFIG.yml diff --git a/item_generation_scripts/audiotools/EFAP.py b/item_generation_scripts/audiotools/EFAP.py deleted file mode 100644 index b83d57e6..00000000 --- a/item_generation_scripts/audiotools/EFAP.py +++ /dev/null @@ -1,922 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import argparse -from enum import Enum -from itertools import combinations -from pathlib import Path -from typing import Optional, Tuple, Union - -import numpy as np - - -def wrap_angles( - azi: float, - ele: float, - clip_ele: Optional[bool] = False, -) -> Tuple[float, float]: - """ - Wrap angles to (-180, 180] azimuth and [-90, 90] elevation - Takes into account hemisphere flips from large elevation changes unless clip_ele is specified - """ - if clip_ele: - ele = min(max(ele, -90), 90) - - if ele % 90 == 0 and ele % 180 != 0: - # if elevation is a multiple of 90, azimuth is irrelevant since we are at a pole - azi = 0 - while np.abs(ele) > 90: - ele -= 360 - else: - # wrap elevation value - while np.abs(ele) > 90: - # flip azimuth to other hemisphere - azi += 180 - - # compensate elevation accordingly - if ele > 90: - ele = 180 - ele - elif ele < -90: - ele = -180 - ele - - # wrap azimuth value - while azi > 180: - azi -= 360 - while azi <= -180: - azi += 360 - - return azi, ele - - -class EfapDmxType(Enum): - NONE = 0 - AMPLITUDE = 1 - INTENSITY = 2 - - -class EfapVertex: - """ - Vertex data structure for EFAP - - Initialises a vertex from the given spherical coordinate pair, - with a flag specifying if it is a ghost loudspeaker - - Parameters - ---------- - azi : float - Azimuth of vertex - ele : float - Elevation of vertex - is_ghost : bool - Whether the vertex is a ghost, default is False - dmx_type : EfapDmxType - Downmix type for ghost vertices - """ - - def __init__( - self, - azi: float, - ele: float, - is_ghost: Optional[bool] = False, - dmx_type: Optional[EfapDmxType] = EfapDmxType.INTENSITY, - ): - self.azi, self.ele = wrap_angles(azi, ele) - self.pos = np.array( - [ - np.cos(np.deg2rad(azi)) * np.cos(np.deg2rad(ele)), - np.sin(np.deg2rad(azi)) * np.cos(np.deg2rad(ele)), - np.sin(np.deg2rad(ele)), - ] - ) - - idx_azi = np.round(np.abs(90 - np.abs(self.azi))) - idx_ele = 90 - np.round(np.abs(self.ele)) - self.index = ( - idx_azi + 181 * idx_ele - ) # vertices on the median plane have lowest index - - self.is_ghost = is_ghost - self.dmx_type = dmx_type - - def __str__(self): - str_ = f"a{self.azi}e{self.ele}" - if self.is_ghost: - str_ += "*" - return str_ - - def __lt__(self, other): - return self.index < other.index - - -class EFAP: - """ - EFAP data structure - - Initialise EFAP data for computing panning gains - - Parameters - ---------- - azimuths : np.ndarray - Azimuth positions of the loudspeaker array - elevations : np.ndarray - Elevation postions of the loudspeaker array - intensity_panning : bool - Whether intensity panning is enabled or not - - Examples - -------- - >>> from EFAP import EFAP - >>> panner = EFAP([30, -30, 0, 110, -110], [0, 0, 0, 0, 0], False) - >>> panner.pan(15, 45) - array([0.66742381, 0.19069252, 0.66742381, 0.19069252, 0.19069252]) - """ - - _EFAP_HULL_TOL = 1e-4 # tolerance for a point to be added to the convex hull - _EFAP_MAX_AZI_GAP = 160 # maximum allowed angular gap in the middle layer - _EFAP_POLAR_ELE = 90 # elevation of north / south poles (zenith / nadir) - _EFAP_THRESH_COPLANAR = 1e-3 # tolerance for points to be considered coplanar - _EFAP_THRESH_MID_LAYER = 45 # elevation threshold for loudspeakers to be considered as in the middle layer - _EFAP_THRESH_POLES = 1e-6 # tolerance for a vertex to be considered polar - _EFAP_THRESH_TRI = 1e-10 # tolerance for a point to be inside a triangle - - def __init__( - self, - azimuths: Union[list, np.ndarray], - elevations: Union[list, np.ndarray], - intensity_panning: Optional[bool] = False, - ): - # validation - azimuths = np.array(azimuths) - elevations = np.array(elevations) - if np.squeeze(azimuths).ndim > 1: - raise ValueError("Too many dimensions for loudspeaker azimuth array") - if np.squeeze(elevations).ndim > 1: - raise ValueError("Too many dimensions for loudspeaker elevations array") - if azimuths.shape != elevations.shape: - raise ValueError("Mismatch between loudspeaker azimuths and elevations") - - # set EFIP flag - self.intensity_panning = intensity_panning - - # initialise vertices and add ghost loudspeakers if needed - self.verts = np.array( - [EfapVertex(azi, ele) for azi, ele in zip(azimuths, elevations)] - ) - self._add_ghost_speakers() - - # formulate initial tetrahedron for the convex hull - self._init_simplex() - - # add the remaining vertices to the convex hull in order of their index - for i in np.argsort(self.verts): - if self.verts[i] not in self.verts[self.tris]: - self._add_vertex_to_hull(i) - - # compute downmix matrix with remapped ghost speakers - self._remap_ghost_speakers() - - # set vertices near poles to have NaN azimuth - for v in self.verts: - if ( - v.ele > self._EFAP_POLAR_ELE - self._EFAP_THRESH_POLES - or v.ele < self._EFAP_THRESH_POLES - self._EFAP_POLAR_ELE - ): - v.azi = np.nan - - # combine triangles into polygons - self._tri2poly() - - def _add_ghost_speakers(self) -> None: - """ - Add ghost loudspeakers at the poles, or to fill large horizontal gaps - """ - ele = [v.ele for v in self.verts] - - dmx_type = EfapDmxType.INTENSITY - - # add ghost loudspeakers at the poles if necessary - if max(ele) < self._EFAP_POLAR_ELE: - if self.intensity_panning: - if max(ele) > self._EFAP_THRESH_MID_LAYER: - dmx_type = EfapDmxType.NONE - else: - dmx_type = EfapDmxType.AMPLITUDE - - self.verts = np.append(self.verts, EfapVertex(0, 90, True, dmx_type)) - - if min(ele) > -self._EFAP_POLAR_ELE: - if self.intensity_panning: - if min(ele) < -self._EFAP_THRESH_MID_LAYER: - dmx_type = EfapDmxType.NONE - else: - dmx_type = EfapDmxType.AMPLITUDE - - self.verts = np.append(self.verts, EfapVertex(0, -90, True, dmx_type)) - - # check for large gaps in the middle horizontal layer - mid_spkrs = [ - v.azi for v in self.verts if np.abs(v.ele) < self._EFAP_THRESH_MID_LAYER - ] - - # no speakers in middle layer; add a triangle of ghost speakers - if not mid_spkrs: - self.verts = np.append( - self.verts, - [ - EfapVertex(0, 0, True), - EfapVertex(180, 0, True), - EfapVertex(240, 0, True), - ], - ) - # only one speaker in the threshold; add two ghost speakers to form a triangle - elif len(mid_spkrs) == 1: - self.verts = np.append( - self.verts, - [ - EfapVertex(mid_spkrs[0] + 120, 0, True), - EfapVertex(mid_spkrs[0] + 240, 0, True), - ], - ) - # search for and fill gaps greater than MAX_AZI_GAP - else: - mid_spkrs = np.sort(mid_spkrs) - angle_diff = np.diff(np.concatenate([mid_spkrs, [mid_spkrs[0] + 360]])) - sectors = np.ceil(angle_diff / self._EFAP_MAX_AZI_GAP) - - for i, s in enumerate(sectors): - if s > 1: - new_diff = angle_diff[i] / s - num_new = s - 1 - for k in range(int(num_new)): - new_azi = mid_spkrs[i] + (k + 1) * new_diff - self.verts = np.append(self.verts, EfapVertex(new_azi, 0, True)) - - def _init_simplex(self) -> None: - """ - Create an initial tetrahedron / simplex for the convex hull from 4 vertices - """ - # take the first vertex as seed - t = [0] - - # attempt to form an edge with non-zero length - for i, v in enumerate(self.verts): - if ( - v.azi != self.verts[t[0]].azi or v.ele != self.verts[t[0]].ele - ) and i not in t: - t.append(i) - break - else: - raise ValueError("Vertices are conincident!") - - # attempt to form a triangle with non-zero area - for i, v in enumerate(self.verts): - if ( - np.linalg.norm( - np.cross( - self.verts[t[1]].pos - self.verts[t[0]].pos, - v.pos - self.verts[t[0]].pos, - ), - 2, - ) - > self._EFAP_HULL_TOL - and i not in t - ): - t.append(i) - break - else: - raise ValueError("Vertices are colinear!") - - # attempt to form a tetrahedron with non-zero volume - for i, v in enumerate(self.verts): - if ( - np.abs( - np.dot( - np.cross( - self.verts[t[1]].pos - self.verts[t[0]].pos, - self.verts[t[2]].pos - self.verts[t[0]].pos, - ), - v.pos - self.verts[t[0]].pos, - ) - ) - ) > self._EFAP_HULL_TOL and i not in t: - t.append(i) - break - else: - raise ValueError("Vertices are coplanar!") - - # create a list of the triangles of the initial simplex / tetrahedron - t = np.array(t) - self.tris = np.array([t[[0, 1, 2]], t[[0, 1, 3]], t[[0, 2, 3]], t[[1, 2, 3]]]) - - # orient the triangle surface planes outwards from the centroid - self.centroid = np.mean([self.verts[i].pos for i in t], axis=0) - for i, tri in enumerate(self.tris): - self.tris[i, :] = self._flip_plane(tri) - - def _add_vertex_to_hull(self, idx_new_vert: int) -> None: - """ - Add a vertex to the convex hull and update the list of triangles in the hull - """ - # compute the centroid of the current convex hull - self.centroid = np.mean( - [self.verts[i].pos for i in np.unique(self.tris)], axis=0 - ) - - tris_new = [] - visible = [] - - # find which hull surfaces are visible from the new vertex - for i, tri in enumerate(self.tris): - if self._vertex_dist(tri, idx_new_vert) > -1e-6: - visible.append(i) - else: - tris_new.append(tri) - - tris_new = np.array(tris_new) - visible = np.array(visible, dtype=int) - - # find edges of the visible hull surfaces - max_vert = np.amax(self.tris[visible]) + 1 - counter = np.zeros([max_vert, max_vert]) - for i, tri in enumerate(self.tris[visible]): - surface = np.append(tri, tri[0]) - for n in range(3): - a = surface[n] - b = surface[n + 1] - counter[a, b] = counter[a, b] + 1 - - counter += counter.T - - edges = [] - for a in range(max_vert - 1): - for b in range(a + 1, max_vert): - if counter[a, b] == 1: - edges.append([a, b]) - edges = np.vstack(edges) - - # break the edges visible from the new vertex and add the new triangle - for e in edges: - tris_new = np.vstack( - [tris_new, self._flip_plane(np.append(e, idx_new_vert))] - ) - - # update the list of triangles in the convex hull - self.tris = tris_new - - def _remap_ghost_speakers(self) -> None: - """ - Remove unused ghost speakers and compute a downmix matrix for the rest - """ - # find ghosts that are not part of the convex hull - ghosts = [i for i, v in enumerate(self.verts) if v.is_ghost] - unused_ghosts = np.compress( - np.isin(ghosts, np.unique(self.tris), invert=True), ghosts - ) - - if unused_ghosts.size > 0: - # remove the unused ghosts from the triangle array and also adjust indices - self.tris[self.tris > unused_ghosts.min()] -= unused_ghosts.size - # delete them from the vertex array - self.verts = np.delete(self.verts, unused_ghosts) - - # generate initial sound energy distribution matrix - n_vtx = len(self.verts) - n_ghost = len(ghosts) - len(unused_ghosts) - - M = np.eye(n_vtx) - for i, v in enumerate(self.verts): - if v.is_ghost: - neighbours = self._get_neighbours(i) - M[:, i] = np.zeros(n_vtx) - M[neighbours, i] = np.ones(len(neighbours)) / len(neighbours) - - # re-distribute sound energy from ghosts - M2 = M.copy() - for i, v in enumerate(self.verts): - if v.is_ghost: - vec = M[:, i] - while np.sum(vec[-n_ghost:]) > 1e-4: - vec = M @ vec - M2[:, i] = vec - - self.dmx_mat = M2[:-n_ghost, :] - - # amplitude downmix for real loudspeakers - self.dmx_mat[:, :-n_ghost] = np.sqrt(self.dmx_mat[:, :-n_ghost]) - - # distribute ghosts according to downmix type - for i, v in enumerate(self.verts): - if v.is_ghost: - if v.dmx_type == EfapDmxType.NONE: - self.dmx_mat[:, i] = 0 - elif v.dmx_type == EfapDmxType.AMPLITUDE: - pass - else: - self.dmx_mat[:, i] = np.sqrt(self.dmx_mat[:, i]) - - def _tri2poly(self) -> None: - """ - Merge hull triangles into polygons if they are coplanar - """ - polys = [] - - for tri in self.tris: - # find all vertices coplanar with this triangle (including those already in the triangle) - new_poly = np.array( - [ - i - for i, _ in enumerate(self.verts) - if np.abs(self._vertex_dist(tri, i)) < self._EFAP_THRESH_COPLANAR - ] - ) - - # check if we already found this polygon as a complete subset - is_subset = [ - i for i, poly in enumerate(polys) if np.all(np.isin(new_poly, poly)) - ] - is_superset = [ - i for i, poly in enumerate(polys) if np.all(np.isin(poly, new_poly)) - ] - - if is_subset: - continue - elif is_superset: - # remove the other polygon since it will be replaced by the superset polygon - polys_new = [p for i, p in enumerate(polys) if i not in is_superset] - polys = polys_new - - # orient the polygon plane in the same direction as the triangle - P1 = self.verts[tri[0]].pos - P2 = self.verts[tri[1]].pos - P3 = self.verts[tri[2]].pos - - # first base vector - U = P2 - P1 - U = U / np.linalg.norm(U) - - # second base vector - V = P3 - P2 - V = V - np.dot(U, V) * U - V = V / np.linalg.norm(V) - - # center of the first triangle - M = np.mean([P1, P2, P3], axis=0) - - # sort vertices - azi = np.zeros_like(new_poly, dtype=float) - for i, idx_v in enumerate(new_poly): - P = self.verts[idx_v].pos - M - X = np.dot(P, U) - Y = np.dot(P, V) - azi[i] = np.arctan2(Y, X) - - idx = np.argsort(azi) - new_poly = new_poly[idx] - - # add the polygon to the main list - polys.append(new_poly) - - self.polys = polys - - def _pan_EFAP_poly( - self, azimuth: float, elevation: float, poly: np.ndarray, mod: int - ) -> np.ndarray: - """ - Compute panning gains for each vertex in the given polygon - - Parameters - ---------- - azimuth : float - Azimuth of requested panning position - elevation : float - Elevation of requested panning position - poly : np.ndarray - Array of vertices defining the polygon - - Returns - ------- - poly_gain: np.ndarray - Gains for each vertex in the polygon - """ - poly_gain = np.zeros_like(poly, dtype=float) - - P = np.array([azimuth, elevation]) - # search for the triangle of the polygon in which P belongs - for i in range(1, poly.size + 1): - A = np.array([self.verts[poly[i - 1]].azi, self.verts[poly[i - 1]].ele]) - for j in range(i, poly.size - 2 + i): - idx1 = 1 + (j % poly.size) - idx2 = 1 + (idx1 % poly.size) - B = np.array( - [self.verts[poly[idx1 - 1]].azi, self.verts[poly[idx1 - 1]].ele] - ) - C = np.array( - [self.verts[poly[idx2 - 1]].azi, self.verts[poly[idx2 - 1]].ele] - ) - - if mod: - if not np.isnan(A[0]): - A[0] %= mod - if not np.isnan(B[0]): - B[0] %= mod - if not np.isnan(C[0]): - C[0] %= mod - - if self._in_triangle(P, A, B, C): - N = np.transpose([B[1] - C[1], C[0] - B[0]]) - N = N / np.dot(N, B - A) - poly_gain[i - 1] = 1 - np.dot(P - A, N) - - """ DEBUGGING / TODO """ - # set gains <= -60dB to 0 - poly_gain[np.abs(poly_gain) < 1e-6] = 0 - - return poly_gain - - """ geometric / math helper functions """ - - def _get_neighbours(self, idx_vert: int) -> np.ndarray: - """ - Find triangles containing the given vertex index (neighbouring vertices) - """ - n = self.tris[np.any(np.isin(self.tris, idx_vert), axis=1)] - return np.unique(n[n != idx_vert]) - - def _get_azi_ele(self, idx_vert: int) -> Tuple[float, float]: - """ - Return a tuple of (azi, ele) for a vertex at the given index - """ - return self.verts[idx_vert].azi, self.verts[idx_vert].ele - - def _in_polygon( - self, azimuth: float, elevation: float, poly: np.ndarray - ) -> Tuple[bool, int]: - """ - Determine whether the panning position lies within the given polygon - by iteratively checking its triangles - - Parameters - ---------- - azimuth : float - Azimuth of requested panning position - elevation : float - Elevation of requested panning position - poly : np.ndarray - Array of vertices defining the polygon - - Returns - ------- - in_polygon, mod: Tuple[bool, int] - Flag indicating whether the point is inside the given polygon - Value of wrapping required if used - """ - azi = [self.verts[v].azi for v in poly] - - P = np.array([azimuth, elevation]) - - for tri in combinations(poly, 3): - A = np.array(self._get_azi_ele(tri[0])) - B = np.array(self._get_azi_ele(tri[1])) - C = np.array(self._get_azi_ele(tri[2])) - if self._in_triangle(P, A, B, C): - return True, None - - # if the azimuth difference is large, perform the 2D check again with azimuths wrapped to (-360, 0] and [0, 360) - if np.nanmax(azi) - np.nanmin(azi) > 180: - for tri in combinations(poly, 3): - A = np.array(self._get_azi_ele(tri[0])) - B = np.array(self._get_azi_ele(tri[1])) - C = np.array(self._get_azi_ele(tri[2])) - if not np.isnan(A[0]): - A[0] %= 360 - if not np.isnan(B[0]): - B[0] %= 360 - if not np.isnan(C[0]): - C[0] %= 360 - if self._in_triangle(P, A, B, C): - return True, 360 - - for tri in combinations(poly, 3): - A = np.array(self._get_azi_ele(tri[0])) - B = np.array(self._get_azi_ele(tri[1])) - C = np.array(self._get_azi_ele(tri[2])) - if not np.isnan(A[0]): - A[0] %= -360 - if not np.isnan(B[0]): - B[0] %= -360 - if not np.isnan(C[0]): - C[0] %= -360 - if self._in_triangle(P, A, B, C): - return True, -360 - - return False, None - - def _in_triangle( - self, P: np.ndarray, A: np.ndarray, B: np.ndarray, C: np.ndarray - ) -> bool: - """ - Determine whether the panning position lies within the given triangle - - Parameters - ---------- - P : float - Point under test - A : float - First vertex of the triangle - B : float - Second vertex of the triangle - C : float - Third vertex of the triangle - - Returns - ------- - bool - Flag indicating whether the point is inside the given triangle - """ - if np.isnan(A[0]): - A[0] = P[0] - - if np.isnan(B[0]): - B[0] = P[0] - - if np.isnan(C[0]): - C[0] = P[0] - - tmpMat = np.transpose([B - A, C - A]) - if (1 / np.linalg.cond(tmpMat)) < self._EFAP_THRESH_TRI: - return False - - Minv = np.linalg.inv(tmpMat) - S = Minv @ (P - A) - - if ( - S[0] < -self._EFAP_THRESH_TRI - or S[1] < -self._EFAP_THRESH_TRI - or S[0] + S[1] > 1 + self._EFAP_THRESH_TRI - ): - return False - - return True - - def _vertex_dist(self, surface: np.ndarray, idx_vert: int) -> float: - """ - Compute the distance of a vertex from a given plane - - Parameters - ---------- - surface : np.ndarray - Array of 3 ordered vertices defining the plane and its orientation - idx_vert: int - Index of the vertex to compute the distance for - - Returns - ------- - float - Distance of the vertex from the given plane - """ - return self._point_plane_dist( - self.verts[surface[0]].pos, - self.verts[surface[1]].pos, - self.verts[surface[2]].pos, - self.verts[idx_vert].pos, - ) - - def _point_plane_dist( - self, P1: np.ndarray, P2: np.ndarray, P3: np.ndarray, X: np.ndarray - ) -> float: - """ - Compute the distance of a vertex from a plane defined by three points - - Parameters - ---------- - P1 : np.ndarray - Cartesian coordinates of the first point - P2 : np.ndarray - Cartesian coordinates of the second point - P3 : np.ndarray - Cartesian coordinates of the third point - X: np.ndarray - Cartesian coordinates of the vertex - - Returns - ------- - float - Distance of the vertex from the given plane - """ - - if np.all(X == P1) or np.all(X == P2) or np.all(X == P3): - return 0 - else: - N = np.cross(P1 - P2, P1 - P3) - eps = np.finfo(float).eps - return np.dot(X - P1, N / (np.linalg.norm(N) + eps)) - - def _flip_plane(self, surface: np.ndarray) -> np.ndarray: - """ - Flip the orientation of a plane (invert normal vector) - - Parameters - ---------- - surface : np.ndarray - Array of 3 ordered vertices defining the plane and its orientation - - Returns - ------- - surface : np.ndarray - Reordered vertices with plane normal pointing outwards from the hull centroid - """ - if ( - self._point_plane_dist( - self.verts[surface[0]].pos, - self.verts[surface[1]].pos, - self.verts[surface[2]].pos, - self.centroid, - ) - > 0 - ): - surface = np.flip(surface.copy()) - - return surface - - def _compute_gains_point(self, azimuth: float, elevation: float) -> np.ndarray: - """ - Compute gains for the requested panning position - - Parameters - ---------- - azimuth : float - Azimuth of requested panning position - elevation : float - Elevation of requested panning position - - Returns - ------- - gains: np.ndarray - Panning gains for the loudspeaker layout - """ - if np.isnan(azimuth) or np.isnan(elevation): - raise ValueError(f"Angles cannot be NaNs : ({azimuth}, {elevation})") - - azimuth, elevation = wrap_angles(azimuth, elevation) - point_pos = [ - np.cos(np.deg2rad(azimuth)) * np.cos(np.deg2rad(elevation)), - np.sin(np.deg2rad(azimuth)) * np.cos(np.deg2rad(elevation)), - np.sin(np.deg2rad(elevation)), - ] - - # filter the polygon list with a quick 2d check - found_polys = [] - for poly in self.polys: - in_poly, mod = self._in_polygon(azimuth, elevation, poly) - if in_poly: - found_polys.append((poly, mod)) - - if not found_polys: - raise AssertionError("Unexpected error during panning") - - # find a visible polygon with the smallest distance - dist = [] - - for poly, mod in found_polys: - surface = self.verts[poly] - d = self._point_plane_dist( - surface[0].pos, - surface[1].pos, - surface[2].pos, - point_pos, - ) - if d >= 0: - dist.append(d) - else: - dist.append(np.inf) - - found_poly, mod = found_polys[np.argmin(dist)] - - # compute gains for the polygon vertices - poly_gain = self._pan_EFAP_poly(azimuth, elevation, found_poly, mod) - - # downmix ghost loudspeakers - gains = np.zeros(self.verts.size) - gains[found_poly] = poly_gain / np.linalg.norm(poly_gain) - gains = gains @ self.dmx_mat.T - gains = gains / np.linalg.norm(gains) - - if self.intensity_panning: - gains = np.sqrt(gains / np.sum(gains)) - - return gains - - """ public functions """ - - def pan( - self, - azimuths: float, - elevations: float, - intensity_panning: Optional[bool] = False, - ) -> np.ndarray: - """ - Compute gains for the requested panning position - - Parameters - ---------- - azimuths : float - Azimuth of requested panning position - elevations : float - Elevation of requested panning position - intensity_panning : bool - Flag whether to use intensity panning (Default is False == amplitude panning) - - Returns - ------- - gains: np.ndarray - Panning gains for the loudspeaker layout - """ - azimuths = np.array(azimuths) - elevations = np.array(elevations) - if azimuths.size == 1 and elevations.size == 1: - return self._compute_gains_point(azimuths, elevations) - elif np.squeeze(azimuths).ndim == 1 and np.squeeze(elevations).ndim == 1: - gains = [] - for a, e in zip(azimuths, elevations): - gains.append(self._compute_gains_point(a, e)) - return np.vstack(gains) - else: - raise ValueError( - "Azimuth and Elevation arrays cannot have more than one dimension and must be of equal size" - ) - - -def main(args): - """ - Parses a speaker layout text file and prints the panning gains - for the requested position - - Parameters - ---------- - args : Namespace - Command line arguments - """ - - speaker_positions = np.loadtxt(Path(args.input), delimiter=",", max_rows=2) - panner = EFAP(speaker_positions[0, :], speaker_positions[1, :], args.efip) - print(panner.pan(args.azimuth, args.elevation)) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Edge-Fading Amplitude Panning") - parser.add_argument( - "-i", - "--input", - metavar="layout_file", - required=True, - type=str, - help="IVAS compatible loudspeaker layout file (Loudspeaker azimuths in first line, elevations in second, subsequent lines are ignored)", - ) - parser.add_argument( - "-efip", - "-intensity_panning", - default=False, - action="store_true", - help="Intensity panning mode (EFIP)", - ) - parser.add_argument( - "azimuth", - type=float, - help="Azimuth of direction to compute panning gains for (positive-left)", - ) - parser.add_argument( - "elevation", - type=float, - help="Elevation of direction to compute panning gains for (positive-up)", - ) - args = parser.parse_args() - main(args) diff --git a/item_generation_scripts/audiotools/__init__.py b/item_generation_scripts/audiotools/__init__.py deleted file mode 100644 index effc5a25..00000000 --- a/item_generation_scripts/audiotools/__init__.py +++ /dev/null @@ -1,286 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import argparse -from itertools import repeat -from pathlib import Path - -from item_generation_scripts.audiotools.constants import AUDIO_FORMATS -from item_generation_scripts.audiotools.convert import convert_file -from item_generation_scripts.utils import apply_func_parallel - - -def add_processing_args(group, input=True): - # set up prefixes to avoid argument collision - if input: - p = "in" - ps = "i" - else: - p = "out" - ps = "o" - - group.add_argument( - f"-{ps}", - f"--{p}", - dest=f"{p}put", - required=True, - type=Path, - help="Path to *.{wav, pcm, raw} file or directory", - ) - group.add_argument( - f"-{ps}f", - f"--{p}_fmt", - required=input, - type=str, - help="Audio format (use -l, --list for a list / -L, --long for a detailed list)", - default=None, - ) - group.add_argument( - f"-{ps}s", - f"--{p}_fs", - type=int, - help="Sampling rate (Hz) (deduced for .wav input, same as input if output not specified, default = %(default)s)", - default=48000, - ) - group.add_argument( - f"-{ps}fc", - f"--{p}_cutoff", - type=int, - help="Cut-off frequency for low-pass filtering (default = %(default)s)", - default=None, - ) - group.add_argument( - f"-{ps}hp", - f"--{p}_hp50", - help="Apply 50 Hz high-pass filtering (default = %(default)s)", - action="store_true", - ) - group.add_argument( - f"-{ps}w", - f"--{p}_window", - type=float, - help="Window the start/end of the signal by this amount in milliseconds (default = %(default)s)", - default=None, - ) - group.add_argument( - f"-{ps}t", - f"--{p}_trim", - type=float, - nargs=2, - metavar=("PRE_TRIM", "POST_TRIM"), - help="Pre-/post-trim the signal by this amount in milliseconds (negative values pad silence), (default = %(default)s)", - ) - group.add_argument( - f"-{ps}pn", - f"--{p}_pad_noise", - help="Flag for padding with noise instead of zeros", - action="store_true", - ) - group.add_argument( - f"-{ps}d", - f"--{p}_delay", - type=float, - help="Delay the signal by this amount in milliseconds (negative values advance, default = %(default)s)", - default=None, - ) - group.add_argument( - f"-{ps}l", - f"--{p}_loudness", - type=float, - help="Normalize to given loudness with BS 1770-4 (default = %(default)s)", - default=None, - ) - group.add_argument( - f"-{ps}nf", - f"--{p}_loudness_fmt", - type=str, - help=f"Format used for loudness computation (only valid with with -{ps}l/--{p}_loudness, default = {p.upper()}_FMT)", - default=None, - ) - - -def get_args(): - parser = argparse.ArgumentParser( - description="Audiotools: Convert/Manipulate spatial audio files." - ) - - """ Input file arguments """ - input_parser = parser.add_argument_group("Input (pre-) processing options") - - # add common arguments - add_processing_args(input_parser) - - # input only arguments - input_parser.add_argument( - "-im", - "--in_meta", - type=str, - nargs="+", - help="list of input metadata files (only relevant for ISM and MASA input)", - default=None, - ) - - """ Output file arguments """ - output_parser = parser.add_argument_group("Output (post-) processing options") - - # add common arguments - add_processing_args(output_parser, False) - - # output only arguments - output_parser.add_argument( - "-lm", - "--limit", - help="Apply limiting to output (default = %(default)s)", - action="store_true", - ) - output_parser.add_argument( - "-t", - "--trajectory", - type=str, - help="Head-tracking trajectory file for binaural output (default = %(default)s)", - default=None, - ) - output_parser.add_argument( - "-bd", - "--bin_dataset", - type=str, - help="Use a custom binaural dataset (see README.md and audiotools/binaural_datasets/README.txt for further information)", - default=None, - ) - output_parser.add_argument( - "-bl", - "--bin_lfe_gain", - type=float, - help="Render LFE to binaural output with the specified gain (only valid for channel-based input, default = %(default)s)", - default=None, - ) - output_parser.add_argument( - "-mnru", - "--mnru_q", - type=float, - help="Flag for MNRU processing", - default=None, - ) - output_parser.add_argument( - "-esdru", - "--esdru_alpha", - type=float, - help="Flag for ESDRU processing", - default=None, - ) - - misc_parser = parser.add_argument_group("General options") - - """ Miscellaneous or meta arguments """ - misc_parser.add_argument( - "-l", - "--list", - help="list all supported audio formats and exit", - action="store_true", - ) - misc_parser.add_argument( - "-L", - "--long", - help="list all supported audio formats with long description and exit", - action="store_true", - ) - misc_parser.add_argument( - "-mp", - "--multiprocessing", - help="Enable multiprocessing (default = %(default)s)", - action="store_true", - ) - - return parser.parse_args() - - -def main(): - args = get_args() - - if args.list is True or args.long is True: - for fmt in AUDIO_FORMATS: - if args.long: - for f, d in fmt.items(): - print(f) - [print(f"\t{k}: {v}", end=None) for k, v in d.items()] - else: - print(", ".join(fmt.keys())) - exit() - - elif args.input is not None: - if not args.out_fs: - args.out_fs = args.in_fs - - if not args.out_fmt: - args.out_fmt = args.in_fmt - - if not args.out_loudness_fmt: - args.out_loudness_fmt = args.out_fmt - - # List input files - args.input = Path(args.input) - in_files = [] - if args.input.exists(): - if args.input.is_dir(): - in_files.extend(args.input.glob("*.wav")) - in_files.extend(args.input.glob("*.pcm")) - in_files.extend(args.input.glob("*.raw")) - else: - in_files = [args.input] - else: - raise ValueError(f"Input path {args.input} does not exist!") - - if len(in_files) == 0: - raise ValueError(f"Input directory {args.input} empty!") - - # Create output directory - args.output = Path(args.output) - - if len(in_files) == 1 and args.input.is_file(): - out_files = [args.output] - else: - args.output.mkdir(exist_ok=True) - out_files = [args.output.joinpath(i.name) for i in in_files] - - # Multiprocessing - enable_multiprocessing = args.multiprocessing - - # Remove unneeded keys to avoid passing to convert_file() - for k in ["list", "long", "multiprocessing", "input", "output"]: - args.__dict__.pop(k) - - apply_func_parallel( - convert_file, - zip(in_files, out_files), - repeat(args.__dict__), - "mp" if enable_multiprocessing else None, - ) diff --git a/item_generation_scripts/audiotools/__main__.py b/item_generation_scripts/audiotools/__main__.py deleted file mode 100644 index 9bdf64cd..00000000 --- a/item_generation_scripts/audiotools/__main__.py +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -from item_generation_scripts.audiotools import main - -if __name__ == "__main__": - main() diff --git a/item_generation_scripts/audiotools/audio.py b/item_generation_scripts/audiotools/audio.py deleted file mode 100644 index 1804f5dd..00000000 --- a/item_generation_scripts/audiotools/audio.py +++ /dev/null @@ -1,428 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import warnings -from abc import ABC, abstractmethod -from pathlib import Path -from typing import Optional, Union - -import numpy as np - -from item_generation_scripts.audiotools.audiofile import read -from item_generation_scripts.audiotools.constants import ( - BINAURAL_AUDIO_FORMATS, - CHANNEL_BASED_AUDIO_ALTNAMES, - CHANNEL_BASED_AUDIO_FORMATS, - IVAS_FRAME_LEN_MS, - METADATA_ASSISTED_SPATIAL_AUDIO_FORMATS, - OBJECT_BASED_AUDIO_FORMATS, - SCENE_BASED_AUDIO_FORMATS, -) - -from .EFAP import wrap_angles - - -class Audio(ABC): - """Base class for audio data""" - - def __init__(self, name: str): - self.name = name.upper() - self.audio = None - self.fs = None - self.num_channels = None - # self.logger = None # TODO needed? - - def __repr__(self): - return f"{self.__class__} : {self.__dict__}" - - @classmethod - @abstractmethod - def _from_file(cls, name: str, filename: Path, fs: Optional[int] = None) -> "Audio": - """Create an Audio object from a file""" - out_audio = cls(name) - - filename = Path(filename) - if filename.suffix in [".pcm", ".raw"]: - if fs is None: - raise ValueError( - "Sampling rate must be specified for headerless files!" - ) - out_audio.audio, out_audio.fs = read(filename, out_audio.num_channels, fs) - elif filename.suffix == ".wav": - out_audio.audio, out_audio.fs = read(filename) - else: - raise NotImplementedError(f"Filetype {filename.suffix} is unsupported!") - - return out_audio - - @classmethod - @abstractmethod - def _from_filelist( - cls, name, files: list[Path], fs: Optional[int] = None - ) -> "Audio": - """Create an Audio object from a list of files with channels""" - out_audio = cls(name) - - for f in files: - f = Path(f) - - if f.suffix in [".pcm", ".raw"]: - if fs is None: - raise ValueError( - "Sampling rate must be specified for headerless files!" - ) - channel, fs = read(f, out_audio.num_channels, fs) - elif f.suffix == ".wav": - channel, fs = read(f) - else: - raise NotImplementedError(f"Filetype {f.suffix} is unsupported!") - - if out_audio.audio is None: - out_audio.audio = channel - out_audio.fs = fs - else: - if fs != out_audio.fs: - raise ValueError( - f"Sampling rate mismatch between input audio files, expected {out_audio.fs}, encountered {fs} for {f}!" - ) - - if channel.shape[0] > out_audio.audio.shape[0]: - channel = channel[: out_audio.audio.shape[0], :] - elif channel.shape[0] < out_audio.audio.shape[0]: - out_audio.audio = out_audio.audio[: channel.shape[0], :] - out_audio.audio = np.column_stack([out_audio.audio, channel]) - - return out_audio - - def apply(self, func, **kwargs) -> None: - """Apply a function to the audio array""" - self.audio = func(self.audio, self.fs, **kwargs) - - -class BinauralAudio(Audio): - """Sub-class for binaural audio""" - - def __init__(self, name: str): - super().__init__(name) - try: - self.__dict__.update(BINAURAL_AUDIO_FORMATS[name.upper()]) - except KeyError: - raise ValueError(f"Unsupported binaural audio format {name}") - - @classmethod - def _from_file( - cls, name: str, filename: Path, fs: Optional[int] = None - ) -> "BinauralAudio": - return super()._from_file(name, filename, fs) - - @classmethod - def _from_filelist( - cls, name: str, filename: Path, fs: Optional[int] = None - ) -> "BinauralAudio": - return super()._from_filelist(name, filename, fs) - - -class ChannelBasedAudio(Audio): - """Sub-class for channel-based audio""" - - def __init__(self, name: str): - if Path(name).exists() and Path(name).suffix == ".txt": - self.parse_custom_layout(name) - else: - # remap configuration name to internal naming - if name.upper() in CHANNEL_BASED_AUDIO_ALTNAMES.keys(): - name = CHANNEL_BASED_AUDIO_ALTNAMES[name.upper()] - - super().__init__(name) - try: - self.__dict__.update(CHANNEL_BASED_AUDIO_FORMATS[name.upper()]) - except KeyError: - raise ValueError(f"Unsupported channel-based audio format {name}") - - self.is_planar = np.all([e == 0 for e in self.ls_ele]) - - def parse_custom_layout(self, layout_file: Union[Path, str]): - layout_file = Path(layout_file) - with open(layout_file) as f_ls: - self.ls_azi = [float(x.strip()) for x in f_ls.readline().strip().split(",")] - self.ls_ele = [float(x.strip()) for x in f_ls.readline().strip().split(",")] - try: - self.lfe_index = [ - int(x.strip()) for x in f_ls.readline().strip().split(",") - ] - except Exception: - self.lfe_index = [] - - if self.lfe_index: - [self.ls_azi.insert(i, 0.0) for i in self.lfe_index] - [self.ls_ele.insert(i, 0.0) for i in self.lfe_index] - - self.name = layout_file.stem - self.num_channels = len(self.ls_azi) - self.layout_file = layout_file - - @classmethod - def _from_file( - cls, name: str, filename: Path, fs: Optional[int] = None - ) -> "ChannelBasedAudio": - return super()._from_file(name, filename, fs) - - @classmethod - def _from_filelist( - cls, name: str, filename: Path, fs: Optional[int] = None - ) -> "ChannelBasedAudio": - return super()._from_filelist(name, filename, fs) - - -class MetadataAssistedSpatialAudio(Audio): - """Sub-class for metadata-assisted spatial audio""" - - def __init__(self, name: str): - super().__init__(name) - try: - self.__dict__.update(METADATA_ASSISTED_SPATIAL_AUDIO_FORMATS[name.upper()]) - except KeyError: - raise ValueError( - f"Unsupported metadata assisted spatial audio format {name}" - ) - self.metadata_files = [] - - @classmethod - def _from_file( - cls, - name: str, - filename: Path, - metadata_files: list[str], - fs: Optional[int] = None, - ) -> "MetadataAssistedSpatialAudio": - obj = super()._from_file(name, filename, fs) - obj.metadata_file = Path(metadata_files[0]) - return obj - - @classmethod - def _from_filelist( - cls, - name: str, - filename: Path, - metadata_files: list[str], - fs: Optional[int] = None, - ) -> "MetadataAssistedSpatialAudio": - obj = super()._from_file(name, filename, fs) - obj.metadata_file = Path(metadata_files[0]) - return obj - - -class ObjectBasedAudio(Audio): - """Sub-class for object-based audio""" - - def __init__(self, name: str): - super().__init__(name) - try: - self.__dict__.update(OBJECT_BASED_AUDIO_FORMATS[name.upper()]) - except KeyError: - raise ValueError(f"Unsupported object-based audio format {name}") - self.object_pos = [] - self.metadata_files = [] - - @classmethod - def _from_file( - cls, - name: str, - filename: Union[str, Path], - metadata_files: list[Union[str, Path]], - fs: Optional[int] = None, - ) -> "ObjectBasedAudio": - obj = super()._from_file(name, filename, fs) - if metadata_files is not None: - obj.metadata_files = [Path(f) for f in metadata_files] - else: - # search for metadata with naming scheme: name.(wav, pcm).(0-3).csv - for obj_idx in range(obj.num_channels): - file_name_meta = filename.with_suffix( - f"{filename.suffix}.{obj_idx}.csv" - ) - if file_name_meta.is_file(): - obj.metadata_files.append(file_name_meta) - else: - raise ValueError(f"Metadata file {file_name_meta} not found.") - warnings.warn( - f"No metadata files specified: The following files were found and used: \n {*obj.metadata_files,}" - ) - - obj.init_metadata() - return obj - - @classmethod - def _from_filelist( - cls, - name: str, - filename: Path, - metadata_files: list[Union[str, Path]], - fs: Optional[int] = None, - ) -> "ObjectBasedAudio": - obj = super()._from_filelist(name, filename, fs) - obj.metadata_files = [Path(f) for f in metadata_files] - obj.init_metadata() - return obj - - def init_metadata(self): - if self.audio.shape[1] != len(self.metadata_files): - raise ValueError( - f"Mismatch between number of channels in file [{self.audio.shape[1]}], and metadata [{len(self.metadata_files)}]" - ) - - self.object_pos = [] - for i, f in enumerate(self.metadata_files): - pos = np.genfromtxt(f, delimiter=",") - - # check if metadata has right number of columns - if pos.shape[1] < 5: - raise ValueError("Metadata incomplete. Columns are missing.") - elif pos.shape[1] > 5: - if pos.shape[1] == 7: - pos = pos[:, :5] - else: - raise ValueError( - "Too many columns in metadata (possibly old version with frame index used)" - ) - - # check if metadata is longer than file -> cut off - num_frames = int( - np.ceil(self.audio.shape[0] / (self.fs * IVAS_FRAME_LEN_MS / 1000)) - ) - if num_frames < pos.shape[0]: - pos = pos[:num_frames] - # check if metadata is shorter than file -> loop - elif num_frames > pos.shape[0]: - pos_loop = np.zeros((num_frames, pos.shape[1])) - pos_loop[: pos.shape[0]] = pos - for idx in range(pos.shape[0], num_frames): - pos_loop[idx, :2] = pos[idx % pos.shape[0], :2] - pos = pos_loop - - # wrap metadata to target value range - for j in range(num_frames): - pos[j, 0], pos[j, 1] = wrap_angles(pos[j, 0], pos[j, 1], clip_ele=True) - - self.object_pos.append(pos) - - -class SceneBasedAudio(Audio): - """Sub-class for scene-based audio""" - - def __init__(self, name: str): - if name == "SBA1": - name = "FOA" - elif name == "SBA2": - name = "HOA2" - elif name == "SBA3": - name = "HOA3" - - super().__init__(name) - try: - self.__dict__.update(SCENE_BASED_AUDIO_FORMATS[name.upper()]) - except KeyError: - raise ValueError(f"Unsupported scene-based audio format {name}") - - # self.ambi_order = ambi_order_from_nchan(self.num_channels) - self.ambi_order = int(np.sqrt(self.num_channels) - 1) - - @classmethod - def _from_file( - cls, name: str, filename: Path, fs: Optional[int] = None - ) -> "SceneBasedAudio": - return super()._from_file(name, filename, fs) - - @classmethod - def _from_filelist( - cls, name: str, filename: Path, fs: Optional[int] = None - ) -> "SceneBasedAudio": - return super()._from_filelist(name, filename, fs) - - -def _get_audio_class(fmt) -> Audio: - """Return a child audio class corresponding to the specifed format""" - if fmt in BINAURAL_AUDIO_FORMATS.keys(): - return BinauralAudio - elif fmt in METADATA_ASSISTED_SPATIAL_AUDIO_FORMATS.keys(): - return MetadataAssistedSpatialAudio - elif fmt in OBJECT_BASED_AUDIO_FORMATS.keys(): - return ObjectBasedAudio - elif fmt in SCENE_BASED_AUDIO_FORMATS.keys(): - return SceneBasedAudio - elif ( - fmt in CHANNEL_BASED_AUDIO_FORMATS.keys() or CHANNEL_BASED_AUDIO_ALTNAMES.keys() - ): - return ChannelBasedAudio - elif Path(fmt).suffix == ".txt": - return ChannelBasedAudio - else: - raise ValueError(f"Unknown audio format {fmt}!") - - -def fromtype(fmt: str) -> Audio: - return _get_audio_class(fmt)(fmt) - - -def fromarray(fmt: str, x: np.ndarray, fs: int) -> Audio: - """Wrap the given array into an audio format""" - if x is None or not fs: - return ValueError("Both array and sampling rate must be specified!") - - output = _get_audio_class(fmt)(fmt) - - output.audio = x - output.fs = fs - - return output - - -def fromfile( - fmt: str, - filename: Union[str, Path], - fs: Optional[int] = None, - in_meta: Optional[list[Union[str, Path]]] = None, -) -> Audio: - """Create an Audio object of the specified format from the given file""" - filename = Path(filename) - fmt_cls = _get_audio_class(fmt) - if fmt_cls is ObjectBasedAudio or fmt_cls is MetadataAssistedSpatialAudio: - return fmt_cls._from_file(fmt, filename, in_meta, fs) - else: - return fmt_cls._from_file(fmt, filename, fs) - - -def fromfilelist( - fmt: str, files: list[Union[str, Path]], fs: Optional[int] = None -) -> Audio: - """Create an Audio object of the specified format from the given list of files""" - return _get_audio_class(fmt)._from_filelist(fmt, files, fs) diff --git a/item_generation_scripts/audiotools/audioarray.py b/item_generation_scripts/audiotools/audioarray.py deleted file mode 100644 index c0909c4c..00000000 --- a/item_generation_scripts/audiotools/audioarray.py +++ /dev/null @@ -1,690 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import logging -import warnings -from typing import Iterator, Optional, Tuple, Union - -import numpy as np -import scipy.signal as sig - -from .constants import DELAY_COMPENSATION_FOR_FILTERING, SEED_PADDING - -logger = logging.getLogger("__main__") -logger.setLevel(logging.DEBUG) - - -"""Functions used in this module""" - - -def trim( - x: np.ndarray, - fs: Optional[int] = 48000, - limits: Optional[Tuple[int, int]] = None, - pad_noise: Optional[bool] = False, - samples: Optional[bool] = False, -) -> np.ndarray: - """ - Trim an audio array - - Parameters - ---------- - x: np.ndarray - Input array - fs: Optional[int] - Input sampling rate in Hz, default = 48000 - limits: Optional[Tuple[int, int]] - Pre- and post-trim duration in milliseconds (negative values pad) - pad_noise: Optional[bool] - If true noise will be padded otherwise zeros will be padded - samples: Optional[bool] - If true limits are interpreted as samples, otherwise as ms - - Returns - ------- - y : np.ndarray - Output trimmed array - """ - - if not limits: - return x - - if not samples: - pre_trim = int(limits[0] * fs // 1000) - post_trim = int(limits[1] * fs // 1000) - else: - pre_trim = limits[0] - post_trim = limits[1] - - if pre_trim < 0: - if pad_noise: - # pad with uniformly distributed noise between -4 and 4 - np.random.seed(SEED_PADDING) - noise = np.random.randint( - low=-4, high=5, size=(np.abs(pre_trim), np.shape(x)[1]) - ).astype("float") - x = np.concatenate((noise, x), axis=0) - else: - x = np.pad(x, [[np.abs(pre_trim), 0], [0, 0]]) - elif pre_trim > 0: - x = x[pre_trim:, :] - - if post_trim < 0: - if pad_noise: - # pad with uniformly distributed noise between -4 and 4 - np.random.seed(SEED_PADDING) - noise = np.random.randint( - low=-4, high=5, size=(np.abs(post_trim), np.shape(x)[1]) - ).astype("float") - x = np.concatenate((x, noise), axis=0) - else: - x = np.pad(x, [[0, np.abs(post_trim)], [0, 0]]) - elif post_trim > 0: - x = x[:-post_trim, :] - - return x - - -def window( - x: np.ndarray, - fs: Optional[int] = 48000, - len_ms: Optional[float] = 100, -) -> np.ndarray: - """ - Apply windowing to the start and end - of an audio array - - - Parameters - ---------- - x: np.ndarray - Input audio array - fs: Optional[int] - Input sampling rate in Hz, default = 48000 - len_ms: Optional[float] - Window length used at start and end of array in milliseconds, default = 100 ms - - Returns - ------- - y: np.ndarray - Output windowed array - """ - - wlen_smp = int(len_ms * fs // 1000) - - # if requested window length is larger than the signal, simply window the signal - if wlen_smp > x.shape[0]: - wlen_smp = x.shape[0] // 2 - - window = sig.windows.hann(2 * wlen_smp) - - # we only need half of the window - window = window[:wlen_smp, np.newaxis] - - x[:wlen_smp, :] *= window - x[-wlen_smp:, :] *= window[::-1, :] - - return x - - -def delay_compensation( - x: np.ndarray, - flt_type: str, - fs: Optional[int] = 48000, - up: Optional[bool] = False, - down: Optional[bool] = False, -) -> np.ndarray: - """ - Compensation for a delayed signal - - Parameters - ---------- - x: np.ndarray - Input array - flt_type: str - Name of filter type used for filtering - fs: Optional[int] - Input sampling rate - up: Optional[bool] - Flag for up-sampling - down: Optional[bool] - Flag for down-sampling - - Returns - ------- - x: np.ndarray - Delay compensated test array - """ - - # Get the delay in number of samples - if flt_type == "SHQ2" and up: - d_samples = DELAY_COMPENSATION_FOR_FILTERING["SHQ2"]["up"] - elif flt_type == "SHQ2" and down: - d_samples = DELAY_COMPENSATION_FOR_FILTERING["SHQ2"]["down"] - elif flt_type == "SHQ3" and up: - d_samples = DELAY_COMPENSATION_FOR_FILTERING["SHQ3"]["up"] - elif flt_type == "SHQ3" and down: - d_samples = DELAY_COMPENSATION_FOR_FILTERING["SHQ3"]["down"] - else: - d_samples = DELAY_COMPENSATION_FOR_FILTERING[flt_type] - # Delay compensation - x = delay(x, fs, -d_samples, samples=True) - - return x - - -def delay( - x: np.ndarray, - fs: Optional[int] = 48000, - delay: Optional[float] = 0, - samples: Optional[bool] = False, -) -> np.ndarray: - """ - Delay a signal by a specified duration (ms) or number of samples - - Parameters - ---------- - x: np.ndarray - Input array - fs: Optional[int] - Sampling rate - delay: Optional[float] - Delay in milliseconds or samples (negative values advance file) - samples: Optional[bool] - If true delay is interpreted as samples, if false as milliseconds - - Returns - ------- - x: np.ndarray - Delayed audio signal - """ - - if not samples: - delay = int(delay * fs / 1000) - - delay_abs = np.abs(delay) - - x = np.roll(x, delay, axis=0) - - if delay < 0: - x[-delay_abs:, :] = 0 - elif delay > 0: - x[:delay_abs, :] = 0 - - return x - - -def limiter( - x: np.ndarray, - fs: int, -) -> np.ndarray: - """ - Apply limiting to an audio signal - - Parameters - ---------- - x: np.ndarray - Input reference array - fs: int - Input sampling frequency - - Returns - ------- - x: np.ndarray - Limited audio signal - """ - - limiter_threshold = 32729 # -0.01dB FS - limiter_attack_seconds = 0.005 - attack_constant = 0.01 ** (1.0 / (limiter_attack_seconds * fs)) - release_heuristics_mem = 0.0 - gain = 1.0 - strong_saturation_cnt = 0 - limited = False - - if x.ndim == 1: - n_samples_x = x.shape - n_chan_x = 1 - else: - n_samples_x, n_chan_x = x.shape - # framing - framesize = fs // 50 - nframes = n_samples_x // framesize - for fr in range(nframes): - apply_limiting = True - fr_sig = x[fr * framesize : ((fr + 1) * framesize), :] - sig_max = np.amax(np.absolute(fr_sig)) - release_heuristic = release_heuristics_mem - if sig_max > limiter_threshold: - frame_gain = limiter_threshold / sig_max - release_heuristic = min(1.0, release_heuristic + (4.0 * framesize / fs)) - else: - release_heuristic = max(0.0, release_heuristic - (framesize / fs)) - if gain >= 1.0 - 1e-10: - apply_limiting = False - - frame_gain = 1.0 - - if sig_max > 3 * limiter_threshold and strong_saturation_cnt > 0: - apply_strong_limiting = True - elif sig_max > 10 * limiter_threshold: - strong_saturation_cnt += 20 - apply_strong_limiting = True - else: - strong_saturation_cnt -= 1 - if strong_saturation_cnt < 0: - strong_saturation_cnt = 0 - apply_strong_limiting = False - - if apply_strong_limiting is True: - if frame_gain < 0.3: - frame_gain /= 3.0 - else: - apply_strong_limiting = False - - if frame_gain < 0.1 and apply_strong_limiting is False: - frame_gain = 0.1 - - if apply_limiting is True: - if frame_gain < gain: - fac = attack_constant ** (np.arange(1, framesize + 1, dtype=np.float32)) - else: - release_constant = 0.01 ** ( - 1.0 / (0.005 * (200.0**release_heuristic) * fs) - ) - fac = release_constant ** ( - np.arange(1, framesize + 1, dtype=np.float32) - ) - - fr_gain = np.tile(gain * fac + frame_gain * (1.0 - fac), (n_chan_x, 1)).T - fr_sig *= fr_gain - gain = fr_gain[-1, 0] - limited = True - else: - gain = 1.0 - - release_heuristics_mem = release_heuristic - # hard limiting for everything that still sticks out - if (fr_sig > 32767).any() or (fr_sig < -32768).any(): - limited = True - idx_max = np.where(fr_sig > 32767) - fr_sig[idx_max] = 32767 - idx_min = np.where(fr_sig < -32768) - fr_sig[idx_min] = -32768 - - if limited: - warnings.warn("Limiting had to be applied") - return x - - -def get_framewise( - x: np.ndarray, - chunk_size: int, - zero_pad: Optional[bool] = False, -) -> Iterator: - """ - Generator to yield a signal frame by frame - If array size is not a multiple of chunk_size, last frame contains the remainder - - Parameters - ---------- - x: np.ndarray - Input reference array - chunk_size: int - Size of frames to yield - zero_pad: Optional[bool] - Whether to zero pad the last chunk if there are not enough samples - - Yields - ------- - frame : np.ndarray - One frame of the input audio signal - """ - - n_frames = x.shape[0] // chunk_size - for i in range(n_frames): - yield x[i * chunk_size : (i + 1) * chunk_size, :] - if x.shape[0] % chunk_size: - last_chunk = x[n_frames * chunk_size :, :] - if zero_pad: - yield np.pad( - last_chunk, [[0, chunk_size - (x.shape[0] % chunk_size)], [0, 0]] - ) - else: - yield last_chunk - - -def framewise_io( - i: np.ndarray, o: np.ndarray, chunk_size: int, zero_pad: Optional[bool] = False -) -> Iterator: - """ - Return an iterator over frame_index, input_frame and output_frame - - Parameters - ---------- - i: np.ndarray - Input array - o: np.ndarray - Output array - chunk_size: int - Size of frames to yield - zero_pad: Optional[bool] - Whether to zero pad the last chunk if there are not enough samples - - Yields - ------- - frame : Iterator - Frame index, one frame of the input and output audio signal - """ - - return enumerate( - zip( - get_framewise(i, chunk_size, zero_pad), - get_framewise(o, chunk_size, zero_pad), - ) - ) - - -"""Deprecated functions (partly replaced by ITU binaries)""" - - -def resample( - x: np.ndarray, - in_freq: int, - out_freq: int, -) -> np.ndarray: - """ - Resample a multi-channel audio array - - Parameters - ---------- - x: np.ndarray - Input array - in_freq: int - Input sampling rate - out_freq: int - Output sampling rate - - Returns - ------- - y: np.ndarray - Output resampled array - """ - - if in_freq == out_freq or out_freq is None: - y = x - else: - datatype = x.dtype - if datatype.name.startswith("int"): - # cast necessary due to bug in resample_poly() with input of type int - x = x.astype("float") - - y = sig.resample_poly(x, out_freq, in_freq) - - if datatype.name.startswith("int"): - y = x.astype(datatype) - - return y - - -def lpfilter( - x: np.ndarray, - fc: int, - fs: int, -) -> np.ndarray: - """ - Low-pass filter a multi-channel audio array - - Parameters - ---------- - x: np.ndarray - Input array - fc: int - Cut-off frequency in Hz - fs: int - Sampling rate in Hz - - Returns - ------- - y: np.ndarray - Output low-pass filtered array - """ - - if (fc + 500) < (fs / 2.0): - # Design a Chebychev Type II filter, band_pass-band_stop = 500 Hz - N, Wn = sig.cheb2ord(fc / (fs / 2), (fc + 500) / (fs / 2), 3, 60) - b, a = sig.cheby2(N, 60, Wn, "low") - - # Apply the Butterworth filter for each channels, across time axis - # y = sig.lfilter(b, a, axis=0) # non zero-phase filter - y = sig.filtfilt(b, a, x, axis=0) # zero-phase filer, batch processing - else: - y = x - - return y - - -def cut( - x: np.ndarray, - limits: Optional[Tuple[int, int]], -) -> np.ndarray: - """ - Cut an audio array - - Parameters - ---------- - x: np.ndarray - Input array - limits: Tuple[int, int] - first and last samples to extract - - Returns - ------- - y: np.ndarray - Output cut array - """ - - in_samples, in_channels = x.shape - first_sample = limits[0] - last_sample = limits[1] - - if first_sample == 0 and (last_sample == -1 or last_sample == in_samples): - y = x - else: - if last_sample == -1: - last_sample = in_samples - - signal_start = first_sample - signal_end = last_sample - insert_start = 0 - insert_end = last_sample - first_sample - total_samples = last_sample - first_sample - if first_sample < 0: - samples_to_pad_begin = -first_sample - insert_start = samples_to_pad_begin - insert_end += samples_to_pad_begin - if last_sample > in_samples: - signal_end = in_samples - insert_end = insert_end - last_sample + in_samples - y = np.zeros([total_samples, in_channels], dtype=x.dtype) - y[insert_start:insert_end, :] = x[signal_start:signal_end, :] - - return y - - -def compare( - ref: np.ndarray, - test: np.ndarray, - fs: int, - per_frame: bool = False, -) -> dict: - """ - Compare two audio arrays - - Parameters - ---------- - ref: np.ndarray - Input reference array - test: np.ndarray - Input test array - fs: int - Input sampling rate in Hz - - Returns - ------- - result: dict - Comparison results - """ - - framesize = fs // 50 - diff = abs(test - ref) - max_diff = int(diff.max()) - result = { - "bitexact": True, - "max_abs_diff": 0, - "max_abs_diff_pos_sample": 0, - "max_abs_diff_pos_channel": 0, - "nsamples_diff": 0, - "nsamples_diff_percentage": 0.0, - "first_diff_pos_sample": -1, - "first_diff_pos_channel": -1, - "first_diff_pos_frame": -1, - } - if per_frame: - result["max_abs_diff_pos_frame"] = 0 - result["nframes_diff"] = 0 - result["nframes_diff_percentage"] = 0.0 - - if max_diff != 0: - if diff.ndim == 1: - nsamples_total = diff.shape - nchannels = 1 - else: - nsamples_total, nchannels = diff.shape - max_diff_pos = np.nonzero(diff == max_diff) - max_diff_pos = [ - max_diff_pos[0][0], - max_diff_pos[0][0] // framesize, - max_diff_pos[1][0], - ] - - first_diff_pos = np.nonzero(diff) - first_diff_pos = [ - first_diff_pos[0][0], - first_diff_pos[0][0] // framesize, - first_diff_pos[1][0], - ] - - nsamples_diff = np.nonzero(diff)[0].size - nsamples_diff_percentage = nsamples_diff / (nsamples_total * nchannels) * 100.0 - nframes = nsamples_total // framesize - nframes_diff = 0 - - result = { - "bitexact": False, - "max_abs_diff": max_diff, - "max_abs_diff_pos_sample": max_diff_pos[0], - "max_abs_diff_pos_channel": max_diff_pos[2], - "nsamples_diff": nsamples_diff, - "nsamples_diff_percentage": nsamples_diff_percentage, - "first_diff_pos_sample": first_diff_pos[0], - "first_diff_pos_channel": first_diff_pos[2], - "first_diff_pos_frame": first_diff_pos[1], - } - - if per_frame: - for fr in range(nframes): - diff_fr = diff[fr * framesize : ((fr + 1) * framesize), :] - nframes_diff += 1 if diff_fr.nonzero()[0].size > 0 else 0 - nframes_diff_percentage = nframes_diff / nframes * 100.0 - result["max_abs_diff_pos_frame"] = max_diff_pos[1] - result["nframes_diff"] = nframes_diff - result["nframes_diff_percentage"] = nframes_diff_percentage - - return result - - -def getdelay( - x: np.ndarray, - y: np.ndarray, -) -> int: - """ - Get the delay between two audio signals - - Parameters - ---------- - x: np.ndarray - Input reference array - y: np.ndarray - Input test array - - Returns - ------- - result: int - Delay of y in samples with respect to x (median of individual channel delays) - """ - - if x.ndim == 1: - n_samples_x = x.shape - n_chan_x = 1 - else: - n_samples_x, n_chan_x = x.shape - if y.ndim == 1: - n_samples_y = y.shape - n_chan_y = 1 - else: - n_samples_y, n_chan_y = y.shape - if n_chan_x != n_chan_y: - raise ValueError - lags = np.arange(-n_samples_x + 1, n_samples_y) - lag = np.zeros([n_chan_x, 1], dtype=int) - for chan in range(n_chan_x): - correlation = sig.correlate(y[:, chan], x[:, chan], mode="full") - lag[chan] = lags[np.argmax(correlation)] - return int(np.median(lag)) - - -def mono_downmix(x: np.ndarray) -> np.ndarray: - """ - Creates a passive mono downmix for a multi-channel audio signal - """ - return np.sum(x, axis=1) - - -def mute_channels( - x: np.ndarray, mute: Optional[Union[list, np.ndarray]] = None -) -> np.ndarray: - """ - Mute audio channels in signal - """ - x[:, mute] = 0 - return x diff --git a/item_generation_scripts/audiotools/audiofile.py b/item_generation_scripts/audiotools/audiofile.py deleted file mode 100644 index d5687a89..00000000 --- a/item_generation_scripts/audiotools/audiofile.py +++ /dev/null @@ -1,436 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import logging -import struct -from pathlib import Path -from typing import Optional, Tuple, Union - -import numpy as np -import scipy.io.wavfile as wav - -from .audioarray import trim, window - -logger = logging.getLogger("__main__") -logger.setLevel(logging.DEBUG) - - -def read( - filename: Union[str, Path], - nchannels: Optional[int] = 1, - fs: Optional[int] = 48000, - outdtype: Optional[str] = "float", -) -> Tuple[np.ndarray, int]: - """ - Read audio file (.pcm, .wav or .raw) - - Parameters - ---------- - filename: str - Input file path - nchannels: Optional[int] - Number of input channels, required for .pcm otherwise default = 1 - fs: Optional[int] - Input sampling rate, required for .pcm input file, otherwise default = 48000 (Hz) - outdtype: Optional[str] - Data type of output array, python builtin or np.dtype - - Returns - ------- - x: np.ndarray - audio signal array - fs: int - signal sampling frequency - """ - - file_extension = Path(filename).suffix - - if file_extension == ".wav": - fs, data = wav.read(filename) - if data.dtype == np.int32: - data = np.interp( - data, - (np.iinfo(np.int32).min, np.iinfo(np.int32).max), - (np.iinfo(np.int16).min, np.iinfo(np.int16).max), - ) - elif data.dtype == np.float32: - data = np.interp( - data, - (-1, 1), - (np.iinfo(np.int16).min, np.iinfo(np.int16).max), - ) - x = np.array(data, dtype=outdtype) - file_len = x.shape[0] - if x.ndim == 1: - # force to be a mtx - x = np.reshape(x, (file_len, 1)) - elif file_extension in [".pcm", ".raw"]: - x = np.fromfile(filename, dtype=np.int16).astype(outdtype) - signal_len = len(x) // nchannels - try: - x = x.reshape(signal_len, nchannels) - except ValueError: - raise ValueError("Wrong number of channels") - else: - raise ValueError("Wrong input format. Use wav, pcm or raw") - - return x, fs - - -def write( - filename: Union[str, Path], - x: np.ndarray, - fs: Optional[int] = 48000, - dtype: Optional[str] = "int16", -) -> None: - """ - Write audio file (.pcm, .wav or .raw) - - Parameters - ---------- - filename: str - Output file path (.pcm, .wav or .raw) - x: np.ndarray - Numpy 2D array of dimension: number of channels x number of samples - fs: Optional[int] - Sampling rate, required for .pcm or .raw input file, default = 48000 (Hz) - dtype: Optional[str] - Data type format required for .pcm or .raw input file, default = 'int16' - - Returns - ------- - None - """ - - file_extension = Path(filename).suffix - - clipped_samples = np.sum( - np.logical_or(x < np.iinfo(np.int16).min, x > np.iinfo(np.int16).max) - ) - if clipped_samples > 0: - logger.warning(f" Warning: {clipped_samples} samples clipped") - x = np.clip(x, np.iinfo(np.int16).min, np.iinfo(np.int16).max) - - if file_extension == ".wav": - x = x.astype(np.int16) - wav.write(filename, fs, x) - elif file_extension == ".pcm" or file_extension == ".raw": - x = x.astype(dtype).reshape(-1, 1) - x.tofile(filename) - else: - raise ValueError("Wrong input format. Use wav, pcm or raw") - - -def concat( - in_filenames: list, - out_file: str, - silence_pre: Optional[int] = 0, - silence_post: Optional[int] = 0, - in_fs: Optional[int] = 48000, - num_channels: Optional[int] = None, - pad_noise: Optional[bool] = False, - preamble: Optional[int] = None, - pad_noise_preamble: Optional[bool] = False, -) -> list: - """ - Horizontally concatenates audio files into one long file - - Parameters - __________ - in_filenames: list - Input list of filenmames (.pcm, .raw or .wav) - out_file: str - Output multi-channel audio file name (.pcm, .raw or .wav) - silence_pre: int - Padded zeros before signal in samples - silence_post: int - Padded zeros after signal in samples - in_fs: Optional[int] - Input sampling rate, default 48000 Hz - pad_noise: Optional[bool] - If true noise will be padded otherwise zeros will be padded - - Returns - ------- - splits - List of sample indices to split the resulting file at - """ - - y = None - fs_compare = 0 - - # create a list of splits - splits = [0] - - # Read input files - for in_file in in_filenames: - x, fs = read(in_file, fs=in_fs, nchannels=num_channels) - if fs_compare and fs_compare != fs: - raise ValueError("Sampling rates of files to concatenate don't match") - else: - fs_compare = fs - - # pad with very low amplitude noise - x = trim( - x, in_fs, (-silence_pre, -silence_post), samples=True, pad_noise=pad_noise - ) - - # add the length to our splits list - splits.append(splits[-1] + x.shape[0]) - - # concatenate - y = np.concatenate([y, x]) if y is not None else x - - # add preamble - if preamble: - y = trim(y, in_fs, (-preamble, 0), pad_noise_preamble) - - write(out_file, y, fs=in_fs) - - return splits[1:] - - -def split( - in_filename: Union[str, Path], - out_folder: Union[str, Path], - split_filenames: list[Union[str, Path]], - splits: list[int], - in_fs: Optional[int] = 48000, - preamble: Optional[int] = 0, - loudness: Optional[float] = None, -) -> list[Union[str, Path]]: - """ - Horizontally splits audio files into multiple shorter files and applies windowing and scaling - - Parameters - __________ - in_filename: Union[str, Path] - Input filenmame (.pcm, .raw or .wav) - out_folder: Union[str, Path] - Output folder where to put the splits - split_filenames: list[Union[str, Path]] - List of names for the split files - splits: list[int] - List of sample indices where to cut the signal - in_fs: Optional[int] - Input sampling rate, default 48000 Hz - loudness: Optional[float] - Desired loudness of individual files - """ - - # create a list of output files - out_paths = [] - - # Read input file - x, fs = read(in_filename, fs=in_fs) - - # remove preamble - if preamble: - x = trim(x, fs, (preamble, 0)) - - split_old = 0 - for idx, split in enumerate(splits): - out_file = Path(out_folder) / Path(split_filenames[idx]).with_suffix( - in_filename.suffix - ) - - # add the path to our list - out_paths.append(out_file) - - # split - y = x[split_old:split, :] - - # windowing - y = window(y) - - # write file - write(out_file, y, fs=in_fs) - - split_old = split - - return out_paths - - -def combine( - in_filenames: list, - out_file: str, - in_fs: Optional[int] = 48000, -) -> None: - """ - Combines audio files into one multi-channel file - - Parameters - ---------- - in_filenames: list - Input list of filenmames (.pcm, .raw or .wav) - out_file: str - Output multi-channel audio file name (.pcm, .raw or .wav) - in_fs: Optional[int] - Input sampling rate, required for .pcm and .raw input file, default 48000 Hz - - Returns - ------- - None - """ - - y = None - fs_compare = 0 - - # Read input files - for in_file in in_filenames: - # assign correct channel - x, fs = read(in_file, fs=in_fs) - if fs_compare and fs_compare != in_fs: - raise ValueError("Sampling rates of files to combine don't match") - else: - fs_compare = fs - if y is None: - y = x - else: - if x.shape[0] > y.shape[0]: - x = x[: y.shape[0], :] - elif y.shape[0] > x.shape[0]: - y = y[: x.shape[0], :] - y = np.column_stack([y, x]) - - write(out_file, y, fs=in_fs) - - -def split_channels( - in_file: str, - out_filenames: list, - in_nchans: int, - in_fs: Optional[int] = 48000, -) -> None: - """ - Split multi-channel audio files into individual mono files - - Parameters - ---------- - in_file: str - Input file name (.pcm, .raw or .wav) - out_filenames: list - List of output file names (.pcm, .raw or .wav) - in_nchans: int - Input number of channels - in_fs: Optional[int] = 48000 - Input sampling rate, default 48000 Hz - - Returns - ------- - None - """ - - # validation - if in_nchans is None: - raise ValueError("Number of channels to split must be specified!") - if in_nchans != len(out_filenames): - print( - "Split: Mismatch between number of channels and output filenames length. Truncating output filenames list." - ) - out_filenames = out_filenames[:in_nchans] - - x, in_fs = read(in_file, nchannels=in_nchans, fs=in_fs) - - # Write output files - for idx, out_file in enumerate(out_filenames): - # extract correct channel - y = x[:, idx] - - write(out_file, y, fs=in_fs) - - -def parse_wave_header( - filename: str, -) -> dict: - """ - Get the format information from a WAV file. - Return a dictionary with the format information - - Parameters - ---------- - filename : string or open file handle - Input WAV file. - - Returns - ------- - Dictionary - """ - - with open(filename, "rb") as fid: - riff = fid.read(4) - - if riff == b"RIFF": - binary_format = "<" - elif riff == b"RIFX": - binary_format = ">" - else: - raise IOError("No RIFF chunk found!") - - wav_size = struct.unpack(f"{binary_format}I", fid.read(4))[0] - - wav_identifier = fid.read(4) - if wav_identifier != b"WAVE": - raise IOError("No WAVE chunk found!") - - fmt_chunk_id = fid.read(4) - - if fmt_chunk_id == b"fmt ": - fmt_size = struct.unpack(f"{binary_format}I", fid.read(4))[0] - wav_format = struct.unpack(f"{binary_format}H", fid.read(2))[0] - channels = struct.unpack(f"{binary_format}H", fid.read(2))[0] - fs = struct.unpack(f"{binary_format}I", fid.read(4))[0] - bytes_per_second = struct.unpack(f"{binary_format}I", fid.read(4))[0] - block_align = struct.unpack(f"{binary_format}H", fid.read(2))[0] - bit_depth = struct.unpack(f"{binary_format}H", fid.read(2))[0] - rem_bytes = fmt_size - 16 - ext_param_size = 0 - ext_param = None - if rem_bytes: - ext_param_size = struct.unpack(f"{binary_format}H", fid.read(2))[0] - - if ext_param_size: - ext_param = fid.read(ext_param_size) - else: - raise IOError("Missing or corrupt fmt chunk!") - - return { - "size": wav_size, - "format_tag": wav_format, - "channels": channels, - "fs": fs, - "bytes_per_second": bytes_per_second, - "block_align": block_align, - "bit_depth": bit_depth, - "ext_param_size": ext_param_size, - "ext_param": ext_param, - } diff --git a/item_generation_scripts/audiotools/binaural_datasets/BRIR_IISofficialMPEG222UC_FULL.mat b/item_generation_scripts/audiotools/binaural_datasets/BRIR_IISofficialMPEG222UC_FULL.mat deleted file mode 100644 index 42e702db..00000000 --- a/item_generation_scripts/audiotools/binaural_datasets/BRIR_IISofficialMPEG222UC_FULL.mat +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a3ddecef64dfcf8887904b5cc370c0d9723bd8fd1637e32232205cdcd739b80d -size 12623190 diff --git a/item_generation_scripts/audiotools/binaural_datasets/BRIR_IISofficialMPEG222UC_LS.mat b/item_generation_scripts/audiotools/binaural_datasets/BRIR_IISofficialMPEG222UC_LS.mat deleted file mode 100644 index 1d590edb..00000000 --- a/item_generation_scripts/audiotools/binaural_datasets/BRIR_IISofficialMPEG222UC_LS.mat +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e2c964b96d802532c0ecf1076092c7d246a54293a3a0c4c72995953c66bfec71 -size 6348499 diff --git a/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA1.mat b/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA1.mat deleted file mode 100644 index 4f59a8a9..00000000 --- a/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA1.mat +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3a9ad5d8d874ac2fb851f5d2b0b303494f1d115612e9f6cab40e5eb33591b05c -size 4630 diff --git a/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA2.mat b/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA2.mat deleted file mode 100644 index 1ad2162a..00000000 --- a/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA2.mat +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6fc2a15579b80493597a8096bd815e8b847fe1880bdba760d4405122878b0b0a -size 10323 diff --git a/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA3.mat b/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA3.mat deleted file mode 100644 index 0e7c3ef4..00000000 --- a/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA3.mat +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:83822cfa090c345a6ece14d1ec1a92023626f467e2f8d982cf099c071dfc1080 -size 18229 diff --git a/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_FULL.mat b/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_FULL.mat deleted file mode 100644 index a2ab24e5..00000000 --- a/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_FULL.mat +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bf86a03f0b13932c5c138af22584f864b75c5733df1b01ac3fdf7750a1bdbe5f -size 14335913 diff --git a/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_LS.mat b/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_LS.mat deleted file mode 100644 index 65c2684c..00000000 --- a/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_LS.mat +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2e25ef101e9e72c5d70a55bc1451a07d041d29f96a803d7d3f968f20fe403316 -size 20190 diff --git a/item_generation_scripts/audiotools/binaural_datasets/README.txt b/item_generation_scripts/audiotools/binaural_datasets/README.txt deleted file mode 100644 index 9fd37c96..00000000 --- a/item_generation_scripts/audiotools/binaural_datasets/README.txt +++ /dev/null @@ -1,34 +0,0 @@ -Files in this directory should contain impulse responses for use in rendering in Matlab .mat format -Samplingrate of 48kHz is assumed - -Files should adhere to the following naming scheme: - -{HRIR|BRIR}_{DATASETNAME}_{FULL|LS|SBA(1-3)}.mat - -- HRIR or BRIR - specifies the type of impulse response which will be used - for either BINAURAL or BINAURAL_ROOM output respectively -- DATASETNAME - specifies the name used with the binaural_dataset commandline argument - or YAML key to enable selection of this dataset -- FULL or LS or SBA3 - specifies the subset of impulse responses in the file: - FULL: all available measurements on the sphere - LS: superset of supported loudspeaker layouts - (see audiotools.constants.CHANNEL_BASED_AUDIO_FORMATS["LS""]) - SBA(1-3): impulse responses transformed to ambisonics by external conversion - if available SBA1 is used for FOA, SBA2 for HOA2 and SBA3 for HOA3 - if not available SBA3 is used and truncated for all Ambisonic formats - -Each Matlab file should contain the following variables: -- IR - Impulse responses with dimensions [ir_length x n_ears x n_channels] -- SourcePosition - array of {azimuth, elevation, radius} of dimensions [n_channels x 3] - required for FULL, optional otherwise -- latency_s - latency of the dataset in samples - optional, will be estimated if not provided - -LICENSES: -Please see HRIR.txt and BRIR.txt for license info \ No newline at end of file diff --git a/item_generation_scripts/audiotools/binaural_datasets/__init__.py b/item_generation_scripts/audiotools/binaural_datasets/__init__.py deleted file mode 100644 index aea270d8..00000000 --- a/item_generation_scripts/audiotools/binaural_datasets/__init__.py +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# diff --git a/item_generation_scripts/audiotools/binaural_datasets/binaural_dataset.py b/item_generation_scripts/audiotools/binaural_datasets/binaural_dataset.py deleted file mode 100644 index e6c4dbe7..00000000 --- a/item_generation_scripts/audiotools/binaural_datasets/binaural_dataset.py +++ /dev/null @@ -1,288 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import warnings -from pathlib import Path -from typing import Optional, Tuple, Union - -import numpy as np -from scipy.io import loadmat - -from item_generation_scripts.audiotools.audio import fromtype -from item_generation_scripts.audiotools.constants import ( - CHANNEL_BASED_AUDIO_FORMATS, - OBJECT_BASED_AUDIO_FORMATS, - SCENE_BASED_AUDIO_FORMATS, -) -from item_generation_scripts.audiotools.EFAP import wrap_angles - - -def load_hrtf( - filename: Union[str, Path], -) -> Tuple[np.ndarray, np.ndarray, int]: - """ - Read HRTFs from Matlab dictionary file mat - - Parameters - ---------- - filename: str - HRTFs file name (.mat) - - Returns - ------- - IR: np.ndarray - Array of impulse responses - SourcePosition: np.ndarray - Array of source positions corresponding to the impulse responses - latency_s: int - Latency in samples - """ - - if not filename.exists(): - raise FileNotFoundError( - f"File {filename.name} was not found in dataset folder!" - ) - - mat_contents = loadmat(filename) - - try: - IR = mat_contents["IR"] - except KeyError: - raise KeyError(f"Key 'IR' not found in .mat file: {filename} !") - - SourcePosition = mat_contents.get("SourcePosition") - latency_s = mat_contents.get("latency_s") - if latency_s is not None: - latency_s = latency_s.astype(np.int32)[0, 0] - - return IR, SourcePosition, latency_s - - -def load_ir( - in_fmt: str, - out_fmt: str, - dataset: Optional[str] = None, -) -> Tuple[np.ndarray, np.ndarray, int]: - """ - Load IRs for a specified rendering format - - Parameters - ---------- - in_fmt: str - Input format - out_fmt: str - Output format - dataset: Optional[str] - Name of desired dataset without prefix and suffix - - Returns - ------- - IR: np.ndarray - Array of impulse responses - SourcePosition: np.ndarray - Array of source positions corresponding to the impulse responses - latency_smp: int - Latency in samples - """ - - dataset_prefix = None - dataset_suffix = None - - if out_fmt.startswith("BINAURAL") and "ROOM" in out_fmt: - dataset_prefix = "BRIR" - if dataset is None: - dataset = "IISofficialMPEG222UC" - - if in_fmt.startswith("MOZART"): - dataset_suffix = "FULL" - elif in_fmt in CHANNEL_BASED_AUDIO_FORMATS.keys(): - dataset_suffix = "LS" - - elif out_fmt.startswith("BINAURAL"): - dataset_prefix = "HRIR" - if dataset is None: - dataset = "ORANGE53" - - if in_fmt in OBJECT_BASED_AUDIO_FORMATS.keys() or in_fmt.startswith( - "CUSTOM_LS" - ): - dataset_suffix = "FULL" - elif in_fmt in CHANNEL_BASED_AUDIO_FORMATS.keys() and in_fmt != "MONO": - dataset_suffix = "LS" - elif in_fmt in SCENE_BASED_AUDIO_FORMATS.keys(): - dataset = "ORANGE53_Dolby" - if in_fmt == "SBA1" or in_fmt == "FOA": - dataset_suffix = "SBA1" - # Use truncated SBA3 dataset if no SBA1 or 2 dataset exists - if not ( - Path(__file__).parent.joinpath( - f"{dataset_prefix}_{dataset}_{dataset_suffix}.mat" - ) - ).is_file(): - dataset_suffix = "SBA3" - warnings.warn("No SBA1 dataset found -> use truncated SBA3 dataset") - elif in_fmt.endswith("2"): - dataset_suffix = "SBA2" - # Use truncated SBA3 dataset if no SBA1 or 2 dataset exists - if not ( - Path(__file__).parent.joinpath( - f"{dataset_prefix}_{dataset}_{dataset_suffix}.mat" - ) - ).is_file(): - dataset_suffix = "SBA3" - warnings.warn("No SBA2 dataset found -> use truncated SBA3 dataset") - else: - dataset_suffix = "SBA3" - - path_dataset = Path(__file__).parent.joinpath( - f"{dataset_prefix}_{dataset}_{dataset_suffix}.mat" - ) - IR, SourcePosition, latency_s = load_hrtf(path_dataset) - - if latency_s is not None: - latency_smp = latency_s - else: - latency_smp = int(np.min(np.argmax(np.sum(np.abs(IR), axis=1), axis=0))) - warnings.warn( - f"No latency of HRTF dataset specified in {path_dataset} file -> computed latency: {latency_smp} sample(s)" - ) - - if in_fmt.startswith("STEREO"): - IR = IR[:, :, :2] # use L and R channels. - elif ( - in_fmt in CHANNEL_BASED_AUDIO_FORMATS.keys() - and not in_fmt.startswith("CUSTOM_LS") - and not in_fmt.startswith("MOZART") - ): - # extract positions from the loudspeaker file - in_fmt = fromtype(in_fmt) - tmp_fmt = fromtype("LS") - - IR_tmp = IR.copy() - IR = np.zeros([IR_tmp.shape[0], IR_tmp.shape[1], in_fmt.num_channels]) - - ir_index = 0 - for i in range(tmp_fmt.num_channels): - for j in range(in_fmt.num_channels): - if ( - tmp_fmt.ls_azi[i] == in_fmt.ls_azi[j] - and tmp_fmt.ls_ele[i] == in_fmt.ls_ele[j] - ): - if j != in_fmt.lfe_index[0]: - IR[:, :, ir_index] = IR_tmp[:, :, i] - ir_index += 1 - - return IR, SourcePosition, latency_smp - - -def find_ir( - SourcePosition: np.ndarray, - azi: float, - ele: float, - num_filter: Optional[int] = None, -) -> Tuple[np.ndarray, np.ndarray]: - """ - Find HRTF measurement closest to the selected direction - - Parameters - ---------- - SourcePosition: np.ndarray - Source IR positions - azi: float - Desired response azimuth - ele: float - Desired response elevation - num_filter: Optional[int] - Number of filters to return, if None return all - - Returns - ------- - i_dir: np.ndarray - Indices of nearest SourcePositions - dist_sort: np.ndarray - Distances corresponding to the indices - """ - - dist = dist_on_sphere(SourcePosition, azi, ele) - - if num_filter is None: - i_dir = np.argsort(dist) - dist_sort = np.sort(dist) - else: - i_dir = np.argsort(dist)[:num_filter] - dist_sort = np.sort(dist)[:num_filter] - - return i_dir, dist_sort - - -def dist_on_sphere( - positions: np.ndarray, - azi: float, - ele: float, -) -> np.ndarray: - """ - Compute great-circle distance - - Parameters - ---------- - positions: np.ndarray - Source IR positions - azi: float - Desired response azimuth - ele: float - Desired response elevation - - Returns - ------- - dist: np.ndarray - Distances from desired point - """ - - azi, ele = wrap_angles(azi, ele) - - delta_azi = np.deg2rad(np.abs(azi - positions[:, 0])) - - # compute great circle distance - a = np.sin(np.deg2rad(positions[:, 1])) * np.sin(np.deg2rad(ele)) + np.cos( - np.deg2rad(positions[:, 1]) - ) * np.cos(np.deg2rad(ele)) * np.cos(delta_azi) - if np.max(a) > 1.001 or np.min(a) < -1.001: - raise ValueError( - f"Absolute distance value larger than one! Min: {np.min(a)}, Max: {np.max(a)}" - ) - - # limiting to prevent errors in arccos due to numerical inaccuracies - a[a > 1] = 1 - a[a < -1] = -1 - dist = np.arccos(a) - - return dist diff --git a/item_generation_scripts/audiotools/binauralobjectrenderer.py b/item_generation_scripts/audiotools/binauralobjectrenderer.py deleted file mode 100644 index 548c4921..00000000 --- a/item_generation_scripts/audiotools/binauralobjectrenderer.py +++ /dev/null @@ -1,652 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import itertools -from itertools import repeat -from typing import Optional, Tuple - -import numpy as np -from scipy.signal import convolve - -from item_generation_scripts.audiotools.binaural_datasets.binaural_dataset import ( - find_ir, -) -from item_generation_scripts.audiotools.constants import IVAS_FRAME_LEN_MS -from item_generation_scripts.audiotools.EFAP import wrap_angles -from item_generation_scripts.utils import apply_func_parallel - - -def barycentric_weights( - azi_deg: np.ndarray, - ele_deg: np.ndarray, - pos_in: np.ndarray, - interp_1d: Optional[bool] = False, -) -> Tuple[int, int, int]: - """ - Computation of spherical Barycentric weights - Implementation based on paper "Spherical Barycentric Coordinates" - from T. Langer, A. Belyaev und H. Seidel - - Parameters - ---------- - azi_deg: np.ndarray - Azimuthal coordinates of three points that form a triangle in degrees - ele_deg: np.ndarray - Elevation coordinates of three points that form a triangle in degrees - pos_in: np.ndarray - Azimuthal and elevation coordinates in degrees for point to compute weights - interp_1d: bool - 1d interpolation between two points - - Returns - ------- - W_1, W_2, W_3: scalar values - Barycentric weights for corresponding vertices - """ - - # check if point is equal to vertex - for k in range(3): - if azi_deg[k] == pos_in[0] and ele_deg[k] == pos_in[1]: - output = np.zeros(3) - output[k] = 1 - return tuple(output) - - pos = np.copy(pos_in) - - pos[0], pos[1] = wrap_angles(pos[0], pos[1]) - - # convert rad - ele = ( - -np.deg2rad(ele_deg, dtype="float64") + np.pi / 2 - ) # different definition of elevation in metadata - azi = np.deg2rad(azi_deg, dtype="float64") - pos[0] = np.deg2rad(pos[0]) - pos[1] = -np.deg2rad(pos[1]) + np.pi / 2 - - """ spherical barycentric coordinates """ - - # convert to cartesian coordinates - x = np.sin(ele) * np.cos(azi) - y = np.sin(ele) * np.sin(azi) - z = np.cos(ele) - pos_x = np.sin(pos[1]) * np.cos(pos[0]) - pos_y = np.sin(pos[1]) * np.sin(pos[0]) - pos_z = np.cos(pos[1]) - - pos_cart = np.array([pos_x, pos_y, pos_z]) - v_1 = np.array([x[0], y[0], z[0]]) - v_2 = np.array([x[1], y[1], z[1]]) - v_3 = np.array([x[2], y[2], z[2]]) - - # rotate coordinate system - unit = np.array([0, 0, 1]) - a = np.cross(pos_cart, unit) - b = np.dot(pos_cart, unit) - a_matrix = np.array([[0, -a[2], a[1]], [a[2], 0, -a[0]], [-a[1], a[0], 0]]) - if b == -1: - rot_matrix = np.eye(3, 3) # a and b point to opposite directions - else: - rot_matrix = np.eye(3, 3) + a_matrix + np.dot(a_matrix, a_matrix) / (1 + b) - - v_1 = rot_matrix @ v_1 - v_2 = rot_matrix @ v_2 - v_3 = rot_matrix @ v_3 - # test_vec = rot_matrix @ pos_cart # should be [0, 0, 1] - - # scale verticies to tangent plane - v_1_plane = v_1 / v_1[2] - v_2_plane = v_2 / v_2[2] - v_3_plane = v_3 / v_3[2] - eps = 10**-10 - - # compute planar barycentric coordinates - denom = (v_2_plane[1] - v_3_plane[1]) * (v_1_plane[0] - v_3_plane[0]) + ( - v_3_plane[0] - v_2_plane[0] - ) * (v_1_plane[1] - v_3_plane[1]) - # denom is proportional to area of triangle -> when area is zero, use linear 1d interpolation - if abs(denom) <= 10**-15: - interp_1d = True - - if not interp_1d: - W_1_plane = ( - (v_2_plane[1] - v_3_plane[1]) * (0 - v_3_plane[0]) - + (v_3_plane[0] - v_2_plane[0]) * (0 - v_3_plane[1]) - ) / (denom + eps) - W_2_plane = ( - (v_3_plane[1] - v_1_plane[1]) * (0 - v_3_plane[0]) - + (v_1_plane[0] - v_3_plane[0]) * (0 - v_3_plane[1]) - ) / (denom + eps) - W_3_plane = 1 - W_1_plane - W_2_plane - else: - v_diff = np.array( - [v_1_plane[:-1], v_2_plane[:-1], v_3_plane[:-1]] - ) # z entry always one - dist_all = np.linalg.norm(v_diff, axis=1) - v_diff_norm = np.divide(v_diff, dist_all[:, None]) - dot_v_ind = np.array( - [[0, 1], [1, 2], [2, 0]] - ) # the three possible combinations of points - # compute dot product between all vertices to find pairs that lie in opposite directions w.r.t. the point - # in this case the dot product is -1 (due to normalization) - dot = np.empty(3) - k = 0 - for ind_i, ind_j in dot_v_ind: - dot[k] = np.dot(v_diff_norm[ind_i], v_diff_norm[ind_j]) - k += 1 - - margin = 10**-5 - indices_minus_one = np.array(np.abs(dot + 1) < margin) - if indices_minus_one.any(): # test if one entry is -1 - v_ind = dot_v_ind[indices_minus_one] - # use vertex pair with smalles distance from origin (current position) - if np.shape(v_ind)[0] >= 2: - used_vertices = v_ind[ - np.argmin( - np.array([sum(dist_all[v_ind[0]]), sum(dist_all[v_ind[1]])]) - ) - ] - else: - used_vertices = v_ind[0] - dist = dist_all[used_vertices[0]] / sum(dist_all[used_vertices]) - if 0 in used_vertices and 1 in used_vertices: - W_1_plane = 1 - dist - W_2_plane = dist - W_3_plane = 0 - elif 1 in used_vertices and 2 in used_vertices: - W_1_plane = 0 - W_2_plane = 1 - dist - W_3_plane = dist - elif 2 in used_vertices and 0 in used_vertices: - W_1_plane = dist - W_2_plane = 0 - W_3_plane = 1 - dist - else: - raise ValueError("problem in 1d interpolation") - else: - # point does not lie on line spanned by two of the points - W_1_plane = -1 - W_2_plane = -1 - W_3_plane = -1 - - # compute spherical weights from planar weights - W_1 = W_1_plane * np.dot(v_1, v_1_plane) - W_2 = W_2_plane * np.dot(v_2, v_2_plane) - W_3 = W_3_plane * np.dot(v_3, v_3_plane) - - # avoid rejection of triangles due to numerical errors since point lies on edge of tiangle - threshold_error = -1 * 10**-8 - if threshold_error < W_1 < 0: - W_1 = 0 - if threshold_error < W_2 < 0: - W_2 = 0 - if threshold_error < W_3 < 0: - W_3 = 0 - - return W_1, W_2, W_3 - - -def get_tri_weights( - pos: np.ndarray, - SourcePosition: np.ndarray, -) -> Tuple[np.ndarray, np.ndarray]: - """ - Finds suitable triangle of data points on surface in which the defined point lies - - Parameters - ---------- - pos: np.ndarray - Point of interest given as [azimutahal, elevation] - SourcePosition: np.ndarray - Positions of the source in the measurements in IR - - Returns - ------- - combination_vertices: np.ndarray - Indices of the three vertices in SourcePosition - W: np.ndarray - Barycentric weights of point in triangle; - if negative, no suitable triangle was found - """ - - W_1, W_2, W_3 = -1, -1, -1 - index_triangle = 3 - # get indices of source positions sorted by distance on the plane from pos - index_vertices, _ = find_ir(SourcePosition, pos[0], pos[1]) - pos = np.array(wrap_angles(pos[0], pos[1])) - combination_vertices = None - while W_1 < 0 or W_2 < 0 or W_3 < 0: - if ( - SourcePosition[index_vertices[0], 0] == pos[0] - and SourcePosition[index_vertices[0], 1] == pos[1] - ): - # if position is position in data set take first triangle that incudes the point - combination_vertices = index_vertices[:3] - W_1, W_2, W_3 = (1, 0, 0) - break - index_HRIR = index_vertices[:index_triangle] # get nearest positions - y_ele_all = SourcePosition[index_HRIR, 1] - if pos[1] > np.max(y_ele_all) or pos[1] < np.min(y_ele_all): - # no need to compute weights since all possible triangles lie completely above or below point - # attention: this can be problematic if no point is available at [0, +-90] - pass - else: - # test all triangle combinations with new point - for combination_vertices_tmp in itertools.combinations(index_HRIR[:-1], 2): - combination_vertices = np.concatenate( - (index_HRIR[-1, None], combination_vertices_tmp), axis=0 - ) - - x_azi = SourcePosition[combination_vertices, 0] - y_ele = SourcePosition[combination_vertices, 1] - W_1, W_2, W_3 = barycentric_weights(x_azi, y_ele, pos) - if W_1 >= 0 and W_2 >= 0 and W_3 >= 0: - # found suitable triangle - break - index_triangle += 1 - if index_triangle > 30: - # stop after too many iterations - return np.array(combination_vertices), np.array([-1, -1, -1]) - - W = np.array([W_1, W_2, W_3]) - return np.array(combination_vertices), W - - -def interpolate_2d( - azi_in: np.ndarray, - ele_in: np.ndarray, - values: np.ndarray, - pos: np.ndarray, - interp_1d: Optional[bool] = False, - weights: Optional[np.ndarray] = None, - ghost: Optional[list[bool]] = None, - SourcePosition: Optional[np.ndarray] = None, - IR: Optional[np.ndarray] = None, - phase: Optional[bool] = False, -) -> np.ndarray: - """ - Compute HRIR for point on surface spanned by three points via barycentric coordinates - - Parameters - ---------- - azi_in: np.ndarray - Azimuthal coordinates of three points that form a triangle in degrees - ele_in: np.ndarray - Elevation coordinates of three points that form a triangle in degrees - values: np.ndarray - Values to interpolate, here either HRIRs or magnitude or phase of HRTFs - pos: np.ndarray - Position of desired interpolation value - interp_1d: bool - 1d interpolation between two points - weights: tuple - If barycentric weights are already known these values are used - ghost: list of bool - If north and/or south pole is ghost source - SourcePosition: np.ndarray - Only necessary if at least one element in ghost is true - IR: np.ndarray - Only necessary if at least one element in ghost is true - phase: bool - If interpolated values are phases and should be wrapped - - Returns - ------- - HRIR: np.ndarray - Interpolated value at point pos - """ - - if ghost is None: - ghost = [False, False] - - if weights is None: - W_1, W_2, W_3 = barycentric_weights( - azi_in, ele_in, pos, interp_1d - ) # compute barycentric weights - else: - (W_1, W_2, W_3) = weights - - if ( - W_1 + W_2 + W_3 > 1.5 - ): # on sphere sum of weights is not necessarily equal to one! - raise ValueError( - f"Sum of positive barycentric weights larger than expected: {W_1 +W_2 +W_3}" - ) - - threshold_error = -1 * 10**-10 - if W_1 < threshold_error or W_2 < threshold_error or W_3 < threshold_error: - raise ValueError("Point lies outside of triangle! No interpolation possible") - - # do some phase unwrapping - if phase: - values = np.unwrap(values, axis=1) - - # treat potential ghost sources at the north and south pole - if (ghost[0] and 90 in ele_in) or (ghost[1] and -90 in ele_in): - if SourcePosition is None or IR is None: - raise ValueError( - "Source positions and IRs are required in interpolation if ghost source is used" - ) - ele_ghost = [] - additional_term = 0 - weights_copy = np.copy(weights) - if ghost[0] and 90 in ele_in: - ele_ghost.append(90) - if ghost[1] and -90 in ele_in: - ele_ghost.append(-90) - for ele_g in ele_ghost: - ind_dist, dist = find_ir(SourcePosition[: -len(ele_ghost)], 0, ele_g) - ind_dist = ind_dist[dist == dist[0]] - weight_spread = weights_copy[ele_in == ele_g] / len(ind_dist) - weights_copy[ele_in == ele_g] = 0 - additional_term += np.sum(IR[:, ind_dist], axis=1) * weight_spread - - HRIR = ( - values[:, 0] * W_1 - + values[:, 1] * W_2 - + values[:, 2] * W_3 - + additional_term - ) - - else: - HRIR = ( - values[:, 0] * W_1 + values[:, 1] * W_2 + values[:, 2] * W_3 - ) # apply weights - - return HRIR - - -def add_ghost_speaker_bary( - SourcePosition: np.ndarray, - IR: np.ndarray, -) -> Tuple[list[bool], np.ndarray, np.ndarray]: - """ - Adds a ghost speaker at the poles if necessary and indicates result by bool values - - Parameters - ---------- - SourcePosition: np.ndarray - All source positions - IR: np.ndarray - IRs at corresponding source positions - - Returns - ------- - ghost_pos: list of bool - If entry is True a ghost speaker is introduced at the north or south pole, respectively - SourcePosition: np.ndarray - All source positions plus poles if ghost_pos is True - IR: np.ndarray - IRs at corresponding source positions - """ - - ghost_pos = [False, False] - if 90 not in SourcePosition[:, 1]: - # if north pole is not in dataset add it - ghost_pos[0] = True - pole = np.array([0, 90, 1]) - SourcePosition = np.concatenate((SourcePosition, pole[None, :]), axis=0) - IR = np.concatenate((IR, np.zeros((*np.shape(IR)[:2], 1))), axis=2) - if -90 not in SourcePosition[:, 1]: - # if south pole is not in dataset add it - ghost_pos[1] = True - pole = np.array([0, -90, 1]) - SourcePosition = np.concatenate((SourcePosition, pole[None, :]), axis=0) - IR = np.concatenate((IR, np.zeros((*np.shape(IR)[:2], 1))), axis=2) - - return ghost_pos, SourcePosition, IR - - -def binaural_fftconv_framewise( - x: np.ndarray, - IR: np.ndarray, - SourcePosition: np.ndarray, - azi: Optional[np.ndarray] = None, - ele: Optional[np.ndarray] = None, - frame_len: Optional[int] = (IVAS_FRAME_LEN_MS // 4) * 48, -) -> np.ndarray: - """ - Binauralization using fft convolution with frame-wise processing - supports rotation on trajectories with interpolation between measured Source - positions, reimplemented roughly along the lines of ConvBinauralRenderer.m - - Parameters - ---------- - x: np.ndarray - Input multi-channel array - IR: np.ndarray - HRIRs array - SourcePosition: np.ndarray - Positions of the source in the measurements in IR - azi: np.ndarray - Azimuth angles for all frames - ele: np.ndarray - Elevation angles for all frames - frame_len: int - Frame length, optional, default = (IVAS_FRAME_LEN_MS // 4) * 48000 - - Returns - ------- - y: np.ndarray - Output binaural signal array - """ - - sig_len = x.shape[0] - N_frames = int( - sig_len / frame_len - ) # TODO add ceil function for non-integer frame length multiples - num_points_interp = 3 # interpolation in triangle - - N_HRIR_taps = IR.shape[0] - - if azi is None or ele is None: - azi = np.repeat([0.0], N_frames) - ele = np.repeat([0.0], N_frames) - elif len(azi) < N_frames or len(ele) < N_frames: - azi = np.concatenate( - [np.repeat(azi, N_frames // len(azi)), azi[: N_frames % len(azi)]] - ) - ele = np.concatenate( - [np.repeat(ele, N_frames // len(ele)), ele[: N_frames % len(ele)]] - ) - - indices_HRIR = np.empty([N_frames, num_points_interp], dtype=int) - IR_2d = np.empty((N_frames, N_HRIR_taps, 2, num_points_interp)) - Bary_weights = np.empty((N_frames, 3)) - - # find three points to form a triangle for interpolation - # test if point lies within triangle spanned by these points by checking the signas of barycentric coordinates - # if all weights are >= 0 the point lies within the triangle - for index in range(np.shape(SourcePosition)[0]): - SourcePosition[index, 0:2] = np.array( - wrap_angles(SourcePosition[index, 0], SourcePosition[index, 1]) - ) - - # add ghost speaker to poles if necessary - ghost_pos, SourcePosition, IR = add_ghost_speaker_bary(SourcePosition, IR) - for i_frame in range(N_frames): - if ( - i_frame - and azi[i_frame] == azi[i_frame - 1] - and ele[i_frame] == ele[i_frame - 1] - ): - IR_2d[i_frame] = IR_2d[i_frame - 1] - indices_HRIR[i_frame] = indices_HRIR[i_frame - 1] - Bary_weights[i_frame] = Bary_weights[i_frame - 1] - continue - pos = np.array([azi[i_frame], ele[i_frame]]) - combination_vertices, W = get_tri_weights(pos, SourcePosition) - if (W < 0).all(): - raise ValueError("No suitable triangle found in frame " + str(i_frame)) - IR_2d[i_frame] = IR[:, :, np.array(combination_vertices)] - indices_HRIR[i_frame] = combination_vertices - Bary_weights[i_frame] = W - - T_rev = frame_len + N_HRIR_taps - 1 - N_rev = int(np.ceil(T_rev / frame_len)) - - fade_in = np.arange(frame_len) / (frame_len - 1) - fade_in = fade_in[:, np.newaxis] - fade_out = 1.0 - fade_in - - # compute both ears in parallel - i_ear = list(range(2)) - result = apply_func_parallel( - render_ear, - zip( - i_ear, - repeat(frame_len), - repeat(N_frames), - repeat(N_rev), - repeat(T_rev), - repeat(fade_in), - repeat(fade_out), - repeat(x), - repeat(sig_len), - repeat(N_HRIR_taps), - repeat(azi), - repeat(ele), - repeat(SourcePosition), - repeat(IR_2d), - repeat(Bary_weights), - repeat(ghost_pos), - repeat(IR), - repeat(indices_HRIR), - ), - None, - "mp", - False, - ) - - y = np.stack(result, axis=1) - - return y[0:sig_len] - - -def render_ear( - i_ear, - frame_len, - N_frames, - N_rev, - T_rev, - fade_in, - fade_out, - x, - sig_len, - N_HRIR_taps, - azi, - ele, - SourcePosition, - IR_2d, - Bary_weights, - ghost_pos, - IR, - indices_HRIR, -) -> np.ndarray: - # function to process one ear used in multiprocessing - G = np.empty((N_frames, N_HRIR_taps)) - - for frame in range(N_frames): - pos = np.array([azi[frame], ele[frame]]) - # Interpolation of time-domain signals - G[frame] = interpolate_2d( - SourcePosition[indices_HRIR[frame], 0], - SourcePosition[indices_HRIR[frame], 1], - IR_2d[frame, :, i_ear], - pos, - weights=Bary_weights[frame], - ghost=ghost_pos, - SourcePosition=SourcePosition, - IR=IR[:, i_ear], - ) - - # frame wise parallel computation slow (many frames, small computational load per frame) - i_frame = list(range(N_frames)) - result = apply_func_parallel( - convolve_frame, - zip( - i_frame, - repeat(frame_len), - repeat(N_frames), - repeat(N_rev), - repeat(T_rev), - repeat(i_ear), - repeat(fade_in), - repeat(fade_out), - repeat(G), - repeat(x), - repeat(sig_len), - repeat(N_HRIR_taps), - ), - None, - "mt", - False, - ) - - return np.hstack(result) - - -def convolve_frame( - i_frame, - frame_len, - N_frames, - N_rev, - T_rev, - i_ear, - fade_in, - fade_out, - G, - x, - sig_len, - N_HRIR_taps, -) -> np.ndarray: - # function to process one frame used in multiprocessing - i1 = i_frame * frame_len - i2 = (i_frame + 1) * frame_len - - y0 = np.zeros([2, sig_len + N_HRIR_taps - 1, 2]) - - G0 = G[i_frame] - G1 = G[min(i_frame + 1, N_frames - 1)] - - for j_frame in range(max(0, i_frame - N_rev), min(i_frame + 1, N_frames)): - j1 = j_frame * frame_len - j2 = (j_frame + 1) * frame_len - j2p = j1 + T_rev - - y0[0, j1:j2p, i_ear] += convolve(np.squeeze(x[j1:j2]), G0) - y0[1, j1:j2p, i_ear] += convolve(np.squeeze(x[j1:j2]), G1) - - y_frame = ( - np.squeeze(fade_out) * y0[0, i1:i2, i_ear] - + np.squeeze(fade_in) * y0[1, i1:i2, i_ear] - ) - return y_frame diff --git a/item_generation_scripts/audiotools/constants.py b/item_generation_scripts/audiotools/constants.py deleted file mode 100644 index c3af9d29..00000000 --- a/item_generation_scripts/audiotools/constants.py +++ /dev/null @@ -1,704 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import numpy as np - -BINAURAL_AUDIO_FORMATS = { - "BINAURAL": { - "num_channels": 2, - }, - "BINAURAL_ROOM": { - "num_channels": 2, - }, -} - -BINAURAL_LFE_GAIN = 10 ** (5.5 / 20) - -LFE_INDEX_DEFAULT = 3 - -LS_AZI_MONO = [0] -LS_ELE_MONO = [0] - -LS_AZI_STEREO = [30, -30] -LS_ELE_STEREO = [0, 0] - -LS_AZI_CICP6 = [30, -30, 0, 0, 110, -110] -LS_ELE_CICP6 = [0, 0, 0, 0, 0, 0] - -LS_AZI_CICP12 = [30, -30, 0, 0, 110, -110, 135, -135] -LS_ELE_CICP12 = [0, 0, 0, 0, 0, 0, 0, 0] - -LS_AZI_CICP14 = [30, -30, 0, 0, 110, -110, 30, -30] -LS_ELE_CICP14 = [0, 0, 0, 0, 0, 0, 35, 35] - -LS_AZI_CICP16 = [30, -30, 0, 0, 110, -110, 30, -30, 110, -110] -LS_ELE_CICP16 = [0, 0, 0, 0, 0, 0, 35, 35, 35, 35] - -LS_AZI_CICP19 = [30, -30, 0, 0, 135, -135, 90, -90, 30, -30, 135, -135] -LS_ELE_CICP19 = [0, 0, 0, 0, 0, 0, 0, 0, 35, 35, 35, 35] - - -CHANNEL_BASED_AUDIO_FORMATS = { - "MONO": { - "num_channels": 1, - "ls_azi": LS_AZI_MONO, - "ls_ele": LS_ELE_MONO, - "lfe_index": [], - }, - "STEREO": { - "num_channels": 2, - "ls_azi": LS_AZI_STEREO, - "ls_ele": LS_ELE_STEREO, - "lfe_index": [], - }, - "5_1": { - "num_channels": 6, - "ls_azi": LS_AZI_CICP6, - "ls_ele": LS_ELE_CICP6, - "lfe_index": [LFE_INDEX_DEFAULT], - }, - "5_1_2": { - "num_channels": 8, - "ls_azi": LS_AZI_CICP14, - "ls_ele": LS_ELE_CICP14, - "lfe_index": [LFE_INDEX_DEFAULT], - }, - "5_1_4": { - "num_channels": 10, - "ls_azi": LS_AZI_CICP16, - "ls_ele": LS_ELE_CICP16, - "lfe_index": [LFE_INDEX_DEFAULT], - }, - "7_1": { - "num_channels": 8, - "ls_azi": LS_AZI_CICP12, - "ls_ele": LS_ELE_CICP12, - "lfe_index": [LFE_INDEX_DEFAULT], - }, - "7_1_4": { - "num_channels": 12, - "ls_azi": LS_AZI_CICP19, - "ls_ele": LS_ELE_CICP19, - "lfe_index": [LFE_INDEX_DEFAULT], - }, - "LS": { - "num_channels": 15, - "ls_azi": [ - 30, - -30, - 0, - 135, - -135, - 110, - -110, - 90, - -90, - 30, - -30, - 110, - -110, - 135, - -135, - ], - "ls_ele": [0, 0, 0, 0, 0, 0, 0, 0, 0, 35, 35, 35, 35, 35, 35], - "lfe_index": [], - }, - "MOZART": { - "num_channels": 30, - "ls_azi": [ - 0, - 0, - 135, - -135, - 30, - -30, - 180, - 0, - 90, - -90, - 45, - -45, - 0, - 0, - 135, - -135, - 90, - -90, - 180, - 0, - 45, - -45, - 60, - -60, - 110, - -110, - 30, - -30, - 110, - -110, - ], - "ls_ele": [ - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 35, - 35, - 35, - 90, - 35, - 35, - 35, - 35, - 35, - -15, - -15, - -15, - 0, - 0, - 0, - 0, - 35, - 35, - 35, - 35, - ], - "lfe_index": [1, 7], - }, - "CUSTOM_LS": { - "num_channels": -1, - "ls_azi": None, - "ls_ele": None, - "lfe_index": None, - }, -} - -# Support a variety of names for multichannel configs -CHANNEL_BASED_AUDIO_ALTNAMES = { - # 5_1 - 51: "5_1", # YAML by default will interpret underscore delimited numbers as integers, similar to python - "5d1": "5_1", - "5.1": "5_1", - "CICP6": "5_1", - # 7_1 - 71: "7_1", - "7d1": "7_1", - "7.1": "7_1", - "CICP12": "7_1", - # 5_1_2 - 512: "5_1_2", - "5d1p2": "5_1_2", - "5.1+2": "5_1_2", - "5.1.2": "5_1_2", - "CICP14": "5_1_2", - # 5_1_4 - 514: "5_1_4", - "5d1p4": "5_1_4", - "5.1+4": "5_1_4", - "5.1.4": "5_1_4", - "CICP16": "5_1_4", - # 7_1_4 - 714: "7_1_4", - "7d1p4": "7_1_4", - "7.1+4": "7_1_4", - "7.1.4": "7_1_4", - "CICP19": "7_1_4", -} - -METADATA_ASSISTED_SPATIAL_AUDIO_FORMATS = { - "MASA1": { - "num_channels": 1, - }, - "MASA2": { - "num_channels": 2, - }, -} -OBJECT_BASED_AUDIO_FORMATS = { - "ISM1": { - "num_channels": 1, - }, - "ISM2": { - "num_channels": 2, - }, - "ISM3": { - "num_channels": 3, - }, - "ISM4": { - "num_channels": 4, - }, -} - - -SCENE_BASED_AUDIO_FORMATS = { - "FOA": { - "num_channels": 4, - "is_planar": False, - }, - "HOA2": { - "num_channels": 9, - "is_planar": False, - }, - "HOA3": { - "num_channels": 16, - "is_planar": False, - }, - "PLANARFOA": { - "num_channels": 4, - "is_planar": True, - }, - "PLANARHOA2": { - "num_channels": 9, - "is_planar": True, - }, - "PLANARHOA3": { - "num_channels": 16, - "is_planar": True, - }, - "SBA1": { - "num_channels": 4, - "is_planar": False, - }, - "SBA2": { - "num_channels": 9, - "is_planar": False, - }, - "SBA3": { - "num_channels": 16, - "is_planar": False, - }, -} - -SCENE_METADATA_FORMATS = {"META"} - -AUDIO_FORMATS = [ - BINAURAL_AUDIO_FORMATS, - CHANNEL_BASED_AUDIO_FORMATS, - METADATA_ASSISTED_SPATIAL_AUDIO_FORMATS, - OBJECT_BASED_AUDIO_FORMATS, - SCENE_BASED_AUDIO_FORMATS, -] - - -IVAS_FRAME_LEN_MS = 20 - -IVAS_CICPX_TO_MONO = np.array( - [ - [ - 1, - 1, - 1, - 1, - 0.79999995, - 0.79999995, - 0.79999995, - 0.79999995, - 0.849999964, - 0.849999964, - 0.849999964, - 0.849999964, - ] - ] -).T - -IVAS_CICPX_TO_STEREO = np.array( - [ - [1, 0], - [0, 1], - [np.sqrt(0.5), np.sqrt(0.5)], - [np.sqrt(0.5), np.sqrt(0.5)], - [0.79999995, 0], - [0, 0.79999995], - [0.79999995, 0], - [0, 0.79999995], - [0.849999964, 0], - [0, 0.849999964], - [0.849999964, 0], - [0, 0.849999964], - ] -) - -# downmix matrices -IVAS_CICP12_TO_6 = np.zeros(8 * 6) -IVAS_CICP12_TO_6[[0, 7, 14, 21, 28, 35, 40, 47]] = 1 -IVAS_CICP12_TO_6 = IVAS_CICP12_TO_6.reshape(8, 6) - -IVAS_CICP14_TO_6 = np.zeros(8 * 6) -IVAS_CICP14_TO_6[[0, 7, 14, 21, 28, 35]] = 1 -IVAS_CICP14_TO_6[[36, 43]] = 0.849999964 -IVAS_CICP14_TO_6 = IVAS_CICP14_TO_6.reshape(8, 6) - -IVAS_CICP16_TO_6 = np.zeros(10 * 6) -IVAS_CICP16_TO_6[[0, 7, 14, 21, 28, 35]] = 1 -IVAS_CICP16_TO_6[[36, 43, 52, 59]] = 0.849999964 -IVAS_CICP16_TO_6 = IVAS_CICP16_TO_6.reshape(10, 6) - -IVAS_CICP16_TO_12 = np.zeros(10 * 8) -IVAS_CICP16_TO_12[[0, 9, 18, 27, 36, 45]] = 1 -IVAS_CICP16_TO_12[[48, 57, 68, 77]] = 0.849999964 -IVAS_CICP16_TO_12 = IVAS_CICP16_TO_12.reshape(10, 8) - -IVAS_CICP16_TO_14 = np.zeros(10 * 8) -IVAS_CICP16_TO_14[[0, 9, 18, 27, 36, 45, 54, 63]] = 1 -IVAS_CICP16_TO_14[[68, 77]] = 0.849999964 -IVAS_CICP16_TO_14 = IVAS_CICP16_TO_14.reshape(10, 8) - -IVAS_CICP19_TO_6 = np.zeros(12 * 6) -IVAS_CICP19_TO_6[[0, 7, 14, 21, 28, 35]] = 1 -IVAS_CICP19_TO_6[[36, 43]] = 0.367322683 -IVAS_CICP19_TO_6[[48, 55, 64, 71]] = 0.849999964 -IVAS_CICP19_TO_6[[40, 47]] = 0.930093586 -IVAS_CICP19_TO_6 = IVAS_CICP19_TO_6.reshape(12, 6) - -IVAS_CICP19_TO_12 = np.zeros(12 * 8) -IVAS_CICP19_TO_12[[0, 9, 18, 27, 38, 47]] = 1 -IVAS_CICP19_TO_12[[48, 57]] = 0.367322683 -IVAS_CICP19_TO_12[[64, 73, 84, 93]] = 0.849999964 -IVAS_CICP19_TO_12[[52, 61]] = 0.930093586 -IVAS_CICP19_TO_12 = IVAS_CICP19_TO_12.reshape(12, 8) - -IVAS_CICP19_TO_14 = np.zeros(12 * 8) -IVAS_CICP19_TO_14[[0, 9, 18, 27, 36, 45, 70, 79]] = 1 -IVAS_CICP19_TO_14[[48, 57]] = 0.367322683 -IVAS_CICP19_TO_14[[84, 93]] = 0.849999964 -IVAS_CICP19_TO_14[[52, 61]] = 0.930093586 -IVAS_CICP19_TO_14 = IVAS_CICP19_TO_14.reshape(12, 8) - -IVAS_CICP19_TO_16 = np.zeros(12 * 10) -IVAS_CICP19_TO_16[[0, 11, 22, 33, 44, 55, 86, 97, 108, 119]] = 1 -IVAS_CICP19_TO_16[[60, 71]] = 0.367322683 -IVAS_CICP19_TO_16[[64, 75]] = 0.930093586 -IVAS_CICP19_TO_16 = IVAS_CICP19_TO_16.reshape(12, 10) - -# upmix matrices -IVAS_MONO_TO_CICPX = np.zeros([1, 12]) -IVAS_MONO_TO_CICPX[0, 2] = 1 - -IVAS_STEREO_TO_CICPX = np.zeros([2, 12]) -IVAS_STEREO_TO_CICPX[0, 0] = 1 -IVAS_STEREO_TO_CICPX[1, 1] = 1 - -IVAS_CICP12_TO_14 = np.zeros(8 * 8) -IVAS_CICP12_TO_14[[0, 9, 18, 27, 36, 45, 52, 61]] = 1 -IVAS_CICP12_TO_14 = IVAS_CICP12_TO_14.reshape(8, 8) - -IVAS_CICP12_TO_16 = np.zeros(8 * 10) -IVAS_CICP12_TO_16[[0, 11, 22, 33, 44, 55, 64, 75]] = 1 -IVAS_CICP12_TO_16 = IVAS_CICP12_TO_16.reshape(8, 10) - -IVAS_CICP12_TO_19 = np.zeros(8 * 12) -IVAS_CICP12_TO_19[[0, 13, 26, 39, 54, 67, 76, 89]] = 1 -IVAS_CICP12_TO_19 = IVAS_CICP12_TO_19.reshape(8, 12) - -IVAS_CICP14_TO_19 = np.zeros(8 * 12) -IVAS_CICP14_TO_19[[0, 13, 26, 39, 52, 65, 80, 93]] = 1 -IVAS_CICP14_TO_19 = IVAS_CICP14_TO_19.reshape(8, 12) - -IVAS_CICP16_TO_19 = np.zeros(10 * 12) -IVAS_CICP16_TO_19[[0, 13, 26, 39, 52, 65, 80, 93, 106, 119]] = 1 -IVAS_CICP16_TO_19 = IVAS_CICP16_TO_19.reshape(10, 12) - -# mapping dict -IVAS_MC_CONVERSION = { - "MONO": { - # upmix - "5_1": IVAS_MONO_TO_CICPX[:, :6], - "7_1": IVAS_MONO_TO_CICPX[:, :8], - "5_1_2": IVAS_MONO_TO_CICPX[:, :8], - "5_1_4": IVAS_MONO_TO_CICPX[:, :10], - "7_1_4": IVAS_MONO_TO_CICPX[:, :12], - }, - "STEREO": { - # upmix - "5_1": IVAS_STEREO_TO_CICPX[:, :6], - "7_1": IVAS_STEREO_TO_CICPX[:, :8], - "5_1_2": IVAS_STEREO_TO_CICPX[:, :8], - "5_1_4": IVAS_STEREO_TO_CICPX[:, :10], - "7_1_4": IVAS_STEREO_TO_CICPX[:, :12], - }, - "5_1": { - # downmix - "MONO": IVAS_CICPX_TO_MONO[:6, :], - "STEREO": IVAS_CICPX_TO_STEREO[:6, :], - # upmix - "7_1": np.pad(np.eye(6), [[0, 0], [0, 2]]), - "5_1_2": np.pad(np.eye(6), [[0, 0], [0, 2]]), - "5_1_4": np.pad(np.eye(6), [[0, 0], [0, 4]]), - "7_1_4": np.pad(np.eye(6), [[0, 0], [0, 6]]), - }, - "7_1": { - # downmix - "MONO": IVAS_CICPX_TO_MONO[:8, :], - "STEREO": IVAS_CICPX_TO_STEREO[:8, :], - "5_1": IVAS_CICP12_TO_6, - # upmix - "5_1_2": IVAS_CICP12_TO_14, - "5_1_4": IVAS_CICP12_TO_16, - "7_1_4": IVAS_CICP12_TO_19, - }, - "5_1_2": { - # downmix - "MONO": np.vstack([IVAS_CICPX_TO_MONO[:6, :], IVAS_CICPX_TO_MONO[-2:, :]]), - "STEREO": np.vstack( - [IVAS_CICPX_TO_STEREO[:6, :], IVAS_CICPX_TO_STEREO[-2:, :]] - ), - "5_1": IVAS_CICP14_TO_6, - "7_1": np.pad(IVAS_CICP14_TO_6, [[0, 0], [0, 2]]), - # upmix - "5_1_4": np.pad(np.eye(8), [[0, 0], [0, 2]]), - "7_1_4": IVAS_CICP14_TO_19, - }, - "5_1_4": { - # downmix - "MONO": np.vstack([IVAS_CICPX_TO_MONO[:6, :], IVAS_CICPX_TO_MONO[-4:, :]]), - "STEREO": np.vstack( - [IVAS_CICPX_TO_STEREO[:6, :], IVAS_CICPX_TO_STEREO[-4:, :]] - ), - "5_1": IVAS_CICP16_TO_6, - "7_1": IVAS_CICP16_TO_12, - "5_1_2": IVAS_CICP16_TO_14, - # upmix - "7_1_4": IVAS_CICP16_TO_19, - }, - "7_1_4": { - # downmix - "MONO": IVAS_CICPX_TO_MONO, - "STEREO": IVAS_CICPX_TO_STEREO, - "5_1": IVAS_CICP19_TO_6, - "7_1": IVAS_CICP19_TO_12, - "5_1_2": IVAS_CICP19_TO_14, - "5_1_4": IVAS_CICP19_TO_16, - }, -} - -# LFE 120 Hz LPF filter coefficients -IVAS_LPF_4_BUTTER_48K_SOS = np.array( - [ - [ - 5.12617881476274e-09, - 1.02523584294987e-08, - 5.12617879059970e-09, - 1, - -1.96875982668433, - 0.969044914826862, - ], - [ - 1, - 1.99999984394358, - 1.00000000471366, - 1, - -1.98677297369091, - 0.987060670205863, - ], - ] -) - -T_DESIGN_11_AZI = np.array( - [ - 132.927291884332, - -83.9349499672527, - 8.47410038634525, - -113.340833834572, - -103.265909909537, - -33.2370360923825, - 21.8564347471830, - -156.539486489880, - -64.2647531387317, - 165.779530068738, - -25.2028339893249, - -97.0037973959711, - 27.8546391256925, - 153.214218975132, - -155.061608694663, - -11.8421354925543, - 80.5387312016125, - -42.0561606270165, - -31.2233262205060, - 38.8379041944063, - 93.7606877469492, - -84.7560200078398, - 7.75536818082863, - -122.276883381108, - 46.8012705252113, - -24.7686335284573, - 99.8904719062334, - -134.783996960185, - -83.0880230164493, - 60.1281736000420, - 152.644656278084, - 29.7576658909417, - 40.7793187974476, - 110.183927562412, - 165.652065916454, - -12.9926632105736, - 79.7359893585681, - -50.5245271190884, - 118.923930267733, - 47.2202861862577, - 171.925276523721, - -62.5145800558502, - -11.1156697680531, - 132.018041099963, - -135.355486412425, - 102.370921576708, - 112.739282398012, - -178.304963670831, - -122.319932198534, - 59.0763464570905, - 151.704200334501, - 21.3763364190503, - -169.005476417779, - 118.980811786769, - -116.089295979010, - 9.64767870353308, - 60.8933243657771, - -156.021526862757, - -63.4602993325163, - 174.929787427393, - -175.288768596346, - -105.951907934032, - -50.1928304519800, - 131.358266702971, - -136.296815007542, - 93.5644603506407, - -97.0840116473627, - -169.158278888619, - -44.1323835471345, - 81.4795403841382, - ] -) - -T_DESIGN_11_ELE = np.array( - [ - 7.69254738757899, - -23.7300652200871, - 23.5127556185301, - 70.4225940747938, - -9.89694439538752, - -70.7513316063095, - -26.4618527647561, - 47.7764936689044, - -7.72047049524459, - 44.5343602375216, - 26.3897904767450, - -44.6578850137166, - 9.76703456924600, - -47.7053318175498, - 7.45302934155972, - -23.5901209534773, - 23.7194484034707, - 70.4382693912270, - -9.83541588740259, - -70.4980825105727, - -26.2949218109204, - 47.6148028805222, - -7.51718499746626, - 44.2862347125773, - 26.6442619674660, - -44.5693707254340, - 9.91271928508000, - -47.9599550372574, - 7.29679922953795, - -23.3445981426306, - 23.6415261666079, - 70.6843143997832, - -9.58140351749889, - -70.3934534122902, - -26.4258159091605, - 47.7510668062369, - -7.30853603036844, - 44.2632768570349, - 26.7140614474957, - -44.3149733480527, - 9.75899721561506, - -48.0361913333593, - 7.43965099805872, - -23.3326075548841, - 23.3868959687598, - 70.8219078016791, - -9.48596399169388, - -70.5801867828491, - -26.6740262349265, - 47.9978414043199, - -7.38276167631068, - 44.4970603752708, - 26.5024990214418, - -44.2461913308458, - 9.51845076548334, - -47.8281351088411, - 7.68427447425834, - -23.5706842106942, - 23.3074499244045, - 70.6586472132300, - -9.68088860263008, - -70.8026785673948, - -26.6963451935976, - 48.0136296461397, - -7.63734823159200, - 44.6651234222196, - 26.3023490002159, - -44.4576351865647, - 9.52341455917443, - -47.6242211091394, - ] -) -PLANAR_HOA_CHANNELS_ACN = np.array([0, 1, 3, 4, 8, 9, 15]) -VERT_HOA_CHANNELS_ACN = np.array([2, 5, 6, 7, 10, 11, 12, 13, 14]) - -SEED_PADDING = 0 - -# delay in number of samples -DELAY_COMPENSATION_FOR_FILTERING = { - "SHQ2": { - "up": 436, - "down": 218, - }, - "SHQ3": { - "up": 436, - "down": 145, - }, - "MSIN": 92, - "LP1p5": 322, - "LP35": 232, - "LP7": 117, - "LP10": 82, - "LP12": 164, - "LP14": 234, - "LP20": 161, - "HP50_32KHZ": 559, - "HP50_48KHZ": 839, -} diff --git a/item_generation_scripts/audiotools/convert/__init__.py b/item_generation_scripts/audiotools/convert/__init__.py deleted file mode 100644 index 4ec23739..00000000 --- a/item_generation_scripts/audiotools/convert/__init__.py +++ /dev/null @@ -1,323 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import logging -from pathlib import Path, PurePath -from typing import Optional, Union - -from item_generation_scripts.audiotools import audio, audioarray, metadata -from item_generation_scripts.audiotools.audiofile import write -from item_generation_scripts.audiotools.convert.channelbased import convert_channelbased -from item_generation_scripts.audiotools.convert.masa import convert_masa -from item_generation_scripts.audiotools.convert.objectbased import convert_objectbased -from item_generation_scripts.audiotools.convert.scenebased import convert_scenebased -from item_generation_scripts.audiotools.wrappers.bs1770 import loudness_norm -from item_generation_scripts.audiotools.wrappers.esdru import esdru -from item_generation_scripts.audiotools.wrappers.filter import ( - hp50filter_itu, - lpfilter_itu, - resample_itu, -) -from item_generation_scripts.audiotools.wrappers.p50fbmnru import p50fbmnru - -from ..metadata import write_ISM_metadata_in_file - - -def convert_file( - in_file: Union[str, Path], - out_file: Union[str, Path], - in_fs: int, - in_fmt: Union[str, Path], - out_fmt: Optional[Union[str, Path]] = None, - out_fs: Optional[int] = None, - in_meta: Optional[list] = None, - logger: Optional[logging.Logger] = None, - **kwargs, -) -> None: - """Conversion function for one audio file""" - - if not in_fmt: - raise ValueError("Input audio format must be specified!") - - # get audio class object - can be either a regular single audio or scene description .txt - if not isinstance(in_fmt, PurePath) and in_fmt.startswith("META"): - input = metadata.Metadata(in_file) - else: - input = audio.fromfile(in_fmt, in_file, in_fs, in_meta) - - # try to set reasonable defaults if missing - if not in_fs: - in_fs = input.fs - if not out_fs: - out_fs = input.fs - - if not out_fmt: - if isinstance(input, metadata.Metadata): - raise ValueError( - "Output format must be specified for scene description files!" - ) - else: - out_fmt = input.name - - output = audio.fromtype(out_fmt) - if isinstance(output, audio.ObjectBasedAudio): - try: - output.object_pos = input.object_pos - output.metadata_files = input.metadata_files - except Exception: - raise ValueError( - "ISM is not supported as an output for rendering! Only usable as pass-through" - ) - - if isinstance(input, metadata.Metadata): - if logger: - logger.debug(f"Converting metadata to {out_fmt} : {in_file} -> {out_file}") - - # render each audio instance separately - for audio_in in input.audio: - output.fs = out_fs - tmp = audio.fromtype(out_fmt) - tmp.fs = in_fs # resampling not yet applied - convert(audio_in, tmp, in_fs=in_fs, out_fs=out_fs, logger=logger, **kwargs) - if output.audio is not None: - output.audio += tmp.audio - else: - output.audio = tmp.audio - else: - if logger: - logger.debug(f"Converting {in_fmt} to {out_fmt} : {in_file} -> {out_file}") - # run main conversion method - output.fs = in_fs # resampling not yet applied - convert(input, output, in_fs=in_fs, out_fs=out_fs, logger=logger, **kwargs) - - # write output - write(out_file, output.audio, output.fs) - if isinstance(output, audio.ObjectBasedAudio): - write_ISM_metadata_in_file(output.object_pos, [out_file], automatic_naming=True) - - -def convert( - input: audio.Audio, - output: audio.Audio, - in_trim: Optional[list] = None, - in_pad_noise: Optional[bool] = False, - in_delay: Optional[float] = None, - in_fs: Optional[int] = None, - in_cutoff: Optional[int] = None, - in_hp50: Optional[bool] = None, - in_window: Optional[list] = None, - in_loudness: Optional[float] = None, - in_loudness_fmt: Optional[str] = None, - out_trim: Optional[list] = None, - out_pad_noise: Optional[bool] = False, - out_delay: Optional[float] = None, - out_fs: Optional[int] = None, - out_cutoff: Optional[int] = None, - out_hp50: Optional[bool] = None, - out_window: Optional[list] = None, - out_loudness: Optional[float] = None, - out_loudness_fmt: Optional[str] = None, - limit: Optional[bool] = False, - mnru_q: Optional[float] = None, - esdru_alpha: Optional[float] = None, - logger: Optional[logging.Logger] = None, - **kwargs, -) -> None: - """Perform pre-processing, conversion and post-processing""" - - """pre-processing""" - process_audio( - x=input, - trim=in_trim, - pad_noise=in_pad_noise, - delay=in_delay, - fs=in_fs, - fc=in_cutoff, - hp50=in_hp50, - window=in_window, - loudness=in_loudness, - loudness_fmt=in_loudness_fmt, - logger=logger, - ) - - """format conversion""" - format_conversion(input, output, logger=logger, **kwargs) - - """post-processing""" - process_audio( - x=output, - trim=out_trim, - pad_noise=out_pad_noise, - delay=out_delay, - fs=out_fs, - fc=out_cutoff, - hp50=out_hp50, - window=out_window, - loudness=out_loudness, - loudness_fmt=out_loudness_fmt, - limit=limit, - mnru_q=mnru_q, - esdru_alpha=esdru_alpha, - logger=logger, - ) - - -def process_audio( - x: audio.Audio, - trim: Optional[list] = None, - pad_noise: Optional[bool] = False, - delay: Optional[float] = None, - fs: Optional[int] = None, - fc: Optional[int] = None, - hp50: Optional[bool] = False, - window: Optional[float] = None, - loudness: Optional[float] = None, - loudness_fmt: Optional[str] = None, - limit: Optional[bool] = False, - mnru_q: Optional[float] = None, - esdru_alpha: Optional[float] = None, - logger: Optional[logging.Logger] = None, -) -> None: - """Perform (pre-/pos-) processing of audio""" - - if fs is None: - fs = x.fs - - """delay audio""" - if delay is not None: - if logger: - logger.debug(f"Delaying audio by {delay} ms") - x.audio = audioarray.delay(x.audio, x.fs, delay) - - """trim or pad audio""" - if trim is not None: - if isinstance(x, audio.ObjectBasedAudio): - # metadata concatenation necessary for ISM - metadata.trim_meta(x, tuple(trim), pad_noise) - else: - x.audio = audioarray.trim(x.audio, x.fs, tuple(trim), pad_noise) - - """windowing""" - if window is not None: - if logger: - logger.debug(f"Windowing audio with {window} ms Hann window") - x.audio = audioarray.window(x.audio, x.fs, window) - - """high-pass (50 Hz) filtering""" - if hp50: - if logger: - logger.debug("Applying 50 Hz high-pass filter using ITU STL filter") - x.audio = hp50filter_itu(x) - - """resampling""" - if x.fs != fs: - if logger: - logger.debug(f"Resampling from {x.fs} to {fs} using ITU STL filter") - x.audio = resample_itu(x, fs) - x.fs = fs - - """loudness normalization""" - if loudness is not None: - if logger: - logger.debug( - f"Applying loudness adjustment to {loudness} LKFS for format {loudness_fmt} using ITU STL bs1770demo" - ) - x.audio = loudness_norm(x, loudness, loudness_fmt) - - """low-pass filtering""" - if fc is not None: - if logger: - logger.debug( - f"Applying low-pass filter with cutoff {fc} Hz using ITU STL filter" - ) - x.audio = lpfilter_itu(x, fc) - - """MNRU""" - if mnru_q is not None: - if logger: - logger.debug("Applying P.50 Fullband MNRU") - x.audio = p50fbmnru(x, mnru_q) - - """ESDRU""" - if esdru_alpha is not None: - if logger: - logger.debug("Applying ESDRU Recommendation ITU-T P.811") - x.audio = esdru(x, esdru_alpha) - - """limiting""" - if limit: - if logger: - logger.debug("Applying limiter") - audioarray.limiter(x.audio, x.fs) - - -def format_conversion( - input: audio.Audio, - output: audio.Audio, - logger: Optional[logging.Logger] = None, - **kwargs, -) -> None: - """Convert one audio format to another""" - - # validation - if isinstance(output, audio.MetadataAssistedSpatialAudio): - raise NotImplementedError("MASA is not supported as an output for rendering!") - - if isinstance(output, audio.ObjectBasedAudio) and input.name != output.name: - raise NotImplementedError( - "ISM is not supported as an output for rendering! Only usable as pass-through" - ) - - if logger: - logger.debug(f"Format conversion: {input.name} -> {output.name}") - - if input.name == output.name or ( - input.name.startswith("BINAURAL") and output.name.startswith("BINAURAL") - ): - output.audio = input.audio - else: - if isinstance(input, audio.BinauralAudio): - raise NotImplementedError( - f"{input.name} is not supported as an input for rendering!" - ) - elif isinstance(input, audio.ChannelBasedAudio): - convert_channelbased(input, output, **kwargs) - elif isinstance(input, audio.MetadataAssistedSpatialAudio): - convert_masa(input, output, **kwargs) - elif isinstance(input, audio.ObjectBasedAudio): - convert_objectbased(input, output, **kwargs) - elif isinstance(input, audio.SceneBasedAudio): - convert_scenebased(input, output, **kwargs) - else: - raise NotImplementedError( - f"Unknown or unsupported audio format {input.name}" - ) diff --git a/item_generation_scripts/audiotools/convert/binaural.py b/item_generation_scripts/audiotools/convert/binaural.py deleted file mode 100644 index b23e69ee..00000000 --- a/item_generation_scripts/audiotools/convert/binaural.py +++ /dev/null @@ -1,108 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -from typing import Optional - -import numpy as np -from scipy.signal import fftconvolve - - -def NS2SA( - fs: float, - x: float, -) -> int: - """ - Converts from nanoseconds to number of samples - - Parameters - ---------- - fs: float - Sampling rate - x: float - Duration in nano seconds - - Returns - ------- - Number of samples - """ - - return int(int(fs / 100) * (x / 100) / 100000) - - -def binaural_fftconv( - x: np.ndarray, - IR: np.ndarray, - nchannels: int, - lfe_index: Optional[list[int]] = None, -) -> np.ndarray: - """ - Binauralization using fft convolution - - Parameters - ---------- - x: np.ndarray - Input multi-channel array - IR: np.ndarray - HRIRs array - nchannels: int - Maximum number of channels to process - lfe_index: Optional[list[int]] - List of LFE channel indices - - Returns - ------- - y: np.ndarray - Output convolved signal array - """ - - if lfe_index is None: - lfe_index = [] - - y = np.zeros([x.shape[0], 2]) - for chan_idx in range(min(x.shape[1], nchannels)): - if chan_idx not in lfe_index: - y[:, 0] = np.add( - y[:, 0], - fftconvolve(x[:, chan_idx].astype(float), IR[:, 0, chan_idx]).astype( - float - )[: x.shape[0]], - ) - y[:, 1] = np.add( - y[:, 1], - fftconvolve(x[:, chan_idx].astype(float), IR[:, 1, chan_idx]).astype( - float - )[: x.shape[0]], - ) - else: - ... - - return y diff --git a/item_generation_scripts/audiotools/convert/channelbased.py b/item_generation_scripts/audiotools/convert/channelbased.py deleted file mode 100644 index a8d941e2..00000000 --- a/item_generation_scripts/audiotools/convert/channelbased.py +++ /dev/null @@ -1,390 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -from pathlib import Path -from typing import Optional, Tuple, Union - -import numpy as np - -from item_generation_scripts.audiotools import audio -from item_generation_scripts.audiotools.audioarray import delay, framewise_io -from item_generation_scripts.audiotools.binaural_datasets.binaural_dataset import ( - load_ir, -) -from item_generation_scripts.audiotools.constants import ( - BINAURAL_LFE_GAIN, - IVAS_FRAME_LEN_MS, - IVAS_MC_CONVERSION, -) -from item_generation_scripts.audiotools.convert import scenebased -from item_generation_scripts.audiotools.convert.binaural import binaural_fftconv -from item_generation_scripts.audiotools.EFAP import EFAP -from item_generation_scripts.audiotools.rotation import Quat2RotMat, rotateAziEle -from item_generation_scripts.audiotools.wrappers.filter import resample_itu - -""" ChannelBasedAudio functions """ - - -def convert_channelbased( - cba: audio.ChannelBasedAudio, - out: audio.Audio, - **kwargs, -) -> audio.Audio: - """Convert channel-based audio to the requested output format""" - # CBA -> Binaural - if isinstance(out, audio.BinauralAudio): - render_cba_to_binaural(cba, out, **kwargs) - - # CBA -> CBA - elif isinstance(out, audio.ChannelBasedAudio): - render_cba_to_cba(cba, out) - - # CBA -> SBA - elif isinstance(out, audio.SceneBasedAudio): - render_cba_to_sba(cba, out) - - else: - raise NotImplementedError( - f"Conversion from {cba.name} to {out.name} is unsupported!" - ) - - return out - - -def render_cba_to_binaural( - cba: audio.ChannelBasedAudio, - bin: audio.BinauralAudio, - trajectory: Optional[Union[str, Path]] = None, - bin_dataset: Optional[str] = None, - bin_lfe_gain: Optional[float] = None, - **kwargs, -) -> None: - """ - Binauralization of channel-based audio - - Parameters - ---------- - cba: audio.ChannelBasedAudio - Channel-based input audio - bin: audio.BinauralAudio - Binaural output audio - trajectory: Optional[Union[str, Path]] - Head rotation trajectory path - bin_dataset: Optional[str] - Name of binaural dataset wihtout prefix or suffix - bin_lfe_gain: Optional[float] - LFE gain for binaural rendering - """ - - if cba.name == "MONO": - # no binauralization possible for mono -> render to stereo and assume binaural signal - cba_stereo = audio.fromtype("STEREO") - cba_stereo.fs = bin.fs - render_cba_to_cba(cba, cba_stereo) - bin.audio = cba_stereo.audio - return - - cba.audio = resample_itu(cba, 48000) - old_fs = cba.fs - cba.fs = 48000 - bin.fs = 48000 - - if trajectory is not None: - cba.audio = rotate_cba(cba, trajectory) - - IR, _, latency_smp = load_ir(cba.name, bin.name, bin_dataset) - - # render LFE - if bin_lfe_gain is not None: - bin_lfe, lfe_delay_ns = render_lfe_to_binaural( - cba.audio, cba.fs, cba.lfe_index, bin_lfe_gain - ) - - # render rest of the signal - bin.audio = binaural_fftconv(cba.audio, IR, cba.num_channels, cba.lfe_index) - # compensate delay from binaural dataset - bin.audio = delay(bin.audio, bin.fs, -latency_smp, samples=True) - - # add LFE and rest - if bin_lfe_gain is not None: - bin.audio += bin_lfe - - bin.audio = resample_itu(bin, old_fs) - - -def render_custom_ls_binaural( - custom_ls: audio.ChannelBasedAudio, - output: audio.BinauralAudio, - IR: np.ndarray, - SourcePosition: np.ndarray, - trajectory: str, -): - # TODO rework impl. (with EFAP) - # logger.info(" Processing channels on custom LS layout") - # azis = ", ".join([f"{a:7.2f}" for a in ls_azi_all]) - # eles = ", ".join([f"{e:7.2f}" for e in ls_ele_all]) - # logger.info(f" azi: {azis}") - # logger.info(f" ele: {eles}") - # logger.info(f" lfe_index: {lfe_index_all}") - - # if output.name == "BINAURAL_ROOM": - # tmp = get_audio_type("MOZART") - # convert_channel_based(custom_ls, tmp) - # logger.info(f" {custom_ls.name} -> {tmp.name} -> {output.name}") - # custom_ls.audio = tmp.audio - # else: - # tmp = custom_ls - # - # ls_azi_all = tmp.ls_azi - # ls_ele_all = tmp.ls_ele - # lfe_index_all = tmp.lfe_index - # - # frame_len = (IVAS_FRAME_LEN_MS // 4) * (fs // 1000) - # sig_len = custom_ls.audio.shape[0] - # N_frames = int(sig_len / frame_len) - # - # i_ls = 0 - # y = np.zeros([sig_len, 2]) - # for i_chan in range(custom_ls.audio.shape[1]): - # - # # skip LFE - # if i_chan in lfe_index_all: - # continue - # - # # skip silent (or very low volume) channels - # if np.allclose(custom_ls.audio[:, i_chan], 0.0, atol=32.0): - # continue - # - # ls_azi = np.repeat(ls_azi_all[i_ls], N_frames) - # ls_ele = np.repeat(ls_ele_all[i_ls], N_frames) - # - # azi, ele = rotateISM(ls_azi, ls_ele, trajectory=trajectory) - # - # y += binaural_fftconv_framewise( - # custom_ls.audio[:, i_chan], - # IR, - # SourcePosition, - # frame_len=frame_len, - # azi=azi, - # ele=ele, - # ) - # i_ls += 1 - # - # return y - return - - -def render_cba_to_cba( - cba_in: audio.ChannelBasedAudio, cba_out: audio.ChannelBasedAudio -) -> None: - """ - Rendering of channel-based input signal to channel-based output - - Parameters - ---------- - cba_in: audio.ObjectBasedAudio - Channel-based input audio - cba_out: audio.ChannelBasedAudio - Channel-based output audio - """ - - # Stereo to Mono - if cba_in.name == "STEREO" and cba_out.name == "MONO": - render_mtx = np.vstack([[0.5], [0.5]]) - else: - try: - render_mtx = IVAS_MC_CONVERSION[cba_in.name][cba_out.name] - except KeyError: - # Use EFAP panning if no matrix was found - panner = EFAP( - np.delete(cba_out.ls_azi, cba_out.lfe_index).astype(float), - np.delete(cba_out.ls_ele, cba_out.lfe_index).astype(float), - ) - - render_mtx = np.vstack( - [ - panner.pan(a, e).T - for i, (a, e) in enumerate(zip(cba_in.ls_azi, cba_in.ls_ele)) - if i not in cba_in.lfe_index - ] - ) - - # pass-through for LFE - for index in np.sort(cba_in.lfe_index): - render_mtx = np.insert(render_mtx, index, 0, axis=0) - render_mtx = np.insert(render_mtx, cba_out.lfe_index, 0, axis=1) - render_mtx[cba_in.lfe_index, cba_out.lfe_index] = 1 - - if cba_out.num_channels <= 2: - render_mtx[cba_in.lfe_index, :] = 0 - - cba_out.audio = cba_in.audio @ render_mtx - - -def render_cba_to_sba(cba: audio.ChannelBasedAudio, sba: audio.SceneBasedAudio) -> None: - """ - Rendering of channel-based input signal to SBA output - - Parameters - ---------- - cba: audio.ObjectBasedAudio - Channel-based input audio - sba: audio.ChannelBasedAudio - SBA output audio - """ - - if cba.name == "MONO": - raise ValueError(f"Rendering from MONO to {sba.name} is not supported.") - - # SH response for loudspeaker positions - render_mtx = np.hstack( - [ - scenebased.getRSH(np.array([a]), np.array([e]), sba.ambi_order) - for a, e in zip(cba.ls_azi, cba.ls_ele) - ] - ).T - render_mtx[cba.lfe_index] = 0 - - sba.audio = cba.audio @ render_mtx - # do not add LFE to output - if sba.is_planar: - scenebased.zero_vert_channels(sba) - - -def rotate_cba( - cba: audio.ChannelBasedAudio, - trajectory: str, -) -> np.ndarray: - """ - Rotate MC signal by applying a rotation matrix calculated from the current quaternion - in each subframe - - Parameters: - ---------- - x: np.ndarray - Input multichannel signal - trajectory: str - Path to trajectory file - - Returns: - ---------- - y: np.ndarray - Rotated multichannel signal - """ - - trj_data = np.genfromtxt(trajectory, delimiter=",") - trj_frames = trj_data.shape[0] - - sig_len = cba.audio.shape[0] - sig_dim = cba.audio.shape[1] - frame_len = (IVAS_FRAME_LEN_MS // 4) * 48 - - out = np.zeros([sig_len, sig_dim]) - - panner = EFAP(cba.ls_azi, cba.ls_ele) - - fade_in = np.arange(frame_len) / (frame_len - 1) - fade_in = fade_in[:, np.newaxis] - fade_out = 1.0 - fade_in - - R_old = np.eye(cba.num_channels) - - for i, (frame_in, frame_out) in framewise_io(cba.audio, out, frame_len): - # update the crossfade if we have a smaller last frame - if frame_out.shape[0] != frame_len: - frame_size = frame_out.shape[0] - fade_in = np.arange(frame_size) / (frame_size - 1) - fade_in = fade_in[:, np.newaxis] - fade_out = 1.0 - fade_in - - q = trj_data[i % trj_frames, :] - rotated_pos = np.array( - [rotateAziEle(a, e, Quat2RotMat(q)) for a, e in zip(cba.ls_azi, cba.ls_ele)] - ) - R = panner.pan(rotated_pos[:, 0], rotated_pos[:, 1]) - R[:, [cba.lfe_index]] = 0 - R[[cba.lfe_index], :] = 0 - R[cba.lfe_index, cba.lfe_index] = 1 - - frame_out[:, :] = (fade_in * frame_in @ R) + (fade_out * frame_in @ R_old) - - R_old = R.copy() - - return out - - -""" Helper functions """ - - -def render_lfe_to_binaural( - x: np.ndarray, - fs: Optional[int] = 48000, - lfe_index: Optional[list] = None, - LFE_gain: Optional[float] = BINAURAL_LFE_GAIN, -) -> Tuple[np.ndarray, int]: - """ - Extract LFE from the given input and render - it binaurally, accounting for delay - """ - - lfe = x[:, lfe_index].copy() - - # if there is more than one LFE sum them into one - if lfe.shape[1] > 1: - lfe = np.sum(lfe, axis=1) - - """ - # 120 Hz low-pass filtering for LFE using IVAS filter coefficients - if fs == 48000: - lfe = sig.sosfilt(IVAS_LPF_4_BUTTER_48K_SOS, lfe, axis=0) - else: - raise NotImplementedError("Only 48 kHz supported at the moment!") - - # 3.5ms LP filter delay from IVAS ROM - lfe_delay_ns = 0.0035 * 1e9 - lfe_delay_smp = round(lfe_delay_ns * fs / 1e9) - - # Delay LFE by the same amount as the HRTF delay - lfe = np.roll(lfe, round(latency_smp), axis=0) - lfe[0 : round(latency_smp), :] = 0 - """ - lfe_delay_ns = 0 - - # apply gain - lfe *= LFE_gain - - # duplicate for each binaural channel - if len(np.shape(lfe)) < 2: - lfe = lfe[:, np.newaxis] - lfe = np.hstack([lfe, lfe]) - - return lfe, lfe_delay_ns diff --git a/item_generation_scripts/audiotools/convert/masa.py b/item_generation_scripts/audiotools/convert/masa.py deleted file mode 100644 index 15f1c683..00000000 --- a/item_generation_scripts/audiotools/convert/masa.py +++ /dev/null @@ -1,165 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -from pathlib import Path -from typing import Optional, Union -from warnings import warn - -from item_generation_scripts.audiotools import audio -from item_generation_scripts.audiotools.convert import channelbased -from item_generation_scripts.audiotools.wrappers.masaRenderer import masaRenderer - -""" MetadataAssistedSpatialAudio functions """ - - -def convert_masa( - masa: audio.MetadataAssistedSpatialAudio, - out: audio.Audio, - **kwargs, -) -> audio.Audio: - """Convert Metadata Assisted Spatial audio to the requested output format""" - - # MASA -> Binaural - if isinstance(out, audio.BinauralAudio): - render_masa_to_binaural(masa, out, **kwargs) - - # MASA -> CBA - elif isinstance(out, audio.ChannelBasedAudio): - render_masa_to_cba(masa, out) - - # MASA -> SBA - elif isinstance(out, audio.SceneBasedAudio): - render_masa_to_sba(masa, out) - - else: - raise NotImplementedError( - f"Conversion from {masa.name} to {out.name} is unsupported!" - ) - - return out - - -def render_masa_to_binaural( - masa: audio.MetadataAssistedSpatialAudio, - bin: audio.BinauralAudio, - trajectory: Optional[Union[str, Path]] = None, - bin_dataset: Optional[str] = None, - **kwargs, -) -> None: - """ - Binauralization of MASA audio - - Parameters - ---------- - masa: audio.MetadataAssistedSpatialAudio - MASA input audio - bin: audio.BinauralAudio - Output binaural audio - trajectory: Optional[Union[str, Path]] - Head rotation trajectory path - bin_dataset: Optional[str] - Name of binaural dataset without prefix or suffix - """ - - if "ROOM" in bin.name: - cba_tmp = audio.fromtype("7_1_4") - cba_tmp.fs = masa.fs - - render_masa_to_cba(masa, cba_tmp) - - channelbased.render_cba_to_binaural(cba_tmp, bin, trajectory) - else: - if trajectory is not None: - warn( - f"Head-rotation not supported by MasaRenderer! Trajectory {trajectory} will be ignored!" - ) - if bin_dataset is not None: - warn( - "Binaural dataset selection not supported by MasaRenderer - please copy the required hrir.bin manually!" - ) - - bin.audio = masaRenderer(masa, "BINAURAL") - - -def render_masa_to_cba( - masa: audio.MetadataAssistedSpatialAudio, - cba: audio.ChannelBasedAudio, -) -> None: - """ - Rendering of MASA input signal to Channel-based format - - Parameters - ---------- - masa: audio.MetadataAssistedSpatialAudio - MASA input audio - cba: audio.ChannelBasedAudio - Channel-based output audio - """ - - if cba.name not in ["5_1", "7_1_4"]: - warn( - f"MasaRenderer does not support {cba.name} natively. Using 7_1_4 as an intermediate format." - ) - - cba_tmp = audio.fromtype("7_1_4") - cba_tmp.fs = masa.fs - cba_tmp.audio = masaRenderer(masa, cba_tmp.name) - - channelbased.render_cba_to_cba(cba_tmp, cba) - else: - cba.audio = masaRenderer(masa, cba.name) - - -def render_masa_to_sba( - masa: audio.MetadataAssistedSpatialAudio, - sba: audio.SceneBasedAudio, -) -> None: - """ - Rendering of MASA input signal to SBA format - - Parameters - ---------- - masa: audio.MetadataAssistedSpatialAudio - MASA input audio - sba: audio.SceneBasedAudio - SBA output audio - """ - - warn( - f"MasaRenderer does not support {sba.name} natively. Using 7_1_4 as an intermediate format." - ) - - cba_tmp = audio.fromtype("7_1_4") - cba_tmp.fs = masa.fs - cba_tmp.audio = masaRenderer(masa, cba_tmp.name) - - channelbased.render_cba_to_sba(cba_tmp, sba) diff --git a/item_generation_scripts/audiotools/convert/objectbased.py b/item_generation_scripts/audiotools/convert/objectbased.py deleted file mode 100644 index 9fb74ed1..00000000 --- a/item_generation_scripts/audiotools/convert/objectbased.py +++ /dev/null @@ -1,352 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -from itertools import repeat -from pathlib import Path -from typing import Optional, Tuple, Union - -import numpy as np - -from item_generation_scripts.audiotools import audio -from item_generation_scripts.audiotools.audioarray import delay, framewise_io -from item_generation_scripts.audiotools.binaural_datasets.binaural_dataset import ( - load_ir, -) -from item_generation_scripts.audiotools.binauralobjectrenderer import ( - binaural_fftconv_framewise, -) -from item_generation_scripts.audiotools.constants import IVAS_FRAME_LEN_MS -from item_generation_scripts.audiotools.convert.channelbased import ( - render_cba_to_binaural, -) -from item_generation_scripts.audiotools.convert.scenebased import getRSH -from item_generation_scripts.audiotools.EFAP import EFAP, wrap_angles -from item_generation_scripts.audiotools.rotation import Quat2RotMat, rotateAziEle -from item_generation_scripts.audiotools.wrappers.filter import resample_itu -from item_generation_scripts.utils import apply_func_parallel - -""" ObjectBasedAudio functions """ - - -def convert_objectbased( - oba: audio.ObjectBasedAudio, - out: audio.Audio, - **kwargs, -) -> audio.Audio: - """Convert an ISM signal to the requested output format""" - - # OBA -> Binaural - if isinstance(out, audio.BinauralAudio): - render_oba_to_binaural(oba, out, **kwargs) - - # OBA -> CBA - elif isinstance(out, audio.ChannelBasedAudio): - render_oba_to_cba(oba, out) - - # OBA -> SBA - elif isinstance(out, audio.SceneBasedAudio): - render_oba_to_sba(oba, out) - else: - raise NotImplementedError( - f"Conversion from {oba.name} to {out.name} is unsupported!" - ) - - return out - - -def render_oba_to_binaural( - oba: audio.ObjectBasedAudio, - bin: audio.BinauralAudio, - trajectory: Optional[Union[str, Path]] = None, - bin_dataset: Optional[str] = None, - **kwargs, -) -> None: - """ - Binauralization of ISM input signal - - Parameters - ---------- - oba: audio.ObjectBasedAudio - Object based input audio - bin: audio.BinauralAudio - Binaural output audio - trajectory: Optional[Union[str, Path]] - Head rotation trajectory - bin_dataset: Optional[str] - Name of binaural dataset, if None default dataset is used - """ - - # bin.audio = np.zeros([oba.audio.shape[0], bin.num_channels]) - - if "ROOM" in bin.name: - cba_tmp = audio.fromtype("7_1_4") - cba_tmp.fs = oba.fs - - render_oba_to_cba(oba, cba_tmp) - - render_cba_to_binaural(cba_tmp, bin, trajectory) - else: - IR, SourcePosition, latency_smp = load_ir(oba.name, bin.name, bin_dataset) - - oba.audio = resample_itu(oba, 48000) - fs_old = oba.fs - oba.fs = 48000 - - # apply processing for every object in parallel - obj_pos = oba.object_pos - obj_idx = list(range(oba.num_channels)) - result = apply_func_parallel( - render_object, - zip( - obj_idx, - obj_pos, - repeat(oba), - repeat(trajectory), - repeat(IR), - repeat(SourcePosition), - ), - None, - "mt", - False, - ) - - # sum results over all objects - bin.audio = np.sum(np.stack(result, axis=2), axis=2) - - # compensate delay from binaural dataset - bin.audio = delay(bin.audio, bin.fs, -latency_smp, samples=True) - - bin.audio = resample_itu(bin, fs_old) - bin.fs = fs_old - - -def render_oba_to_cba( - oba: audio.ObjectBasedAudio, - cba: audio.ChannelBasedAudio, -) -> None: - """ - Rendering of ISM input signal to channel-based format - - Parameters - ---------- - oba: audio.ObjectBasedAudio - Object based input audio - cba: audio.ChannelBasedAudio - Channel-based output audio - """ - - cba.audio = np.zeros([oba.audio.shape[0], cba.num_channels]) - - for obj_idx, obj_pos in enumerate(oba.object_pos): - obj_audio = oba.audio[:, [obj_idx]] - pos_frames = obj_pos.shape[0] - - frame_len = IVAS_FRAME_LEN_MS * (oba.fs // 1000) - - fade_in = np.arange(frame_len) / (frame_len - 1) - fade_in = fade_in[:, np.newaxis] - fade_out = 1.0 - fade_in - - # use EFAP for rendering - panner = EFAP( - np.delete(cba.ls_azi, cba.lfe_index), np.delete(cba.ls_ele, cba.lfe_index) - ) - gains_old = None - - for i, (frame_in, frame_out) in framewise_io(obj_audio, cba.audio, frame_len): - # update the crossfade if we have a smaller last frame - if frame_out.shape[0] != frame_len: - frame_size = frame_out.shape[0] - fade_in = np.arange(frame_size) / (frame_size - 1) - fade_in = fade_in[:, np.newaxis] - fade_out = 1.0 - fade_in - - azi, ele = wrap_angles(*obj_pos[i % pos_frames, :2], clip_ele=True) - gains = panner.pan(azi, ele) - for lfe in np.sort(cba.lfe_index): - gains = np.insert(gains, lfe, 0) - gains = gains[np.newaxis, :] - - if gains_old is None: - gains_old = gains.copy() - - frame_out[:] += (fade_in * frame_in @ gains) + ( - fade_out * frame_in @ gains_old - ) - - gains_old = gains.copy() - - -def render_oba_to_sba( - oba: audio.ObjectBasedAudio, - sba: audio.SceneBasedAudio, -) -> None: - """ - Rendering of ISM input signal to SBA format - - Parameters - ---------- - oba: audio.ObjectBasedAudio - Object based input audio - sba: audio.SceneBasedAudio - SBA output audio - """ - - sba.audio = np.zeros([oba.audio.shape[0], sba.num_channels]) - - for obj_idx, obj_pos in enumerate(oba.object_pos): - obj_audio = oba.audio[:, [obj_idx]] - pos_frames = obj_pos.shape[0] - - frame_len = IVAS_FRAME_LEN_MS * (oba.fs // 1000) - - fade_in = np.arange(frame_len) / (frame_len - 1) - fade_in = fade_in[:, np.newaxis] - fade_out = 1.0 - fade_in - - gains_old = None - - for i, (frame_in, frame_out) in framewise_io(obj_audio, sba.audio, frame_len): - # update the crossfade if we have a smaller last frame - if frame_out.shape[0] != frame_len: - frame_size = frame_out.shape[0] - fade_in = np.arange(frame_size) / (frame_size - 1) - fade_in = fade_in[:, np.newaxis] - fade_out = 1.0 - fade_in - - pos = obj_pos[i % pos_frames, :] - gains = getRSH(np.array([pos[0]]), np.array([pos[1]]), sba.ambi_order) - - if gains_old is None: - gains_old = gains.copy() - - frame_out[:] += (fade_in * frame_in @ gains.T) + ( - fade_out * frame_in @ gains_old.T - ) - - gains_old = gains.copy() - - -def rotate_oba( - azi: np.ndarray, - ele: np.ndarray, - trajectory: Optional[str] = None, -) -> Tuple[np.ndarray, np.ndarray]: - """ - Application of head tracking trajectory - - Parameters: - ---------- - azi: np.ndarray - Azimuth coordinates of objects - ele: np.ndarray - Elevation coordinates of objects - trajectory: str - Head-tracking trajectory path - - Returns: - ---------- - azi_rot: np.ndarray - Azimuth coordinates after application of trajectory - ele_rot: np.ndarray - Elevation coordinates after application of trajectory - """ - - if trajectory is None: - return azi, ele - - trj_data = np.genfromtxt(trajectory, delimiter=",") - trj_frames = trj_data.shape[0] - - N_frames = azi.shape[0] - if ele.shape[0] != azi.shape[0]: - raise ValueError("Inconsistent input in azi and ele") - - azi_rot = np.zeros([N_frames]) - ele_rot = np.zeros([N_frames]) - - for i_frame in range(N_frames): - q = trj_data[i_frame % trj_frames, :] - azi_rot[i_frame], ele_rot[i_frame] = rotateAziEle( - azi[i_frame], ele[i_frame], Quat2RotMat(q) - ) - - return azi_rot, ele_rot - - -def render_object( - obj_idx: int, - obj_pos: np.ndarray, - oba: audio.ObjectBasedAudio, - trajectory: str, - IR: np.ndarray, - SourcePosition: np.ndarray, -) -> np.ndarray: - """ - Binaural rendering for one ISM object - - Parameters: - ---------- - obj_idx: int - Index of object in list of all objects - obj_pos: np.ndarray - Position of object - oba: audio.ObjectBasedAudio - Input ISM audio object - trajectory: str - Head-tracking trajectory path - IR: np.ndarray - HRIRs for binauralization - SourcePosition: np.ndarray - Positions of HRIR measurements - - Returns: - ---------- - result_audio: np.ndarray - Binaurally rendered object - """ - - # repeat each value four times since head rotation data is on sub-frame basis - azi = np.repeat(obj_pos[:, 0], 4) - ele = np.repeat(obj_pos[:, 1], 4) - # apply head-rotation trajectory - obj_audio = oba.audio[:, [obj_idx]] - azi, ele = rotate_oba(azi, ele, trajectory) - # convolve signal with HRIRs - result_audio = binaural_fftconv_framewise( - obj_audio, - IR, - SourcePosition, - azi, - ele, - ) - return result_audio diff --git a/item_generation_scripts/audiotools/convert/scenebased.py b/item_generation_scripts/audiotools/convert/scenebased.py deleted file mode 100644 index a7e89b4f..00000000 --- a/item_generation_scripts/audiotools/convert/scenebased.py +++ /dev/null @@ -1,429 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -from pathlib import Path -from typing import Optional, Union -from warnings import warn - -import numpy as np -from scipy.special import lpmv - -from item_generation_scripts.audiotools import audio -from item_generation_scripts.audiotools.audioarray import delay, framewise_io -from item_generation_scripts.audiotools.binaural_datasets.binaural_dataset import ( - load_ir, -) -from item_generation_scripts.audiotools.constants import ( - IVAS_FRAME_LEN_MS, - T_DESIGN_11_AZI, - T_DESIGN_11_ELE, - VERT_HOA_CHANNELS_ACN, -) -from item_generation_scripts.audiotools.convert import channelbased -from item_generation_scripts.audiotools.convert.binaural import binaural_fftconv -from item_generation_scripts.audiotools.EFAP import EFAP -from item_generation_scripts.audiotools.rotation import Quat2RotMat, SHrotmatgen -from item_generation_scripts.audiotools.wrappers.filter import resample_itu - -""" SceneBasedAudio functions """ - - -def convert_scenebased( - sba: audio.SceneBasedAudio, - out: audio.Audio, - **kwargs, -) -> audio.Audio: - """Convert scene-based audio to the requested output format""" - - # SBA -> Binaural - if isinstance(out, audio.BinauralAudio): - render_sba_to_binaural(sba, out, **kwargs) - - # SBA -> CBA - elif isinstance(out, audio.ChannelBasedAudio): - render_sba_to_cba(sba, out) - - # SBA -> SBA - elif isinstance(out, audio.SceneBasedAudio): - render_sba_to_sba(sba, out) - else: - raise NotImplementedError( - f"Conversion from {sba.name} to {out.name} is unsupported!" - ) - - return out - - -def render_sba_to_binaural( - sba: audio.SceneBasedAudio, - bin: audio.BinauralAudio, - trajectory: Optional[Union[str, Path]] = None, - bin_dataset: Optional[str] = None, - **kwargs, -) -> None: - """ - Binauralization of scene-based audio - - Parameters - ---------- - sba: audio.SceneBasedAudio - Input SBA audio - bin: audio.BinauralAudio - Output binaural audio - trajectory: Optional[Union[str, Path]] - Head rotation trajectory path - bin_dataset: Optional[str] - Name of binaural dataset without prefix or suffix - """ - - if trajectory is not None: - sba.audio = rotate_sba(sba, trajectory) - - if "ROOM" in bin.name: - cba_tmp = audio.fromtype("7_1_4") - cba_tmp.fs = sba.fs - - render_sba_to_cba(sba, cba_tmp) - - channelbased.render_cba_to_binaural(cba_tmp, bin, trajectory) - else: - IR, _, latency_smp = load_ir(sba.name, bin.name, bin_dataset) - - sba.audio = resample_itu(sba, 48000) - fs_old = sba.fs - sba.fs = 48000 - - bin.audio = binaural_fftconv(sba.audio, IR, sba.num_channels) - - # compensate delay from binaural dataset - bin.audio = delay(bin.audio, bin.fs, -latency_smp, samples=True) - - bin.audio = resample_itu(bin, fs_old) - bin.fs = fs_old - - -def render_sba_to_cba( - sba: audio.SceneBasedAudio, - cba: audio.ChannelBasedAudio, -) -> None: - """ - Rendering of SBA input signal to channel-based format - - Parameters - ---------- - sba: audio.SceneBasedAudio - Scene-based input audio - cba: audio.ChannelBasedAudio - Channel-based output audio - """ - - render_mtx = get_allrad_mtx(sba.ambi_order, cba) - cba.audio = sba.audio @ render_mtx.T - - -def render_sba_to_sba( - sba_in: audio.SceneBasedAudio, - sba_out: audio.SceneBasedAudio, -) -> None: - """ - Rendering of SBA input signal to SBA output format - - Parameters - ---------- - sba_in: audio.SceneBasedAudio - Scene-based input audio - sba_out: audio.SceneBasedAudio - Scene-based output audio - """ - - if sba_out.ambi_order > sba_in.ambi_order: - sba_out.audio = np.pad( - sba_in.audio, [[0, 0], [0, sba_out.num_channels - sba_in.num_channels]] - ) - elif sba_out.ambi_order < sba_in.ambi_order: - sba_out.audio = sba_in.audio[:, : sba_out.num_channels] - - if sba_out.is_planar: - zero_vert_channels(sba_out) - - -def rotate_sba( - sba: audio.SceneBasedAudio, - trajectory: str, -) -> np.ndarray: - """ - Rotate HOA signal by applying a rotation matrix calculated from the current quaternion - in each subframe - - Parameters: - ---------- - x: np.ndarray - Input signal upto HOA3 - trajectory: str - Path to trajectory file - - Returns: - ---------- - y: np.ndarray - Rotated HOA signal - """ - - trj_data = np.genfromtxt(trajectory, delimiter=",") - trj_frames = trj_data.shape[0] - - sig_len = sba.audio.shape[0] - sig_dim = sba.audio.shape[1] - frame_len = (IVAS_FRAME_LEN_MS // 4) * 48 - - if sig_dim not in [4, 9, 16]: - raise ValueError("rotate_sba can only handle FOA, HOA2 or HOA3 signals!") - - out = np.zeros([sig_len, sig_dim]) - - fade_in = np.arange(frame_len) / (frame_len - 1) - fade_in = fade_in[:, np.newaxis] - fade_out = 1.0 - fade_in - - R = np.eye(sig_dim) - R_old = np.eye(sig_dim) - for i, (frame_in, frame_out) in framewise_io(sba.audio, out, frame_len): - # update the crossfade if we have a smaller last frame - if frame_out.shape[0] != frame_len: - frame_size = frame_out.shape[0] - fade_in = np.arange(frame_size) / (frame_size - 1) - fade_in = fade_in[:, np.newaxis] - fade_out = 1.0 - fade_in - - R_r = Quat2RotMat(trj_data[i % trj_frames, :]) - R[:, :] = SHrotmatgen(R_r, order=ambi_order_from_nchan(sig_dim)) - - frame_out[:, :] = (fade_in * frame_in @ R.T) + (fade_out * frame_in @ R_old.T) - - R_old[:, :] = R.copy() - - return out - - -""" Helper functions """ - - -def zero_vert_channels(sba: audio.SceneBasedAudio) -> None: - """Remove all ambisonics parts with vertical components""" - sba.audio[:, VERT_HOA_CHANNELS_ACN[VERT_HOA_CHANNELS_ACN < sba.num_channels]] = 0 - - -def nchan_from_ambi_order(ambi_order: int) -> int: - """Compute number of channels based on ambisonics order""" - return (ambi_order + 1) ** 2 - - -def ambi_order_from_nchan(nchan: int) -> int: - """Compute ambisonics order based on number of channels""" - return int(np.sqrt(nchan) - 1) - - -def rE_weight(order: int) -> np.ndarray: - """Compute max-rE weighting matrix""" - return np.array( - [ - lpmv(0, l, np.cos(np.deg2rad(137.9) / (order + 1.51))) - for l in range(order + 1) - for _ in range(-l, l + 1) - ] - ).T - - -def n2sn(order: int) -> np.ndarray: - """Compute conversion matrix for N3D to SN3D normalization""" - return np.array( - [1.0 / np.sqrt(2 * l + 1) for l in range(order + 1) for _ in range(-l, l + 1)] - ) - - -def sn2n(order: int) -> np.ndarray: - """Compute conversion matrix for SN3D to N3D normalization""" - return np.array( - [np.sqrt(2 * l + 1) for l in range(order + 1) for _ in range(-l, l + 1)] - ) - - -def getRSH( - azi: np.ndarray, - ele: np.ndarray, - ambi_order: int, - norm: Optional[str] = "sn3d", - degrees: Optional[bool] = True, -) -> np.ndarray: - """ - Returns real spherical harmonic response for the given position(s) - - Parameters: - ---------- - azi: np.ndarray - Azimuth angles - ele: np.ndarray - Elevation angles - ambi_order: int - Ambisonics order - norm: Optional[str] - Normalization of ambisonic bases. - Possible values: "sn3d", "n3d", everything else is interpreted as orthogonal - degrees: Optional[bool] - If true azi and ele are interpreted as angles in degrees, otherwise as radians - - Returns: - ---------- - response: np.ndarray - Real spherical harmonic response - """ - - if degrees: - azi = np.deg2rad(azi) - ele = np.deg2rad(ele) - - azi = azi.astype("float64") - ele = ele.astype("float64") - - LM = np.array([(l, m) for l in range(ambi_order + 1) for m in range(-l, l + 1)]) - - response = np.zeros([LM.shape[0], azi.shape[0]]) - - # trig_term * legendre * uncondon - for i, (l, m) in enumerate(LM): - # N3D norm - response[i, :] = np.sqrt( - ((2 * l + 1) * float(np.math.factorial(l - np.abs(m)))) - / (4 * np.pi * float(np.math.factorial(l + np.abs(m)))) - ) - - # trig term - if m < 0: - response[i, :] *= np.sqrt(2) * np.sin(azi * np.abs(m)) - elif m == 0: - pass # response[i,:] *= 1 - else: - response[i, :] *= np.sqrt(2) * np.cos(azi * m) - - # legendre polynomial - a = lpmv(np.abs(m), l, np.sin(ele)) * ((-1) ** np.abs(m)) - if np.inf in a or -np.inf in a: - a[a == np.inf] = np.finfo(np.float64).max - a[a == -np.inf] = np.finfo(np.float64).min - warn( - "Warning: order too large -> leads to overflow. Inf values are discarded!" - ) - response[i, :] *= a - - if norm == "sn3d": - response *= np.sqrt(4 * np.pi) - response[:] = np.diag(n2sn(ambi_order)) @ response - elif norm == "n3d": - response *= np.sqrt(4 * np.pi) - else: - pass # ortho - - return response - - -def get_allrad_mtx( - ambi_order: int, - cba: audio.ChannelBasedAudio, - norm: Optional[str] = "sn3d", - rE_weight_bool: Optional[bool] = False, - intensity_panning: Optional[bool] = True, -) -> np.ndarray: - """ - Returns ALLRAD matrix - - Parameters: - ---------- - ambi_order: int - Ambisonics order - cba: audio.ChannelBasedAudio - Channel-based audio object - norm: Optional[str] - Normalization of ambisonic bases. - Possible values: "sn3d", "ortho", everything else is interpreted as n3d - re_weight_bool: Optional[bool] - Flag for max-rE weighting - intensity_panning: Optional[bool] - Flag for intensity panning - - Returns: - ---------- - hoa_dec: np.ndarray - ALLRAD matrix - """ - - n_harm = nchan_from_ambi_order(ambi_order) - - if cba.name == "MONO": - hoa_dec = np.zeros([1, n_harm]) - hoa_dec[0, 0] = 1 - elif cba.name == "STEREO": - hoa_dec = np.zeros([2, n_harm]) - # Cardioids +/- 90 degrees - hoa_dec[0, 0] = 0.5 - hoa_dec[0, 1] = 0.5 - hoa_dec[1, 0] = 0.5 - hoa_dec[1, 1] = -0.5 - else: - Y_td = getRSH( - T_DESIGN_11_AZI, - T_DESIGN_11_ELE, - ambi_order, - norm="ortho", - ) - Y_td *= np.sqrt(4 * np.pi) - - n_ls_woLFE = cba.num_channels - len(cba.lfe_index) - ls_azi_woLFE = np.delete(cba.ls_azi, cba.lfe_index).astype(float) - ls_ele_woLFE = np.delete(cba.ls_ele, cba.lfe_index).astype(float) - - panner = EFAP(ls_azi_woLFE, ls_ele_woLFE, intensity_panning) - G_td = panner.pan(T_DESIGN_11_AZI, T_DESIGN_11_ELE) - - hoa_dec = (G_td.T @ Y_td.T) / T_DESIGN_11_AZI.size - - if norm == "sn3d": - hoa_dec = hoa_dec @ np.diag(sn2n(ambi_order)) - elif norm == "ortho": - hoa_dec *= np.sqrt(4 * np.pi) - - if rE_weight_bool: - a_n = rE_weight(ambi_order) - nrg_pre = np.sqrt(len(n_ls_woLFE) / np.sum(a_n**2)) - hoa_dec = hoa_dec @ np.diag(a_n) * nrg_pre - - hoa_dec = np.insert(hoa_dec, cba.lfe_index, np.zeros(n_harm), axis=0) - - return hoa_dec diff --git a/item_generation_scripts/audiotools/metadata.py b/item_generation_scripts/audiotools/metadata.py deleted file mode 100644 index 0a4631ae..00000000 --- a/item_generation_scripts/audiotools/metadata.py +++ /dev/null @@ -1,571 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import csv -from pathlib import Path -from typing import Optional, TextIO, Tuple, Union - -import numpy as np - -from item_generation_scripts.audiotools import audio -from item_generation_scripts.audiotools.audio import fromtype -from item_generation_scripts.audiotools.audioarray import trim -from item_generation_scripts.audiotools.audiofile import read -from item_generation_scripts.audiotools.constants import IVAS_FRAME_LEN_MS - - -class Metadata: - def __init__(self, meta_file: Union[str, Path]): - self.meta_file = Path(meta_file) - - if not self.meta_file.exists(): - raise FileNotFoundError( - f"Scene description file {self.meta_file} does not exist!" - ) - - with open(self.meta_file) as f: - audio_file = self.meta_file.parent.joinpath(f.readline().strip()).absolute() - - if audio_file.suffix != ".wav": - raise ValueError( - "Scene description files can only be used with WAVE input!" - ) - - self.audio_array, self.fs = read(audio_file) - self.audio = [] - - num_audio = int(f.readline().strip()) - for _ in range(num_audio): - in_fmt = f.readline().strip().upper() - - if in_fmt == "ISM": - self.parse_ism_input(f) - elif in_fmt == "MASA": - self.parse_masa_input(f) - elif in_fmt == "MC": - self.parse_mc_input(f) - elif in_fmt == "SBA": - self.parse_sba_input(f) - else: - raise KeyError(f"Unknown input type in metadata file {in_fmt}") - - def parse_ism_input(self, f: TextIO): - start = int(f.readline().strip()) - 1 - - ism = fromtype("ISM1") - ism.audio = self.audio_array[:, start : start + 1] - ism.fs = self.fs - - line = f.readline().strip() - tmp_path = self.meta_file.parent.joinpath(line).absolute() - if tmp_path.exists(): - # csv metadata - ism.metadata_files = [tmp_path] - ism.init_metadata() - else: - # manually specified metadata - positions = [f.readline().strip() for _ in range(int(line))] - positions = np.genfromtxt( - positions, delimiter="," - ) # TODO can use ndmin = 2 with numpy > 1.23.0; check support - if positions.ndim == 1: - positions = positions[np.newaxis, :] - - obj_pos = [] - # repeat based on first column - for p in positions: - repeats = int(p[0]) - obj_pos.append(np.tile(p[1:], [repeats, 1])) - obj_pos = np.vstack(obj_pos) - - ism.object_pos = [obj_pos] - - self.audio.append(ism) - - def parse_masa_input(self, f: TextIO): - start = int(f.readline().strip()) - 1 - masa_tc = int(f.readline().strip()) - - masa = fromtype(f"MASA{masa_tc}") - masa.audio = self.audio_array[:, start : start + masa_tc] - masa.fs = self.fs - masa.metadata_files = [ - self.meta_file.parent.joinpath(f.readline().strip()).absolute() - ] - masa.init_metadata() - - self.audio.append(masa) - - def parse_mc_input(self, f: TextIO): - start = int(f.readline().strip()) - 1 - mc_fmt = f.readline().strip() - - mc = fromtype(mc_fmt) - mc.audio = self.audio_array[:, start : start + mc.num_channels] - mc.fs = self.fs - - self.audio.append(mc) - - def parse_sba_input(self, f: TextIO): - start = int(f.readline().strip()) - 1 - sba_order = int(f.readline().strip()) - - sba = fromtype(f"SBA{sba_order}") - sba.audio = self.audio_array[:, start : start + sba.num_channels] - sba.fs = self.fs - - self.audio.append(sba) - - def parse_optional_values(self, f: TextIO): - raise NotImplementedError( - "Additional configuration keys in metadata currently unsupported!" - ) - - # opts = {} - # original_pos = f.tell() - # key_value = f.readline().strip() - - # try to parse a key, otherwise reset read pointer - # for key in OPT_KEYS: - # if key_value.startswith(key): - # opts[key] = key_value.replace(key, "").replace(":", "") - # original_pos = f.tell() - # key_value = f.readline.strip() - # else: - # f.seek(original_pos) - # - - -def write_ISM_metadata_in_file( - metadata: list[np.ndarray], - file_name: list[Union[str, Path]], - automatic_naming: Optional[bool] = False, -) -> list[str, Path]: - """ - Write ISM metadata into csv file(s) - - Parameters - ---------- - metadata: list[np.ndarray] - List of metadata arrays - file_name: list[Union[str, Path]] - List of file names for csv files - automatic_naming: Optional[bool] - If true files are named automatically name.0.csv, name.1.csv, ... with name as the first entry of file_name - - Returns - ---------- - file_names: list[str, Path] - List of acutally used file names - """ - - if not automatic_naming and len(metadata) != len(file_name): - raise ValueError("Number of metadata objects and file names has to match") - number_objects = len(metadata) - - if automatic_naming: - file_names = [] - for m_object in range(number_objects): - file_names.append(f"{file_name[0]}.{m_object}.csv") - else: - file_names = file_name - - for i, csv_file in enumerate(file_names): - number_frames = metadata[i].shape[0] - with open(csv_file, "w", newline="") as file: - writer = csv.writer(file) - for k in range(number_frames): - row_list = [ - "%+07.2f" % np.round(metadata[i][k, 0], 2), - "%+06.2f" % np.round(metadata[i][k, 1], 2), - "01.00", - "000.00", - "1.00", - ] - writer.writerow(row_list) - - return file_names - - -def trim_meta( - x: audio.ObjectBasedAudio, - limits: Optional[Tuple[int, int]] = None, - pad_noise: Optional[bool] = False, - samples: Optional[bool] = False, -) -> None: - """ - Trim or pad ISM including metadata - positive limits trim negative limits pad - - Parameters - ---------- - x: audio.ObjectBasedAudio - ISM audio object - limits: Optional[Tuple[int, int]] - Number of samples to trim or pad at beginning and end - pad_noise: Optional[bool] - Flag for padding noise instead of silence - samples: Optional[bool] - Flag for interpreting limits as samples, otherwise milliseconds - """ - - if not limits: - return - - frame_length = int(IVAS_FRAME_LEN_MS * x.fs // 1000) - - # check if trim values are multiples of the frame length - if not samples: - pre_trim = int(limits[0] * x.fs // 1000) - post_trim = int(limits[1] * x.fs // 1000) - else: - pre_trim = limits[0] - post_trim = limits[1] - - if pre_trim % frame_length != 0 or post_trim % frame_length != 0: - raise ValueError( - f"ISM metadata padding and trimming only possible if pad/trim length is multiple of frame length. " - f"Frame length: {IVAS_FRAME_LEN_MS}ms" - ) - - # check if audio is multiple of frame length - if np.shape(x.audio)[0] % frame_length != 0: - raise ValueError( - f"ISM metadata padding and trimming only possible if audio length is multiple of frame length. " - f"Frame length: {IVAS_FRAME_LEN_MS}ms" - ) - - # check if metadata length fits exactly to audio length - for meta in x.object_pos: - if np.shape(meta)[0] * frame_length != np.shape(x.audio)[0]: - raise ValueError( - f"ISM metadata padding and trimming only possible if audio length is multiple of frame " - f"length and audio and metadata length match. Frame length: {IVAS_FRAME_LEN_MS}ms" - ) - - # trim audio - x.audio = trim(x.audio, x.fs, limits, pad_noise, samples) - - # trim metadata - trim_frames_pre = int(pre_trim / frame_length) - trim_frames_post = int(post_trim / frame_length) - for i in range(len(x.object_pos)): - x.object_pos[i] = trim( - x.object_pos[i], - limits=(trim_frames_pre, trim_frames_post), - pad_noise=False, - samples=True, - ) - - # add radius 1 - if trim_frames_pre < 0: - x.object_pos[i][: abs(trim_frames_pre), 2] = 1 - if trim_frames_post < 0: - x.object_pos[i][abs(trim_frames_post) :, 2] = 1 - - return - - -def concat_meta_from_file( - audio_files: list[str], - meta_files: list[list[str]], - out_file: list[str], - input_fmt: str, - silence_pre: Optional[int] = 0, - silence_post: Optional[int] = 0, - preamble: Optional[int] = None, -) -> None: - """ - Concatenate ISM metadata from files - - Parameters - ---------- - audio_files: list[str] - List of audio file names - meta_files: list[list[str]] - List of corresponding metadata file names - out_file: list[str] - Name of concatenated output file - input_fmt: str - Input audio format - silence_pre: Optional[int] - Silence inserted before each item - silence_post: Optional[int] - Silence inserted after each item - preamble: Optional[int] - Length of preamble in milliseconds - """ - - # create audio objects - audio_objects = [] - fs = None - for i, audio_file in enumerate(audio_files): - # metadata is cut/looped to signal length in init of audio object - audio_object = audio.fromfile(input_fmt, audio_file, in_meta=meta_files[i]) - audio_objects.append(audio_object) - if fs: - if audio_object.fs != fs: - raise ValueError("Sampling rates of files to concatenate don't match") - else: - fs = audio_object.fs - - frame_length = int(IVAS_FRAME_LEN_MS * audio_objects[0].fs // 1000) - - # pad and concatenate - concat_meta_all_obj = [None] * audio_objects[0].num_channels - - for audio_item in audio_objects: - # check if audio is multiple of frame length - if np.shape(audio_item.audio)[0] % frame_length != 0: - raise ValueError( - f"ISM metadata padding and trimming only possible if audio length is multiple of frame length. " - f"Frame length: {IVAS_FRAME_LEN_MS}ms" - ) - - # check if metadata length fits exactly to audio length - for meta in audio_item.object_pos: - if np.shape(meta)[0] * frame_length != np.shape(audio_item.audio)[0]: - raise ValueError( - f"ISM metadata padding and trimming only possible if audio length is multiple of frame " - f"length and audio and metadata length match. Frame length: {IVAS_FRAME_LEN_MS}ms" - ) - - # pad - trim_meta( - audio_item, (-silence_pre, -silence_post) - ) # use negative value since we want to pad, not trim - - # concatenate - for idx, obj_pos in enumerate(audio_item.object_pos): - concat_meta_all_obj[idx] = ( - np.concatenate([concat_meta_all_obj[idx], obj_pos]) - if concat_meta_all_obj[idx] is not None - else obj_pos - ) - - # add preamble - if preamble: - concat_meta_all_obj = add_remove_preamble(concat_meta_all_obj, preamble) - - write_ISM_metadata_in_file(concat_meta_all_obj, out_file) - - return - - -def split_meta_in_file( - in_filename: Union[str, Path], - out_folder: Union[str, Path], - split_filenames: list[Union[str, Path]], - splits: list[int], - input_fmt: str, - meta_files: Optional[list[Union[str, Path]]] = None, - in_fs: Optional[int] = 48000, - preamble: Optional[int] = 0, -): - """ - Splits ISM metadata files into multiple shorter files - - Parameters - __________ - in_filename: Union[str, Path] - Input filenmame (.pcm, .raw or .wav) - out_folder: Union[str, Path] - Output folder where to put the splits - split_filenames: list[Union[str, Path]] - List of names for the split files - splits: list[int] - List of sample indices where to cut the signal - in_fs: Optional[int] - Input sampling rate, default 48000 Hz - """ - - # create a list of output files - out_paths = [] - - # Read input file by creating ISM audio object - audio_object = audio.fromfile(input_fmt, in_filename, in_meta=meta_files, fs=in_fs) - - split_old = 0 - for idx, split in enumerate(splits): - out_paths_obj = [] - for obj in range(audio_object.num_channels): - out_file = ( - Path(out_folder) - / f"{Path(split_filenames[idx]).with_suffix(in_filename.suffix)}.{obj}.csv" - ) - - # add the path to our list - out_paths_obj.append(out_file) - - # remove preamble - if preamble: - preamble_frames = int(preamble / IVAS_FRAME_LEN_MS) - y = trim( - audio_object.object_pos[obj], - audio_object.fs, - (preamble_frames, 0), - samples=True, - ) - else: - y = audio_object.object_pos[obj] - - # split - split_start = int(split_old / IVAS_FRAME_LEN_MS / audio_object.fs * 1000) - split_end = int(split / IVAS_FRAME_LEN_MS / audio_object.fs * 1000) - y = y[split_start:split_end, :] - - # write file - write_ISM_metadata_in_file([y], [out_file]) - - out_paths.append(out_paths_obj) - - split_old = split - - return out_paths - - -def check_ISM_metadata( - in_meta: dict, - num_objects: int, - num_items: int, - item_names: Optional[list] = None, -) -> list: - """Find ISM metadata""" - - list_meta = [] - if in_meta is None: - for item in item_names: - list_item = metadata_search(Path(item).parent, [item], num_objects) - list_meta.append(list_item) - else: - if len(in_meta) == 1 and num_items != 1: - # automatic search for metadata files in folder for all items and objects - try: - path_meta = in_meta["all_items"] - except KeyError: - raise ValueError( - 'Only one metadata path is given but not with key "all_items".' - ) - - list_meta = metadata_search(path_meta, item_names, num_objects) - - elif num_items == len(in_meta): - # search for every item individually - for item_idx in range(num_items): - # try to use item_names as keys - try: - if item_names: - current_item = in_meta[item_names[item_idx].name] - else: - raise KeyError - except KeyError: - current_item = in_meta[f"item{item_idx + 1}"] - - if len(current_item) == 1: - # automatic search in folder - list_item = metadata_search( - current_item[0], [item_names[item_idx]], num_objects - ) - - elif len(current_item) == num_objects: - # just read out - list_item = current_item - else: - raise ValueError("Number of objects and metadata does not match.") - list_meta.append(list_item) - else: - raise ValueError("Number of metadata inputs does not match number of items") - - # return list of lists of metadata files - return list_meta - - -def metadata_search( - in_meta_path: Union[str, Path], - item_names: list[Union[str, Path]], - num_objects: int, -) -> list[list[Union[Path, str]]]: - """Search for ISM metadata with structure item_name.{0-3}.csv in in_meta folder""" - - if not item_names: - raise ValueError("Item names not provided, can't search for metadata") - - list_meta = [] - for item in item_names: - list_item = [] - for obj_idx in range(num_objects): - file_name_meta = in_meta_path / Path(item.stem).with_suffix( - f"{item.suffix}.{obj_idx}.csv" - ) - # check if file exists and add to list - if file_name_meta.is_file(): - list_item.append(file_name_meta) - else: - raise ValueError(f"Metadata file {file_name_meta} not found.") - if len(item_names) == 1: - list_meta = list_item - else: - list_meta.append(list_item) - - return list_meta - - -def add_remove_preamble( - metadata, - preamble, - add: Optional[bool] = True, -): - preamble_frames = preamble / IVAS_FRAME_LEN_MS - if not preamble_frames.is_integer(): - raise ValueError( - f"Application of preamble for ISM metadata is only possible if preamble length is multiple of frame length. " - f"Frame length: {IVAS_FRAME_LEN_MS}ms" - ) - for obj_idx in range(len(metadata)): - if metadata is not None and metadata[obj_idx] is not None: - if add: - metadata[obj_idx] = trim( - metadata[obj_idx], - limits=(-int(preamble_frames), 0), - samples=True, - ) - - # add radius 1 - metadata[obj_idx][: int(preamble_frames), 2] = 1 - else: - metadata[obj_idx] = trim( - metadata[obj_idx], - limits=(int(preamble_frames), 0), - samples=True, - ) - - return metadata diff --git a/item_generation_scripts/audiotools/rotation.py b/item_generation_scripts/audiotools/rotation.py deleted file mode 100644 index 742548a8..00000000 --- a/item_generation_scripts/audiotools/rotation.py +++ /dev/null @@ -1,379 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -from typing import Optional, Tuple - -import numpy as np - -""" -Helper functions used by Ruedenberg, -an implementation of the algorithm in -Ivanic, J. & Ruedenberg, K., J. Phys. Chem. 100, 6342 (1996) -translated from ivas_rotation.c -""" - - -def SHrot_p( - i: int, - l: int, - a: int, - b: int, - SHrotmat: np.ndarray, - R_lm1: np.ndarray, -) -> float: - """Helper function to calculate the ps""" - - ri1 = SHrotmat[i + 1 + 1][1 + 1 + 1] - rim1 = SHrotmat[i + 1 + 1][-1 + 1 + 1] - ri0 = SHrotmat[i + 1 + 1][0 + 1 + 1] - - if b == -l: - R_lm1_1 = R_lm1[a + l - 1][0] - R_lm1_2 = R_lm1[a + l - 1][2 * l - 2] - p = ri1 * R_lm1_1 + rim1 * R_lm1_2 - else: - if b == l: - R_lm1_1 = R_lm1[a + l - 1][2 * l - 2] - R_lm1_2 = R_lm1[a + l - 1][0] - p = ri1 * R_lm1_1 - rim1 * R_lm1_2 - else: - R_lm1_1 = R_lm1[a + l - 1][b + l - 1] - p = ri0 * R_lm1_1 - - return p - - -def SHrot_u( - l: int, - m: int, - n: int, - SHrotmat: np.ndarray, - R_lm1: np.ndarray, -) -> float: - """Helper function to calculate the us""" - - return SHrot_p(0, l, m, n, SHrotmat, R_lm1) - - -def SHrot_v( - l: int, - m: int, - n: int, - SHrotmat: np.ndarray, - R_lm1: np.ndarray, -) -> float: - """Helper function to calculate the vs""" - - if m == 0: - p0 = SHrot_p(1, l, 1, n, SHrotmat, R_lm1) - p1 = SHrot_p(-1, l, -1, n, SHrotmat, R_lm1) - return p0 + p1 - else: - if m > 0: - d = 1.0 if (m == 1) else 0.0 - p0 = SHrot_p(1, l, m - 1, n, SHrotmat, R_lm1) - p1 = SHrot_p(-1, l, -m + 1, n, SHrotmat, R_lm1) - return p0 * np.sqrt(1.0 + d) - p1 * (1.0 - d) - else: - d = 1.0 if (m == -1) else 0.0 - p0 = SHrot_p(1, l, m + 1, n, SHrotmat, R_lm1) - p1 = SHrot_p(-1, l, -m - 1, n, SHrotmat, R_lm1) - return p0 * (1.0 - d) + p1 * np.sqrt(1.0 + d) - - -def SHrot_w( - l: int, - m: int, - n: int, - SHrotmat: np.ndarray, - R_lm1: np.ndarray, -) -> float: - """Helper function to calculate the w""" - - if m == 0: - raise ValueError("ERROR should not be called\n") - else: - if m > 0: - p0 = SHrot_p(1, l, m + 1, n, SHrotmat, R_lm1) - p1 = SHrot_p(-1, l, -m - 1, n, SHrotmat, R_lm1) - return p0 + p1 - else: - p0 = SHrot_p(1, l, m - 1, n, SHrotmat, R_lm1) - p1 = SHrot_p(-1, l, -m + 1, n, SHrotmat, R_lm1) - return p0 - p1 - - -def SHrotmatgen( - R: np.ndarray, - order: Optional[int] = 3, -) -> np.ndarray: - """ - Calculate SHD rotation matrix from that in real space - translated from ivas_rotation.c - - Parameters: - ---------- - R: np.ndarray - real-space rotation matrix - order: Optional[int] - Ambisonics order, default = 3 - - Returns: - ---------- - SHrotmat: np.ndarray - SHD rotation matrix - """ - - dim = (order + 1) * (order + 1) - - SHrotmat = np.zeros([dim, dim]) - R_lm1 = np.zeros([dim, dim]) - R_l = np.zeros([dim, dim]) - - SHrotmat[0][0] = 1.0 - - SHrotmat[1][1] = R[1][1] - SHrotmat[1][2] = R[1][2] - SHrotmat[1][3] = R[1][0] - - SHrotmat[2][1] = R[2][1] - SHrotmat[2][2] = R[2][2] - SHrotmat[2][3] = R[2][0] - - SHrotmat[3][1] = R[0][1] - SHrotmat[3][2] = R[0][2] - SHrotmat[3][3] = R[0][0] - - for i in range(2 * 1 + 1): - for j in range(2 * 1 + 1): - R_lm1[i][j] = SHrotmat[i + 1][j + 1] - - band_idx = 4 - for l in range(2, order + 1): - R_l[:, :] = 0.0 - - for m in range(-l, l + 1): - d = 1 if (m == 0) else 0 - absm = abs(m) - sql2mm2 = np.sqrt((l * l - m * m)) - sqdabsm = np.sqrt(((1 + d) * (l + absm - 1) * (l + absm))) - sqlabsm = np.sqrt(((l - absm - 1) * (l - absm))) - - for n in range(-l, l + 1): - if abs(n) == l: - sqdenom = np.sqrt((2 * l) * (2 * l - 1)) - else: - sqdenom = np.sqrt(l * l - n * n) - - u = sql2mm2 / sqdenom - v = sqdabsm / sqdenom * (1 - 2 * d) * 0.5 - w = sqlabsm / sqdenom * (1 - d) * (-0.5) - - if u != 0: - u = u * SHrot_u(l, m, n, SHrotmat, R_lm1) - if v != 0: - v = v * SHrot_v(l, m, n, SHrotmat, R_lm1) - if w != 0: - w = w * SHrot_w(l, m, n, SHrotmat, R_lm1) - R_l[m + l][n + l] = u + v + w - - for i in range(2 * l + 1): - for j in range(2 * l + 1): - SHrotmat[band_idx + i][band_idx + j] = R_l[i][j] - - for i in range(2 * l + 1): - for j in range(2 * l + 1): - R_lm1[i][j] = R_l[i][j] - - band_idx += 2 * l + 1 - - return SHrotmat - - -def Quat2Euler( - quat: np.ndarray, - degrees: bool = True, -) -> np.ndarray: - """Convert Quaternion to Euler angles""" - - sinr = +2.0 * (quat[..., 0] * quat[..., 1] + quat[..., 2] * quat[..., 3]) - cosr = +1.0 - 2.0 * (quat[..., 1] * quat[..., 1] + quat[..., 2] * quat[..., 2]) - roll = np.arctan2(sinr, cosr) - - sinp = +2.0 * (quat[..., 0] * quat[..., 2] - quat[..., 3] * quat[..., 1]) - pitch = np.where(np.fabs(sinp) >= 1, np.copysign(np.pi / 2, sinp), np.arcsin(sinp)) - - siny = +2.0 * (quat[..., 0] * quat[..., 3] + quat[..., 1] * quat[..., 2]) - cosy = +1.0 - 2.0 * (quat[..., 2] * quat[..., 2] + quat[..., 3] * quat[..., 3]) - yaw = np.arctan2(siny, cosy) - - ypr = np.array([yaw, pitch, roll]).T - - if degrees: - ypr = np.rad2deg(ypr) - - return ypr - - -def Euler2Quat( - ypr: np.ndarray, - degrees: bool = True, -) -> np.ndarray: - """Convert Euler angles to Quaternion""" - - if degrees: - ypr = np.deg2rad(ypr) - - if len(ypr.shape) == 2: - N_quat = ypr.shape[0] - quat = np.zeros([N_quat, 4]) - yaw = ypr[:, 0] - pitch = ypr[:, 1] - roll = ypr[:, 2] - else: - quat = np.zeros([4]) - yaw = ypr[0] - pitch = ypr[1] - roll = ypr[2] - - c1 = np.cos(0.5 * yaw) - c2 = np.cos(0.5 * pitch) - c3 = np.cos(0.5 * roll) - - s1 = np.sin(0.5 * yaw) - s2 = np.sin(0.5 * pitch) - s3 = np.sin(0.5 * roll) - - quat[..., 0] = c3 * c2 * c1 + s3 * s2 * s1 - quat[..., 1] = s3 * c2 * c1 - c3 * s2 * s1 - quat[..., 2] = s3 * c2 * s1 + c3 * s2 * c1 - quat[..., 3] = c3 * c2 * s1 - s3 * s2 * c1 - - return quat - - -def Quat2RotMat( - quat: np.ndarray, -) -> np.ndarray: - """Convert quaternion to rotation matrix""" - - R = np.zeros([3, 3]) - - if quat[0] != -3: - # Quaternions - # formula taken from ivas_rotation.c - - R[0, 0] = ( - quat[0] * quat[0] - + quat[1] * quat[1] - - quat[2] * quat[2] - - quat[3] * quat[3] - ) - R[0, 1] = 2.0 * (quat[1] * quat[2] - quat[0] * quat[3]) - R[0, 2] = 2.0 * (quat[1] * quat[3] + quat[0] * quat[2]) - - R[1, 0] = 2.0 * (quat[1] * quat[2] + quat[0] * quat[3]) - R[1, 1] = ( - quat[0] * quat[0] - - quat[1] * quat[1] - + quat[2] * quat[2] - - quat[3] * quat[3] - ) - R[1, 2] = 2.0 * (quat[2] * quat[3] - quat[0] * quat[1]) - - R[2, 0] = 2.0 * (quat[1] * quat[3] - quat[0] * quat[2]) - R[2, 1] = 2.0 * (quat[2] * quat[3] + quat[0] * quat[1]) - R[2, 2] = ( - quat[0] * quat[0] - - quat[1] * quat[1] - - quat[2] * quat[2] - + quat[3] * quat[3] - ) - - else: - # Euler angles in R_X(roll)*R_Y(pitch)*R_Z(yaw) convention - # - # yaw: rotate scene counter-clockwise in the horizontal plane - # pitch: rotate scene in the median plane, increase elevation with positive values - # roll: rotate scene from the right ear to the top - # - # formula taken from ivas_rotation.c - - c1 = np.cos(quat[3] / 180.0 * np.pi) - c2 = np.cos(quat[2] / 180.0 * np.pi) - c3 = np.cos(quat[1] / 180.0 * np.pi) - - s1 = np.sin(quat[3] / 180.0 * np.pi) - s2 = np.sin(-quat[2] / 180.0 * np.pi) - s3 = np.sin(quat[1] / 180.0 * np.pi) - - R[0, 0] = c2 * c3 - R[0, 1] = -c2 * s3 - R[0, 2] = s2 - - R[1, 0] = c1 * s3 + c3 * s1 * s2 - R[1, 1] = c1 * c3 - s1 * s2 * s3 - R[1, 2] = -c2 * s1 - - R[2, 0] = s1 * s3 - c1 * c3 * s2 - R[2, 1] = c3 * s1 + c1 * s2 * s3 - R[2, 2] = c1 * c2 - - return R - - -def rotateAziEle( - azi: float, - ele: float, - R: np.ndarray, - is_planar: bool = False, -) -> Tuple[float, float]: - """Rotate azimuth and elevation angles with rotation matrix""" - - w = np.cos(np.deg2rad(ele)) - dv = np.array( - [ - w * np.cos(np.deg2rad(azi)), - w * np.sin(np.deg2rad(azi)), - np.sin(np.deg2rad(ele)), - ] - ) - - dv_rot = R @ dv - - azi = np.rad2deg(np.arctan2(dv_rot[1], dv_rot[0])) - if is_planar: - ele = 0 - else: - ele = np.rad2deg(np.arctan2(dv_rot[2], np.sqrt(np.sum(dv_rot[:2] ** 2)))) - - return azi, ele diff --git a/item_generation_scripts/audiotools/utils.py b/item_generation_scripts/audiotools/utils.py deleted file mode 100644 index 6aaf5fa9..00000000 --- a/item_generation_scripts/audiotools/utils.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -from pathlib import Path - -import numpy as np - -from item_generation_scripts.audiotools.rotation import Euler2Quat, Quat2Euler - - -def read_trajectory(trj_file: Path, return_quat=True): - trj = np.genfromtext(trj_file, delimiter=",") - - if np.all(trj[:, 0] == -3): - # Euler - if return_quat: - return Euler2Quat(trj[:, 1:]) - else: - return trj[:, 1:] - else: - # Quat - if return_quat: - return trj - else: - return Quat2Euler(trj) - - -def write_trajectory(trj, out_file, write_quat=True): - if trj.shape[1] == 3: - # Euler - if write_quat: - trj = Euler2Quat(trj) - else: - trj = np.insert(trj, 0, -3.0, axis=1) - elif not write_quat: - trj = Quat2Euler(trj) - trj = np.insert(trj, 0, -3.0, axis=1) - - with open(out_file, "w") as f: - for pos in trj: - f.write(", ".join([f"{q:.6f}" for q in pos])) - f.write("\n") diff --git a/item_generation_scripts/audiotools/wrappers/__init__.py b/item_generation_scripts/audiotools/wrappers/__init__.py deleted file mode 100644 index aea270d8..00000000 --- a/item_generation_scripts/audiotools/wrappers/__init__.py +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# diff --git a/item_generation_scripts/audiotools/wrappers/bs1770.py b/item_generation_scripts/audiotools/wrappers/bs1770.py deleted file mode 100644 index d238bec3..00000000 --- a/item_generation_scripts/audiotools/wrappers/bs1770.py +++ /dev/null @@ -1,291 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import copy -import logging -from pathlib import Path -from tempfile import TemporaryDirectory -from typing import Optional, Tuple, Union -from warnings import warn - -import numpy as np - -from item_generation_scripts.audiotools import audio, convert -from item_generation_scripts.audiotools.audiofile import write -from item_generation_scripts.audiotools.wrappers.filter import resample_itu -from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES -from item_generation_scripts.utils import find_binary, get_devnull, run - -logger = logging.getLogger("__main__") -logger.setLevel(logging.DEBUG) - - -def bs1770demo( - input: audio.Audio, - target_loudness: Optional[float] = -26, -) -> Tuple[float, float]: - """ - Wrapper for ITU-R BS.1770-4, requires bs1770demo binary - - Parameters - ---------- - input: Audio - Input audio - target_loudness: Optional[float] - Desired loudness in LKFS - - Returns - ------- - measured_loudness : float - Measured loudness of input - scale_factor: float - Scale factor to achieve desired loudness - """ - - null_file = get_devnull() - - if "bs1770demo" in DEFAULT_CONFIG_BINARIES["binary_paths"]: - binary = find_binary( - DEFAULT_CONFIG_BINARIES["binary_paths"]["bs1770demo"].name, - binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"]["bs1770demo"].parent, - ) - else: - binary = find_binary("bs1770demo") - - if not isinstance(input, audio.BinauralAudio) and not isinstance( - input, audio.ChannelBasedAudio - ): - raise NotImplementedError(f"{input.name} is unsupported in ITU-R BS.1770-4.") - - if input.fs != 48000: - warn( - "ITU-R BS.1770-4 only supports 48kHz sampling rate. Temporarily resampling signal for measurement." - ) - tmp_sig = resample_itu(input, 48000) - else: - tmp_sig = input.audio - - with TemporaryDirectory() as tmp_dir: - tmp_dir = Path(tmp_dir) - tmp_file = tmp_dir.joinpath("tmp_loudness.pcm") - - """ - ITU-R BS.1770-4 - """ - - cmd = [ - str(binary), - "-nchan", - str(input.num_channels), # input nchan - "-lev", - str(target_loudness), # level - "-conf", - "", # config string - str(tmp_file), - null_file, - ] - - if isinstance(input, audio.BinauralAudio): - cmd[6] = "00" # -conf - elif isinstance(input, audio.ChannelBasedAudio): - # if loudspeaker position fulfills the criteria, set the config string to 1 for that index - conf_str = [ - str(int(abs(e) < 30 and (60 <= abs(a) <= 120))) - for a, e in zip(input.ls_azi, input.ls_ele) - ] - for lfe in input.lfe_index: - conf_str[lfe] = "L" - - cmd[6] = "".join(conf_str) - - # write temporary file - write(tmp_file, tmp_sig, 48000) - - # run command - result = run(cmd, logger=logger) - - # parse output - measured_loudness = float(result.stdout.splitlines()[3].split(":")[1]) - scale_factor = float(result.stdout.splitlines()[-3].split(":")[1]) - - return measured_loudness, scale_factor - - -def get_loudness( - input: audio.Audio, - target_loudness: Optional[float] = -26, - loudness_format: Optional[str] = None, -) -> Tuple[float, float]: - """ - Loudness measurement using ITU-R BS.1770-4 - - Parameters - ---------- - input : Audio - Input audio - target_loudness: float - Desired loudness in LKFS - loudness_format: str - Loudness format to render to for loudness computation (default input format if possible) - - Returns - ------- - measured_loudness : float - Measured loudness (after conversion to loudness_format if specified) - scale_factor: float - Scale factor to acheive desired loudness - """ - - if target_loudness > 0: - raise ValueError("Desired loudness is too high!") - - if loudness_format is None: - # for some formats rendering is necessary prior to loudness measurement - if isinstance(input, audio.SceneBasedAudio) or isinstance( - input, audio.MetadataAssistedSpatialAudio - ): - loudness_format = "7_1_4" - elif isinstance(input, audio.ObjectBasedAudio): - loudness_format = "BINAURAL" - elif hasattr(input, "layout_file"): - loudness_format = input.layout_file - else: - # default use input format - loudness_format = input.name - - # configure intermediate format - tmp = audio.fromtype(loudness_format) - tmp.fs = input.fs - - if input.name != loudness_format: - convert.format_conversion(input, tmp) - else: - tmp.audio = input.audio - - return bs1770demo(tmp, target_loudness) - - -def loudness_norm( - input: audio.Audio, - target_loudness: Optional[float] = -26, - loudness_format: Optional[str] = None, -) -> np.ndarray: - """ - Iterative loudness normalization using ITU-R BS.1770-4 - Signal is iteratively scaled after rendering to the specified format - until loudness converges to the target value - - Parameters - ---------- - input : Audio - Input audio - target_loudness: Optional[float] - Desired loudness level in LKFS - loudness_format: Optional[str] - Loudness format to render to for loudness computation (default input format) - - Returns - ------- - norm : Audio - Normalized audio - """ - - # repeat until convergence of loudness - measured_loudness = np.inf - scale_factor = 1 - num_iter = 1 - - while np.abs(measured_loudness - target_loudness) > 0.5 and num_iter < 10: - measured_loudness, scale_factor_new = get_loudness( - input, target_loudness, loudness_format - ) - - # scale input - input.audio *= scale_factor_new - - # update scale factor - scale_factor *= scale_factor_new - - num_iter += 1 - - if num_iter >= 10: - warn( - f"Loudness did not converge to desired value, stopping at: {measured_loudness:.2f}" - ) - - return input.audio - - -def scale_files( - file_list: list[list[Union[Path, str]]], - fmt: str, - loudness: float, - fs: Optional[int] = 48000, - in_meta: Optional[list] = None, -) -> None: - """ - Scales audio files to desired loudness - - Parameters - ---------- - file_list : list[list[Union[Path, str]]] - List of file paths in a list of the condition folders - fmt: str - Audio format of files in list - loudness: float - Desired loudness level in LKFS/dBov - fs: Optional[int] - Sampling rate - in_meta: Optional[list] - Metadata for ISM with same structure as file_list but one layer more - for the list of metadata for one file - """ - - if fmt.startswith("ISM") and in_meta: - meta_bool = True - else: - in_meta = copy.copy(file_list) - meta_bool = False - - for folder, meta_folder in zip(file_list, in_meta): - for file, meta in zip(folder, meta_folder): - # create audio object - if meta_bool: - audio_obj = audio.fromfile(fmt, file, fs, meta) - else: - audio_obj = audio.fromfile(fmt, file, fs) - - # adjust loudness - scaled_audio = loudness_norm(audio_obj, loudness) - - # write into file - write(file, scaled_audio, audio_obj.fs) diff --git a/item_generation_scripts/audiotools/wrappers/eid_xor.py b/item_generation_scripts/audiotools/wrappers/eid_xor.py deleted file mode 100644 index 0b807d94..00000000 --- a/item_generation_scripts/audiotools/wrappers/eid_xor.py +++ /dev/null @@ -1,193 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import os.path -from pathlib import Path -from typing import Optional, Union - -from item_generation_scripts.audiotools.wrappers.gen_patt import create_error_pattern -from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES -from item_generation_scripts.utils import find_binary, run - - -def eid_xor( - error_pattern: Union[str, Path], - in_bitstream: Union[str, Path], - out_bitstream: Union[str, Path], -) -> None: - """ - Wrapper for eid-xor binary to apply error patterns for the bitstream processing - - Parameters - ---------- - error_pattern: Union[str, Path] - Path to error pattern file - in_bitstream: Union[str, Path] - Path to input bitstream file - out_bitstream: Union[str, Path] - Output path for modified bitstream - """ - - # find binary - if "eid-xor" in DEFAULT_CONFIG_BINARIES["binary_paths"]: - binary = find_binary( - DEFAULT_CONFIG_BINARIES["binary_paths"]["eid-xor"].name, - binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"]["eid-xor"].parent, - ) - else: - binary = find_binary("eid-xor") - - # check for valid inputs - if not Path(in_bitstream).is_file(): - raise ValueError( - f"Input bitstream file {in_bitstream} for bitstream processing does not exist" - ) - elif not Path(error_pattern).is_file(): - raise ValueError( - f"Error pattern file {error_pattern} for bitstream processing does not exist" - ) - - # set up command line - cmd = [ - str(binary), - "-vbr", # Enables variable bit rate operation - "-fer", # Error pattern is a frame erasure pattern - in_bitstream, - error_pattern, - out_bitstream, - ] - - # run command - run(cmd) - - return - - -def create_and_apply_error_pattern( - in_bitstream: Union[Path, str], - out_bitstream: Union[Path, str], - len_sig: int, - error_pattern: Optional[Union[Path, str]] = None, - error_rate: Optional[float] = None, - preamble: Optional[int] = 0, - master_seed: Optional[int] = 0, - prerun_seed: Optional[int] = 0, -) -> None: - """ - Function to create (or use existing) frame error pattern for bitstream processing - - Parameters - ---------- - in_bitstream: Union[Path, str] - Path of input bitstream - out_bitstream: Union[Path, str] - Path of output bitstream - len_sig: int - Length of signal in frames - error_pattern: Optional[Union[Path, str]] - Path to existing error pattern - error_rate: float - Error rate in percent - preamble: Optional[int] - Length of preamble in frames - master_seed: Optional[int] - Master seed for error pattern generation - prerun_seed: Optional[int] - Number of preruns in seed generation - """ - - if error_pattern is None: - # create error pattern - if error_rate is not None: - error_pattern = in_bitstream.parent.joinpath("error_pattern").with_suffix( - ".192" - ) - create_error_pattern( - len_sig, error_pattern, error_rate, preamble, master_seed, prerun_seed - ) - else: - raise ValueError( - "Either error pattern or error rate has to be specified for bitstream processing" - ) - elif error_rate is not None: - raise ValueError( - "Error pattern and error rate are specified for bitstream processing. Can't use both" - ) - - # apply error pattern - eid_xor(error_pattern, in_bitstream, out_bitstream) - - return - - -def validate_error_pattern_application( - error_pattern: Optional[Union[Path, str]] = None, - error_rate: Optional[int] = None, -) -> None: - """ - Validate settings for the network simulator - - Parameters - ---------- - error_pattern: Optional[Union[Path, str]] - Path to existing error pattern - error_rate: Optional[int] - Frame error rate - """ - - if find_binary("gen-patt") is None: - raise FileNotFoundError( - "The binary gen-patt for error pattern generation was not found! Please check the configuration." - ) - if find_binary("eid-xor") is None: - raise FileNotFoundError( - "The binary eid-xor for error patter application was not found! Please check the configuration." - ) - if error_pattern is not None: - if not os.path.exists(os.path.realpath(error_pattern)): - raise FileNotFoundError( - f"The frame error profile file {error_pattern} was not found! Please check the configuration." - ) - if error_rate is not None: - raise ValueError( - "Frame error pattern and error rate are specified for bitstream processing. Can't use both! Please check the configuration." - ) - else: - if error_rate is None: - raise ValueError( - "Either error rate or error pattern has to be specified for FER bitstream processing." - ) - elif error_rate < 0 or error_rate > 100: - raise ValueError( - f"Specified error rate of {error_rate}% is either too large or too small." - ) - return diff --git a/item_generation_scripts/audiotools/wrappers/esdru.py b/item_generation_scripts/audiotools/wrappers/esdru.py deleted file mode 100644 index 7785a586..00000000 --- a/item_generation_scripts/audiotools/wrappers/esdru.py +++ /dev/null @@ -1,130 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -from pathlib import Path -from tempfile import TemporaryDirectory -from typing import Optional - -import numpy as np - -from item_generation_scripts.audiotools import audio -from item_generation_scripts.audiotools.audiofile import read, write -from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES -from item_generation_scripts.utils import find_binary, run - - -def esdru( - input: audio.Audio, - alpha: float, - sf: Optional[int] = 48000, - e_step: Optional[float] = 0.5, - seed: Optional[int] = 1, -) -> np.ndarray: - """ - Wrapper for ESDRU (Ericsson spatial distortion reference unit) Recommendation ITU-T P.811, requires esdru binary - - Parameters - ---------- - input : Audio - Input audio (16 bit Stereo PCM) - alpha: float - Alpha value [0.0 ... 1.0] - sf: Optional[int] - Sampling frequency FS Hz (Default: 48000 Hz) - e_step: Optional[float] - Max step S during high energy [0.0 ... 1.0] (Default: 0.5) - seed: Optional[int] - Set random seed I [unsigned int] (Default: 1) - - Returns - ------- - output: np.ndarray - Output array (16 bit Stereo PCM) - """ - if "esdru" in DEFAULT_CONFIG_BINARIES["binary_paths"]: - binary = find_binary( - DEFAULT_CONFIG_BINARIES["binary_paths"]["esdru"].name, - binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"]["esdru"].parent, - ) - else: - binary = find_binary("esdru") - - if not isinstance(input, audio.BinauralAudio) and not input.name == "STEREO": - raise Exception( - "ESDRU condition only available for STEREO or BINAURAL output format" - ) - - if alpha < 0.0 or alpha > 1.0: - raise Exception( - "Alpha value is out of bounds. Please choose a value between 0.0 and 1.0." - ) - - if e_step < 0.0 or e_step > 1.0: - raise Exception( - "Step value is out of bounds. Please choose a value between 0.0 and 1.0." - ) - - tmp_input_signal = input.audio - tmp_output_signal = np.ones((48000, 2)) - - with TemporaryDirectory() as tmp_dir: - tmp_dir = Path(tmp_dir) - tmp_input_file = tmp_dir.joinpath("tmp_input_signal.raw") - tmp_output_file = tmp_dir.joinpath("tmp_output_signal.raw") - - """ - ITU-T Recommendation P.811, ESDRU - """ - - cmd = [ - str(binary), - "-sf", - str(sf), - "-e_step", - str(e_step), - "-seed", - str(seed), - str(alpha), - str(tmp_input_file), - str(tmp_output_file), - ] - - # write temporary file - write(tmp_input_file, tmp_input_signal, sf) - write(tmp_output_file, tmp_output_signal, sf) - - # run command - run(cmd) - - tmp_output_signal, out_fs = read(tmp_output_file, 2, sf) - - return tmp_output_signal diff --git a/item_generation_scripts/audiotools/wrappers/filter.py b/item_generation_scripts/audiotools/wrappers/filter.py deleted file mode 100644 index 4c7b61b4..00000000 --- a/item_generation_scripts/audiotools/wrappers/filter.py +++ /dev/null @@ -1,366 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import re -from copy import copy -from pathlib import Path -from tempfile import TemporaryDirectory -from typing import Optional -from warnings import warn - -import numpy as np - -from item_generation_scripts.audiotools.audio import Audio, ChannelBasedAudio -from item_generation_scripts.audiotools.audioarray import delay_compensation -from item_generation_scripts.audiotools.audiofile import read, write -from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES -from item_generation_scripts.utils import find_binary, run - -FILTER_TYPES_REGEX = r"[\n][\s]{3}[A-Z0-9]\w+\s+" - - -def filter_itu( - input: Audio, - flt_type: str, - block_size: Optional[int] = None, - mod: Optional[bool] = False, - up: Optional[bool] = False, - down: Optional[bool] = False, - is_async: Optional[bool] = False, - delay: Optional[int] = None, - skip_channel: Optional[list[int]] = None, -) -> np.ndarray: - """ - Low-pass filter a multi-channel audio array - - Parameters - ---------- - input: Audio - Input array - flt_type: str - Name of filter type used for filtering - block_size: Optional[int] - Processing block size in number of samples (default 256 samples) - mod: Optional[bool] - Flag for using the modified IRS characteristic - up: Optional[bool] - Flag for up-sampling - down: Optional[bool] - Flag for down-sampling - is_async: Optional[bool] - Flag for asynchronization operation - delay: Optional[int] - Delay in number of samples - skip_channel: Optional[list[int]] - List of channel indices which should not be filtered - - Returns - ------- - output: np.ndarray - Output filtered array - """ - - if "filter" in DEFAULT_CONFIG_BINARIES["binary_paths"]: - binary = find_binary( - DEFAULT_CONFIG_BINARIES["binary_paths"]["filter"].name, - binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"]["filter"].parent, - ) - else: - binary = find_binary("filter") - - # check if filter type is supported - tmp = run([binary], check=False) - - FILTER_TYPES = [ - f.group().strip() for f in re.finditer(FILTER_TYPES_REGEX, tmp.stdout) - ] - - if flt_type not in FILTER_TYPES: - raise ValueError( - f"Filter type {flt_type} does not seem to be supported by the binary: {FILTER_TYPES}" - ) - - # create command line - cmd = [ - binary, - "-q", - ] - - if mod: - cmd.append("-mod") - if up and down: - raise ValueError("Either up-sampling or down-sampling has to be chosen") - if up: - cmd.append("-up") - elif down: - cmd.append("-down") - if is_async: - cmd.append("-async") - if delay: - cmd.extend(["-delay", str(delay)]) - - cmd.append(str(flt_type)) - - # create output array with according size - if up: - # upsampling -> size increases - if flt_type == "SHQ2": - output = np.zeros((np.shape(input.audio)[0] * 2, np.shape(input.audio)[1])) - elif flt_type == "SHQ3": - output = np.zeros((np.shape(input.audio)[0] * 3, np.shape(input.audio)[1])) - else: - raise ValueError(f"No upsampling with {flt_type} possible") - elif down: - # downsampling -> size decreases - if flt_type == "SHQ2": - output = np.zeros( - (int(np.ceil(np.shape(input.audio)[0] / 2)), np.shape(input.audio)[1]) - ) - elif flt_type == "SHQ3": - output = np.zeros( - (int(np.ceil(np.shape(input.audio)[0] / 3)), np.shape(input.audio)[1]) - ) - else: - raise ValueError(f"No downsampling with {flt_type} possible") - else: - # normal filtering -> size remains - output = np.zeros_like(input.audio) - - with TemporaryDirectory() as tmp_dir: - tmp_dir = Path(tmp_dir) - - # process channels separately - for channel in range(input.num_channels): - if skip_channel and channel in skip_channel: - continue - - cmd_in_out = cmd.copy() - - tmp_in = tmp_dir.joinpath(f"tmp_filterIn{channel}.pcm") - tmp_out = tmp_dir.joinpath(f"tmp_filterOut{channel}.pcm") - - cmd_in_out.append(str(tmp_in)) - cmd_in_out.append(str(tmp_out)) - - if block_size: - cmd_in_out.append(str(block_size)) - - write(tmp_in, input.audio[:, channel], input.fs) - - run(cmd_in_out) - - a, _ = read(tmp_out, nchannels=1, fs=input.fs) - output[:, channel][:, None] = a - - return output - - -def lpfilter_itu( - x: Audio, - fc: int, -) -> np.ndarray: - """ - Low-pass filter a multi-channel audio array - - Parameters - ---------- - x: Audio - Input audio - fc: int - Cut-off frequency in Hz - - Returns - ------- - y: np.ndarray - Output low-pass filtered array - """ - - # find right filter type for cut-off frequency - flt_types = ["LP1p5", "LP35", "LP7", "LP10", "LP12", "LP14", "LP20"] - flt_vals = [1500, 3500, 7000, 10000, 12000, 14000, 20000] - try: - flt_type = flt_types[flt_vals.index(fc)] - except Exception: - raise ValueError(f"LP cut-off frequency {fc}Hz not supported.") - - # resample if samplingrate is not supported - old_fs = None - tmp = copy(x) - if x.fs != 48000: - warn( - f"Filter type {flt_type} only supported for 48kHz samplingrate, not for {x.fs}Hz -> resampling" - ) - old_fs = x.fs - tmp.audio = resample_itu(tmp, 48000) - tmp.fs = 48000 - - # apply filter - y = filter_itu(tmp, flt_type=flt_type, block_size=960) - - # delay compensation - y = delay_compensation(y, flt_type=flt_type, fs=tmp.fs) - - # reverse resampling - if old_fs: - tmp.audio = y - y = resample_itu(tmp, old_fs) - - return y - - -def hp50filter_itu( - x: Audio, -) -> np.ndarray: - """ - High-pass 50Hz filter a multi-channel audio array - - Parameters - ---------- - x: Audio - Input audio - - Returns - ------- - y: np.ndarray - Output high-pass filtered array - """ - - # set filter type and check if sampling rate is supported - old_fs = None - tmp = copy(x) - if x.fs == 48000: - flt_type = "HP50_48KHZ" - elif x.fs == 32000: - flt_type = "HP50_32KHZ" - else: - # resample if samplingrate is not supported - warn( - f"Filter type HP50 only supported for 48kHz and 32kHz samlingrate, not for {x.fs}Hz -> resampling" - ) - flt_type = "HP50_48KHZ" - old_fs = x.fs - tmp.audio = resample_itu(tmp, 48000) - tmp.fs = 48000 - - # don't apply high-pass filtering to LFE channel - if isinstance(x, ChannelBasedAudio): - skip_channel = x.lfe_index - else: - skip_channel = None - - # apply filter - y = filter_itu(tmp, flt_type=flt_type, skip_channel=skip_channel) - - # delay compensation - y = delay_compensation(y, flt_type=flt_type, fs=tmp.fs) - - # reverse resampling - if old_fs: - tmp.audio = y - y = resample_itu(tmp, old_fs) - - return y - - -def resample_itu( - x: Audio, - fs_new: int, -) -> np.ndarray: - """ - Resampling of multi-channel audio array - - Parameters - ---------- - x: Audio - Input audio - fs_new: int - Target sampling rate in Hz - - Returns - ------- - y: np.ndarray - Output resampled array - """ - - fs_old = x.fs - - # if samplingrate is the same do nothing - if fs_new == fs_old: - return x.audio - - ratio_fs = fs_new / fs_old - up = [False] - down = [False] - - # select suitable processing to achieve target samplingrate - if ratio_fs == 2: - flt_type = ["SHQ2"] - up = [True] - elif ratio_fs == 0.5: - flt_type = ["SHQ2"] - down = [True] - elif ratio_fs == 3: - flt_type = ["SHQ3"] - up = [True] - elif ratio_fs == 1 / 3: - flt_type = ["SHQ3"] - down = [True] - elif ratio_fs == 2 / 3: - flt_type = ["SHQ2", "SHQ3"] - up = [True, False] - down = [False, True] - elif ratio_fs == ratio_fs == 3 / 2: - flt_type = ["SHQ3", "SHQ2"] - up = [True, False] - down = [False, True] - else: - raise ValueError("Ratio of input and output sampling frequency not supported") - - # apply filter - y = copy(x) - for i, flt in enumerate(flt_type): - y.audio = filter_itu(y, flt_type=flt, up=up[i], down=down[i]) - y.audio = delay_compensation( - y.audio, flt_type=flt, fs=y.fs, up=up[i], down=down[i] - ) - # if up[i]: - # if flt == "SHQ2": - # y.fs = y.fs * 2 - # elif flt == "SHQ3": - # y.fs = y.fs * 3 - # elif down[i]: - # if flt == "SHQ2": - # y.fs = int(y.fs / 2) - # elif flt == "SHQ3": - # y.fs = int(y.fs / 3) - - return y.audio diff --git a/item_generation_scripts/audiotools/wrappers/gen_patt.py b/item_generation_scripts/audiotools/wrappers/gen_patt.py deleted file mode 100644 index a68706a7..00000000 --- a/item_generation_scripts/audiotools/wrappers/gen_patt.py +++ /dev/null @@ -1,171 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -from os import getcwd -from pathlib import Path -from tempfile import TemporaryDirectory -from typing import Optional, Union - -from item_generation_scripts.audiotools.wrappers.random_seed import random_seed -from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES -from item_generation_scripts.utils import find_binary, run - -ERROR_PATTERNS_DIR = Path(__file__).parent.parent.parent.joinpath("error_patterns") - - -def gen_patt( - len_sig: int, - path_pattern: Union[Path, str], - error_rate: float, - start: Optional[int] = 0, - working_dir: Optional[Union[Path, str]] = None, -) -> None: - """ - Wrapper for gen-patt binary to create error patterns for the bitstream processing - - Parameters - ---------- - len_sig: int - Length of signal in frames - path_pattern: Union[Path, str] - Path of output pattern - error_rate: float - Error rate in percent - start: Optional[int] - Start frame of error pattern (length preamble) - working_dir: Optional[Union[Path, str]] - Directory where binary should be called (sta file has to be in this dir if desired) - """ - - # find binary - if "gen-patt" in DEFAULT_CONFIG_BINARIES["binary_paths"]: - binary = find_binary( - DEFAULT_CONFIG_BINARIES["binary_paths"]["gen-patt"].name, - binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"]["gen-patt"].parent, - ) - else: - binary = find_binary("gen-patt") - - if working_dir is None: - working_dir = getcwd() - - # set up command line - cmd = [ - str(binary), - "-tailstat", # Statistics performed on the tail - "-fer", # Frame erasure mode using Gilbert model - "-g192", # Save error pattern in 16-bit G.192 format - "-gamma", # Correlation for BER|FER modes - str(0), - "-rate", - str(error_rate / 100), - "-tol", # Max deviation of specified BER/FER/BFER - str(0.001), - "-reset", # Reset EID state in between iteractions - "-n", - str(int(len_sig)), - "-start", - str(int(start) + 1), - path_pattern, - ] - - # run command - run(cmd, cwd=working_dir) - - return - - -def create_error_pattern( - len_sig: int, - path_pattern: Union[Path, str], - frame_error_rate: float, - preamble: Optional[int] = 0, - master_seed: Optional[int] = 0, - prerun_seed: Optional[int] = 0, -) -> None: - """ - Creates error pattern with desired frame error rate for bitstream processing - - Parameters - ---------- - len_sig: int - Length of signal in frames - path_pattern: Union[Path, str] - Path of output pattern - frame_error_rate: float - Error rate in percent - preamble: Optional[int] - Length of preamble in frames - master_seed: Optional[int] - Master seed for error pattern generation - prerun_seed: optional[int] - Number of preruns in seed generation - """ - - with TemporaryDirectory() as tmp_dir: - tmp_dir = Path(tmp_dir) - - sta_file = ERROR_PATTERNS_DIR.joinpath("sta_template") - tmp_sta_file = tmp_dir.joinpath("sta") - - # compute seed - seed = random_seed((0, 99999999), master_seed, prerun_seed) - - # open file and modify - lines = [] - with open(sta_file, "r") as sta_file_txt: - lines.append(sta_file_txt.readline()) # not changed - lines.append(f"{sta_file_txt.readline()[:-2]}{frame_error_rate/100}\n") - lines.append(sta_file_txt.readline()) # not changed - lines.append(f"{sta_file_txt.readline()[:-2]}{seed}\n") - lines.append(sta_file_txt.readline()) # not changed - lines.append( - f"{sta_file_txt.readline()[:-2]}{1-(frame_error_rate/100*2)}\n" - ) - lines.append(sta_file_txt.readline()) # not changed - lines.append( - f"{sta_file_txt.readline()[:-2]}{1-(frame_error_rate/100*2)}\n" - ) - lines.append(sta_file_txt.readline()) # not changed - - with open(tmp_sta_file, "w") as tmp_sta_file_txt: - tmp_sta_file_txt.write("".join(lines)) - - gen_patt( - len_sig=len_sig, - error_rate=frame_error_rate, - path_pattern=path_pattern, - start=preamble, - working_dir=tmp_dir, - ) - - return diff --git a/item_generation_scripts/audiotools/wrappers/masaRenderer.py b/item_generation_scripts/audiotools/wrappers/masaRenderer.py deleted file mode 100644 index a5987b1e..00000000 --- a/item_generation_scripts/audiotools/wrappers/masaRenderer.py +++ /dev/null @@ -1,117 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -from pathlib import Path -from tempfile import TemporaryDirectory - -import numpy as np - -from item_generation_scripts.audiotools import audio -from item_generation_scripts.audiotools.audiofile import read, write -from item_generation_scripts.audiotools.wrappers.filter import resample_itu -from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES -from item_generation_scripts.utils import find_binary, run - - -def masaRenderer( - masa: audio.MetadataAssistedSpatialAudio, - out_fmt: str, -) -> np.ndarray: - """ - Wrapper for masaRenderer (from MASA reference software) - - Parameters - ---------- - masa : MetadataAssistedSpatialAudio - Input MASA audio - out_fmt: str - Desired output format (only 5_1, 7_1_4 and BINAURAL supported) - - Returns - ------- - output : np.ndarray - MASA rendered to out_fmt - """ - - if "masaRenderer" in DEFAULT_CONFIG_BINARIES["binary_paths"]: - binary = find_binary( - DEFAULT_CONFIG_BINARIES["binary_paths"]["masaRenderer"].name, - binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"]["masaRenderer"].parent, - ) - else: - binary = find_binary("masaRenderer") - - if out_fmt not in ["5_1", "7_1_4", "BINAURAL"]: - raise ValueError(f"Output format {out_fmt} is not supported by MasaRenderer!") - - if out_fmt == "5_1": - output_mode = "-LS51" - num_channels = 6 - elif out_fmt == "7_1_4": - output_mode = "-LS714" - num_channels = 12 - else: - output_mode = "-BINAURAL" - num_channels = 2 - - cmd = [ - str(binary), - output_mode, - "", # 2 -> inputPcm - str(masa.metadata_files.resolve()), - "", # 4 -> outputPcm - ] - - with TemporaryDirectory() as tmp_dir: - tmp_dir = Path(tmp_dir) - tmp_in = tmp_dir.joinpath("tmp_masaRendIn.pcm") - tmp_out = tmp_dir.joinpath("tmp_masaRendOut.pcm") - - cmd[2] = str(tmp_in) - cmd[4] = str(tmp_out) - - tmp_audio = resample_itu(masa, 48000) - old_fs = masa.fs - - write(tmp_in, tmp_audio, 48000) - - # we need to run in the masaRenderer directory to use the .bin files it requires - run(cmd, cwd=binary.resolve().parent) - - output, _ = read(tmp_out, num_channels) - - output_audio = audio.fromtype(out_fmt) - output_audio.audio = output - output_audio.fs = 48000 - output = resample_itu(output_audio, old_fs) - - return output diff --git a/item_generation_scripts/audiotools/wrappers/networkSimulator.py b/item_generation_scripts/audiotools/wrappers/networkSimulator.py deleted file mode 100644 index 4e74c3ce..00000000 --- a/item_generation_scripts/audiotools/wrappers/networkSimulator.py +++ /dev/null @@ -1,224 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import logging -import os.path -from pathlib import Path -from typing import Optional, Union - -from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES -from item_generation_scripts.utils import find_binary, run - -LIST_JBM_PROFILES = range(12) -ERROR_PATTERNS_DIR = Path(__file__).parent.parent.parent.joinpath("dly_error_profiles") - - -def validate_network_simulator( - error_pattern: Optional[Union[Path, str]] = None, - error_profile: Optional[int] = None, - n_frames_per_packet: Optional[int] = None, -) -> None: - """ - Validate settings for the network simulator - - Parameters - ---------- - error_pattern: Optional[Union[Path, str]] - Path to existing error pattern - error_profile: Optional[int] - Index of existing error pattern - n_frames_per_packet: Optional[int] - Number of frames per paket - """ - - if "networkSimulator_g192" in DEFAULT_CONFIG_BINARIES["binary_paths"]: - binary = find_binary( - DEFAULT_CONFIG_BINARIES["binary_paths"]["networkSimulator_g192"].name, - binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"][ - "networkSimulator_g192" - ].parent, - ) - else: - binary = find_binary("networkSimulator_g192") - - if binary is None: - raise FileNotFoundError( - "The network simulator binary was not found! Please check the configuration." - ) - if error_pattern is not None: - if not os.path.exists(os.path.realpath(error_pattern)): - raise FileNotFoundError( - f"The network simulator error profile file {error_pattern} was not found! Please check the configuration." - ) - if error_profile is not None: - raise ValueError( - "JBM pattern and JBM profile number are specified for bitstream processing. Can't use both! Please check the configuration." - ) - elif error_profile is not None: - if error_profile not in LIST_JBM_PROFILES: - raise ValueError( - f"JBM profile number {error_profile} does not exist, should be between {LIST_JBM_PROFILES[0]} and {LIST_JBM_PROFILES[-1]}" - ) - if n_frames_per_packet is not None and n_frames_per_packet not in [1, 2]: - raise ValueError( - f"n_frames_per_paket is {n_frames_per_packet}. Should be 1 or 2. Please check your configuration." - ) - - return - - -def network_simulator( - error_pattern: Union[str, Path], - in_bitstream: Union[str, Path], - out_bitstream: Union[str, Path], - n_frames_per_packet: int, - offset: int, - logger: Optional[logging.Logger] = None, -) -> None: - """ - Wrapper for networkSimulator_g192 binary to apply error patterns for the bitstream processing - - Parameters - ---------- - error_pattern: Union[str, Path] - Path to error pattern file - in_bitstream: Union[str, Path] - Path to input bitstream file - out_bitstream: Union[str, Path] - Output path for modified bitstream - n_frames_per_packet: int, - Number of frames per paket [1,2] - offset: Optional[int] - delay offset - logger: Optional[logging.Logger] - logger - """ - - # find binary - if "networkSimulator_g192" in DEFAULT_CONFIG_BINARIES["binary_paths"]: - binary = find_binary( - DEFAULT_CONFIG_BINARIES["binary_paths"]["networkSimulator_g192"].name, - binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"][ - "networkSimulator_g192" - ].parent, - ) - else: - binary = find_binary("networkSimulator_g192") - - # check for valid inputs - if not Path(in_bitstream).is_file(): - raise ValueError( - f"Input bitstream file {in_bitstream} for bitstream processing does not exist" - ) - elif not Path(error_pattern).is_file(): - raise ValueError( - f"Error pattern file {error_pattern} for bitstream processing does not exist" - ) - - # set up command line - cmd = [ - str(binary), - error_pattern, - in_bitstream, - out_bitstream, - f"{out_bitstream}_tracefile_sim", - str(n_frames_per_packet), - str(offset), - ] - - # run command - run(cmd, logger=logger) - - return - - -def apply_network_simulator( - in_bitstream: Union[Path, str], - out_bitstream: Union[Path, str], - error_pattern: Optional[Union[Path, str]] = None, - error_profile: Optional[int] = None, - n_frames_per_packet: Optional[int] = None, - offset: Optional[int] = 0, - logger: Optional[logging.Logger] = None, -) -> None: - """ - Function to apply a network simulator profile to a bitstreaam - - Parameters - ---------- - in_bitstream: Union[Path, str] - Path of input bitstream - out_bitstream: Union[Path, str] - Path of output bitstream - error_pattern: Optional[Union[Path, str]] - Path to existing error pattern - error_profile: Optional[int] - Index of existing error pattern - n_frames_per_packet: Optional[int] - Number of frames per paket - offset: Optional[int] - delay offset - logger: Optional[logging.Logger] - logger - """ - - if error_pattern is None: - # create error pattern - if error_profile is not None: - if error_profile in LIST_JBM_PROFILES: - error_pattern = ERROR_PATTERNS_DIR.joinpath( - f"dly_error_profile_{error_profile}.dat" - ) - else: - raise ValueError( - f"JBM profile number {error_profile} does not exist, should be between {LIST_JBM_PROFILES[0]} and {LIST_JBM_PROFILES[-1]}" - ) - else: - raise ValueError( - "Either error pattern or error profile number has to be specified for network simulator bitstream processing" - ) - elif error_profile is not None: - raise ValueError( - "JBM pattern and JBM profile number are specified for bitstream processing. Can't use both" - ) - - if n_frames_per_packet is None: - n_frames_per_packet = 1 - if error_profile is not None and error_profile == 5: - n_frames_per_packet = 2 - - # apply error pattern - network_simulator( - error_pattern, in_bitstream, out_bitstream, n_frames_per_packet, offset, logger - ) - - return diff --git a/item_generation_scripts/audiotools/wrappers/p50fbmnru.py b/item_generation_scripts/audiotools/wrappers/p50fbmnru.py deleted file mode 100644 index 2f4c19ef..00000000 --- a/item_generation_scripts/audiotools/wrappers/p50fbmnru.py +++ /dev/null @@ -1,110 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -from pathlib import Path -from tempfile import TemporaryDirectory -from warnings import warn - -import numpy as np - -from item_generation_scripts.audiotools import audio -from item_generation_scripts.audiotools.audiofile import read, write -from item_generation_scripts.audiotools.wrappers.filter import resample_itu -from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES -from item_generation_scripts.utils import find_binary, run - - -def p50fbmnru( - input: audio.Audio, - q_db: float, -) -> np.ndarray: - """ - Wrapper for P.50 Fullband MNRU (Modulated Noise Reference Unit), requires p50fbmnru binary - The mode is M (Modulated Noise) as specified in section 5.2.1 of S4-141392 - EVS-7c Processing functions for characterization phase v110.doc - - Parameters - ---------- - input : Audio - Input audio - q_db: float - The ratio, in dB, of speech power to modulated noise power - - Returns - ------- - output: np.ndarray - Output array - """ - - if "p50fbmnru" in DEFAULT_CONFIG_BINARIES["binary_paths"]: - binary = find_binary( - DEFAULT_CONFIG_BINARIES["binary_paths"]["p50fbmnru"].name, - binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"]["p50fbmnru"].parent, - ) - else: - binary = find_binary("p50fbmnru") - - if input.fs != 48000: - warn("P.50 Fullband MNRU requires a sampling rate of 48kHz.") - tmp_sig = resample_itu(input, 48000) - else: - tmp_sig = input.audio - - tmp_input_signal = tmp_sig - tmp_output_signal = np.ones((48000, input.num_channels)) - - with TemporaryDirectory() as tmp_dir: - tmp_dir = Path(tmp_dir) - tmp_input_file = tmp_dir.joinpath("tmp_input_signal.raw") - tmp_output_file = tmp_dir.joinpath("tmp_output_signal.raw") - - """ - P.50 Fullband MNRU - """ - - cmd = [ - str(binary), - str(tmp_input_file), - str(tmp_output_file), - str(q_db), - "M", - ] - - # write temporary file - write(tmp_input_file, tmp_input_signal) - write(tmp_output_file, tmp_output_signal) - - # run command - run(cmd) - - tmp_output_signal, out_fs = read(tmp_output_file, input.num_channels) - - return tmp_output_signal diff --git a/item_generation_scripts/audiotools/wrappers/random_seed.py b/item_generation_scripts/audiotools/wrappers/random_seed.py deleted file mode 100644 index 01cf0870..00000000 --- a/item_generation_scripts/audiotools/wrappers/random_seed.py +++ /dev/null @@ -1,92 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -from typing import Optional, Tuple - -from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES -from item_generation_scripts.utils import find_binary, run - - -def random_seed( - range: Tuple[int, int], - master_seed: Optional[int] = 0, - prerun_seed: Optional[int] = 0, - hexa: Optional[bool] = True, -) -> int: - """ - - Parameters - ---------- - master_seed: Optional[int] - Master seed for error pattern generation - prerun_seed: Optional[int] - Number of preruns in seed generation - hexa: Optonal[bool] - Flag if output should be in hexadecimal or decimal format - - Returns - ------- - result: int - One random value - """ - - # find binary - if "random" in DEFAULT_CONFIG_BINARIES["binary_paths"]: - binary = find_binary( - DEFAULT_CONFIG_BINARIES["binary_paths"]["random"].name, - binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"]["random"].parent, - ) - else: - binary = find_binary("random") - - # set up command line - cmd = [ - str(binary), - "-n", # Number of items - str(1), - "-s", - str(master_seed), - "-d", - str(prerun_seed), - "-r", # value range for results - str(range[0]), - str(range[1]), - ] - - # run command - result = run(cmd) - result = int(result.stdout[:-1]) - - if hexa: - result = hex(result) - - return result diff --git a/item_generation_scripts/binary_paths.yml b/item_generation_scripts/binary_paths.yml deleted file mode 100644 index bafcacfc..00000000 --- a/item_generation_scripts/binary_paths.yml +++ /dev/null @@ -1,30 +0,0 @@ ---- -################################################ -# Binary paths -################################################ -### Custom binary paths and names can be specified here. -### If not defined here, the binaries in item_generation_scripts/bin would be used -### If binaries are neither specified here nor found in the bin folder, the scripts would look for them in $PATH -### DO NOT change the location of this file. -### DO NOT USE relative paths. The paths have to be absolute. -### DO NOT change the default keys. -### For example, if the user has renamed the 'filter' binary to 'foo' then use --> filter: path/to/binary/foo - -# ### Binary for resampling and filtering -# filter: "path/to/binary/filter_new" -# ### Binary for loudness adjustment -# bs1770demo: "path/to/binary/bs1880" -# ### Binary for MNRU -# p50fbmnru: "path/to/binary/p50fbmnru" -# ### Binary for ESDRU -# esdru: "path/to/binary/esdru" -# ### Binary for frame error pattern application -# eid-xor: "path/to/binary/eid-xor" -# ### Binary for error pattern generation -# gen-patt: "path/to/binary/gen-patt" -# ### Binary for random offset/seed generation -# random: "path/to/binary/random" -# ### Binary for JBM network similulator -# networkSimulator_g192: "path/to/binary/networkSimulator_g192" -# ### Binary for MASA rendering -# masaRenderer: "path/to/binary/masaRenderer" \ No newline at end of file diff --git a/item_generation_scripts/processing/__init__.py b/item_generation_scripts/processing/__init__.py deleted file mode 100644 index aea270d8..00000000 --- a/item_generation_scripts/processing/__init__.py +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# diff --git a/item_generation_scripts/processing/preprocessing_2.py b/item_generation_scripts/processing/preprocessing_2.py deleted file mode 100644 index 1152ccc7..00000000 --- a/item_generation_scripts/processing/preprocessing_2.py +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import logging -from pathlib import Path -from warnings import warn - -import numpy as np - -from item_generation_scripts.audiotools import audio -from item_generation_scripts.audiotools.audioarray import delay, trim -from item_generation_scripts.audiotools.audiofile import write -from item_generation_scripts.audiotools.metadata import ( - add_remove_preamble, - write_ISM_metadata_in_file, -) -from item_generation_scripts.audiotools.wrappers.bs1770 import ( - get_loudness, - loudness_norm, -) -from item_generation_scripts.audiotools.wrappers.random_seed import random_seed -from item_generation_scripts.processing.processing import Processing - - -class Preprocessing2(Processing): - def __init__(self, attrs: dict): - super().__init__(attrs) - self.name = "pre_2" - - def process(self, in_file: Path, out_file: Path, in_meta, logger: logging.Logger): - logger.debug(f"Preprocessing2 configuration : {self.__dict__}") - logger.debug(f"Preprocessing2 {in_file.absolute()} -> {out_file.absolute()}") - - # load in file - audio_object = audio.fromfile( - self.in_fmt, in_file, fs=self.in_fs, in_meta=in_meta - ) - - # add preamble - if self.preamble: - # also apply preamble to ISM metadata - if self.in_fmt.startswith("ISM"): - # read out old - metadata = [] - for meta in in_meta: - metadata.append(np.genfromtxt(meta, delimiter=",")) - - # modify metadata - metadata = add_remove_preamble(metadata, self.preamble) - meta_files = write_ISM_metadata_in_file(metadata, [out_file], True) - - # modify audio object - audio_object.metadata_files = meta_files - audio_object.obect_pos = metadata - - # add preamble to actual signal - audio_object.audio = trim( - audio_object.audio, - audio_object.fs, - (-self.preamble, 0), - self.pad_noise_preamble, - ) - - # add background noise - if self.background_noise: - audio_object.audio = self.add_background_noise(audio_object, in_meta) - - # save file - write(out_file, audio_object.audio, fs=audio_object.fs) - - return - - def add_background_noise(self, audio_object: audio.Audio, in_meta) -> np.ndarray: - # range for random delay - range_delay = (1, 2400000) - - # load background noise - noise_object = audio.fromfile( - self.in_fmt, - self.background_noise["background_noise_path"], - fs=self.in_fs, - in_meta=in_meta, - ) - - # if noise is too short raise error - if len(noise_object.audio) < len(audio_object.audio): - raise ValueError("Background noise too short for audio signal") - if len(noise_object.audio) - range_delay[1] < len(audio_object.audio): - warn( - "Background noise may be to short for audio signal when considering the random delay" - ) - - # measure loudness of audio signal based on output format - tmp_object = audio.fromtype(self.out_fmt) - if ( - isinstance(tmp_object, audio.ObjectBasedAudio) - or isinstance(tmp_object, audio.SceneBasedAudio) - or isinstance(tmp_object, audio.MetadataAssistedSpatialAudio) - ): - out_format = None - else: - out_format = self.out_fmt - - loudness_signal, _ = get_loudness(audio_object, loudness_format=out_format) - - # compute desired loudness of background noise - loudness_noise = loudness_signal - self.background_noise["snr"] - - # apply random delay and cut signal - rand_delay = random_seed( - range=range_delay, - master_seed=self.background_noise["master_seed"], - prerun_seed=self.background_noise["seed_delay"], - hexa=False, - ) - noise_object.audio = delay( - noise_object.audio, delay=-rand_delay, samples=True, fs=noise_object.fs - )[: len(audio_object.audio)] - - # scale background noise to desired loudness based on output format - noise_object.audio = loudness_norm(noise_object, loudness_noise, out_format) - - # add array to signal - audio_object.audio = noise_object.audio + audio_object.audio - - return audio_object.audio diff --git a/item_generation_scripts/processing/processing.py b/item_generation_scripts/processing/processing.py deleted file mode 100644 index ad2cf272..00000000 --- a/item_generation_scripts/processing/processing.py +++ /dev/null @@ -1,455 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import logging -from abc import ABC, abstractmethod -from itertools import repeat -from pathlib import Path -from shutil import copyfile -from typing import Iterable, Union -from warnings import warn - -import numpy as np - -from item_generation_scripts.audiotools import audio -from item_generation_scripts.audiotools.audiofile import ( - concat, - read, - split, - trim, - write, -) -from item_generation_scripts.audiotools.metadata import ( - add_remove_preamble, - concat_meta_from_file, - metadata_search, - split_meta_in_file, - write_ISM_metadata_in_file, -) -from item_generation_scripts.audiotools.wrappers.bs1770 import scale_files -from item_generation_scripts.constants import LOGGER_DATEFMT, LOGGER_FORMAT -from item_generation_scripts.processing.config import TestConfig -from item_generation_scripts.utils import apply_func_parallel, list_audio, pairwise - - -class Processing(ABC): - def __init__(self, attrs: dict): - self.__dict__.update(attrs) - - @abstractmethod - def process( - self, in_file: Path, out_file: Path, in_meta, logger: logging.Logger - ) -> None: - pass - - -def reorder_items_list(items_list: list, concatenation_order: list) -> list: - name_to_full = {Path(full_file).name: full_file for full_file in items_list} - ordered_full_files = [ - name_to_full[name] for name in concatenation_order if name in name_to_full - ] - return ordered_full_files - - -def concat_setup(cfg: TestConfig, chain, logger: logging.Logger): - n_items_list = len(cfg.items_list) - cfg_pre2 = chain[0] - - # check for text files - if any([i for i in cfg.items_list if i.suffix == ".txt"]): - raise SystemExit("Concatenation for text files is unsupported") - - # apply concatenation order - if cfg_pre2.concatenation_order is not None: - n_concatenation_order = len(cfg_pre2.concatenation_order) - if n_concatenation_order != n_items_list: - warn( - f"Warning: Mismatch in specified concatenation order and number of items to process!\n" - f"Number of items specified in concatenation order: {n_concatenation_order}\n" - f"Number of items in the directory: {n_items_list}\n" - f"Concatenation will use the following order:\n{cfg_pre2.concatenation_order}" - ) - - logger.info(f"Concatenating input files in directory {cfg.input_path}") - - # concatenate ISM metadata - if cfg.input["fmt"].startswith("ISM"): - cfg.concat_meta = [] - for obj_idx in range(len(cfg.metadata_path[0])): - cfg.concat_meta.append( - cfg.tmp_dirs[0].joinpath( - f"{cfg.input_path.name}_concatenated.wav.{obj_idx}.csv" - ) - ) - concat_meta_from_file( - cfg.items_list, - cfg.metadata_path, - cfg.concat_meta, - cfg.input["fmt"], - ) - - # set input to the concatenated file we have just written to the output dir - cfg.metadata_path = [cfg.concat_meta] - - # concatenate audio - cfg.concat_file = cfg.tmp_dirs[0].joinpath( - f"{cfg.input_path.name}_concatenated.wav" - ) - - # determine number of channels for pcm and raw files - tmp_audio = audio.fromtype(cfg_pre2.in_fmt) - tmp_num_chans = tmp_audio.num_channels - - cfg.splits = concat( - cfg.items_list, - cfg.concat_file, - in_fs=cfg.input.get("fs", 48000), - num_channels=tmp_num_chans, - ) - - # save item naming for splits naming in the end - cfg.split_names = [] - for name in cfg.items_list: - cfg.split_names.append(Path(name).stem.split(".")[0]) - # set input to the concatenated file we have just written to the output dir - cfg.items_list = [cfg.concat_file] - - # write out splits - with open(cfg.concat_file.with_suffix(".splits.log"), "w") as f: - print(", ".join([str(s) for s in cfg.splits]), file=f) - print(", ".join([str(sn) for sn in cfg.split_names]), file=f) - print(", ".join([str(i.stem) for i in cfg.items_list]), file=f) - - logger.info(f"Splits written to file {cfg.concat_file.with_suffix('.splits.log')}") - - -def concat_teardown(cfg: TestConfig, logger: logging.Logger): - if not cfg.splits: - raise ValueError("Splitting not possible without split marker") - - output_format = cfg.postprocessing["fmt"] - - out_files = [] - out_meta = [] - - logger.info(f"Splitting output file in directory {cfg.output_path}") - - for odir in cfg.out_dirs: - path_input = odir / cfg.items_list[0].name - out_paths = split( - path_input, - odir, - cfg.split_names, - cfg.splits, - in_fs=cfg.postprocessing["fs"], - ) - - logger.debug( - f"Resulting split files condition {odir.name}: {', '.join([str(op) for op in out_paths])}" - ) - out_files.append(out_paths) - - # split ISM metadata - if output_format.startswith("ISM"): - for odir in cfg.out_dirs: - path_input = odir / cfg.items_list[0].name - out_meta_paths = split_meta_in_file( - path_input, - odir, - cfg.split_names, - cfg.splits, - output_format, - meta_files=cfg.metadata_path[0], - ) - out_meta.append(out_meta_paths) - - # remove concatenated file - if cfg.delete_tmp: - cfg.concat_file.unlink(missing_ok=True) - - return out_files, out_meta - - -def preprocess(cfg, logger): - preprocessing = cfg.proc_chains[0] - chain = preprocessing["processes"] - - logger.info(f" Generating condition: {preprocessing['name']}") - - # run preprocessing - apply_func_parallel( - process_item, - zip( - cfg.items_list, - repeat(cfg.tmp_dirs[0]), - repeat(cfg.out_dirs[0]), - repeat(chain), - repeat(logger), - cfg.metadata_path, - ), - None, - "mp" if cfg.multiprocessing else None, - ) - - # update the configuration to use preprocessing outputs as new inputs - cfg.items_list = list_audio( - cfg.out_dirs[0], absolute=False, select_list=getattr(cfg, "input_select", None) - ) - - # Re-ordering items based on concatenation order - if ( - hasattr(cfg, "preprocessing_2") - and cfg.preprocessing_2.get("concatenate_input", False) - and cfg.preprocessing_2.get("concatenation_order", None) is not None - ): - cfg.items_list = reorder_items_list(cfg.items_list, cfg.concatenation_order) - - if cfg.metadata_path[0] is not None: - for item_idx in range(len(cfg.metadata_path)): - for obj_idx in range(len(cfg.metadata_path[item_idx])): - if cfg.metadata_path[item_idx][obj_idx]: - cfg.metadata_path[item_idx][obj_idx] = cfg.out_dirs[0] / Path( - f"{cfg.items_list[item_idx].stem}.wav.{obj_idx}.csv" - ) - # remove already applied processing stage - cfg.proc_chains = cfg.proc_chains[1:] - cfg.tmp_dirs = cfg.tmp_dirs[1:] - cfg.out_dirs = cfg.out_dirs[1:] - - -def preprocess_2(cfg, logger): - preprocessing_2 = cfg.proc_chains[0] - chain = preprocessing_2["processes"] - - logger.info(f" Generating condition: {preprocessing_2['name']}") - - # concatenate items if required - if chain[0].concatenate_input: - concat_setup(cfg, chain, logger) - - # run preprocessing 2 - apply_func_parallel( - process_item, - zip( - cfg.items_list, - repeat(cfg.tmp_dirs[0]), - repeat(cfg.out_dirs[0]), - repeat(chain), - repeat(logger), - cfg.metadata_path, - ), - None, - "mp" if cfg.multiprocessing else None, - ) - - # update the configuration to use preprocessing 2 outputs as new inputs - cfg.items_list = list_audio( - cfg.out_dirs[0], absolute=False, select_list=getattr(cfg, "input_select", None) - ) - - # Re-ordering items based on concatenation order - if ( - hasattr(cfg, "preprocessing_2") - and cfg.preprocessing_2.get("concatenate_input", False) - and cfg.preprocessing_2.get("concatenation_order", None) is not None - ): - cfg.items_list = reorder_items_list(cfg.items_list, cfg.concatenation_order) - - if cfg.metadata_path[0] is not None: - for item_idx in range(len(cfg.metadata_path)): - for obj_idx in range(len(cfg.metadata_path[item_idx])): - if cfg.metadata_path[item_idx][obj_idx]: - cfg.metadata_path[item_idx][obj_idx] = cfg.out_dirs[0] / Path( - f"{cfg.items_list[item_idx].stem}.wav.{obj_idx}.csv" - ) - # remove already applied processing stage - cfg.proc_chains = cfg.proc_chains[1:] - cfg.tmp_dirs = cfg.tmp_dirs[1:] - cfg.out_dirs = cfg.out_dirs[1:] - - return - - -def reverse_process_2(cfg, logger): - # remove preamble - if cfg.pre2.preamble: - remove_preamble(cfg) - - # reverse concatenation - if cfg.pre2.concatenate_input: - # write out the splits, optionally remove file - out_paths_splits, out_meta_splits = concat_teardown(cfg, logger) - else: - # if no concatenation read files from folder - out_paths_splits = [] - for out_dir in cfg.out_dirs: - list_audio_dir = list_audio(out_dir, absolute=True) - out_paths_splits.append(list_audio_dir) - if cfg.postprocessing["fmt"].startswith("ISM"): - out_meta_splits = [] - for i, condition in enumerate(out_paths_splits): - meta_condition = metadata_search( - cfg.out_dirs[i], - condition, - num_objects=int(cfg.postprocessing["fmt"][-1]), - ) - out_meta_splits.append(meta_condition) - else: - out_meta_splits = None - - # scale individual files - if cfg.postprocessing.get("loudness", False): - scale_files( - out_paths_splits, - cfg.postprocessing["fmt"], - cfg.postprocessing["loudness"], - cfg.postprocessing["fs"], - out_meta_splits, - ) - return - - -def process_item( - in_file: Union[Path, str], - tmp_dir: Union[Path, str], - out_dir: Union[Path, str], - chain: Iterable, - logger: logging.Logger, - in_meta, -) -> None: - tmp_file = tmp_dir.joinpath(in_file.name) - tmp_file_meta = [] - if in_meta: - for im in in_meta: - tmp_file_meta.append(tmp_dir.joinpath(Path(im).name)) - - # assemble a list of files to be used during the processing chain - out_dir_wav = False - processing_paths = [in_file] - processing_paths_meta = [in_meta] - for p in chain: - if Path(in_file.name).suffix == ".txt" and p.out_fmt is not None: - processing_paths.append(tmp_file.with_suffix(f".{p.name}.wav")) - out_dir_wav = True - else: - processing_paths.append(tmp_file.with_suffix(f".{p.name}{tmp_file.suffix}")) - try: - out_format = p.out_fmt - except AttributeError: - # EVS has no attribute out_fmt - out_format = p.in_fmt - try: - bool_ism = out_format.startswith("ISM") - except Exception: - bool_ism = out_format.name.startswith("ISM") - - if bool_ism: - list_meta_step = [] - for idx, tfm in enumerate(tmp_file_meta): - list_meta_step.append( - tfm.parent - / f"{in_file.stem.split('.')[0]}.{p.name}.wav.{idx}.csv" - ) - processing_paths_meta.append(list_meta_step) - else: - processing_paths_meta.append(None) - # TODO: support txt file writing for META pass-through - - if out_dir_wav: - out_file = out_dir.joinpath(in_file.name).with_suffix(".wav") - else: - out_file = out_dir.joinpath(in_file.name) - - out_meta = [] - if in_meta: - for im in range(len(in_meta)): - out_meta.append(out_dir.joinpath(f"{Path(out_file).stem}.wav.{im}.csv")) - - # execute each process sequentially, feed output into input of next process - for p, (input, output), input_meta in zip( - chain, pairwise(processing_paths), processing_paths_meta[:-1] - ): - # setup logging for the output - item_logger = logger.getChild(output.stem) - fh = logging.FileHandler(output.with_suffix(".log"), mode="w") - fh.setLevel(logging.DEBUG) - fh.setFormatter(logging.Formatter(LOGGER_FORMAT, datefmt=LOGGER_DATEFMT)) - item_logger.addHandler(fh) - - p.process(input, output, input_meta, item_logger) - - # copy output and metadata from final process to output file - copyfile(processing_paths[-1], out_file) - if processing_paths_meta[-1]: - for idx, ppm in enumerate(processing_paths_meta[-1]): - copyfile(ppm, out_meta[idx]) - - -def remove_preamble(cfg): - # get number of channels from output format - num_channels = audio.fromtype(cfg.postprocessing["fmt"]).num_channels - for odir in cfg.out_dirs: - for item in cfg.items_list: - path_input = odir / item.name - - # remove preamble for ISM metadata - if cfg.postprocessing["fmt"].startswith("ISM"): - # search for metadata - meta_item = metadata_search( - odir, [Path(item.name)], num_objects=num_channels - ) - metadata_array = [] - for meta_i in meta_item: - metadata_array.append(np.genfromtxt(meta_i, delimiter=",")) - - # remove preamble - metadata_array = add_remove_preamble( - metadata_array, cfg.pre2.preamble, add=False - ) - - # write csv files - write_ISM_metadata_in_file( - metadata_array, [path_input], automatic_naming=True - ) - - # read file - x, fs = read( - path_input, nchannels=num_channels, fs=cfg.postprocessing["fs"] - ) - - # remove preamble - x = trim(x, fs, (cfg.pre2.preamble, 0)) - - # write file - write(path_input, x, fs) - - return diff --git a/item_generation_scripts/utils.py b/item_generation_scripts/utils.py deleted file mode 100644 index 1e21b0db..00000000 --- a/item_generation_scripts/utils.py +++ /dev/null @@ -1,297 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import logging -import shutil -import subprocess as sp -import sys -from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor -from itertools import repeat, tee -from os import devnull -from pathlib import Path -from shutil import which -from typing import Callable, Iterable, Optional, Union - -import yaml - -ALLOWED_INPUT_EXT = (".wav", ".pcm", ".txt", ".raw") -BIN_DIR = Path(__file__).parent.joinpath("bin") - - -""" -Directory/path handling -""" - - -def create_dir(p: str) -> None: - p = Path(p) - p.mkdir(exist_ok=True, parents=True) - - -def delete_dir(p: str) -> None: - p = Path(p) - if p.exists() and p.is_dir(): - shutil.rmtree(p) - - -class DirManager: - """ - Context manager that creates directories if not already present and - automatically cleans up (i.e. deletes) all specified paths - """ - - def __init__( - self, create_paths: Union[str, list], delete_paths: Union[str, list] = list() - ): - self.create_paths = ( - create_paths if isinstance(create_paths, list) else [create_paths] - ) - self.delete_paths = ( - delete_paths if isinstance(create_paths, list) else [delete_paths] - ) - - def __enter__(self): - for path in self.create_paths: - create_dir(path) - - def __exit__(self, exc_type, exc_value, exc_traceback): - for path in self.delete_paths: - if path in self.create_paths: - delete_dir(path) - else: - print( - f"Tmp dir '{path}' was not present in creation paths - skipping deletion." - ) - - -def list_audio(path: str, absolute: bool = False, select_list: list = None) -> list: - """ - Return list with all files with ALLOWED_INPUT_EXT found under the given path. - - If path is a directory, all files in it are included, if it is a file, just the file - will be in the list. If a select list is provided, files are filtered accordingly. - """ - path = Path(path) - audio_list = [] - - if path.exists(): - if path.is_dir(): - if absolute: - [audio_list.extend(list(path.glob(ext))) for ext in ALLOWED_INPUT_EXT] - audio_list = [ - path.joinpath(f) - for f in path.iterdir() - if f.suffix in ALLOWED_INPUT_EXT - ] - else: - audio_list = [ - f for f in path.iterdir() if f.suffix in ALLOWED_INPUT_EXT - ] - else: - if not absolute: - path = path.name - ext = path.suffix - if ext in ALLOWED_INPUT_EXT: - audio_list.append(path) - - # filter according to select list - if select_list: - select_set = set([Path(i).stem for i in select_list]) - audio_list = [ - f for f in audio_list if any([pattern in f.stem for pattern in select_set]) - ] - - return audio_list - - -def get_nickname(p: Path) -> str: - return f"{p.parent.name}/{p.name}" - - -""" -System interaction -""" - - -def find_binary( - binary: str, - raise_error: Optional[bool] = True, - logger: Optional[logging.Logger] = None, - binary_path: Optional[str] = None, -) -> Union[Path, None]: - """Attempt to find and return the path to the given binary""" - # prioritise binaries placed in the directory over $PATH - if binary_path is not None: - bin = which(binary, path=binary_path) - else: - bin = which(binary, path=BIN_DIR) - if not bin: - bin = which(binary) - - if not bin and raise_error: - raise FileNotFoundError( - f"Binary {binary} was neither found in {binary_path.absolute()} nor in {BIN_DIR.absolute()} or in $PATH!" - ) - elif not bin: - if logger: - logger.debug(f"Couldn't find binary {binary}") - return None - else: - if logger: - logger.debug(f"Found binary {bin}") - return Path(bin) - - -def get_devnull(): - return devnull - - -def get_gitsha(): - try: - git_sha = sp.check_output( - ["git", "rev-parse", "HEAD"], stderr=sp.STDOUT, text=True - ).strip() - except sp.CalledProcessError: - git_sha = "git repository not found!" - - return git_sha - - -def run(cmd, cwd=None, check=True, logger: Optional[logging.Logger] = None): - if logger: - logger.debug(f"Running command {' '.join([str(c) for c in cmd])}; cwd = {cwd}") - - try: - result = sp.run(cmd, check=check, capture_output=True, text=True, cwd=cwd) - except sp.CalledProcessError as e: - raise SystemError( - f"Command returned non-zero exit status ({e.returncode}): {' '.join([str(c) for c in e.cmd])}\n{e.stderr}\n{e.stdout}" - ) - - if logger: - logger.debug(result.stderr.strip()) - logger.debug(result.stdout.strip()) - - return result - - -""" -Utility functions -""" - - -def apply_func_parallel( - func: Callable, - args: Iterable, - kwargs: Optional[Iterable] = None, - type: Optional[str] = None, - show_progress: Optional[bool] = True, -) -> list: - """ - Apply a function iteratively to a list of arguments and keyword arguments - Optionally with multiprocessing or multithreading - - Parameters - ---------- - func : Callable - Function to use - args : Iterable - List of positional arguments - kwargs: Optional[Iterable] - List of keyword arguments - type: Optional[str] - Type of parallel processing to use, "mp" for multiprocessing or "mt" for threading, default = None (sequential processing) - show_progress: Optional[bool] - Flag whether to show progress bar - - Returns - ------- - List of function results - """ - - # if no kwargs are specified, repeat the empty dict to avoid issues with zipping and unpacking - if not kwargs: - kwargs = repeat({}) - - args_zip = zip(args, kwargs) - - if type == "mp": - executor = ProcessPoolExecutor - elif type == "mt": - executor = ThreadPoolExecutor - else: - return [ - func(*a, **k) - for a, k in (progressbar(list(args_zip)) if show_progress else args_zip) - ] - - with executor() as e: - results = [e.submit(func, *a, **k) for a, k in args_zip] - return [ - r.result() for r in (progressbar(results) if show_progress else results) - ] - - -def pairwise(iter): - """itertools.pairwise() for python < 3.10""" - a, b = tee(iter) - next(b, None) - return zip(a, b) - - -def progressbar(iter: Iterable, width=80): - """simple unicode progressbar""" - count = len(iter) - - def update(progress): - fill = int(width * progress / count) - print( - f"{int(progress/count*100):3d}%{u'│'}{u'█'*fill}{(u'░'*(width-fill))}{u'│'}{progress}/{count}", - end="\r", - file=sys.stdout, - flush=True, - ) - - update(0) - for i, item in enumerate(iter): - yield item - update(i + 1) - print("\n", flush=True, file=sys.stdout) - - -def get_binary_paths(yaml_file_with_binary_paths): - with open(yaml_file_with_binary_paths, "r") as f: - data = yaml.safe_load(f) - if data is None: - return {} - else: - return {key: Path(value) for key, value in data.items()} diff --git a/ivas_processing_scripts/audiotools/audiofile.py b/ivas_processing_scripts/audiotools/audiofile.py index 954c91f8..d5687a89 100755 --- a/ivas_processing_scripts/audiotools/audiofile.py +++ b/ivas_processing_scripts/audiotools/audiofile.py @@ -110,6 +110,7 @@ def write( filename: Union[str, Path], x: np.ndarray, fs: Optional[int] = 48000, + dtype: Optional[str] = "int16", ) -> None: """ Write audio file (.pcm, .wav or .raw) @@ -122,6 +123,8 @@ def write( Numpy 2D array of dimension: number of channels x number of samples fs: Optional[int] Sampling rate, required for .pcm or .raw input file, default = 48000 (Hz) + dtype: Optional[str] + Data type format required for .pcm or .raw input file, default = 'int16' Returns ------- @@ -141,7 +144,7 @@ def write( x = x.astype(np.int16) wav.write(filename, fs, x) elif file_extension == ".pcm" or file_extension == ".raw": - x = x.astype("int16").reshape(-1, 1) + x = x.astype(dtype).reshape(-1, 1) x.tofile(filename) else: raise ValueError("Wrong input format. Use wav, pcm or raw") diff --git a/item_generation_scripts/audiotools/wrappers/reverb.py b/ivas_processing_scripts/audiotools/wrappers/reverb.py similarity index 90% rename from item_generation_scripts/audiotools/wrappers/reverb.py rename to ivas_processing_scripts/audiotools/wrappers/reverb.py index 1c4491bd..46f4ee33 100644 --- a/item_generation_scripts/audiotools/wrappers/reverb.py +++ b/ivas_processing_scripts/audiotools/wrappers/reverb.py @@ -31,18 +31,19 @@ # import os.path -import numpy as np -from scipy.fft import fft from copy import copy from pathlib import Path from tempfile import TemporaryDirectory from typing import Optional, Union -from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES -from item_generation_scripts.utils import find_binary, run -from item_generation_scripts.audiotools.audio import Audio -from item_generation_scripts.audiotools.audiofile import read, write -from item_generation_scripts.audiotools.wrappers.filter import resample_itu +import numpy as np +from scipy.fft import fft + +from ivas_processing_scripts.audiotools.audio import Audio +from ivas_processing_scripts.audiotools.audiofile import read, write +from ivas_processing_scripts.audiotools.wrappers.filter import resample_itu +from ivas_processing_scripts.constants import DEFAULT_CONFIG_BINARIES +from ivas_processing_scripts.utils import find_binary, run def reverb( @@ -62,13 +63,13 @@ def reverb( Impulse response align: float multiplicative factor to apply to the reverberated sound in order to align its energy level with a second filePath to the output file - + Returns ------- output: Audio Convolved audio signal with IR """ - + # find binary if "reverb" in DEFAULT_CONFIG_BINARIES["binary_paths"]: binary = find_binary( @@ -77,10 +78,10 @@ def reverb( ) else: binary = find_binary("reverb") - + with TemporaryDirectory() as tmp_dir: tmp_dir = Path(tmp_dir) - + # resample input audio signal to that of the IR old_fs = None tmp_input = copy(input) @@ -92,12 +93,12 @@ def reverb( # write input audio signal to temporary file in .pcm format tmp_input_file = tmp_dir.joinpath(f"tmp_reverbIn.pcm") write(tmp_input_file, tmp_input.audio, tmp_input.fs) - + # down-scale IR to prevent saturation # max_value = np.max(np.abs(IR.audio)) # if max_value > 1.0: - # IR.audio = IR.audio / max_value - + # IR.audio = IR.audio / max_value + # write IR to temporary file in .pcm format # note: the reverb tool expects 32b float format tmp_IR_file = tmp_dir.joinpath(f"tmp_IR.pcm") @@ -111,7 +112,7 @@ def reverb( # append multiplicative factor, if provided if align: cmd.extend(["-align", str(align)]) - + # append temporary filenames tmp_output_file = tmp_dir.joinpath(f"tmp_reverbOut.pcm") cmd.extend([tmp_input_file, tmp_IR_file, tmp_output_file]) @@ -119,17 +120,18 @@ def reverb( # run the 'reverb' command run(cmd) - # read the reverberated output file + # read the reverberated output file output = copy(tmp_input) output.audio, _ = read(tmp_output_file, nchannels=1, fs=tmp_input.fs) - + # reverse the resampling if old_fs: output.audio = resample_itu(output, old_fs) output.fs = old_fs - + return output + def reverb_stereo( input: Audio, stereo_IR: Audio, @@ -146,13 +148,13 @@ def reverb_stereo( Impulse response align: float multiplicative factor to apply to the reverberated sound in order to align its energy level with the second file - + Returns ------- output: Audio Convolved audio signal with stereo IR """ - + # convert to float32 stereo_IR.audio = np.float32(stereo_IR.audio) @@ -160,26 +162,26 @@ def reverb_stereo( IR_left = copy(stereo_IR) IR_left.name = "MONO" IR_left.num_channels = 1 - IR_left.audio = np.reshape(stereo_IR.audio[:,0], (-1, 1)) - + IR_left.audio = np.reshape(stereo_IR.audio[:, 0], (-1, 1)) + IR_right = copy(stereo_IR) IR_right.name = "MONO" IR_right.num_channels = 1 - IR_right.audio = np.reshape(stereo_IR.audio[:,1], (-1, 1)) + IR_right.audio = np.reshape(stereo_IR.audio[:, 1], (-1, 1)) # calculate the scaling (multiplicative) factor such that the maximum gain of the IR filter across all frequencies is 0dB if align is None: H = fft(stereo_IR.audio, axis=0) align = 1.0 / np.max(np.abs(H)) - + # convolve mono input with left and right IR y_left = reverb(input, IR_left, align=align) y_right = reverb(input, IR_right, align=align) - + # combine into stereo output y = copy(input) y.name = "STEREO" y.num_channels = 2 y.audio = np.column_stack([y_left.audio, y_right.audio]) - + return y diff --git a/item_generation_scripts/__init__.py b/ivas_processing_scripts/generation/__init__.py old mode 100644 new mode 100755 similarity index 90% rename from item_generation_scripts/__init__.py rename to ivas_processing_scripts/generation/__init__.py index 8b3d8bae..2c7c9bf3 --- a/item_generation_scripts/__init__.py +++ b/ivas_processing_scripts/generation/__init__.py @@ -35,13 +35,13 @@ import os import yaml -from item_generation_scripts.constants import ( +from ivas_processing_scripts.constants import ( LOGGER_DATEFMT, LOGGER_FORMAT, LOGGER_SUFFIX, ) -from item_generation_scripts.processing import config, process_ism_items, process_stereo_items -from item_generation_scripts.utils import create_dir +from ivas_processing_scripts.generation import config, process_ism_items, process_stereo_items +from ivas_processing_scripts.utils import create_dir def logging_init(args, cfg): @@ -94,7 +94,9 @@ def main(args): fs=cfg.fs, preamble=cfg.preamble, postamble=cfg.postamble, - add_low_level_random_noise=cfg.add_low_level_random_noise, + add_low_level_random_noise=cfg.get("add_low_level_random_noise", False), + # TODO@VM dict.get() can provide a default value if the key is not found + # please check if this is a viable solution - I kept getting "AttributeError: 'TestConfig' object has no attribute 'add_low_level_random_noise'" ) elif cfg.format == "STEREO": # generate STEREO items according to scene description @@ -111,7 +113,7 @@ def main(args): preamble=cfg.preamble, postamble=cfg.postamble, ) - + # copy configuration to output directory with open(cfg.output_path.joinpath(f"{cfg.format}.yml"), "w") as f: yaml.safe_dump(cfg._yaml_dump, f) diff --git a/item_generation_scripts/__main__.py b/ivas_processing_scripts/generation/__main__.py old mode 100644 new mode 100755 similarity index 98% rename from item_generation_scripts/__main__.py rename to ivas_processing_scripts/generation/__main__.py index b49109d3..9ba00fd5 --- a/item_generation_scripts/__main__.py +++ b/ivas_processing_scripts/generation/__main__.py @@ -32,7 +32,7 @@ import argparse -from item_generation_scripts import main +from ivas_processing_scripts.generation import main if __name__ == "__main__": parser = argparse.ArgumentParser( diff --git a/item_generation_scripts/processing/config.py b/ivas_processing_scripts/generation/config.py similarity index 97% rename from item_generation_scripts/processing/config.py rename to ivas_processing_scripts/generation/config.py index 3e9aaaa5..ca9dbcc2 100644 --- a/item_generation_scripts/processing/config.py +++ b/ivas_processing_scripts/generation/config.py @@ -35,7 +35,7 @@ from pathlib import Path import yaml -from item_generation_scripts.constants import DEFAULT_CONFIG, REQUIRED_KEYS +from ivas_processing_scripts.generation.constants import DEFAULT_CONFIG, REQUIRED_KEYS def merge_dicts(base: dict, other: dict) -> None: @@ -122,4 +122,4 @@ class TestConfig: # Report missing keys to the user if MISSING_KEYS: - raise KeyError(f"The following key(s) must be specified : {MISSING_KEYS}") + raise KeyError(f"The following key(s) must be specified : {MISSING_KEYS}") \ No newline at end of file diff --git a/item_generation_scripts/constants.py b/ivas_processing_scripts/generation/constants.py similarity index 95% rename from item_generation_scripts/constants.py rename to ivas_processing_scripts/generation/constants.py index 6b0d0681..34001207 100644 --- a/item_generation_scripts/constants.py +++ b/ivas_processing_scripts/generation/constants.py @@ -33,7 +33,7 @@ from datetime import datetime from pathlib import Path -from item_generation_scripts.utils import get_binary_paths +from ivas_processing_scripts.utils import get_binary_paths LOGGER_SUFFIX = ".log" LOGGER_FORMAT = ( @@ -55,7 +55,7 @@ DEFAULT_CONFIG = { DEFAULT_CONFIG_BINARIES = { "binary_paths": get_binary_paths( - Path(__file__).parent.joinpath("binary_paths.yml") + Path(__file__).parent.parent.joinpath("binary_paths.yml") ), } @@ -64,4 +64,4 @@ REQUIRED_KEYS = [ "input_path", "output_path", "scenes", -] +] \ No newline at end of file diff --git a/item_generation_scripts/processing/process_ism_items.py b/ivas_processing_scripts/generation/process_ism_items.py similarity index 86% rename from item_generation_scripts/processing/process_ism_items.py rename to ivas_processing_scripts/generation/process_ism_items.py index fe62f048..810f770b 100644 --- a/item_generation_scripts/processing/process_ism_items.py +++ b/ivas_processing_scripts/generation/process_ism_items.py @@ -33,16 +33,18 @@ import csv import logging import os +from math import floor from pathlib import Path from typing import Optional + import numpy as np -from math import floor -from item_generation_scripts.audiotools import audio, audiofile -from item_generation_scripts.audiotools.wrappers.bs1770 import get_loudness +from ivas_processing_scripts.audiotools import audio, audiofile +from ivas_processing_scripts.audiotools.wrappers.bs1770 import get_loudness SEED_RANDOM_NOISE = 0 + # function for converting nd numpy array to strings with 2 decimal digits def csv_formatdata(data): for row in data: @@ -78,34 +80,33 @@ def generate_ism_items( else: y = audio.ChannelBasedAudio("MONO") y_meta = None - + # read the overlap length - if 'overlap' in scene.keys(): + if "overlap" in scene.keys(): source_overlap = float(scene["overlap"]) else: source_overlap = 0.0 - + # repeat for all source files for i in range(N_sources): - # parse parameters from the scene description source_file = np.atleast_1d(scene["source"])[i] source_azi = np.atleast_1d(scene["azimuth"])[i] source_ele = np.atleast_1d(scene["elevation"])[i] - + logger.info( f"Encoding {source_file} at position(s) {source_azi},{source_ele}" ) # read source file x = audio.fromfile("MONO", os.path.join(input_path, source_file), fs=fs) - + ############### DEBUG ############33 # x.audio = x.audio[:-10] # get the number of frames (multiple of 20ms) N_frames = int(len(x.audio) / x.fs * 50) - + # trim the source signal to align to 20ms boundary # N_trim = int(N_frames * x.fs / 50) # x.audio = x.audio[:N_trim] @@ -180,18 +181,18 @@ def generate_ism_items( # arrange all metadata fields column-wise into a matrix x_meta = np.column_stack((azi, ele, dist, spread, gain)) - + # shift the second (and all other) source files (positive shift creates overlap, negative shift creates a gap) if i > 0 and source_overlap != 0.0: # get the length of the first source file - N_delay = len(y.audio[:,0]) - + N_delay = len(y.audio[:, 0]) + # add the shift N_delay += int(source_overlap * x.fs) - + # ensure delay is a multiple of 20ms # N_delay = int(floor(source_shift * 50) / 50 * x.fs) - + # insert all-zero preamble pre = np.zeros((N_delay, x.audio.shape[1])) x.audio = np.concatenate([pre, x.audio]) @@ -199,14 +200,14 @@ def generate_ism_items( # insert neutral position as a pre-amble pre = np.tile( [0.00, 0.00, 1.00, 0.00, 1.00], (N_delay, 1) - ) # !!!! TBD - check if we should insert netrual position or the first position of the metadata + ) # !!!! TBD - check if we should insert netrual position or the first position of the metadata x_meta = np.concatenate([pre, x_meta]) - - # pad with zeros to ensure that the signal length is a multiple of 20ms + + # pad with zeros to ensure that the signal length is a multiple of 20ms N_frame = x.fs / 50 if len(x.audio) % N_frame != 0: N_pad = int(N_frame - len(x.audio) % N_frame) - + # insert all-zero preamble pre = np.zeros((N_pad, x.audio.shape[1])) x.audio = np.concatenate([pre, x.audio]) @@ -214,7 +215,7 @@ def generate_ism_items( # insert neutral position as a pre-amble pre = np.tile( [0.00, 0.00, 1.00, 0.00, 1.00], (N_pad, 1) - ) # !!!! TBD - check if we should insert netrual position or the first position of the metadata + ) # !!!! TBD - check if we should insert netrual position or the first position of the metadata x_meta = np.concatenate([pre, x_meta]) # add source signal to the array of all source signals @@ -224,14 +225,28 @@ def generate_ism_items( else: # pad with zeros to have the same length of all source signals if x.audio.shape[0] > y.audio.shape[0]: - y.audio = np.vstack((y.audio, np.zeros((x.audio.shape[0] - y.audio.shape[0], y.audio.shape[1])))) + y.audio = np.vstack( + ( + y.audio, + np.zeros( + (x.audio.shape[0] - y.audio.shape[0], y.audio.shape[1]) + ), + ) + ) elif y.audio.shape[0] > x.audio.shape[0]: - x.audio = np.vstack((x.audio, np.zeros((y.audio.shape[0] - x.audio.shape[0], x.audio.shape[1])))) + x.audio = np.vstack( + ( + x.audio, + np.zeros( + (y.audio.shape[0] - x.audio.shape[0], x.audio.shape[1]) + ), + ) + ) y.audio = np.hstack((y.audio, x.audio)) # add metadata to the array of all metadata # make sure x_meta is a 3d array - x_meta = x_meta[np.newaxis, :] + x_meta = x_meta[np.newaxis, :] if y_meta is None: y_meta = x_meta else: @@ -242,25 +257,19 @@ def generate_ism_items( if x_meta.shape[1] > y_meta.shape[1]: N_delta = x_meta.shape[1] - y_meta.shape[1] # reshape to 2d array - y_meta = y_meta.reshape(y_meta.shape[1], -1) + y_meta = y_meta.reshape(y_meta.shape[1], -1) # repeat last row N_delta times and append to the array - y_meta = np.vstack( - (y_meta, np.tile(y_meta[-1, :], (N_delta, 1))) - ) + y_meta = np.vstack((y_meta, np.tile(y_meta[-1, :], (N_delta, 1)))) # reshape back to 3d array - y_meta = y_meta.reshape( - N_srcs, -1, N_meta_features - ) + y_meta = y_meta.reshape(N_srcs, -1, N_meta_features) elif y_meta.shape[1] > x_meta.shape[1]: N_delta = y_meta.shape[1] - x_meta.shape[1] # reshape to 2d array - x_meta = x_meta.reshape(x_meta.shape[1], -1) + x_meta = x_meta.reshape(x_meta.shape[1], -1) # repeat last row N_delta times and append to the array - x_meta = np.vstack( - (x_meta, np.tile(x_meta[-1, :], (N_delta, 1))) - ) + x_meta = np.vstack((x_meta, np.tile(x_meta[-1, :], (N_delta, 1)))) # reshape back to 3d array - x_meta = np.expand_dims(x_meta, axis=0) + x_meta = np.expand_dims(x_meta, axis=0) y_meta = np.concatenate([y_meta, x_meta]) @@ -268,7 +277,7 @@ def generate_ism_items( if preamble != 0.0: # ensure that pre-mable is a multiple of 20ms N_pre = int(floor(preamble * 50) / 50 * y.fs) - + # insert all-zero preamble to all sources pre = np.zeros((N_pre, y.audio.shape[1])) y.audio = np.concatenate([pre, y.audio]) @@ -276,13 +285,13 @@ def generate_ism_items( # insert neutral position as a pre-amble to all sources pre = np.tile( [0.00, 0.00, 1.00, 0.00, 1.00], (y_meta.shape[0], N_pre, 1) - ) # !!!! TBD - check if we should insert netrual position or the first position of the metadata + ) # !!!! TBD - check if we should insert netrual position or the first position of the metadata y_meta = np.concatenate([pre, y_meta], axis=1) - + if postamble != 0.0: # ensure that post-mable is a multiple of 20ms N_post = int(floor(postamble * 50) / 50 * y.fs) - + # append all-zero postamble to all sources post = np.zeros((N_post, y.audio.shape[1])) y.audio = np.concatenate([y.audio, post]) @@ -290,17 +299,17 @@ def generate_ism_items( # append neutral position as a post-amble to all sources post = np.tile( [0.00, 0.00, 1.00, 0.00, 1.00], (y_meta.shape[0], N_post, 1) - ) # !!!! TBD - check if we should insert netrual position or the last position of the metadata + ) # !!!! TBD - check if we should insert netrual position or the last position of the metadata y_meta = np.concatenate([y_meta, post], axis=1) - + # add random noise if add_low_level_random_noise: # create uniformly distributed noise between -4 and 4 np.random.seed(SEED_RANDOM_NOISE) - noise = np.random.randint( - low=-4, high=5, size=y.audio.shape - ).astype("float") - + noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype( + "float" + ) + # superimpose y.audio += noise @@ -315,7 +324,12 @@ def generate_ism_items( # generate .csv filename (should end with .0.csv, .1.csv, ...) csv_filename = os.path.normpath(f"{output_filename}.{i}.csv") - with open(os.path.join(output_path, csv_filename), 'w', newline='', encoding='utf-8') as f: + with open( + os.path.join(output_path, csv_filename), + "w", + newline="", + encoding="utf-8", + ) as f: # create csv writer writer = csv.writer(f) diff --git a/item_generation_scripts/processing/process_stereo_items.py b/ivas_processing_scripts/generation/process_stereo_items.py similarity index 81% rename from item_generation_scripts/processing/process_stereo_items.py rename to ivas_processing_scripts/generation/process_stereo_items.py index f8dcc43d..aecc1a57 100644 --- a/item_generation_scripts/processing/process_stereo_items.py +++ b/ivas_processing_scripts/generation/process_stereo_items.py @@ -34,16 +34,16 @@ import csv import logging import os -from pathlib import Path -from typing import Optional from copy import copy -import numpy as np from math import floor +from pathlib import Path +from typing import Optional +import numpy as np -from item_generation_scripts.audiotools import audio, audiofile -from item_generation_scripts.audiotools.wrappers.bs1770 import get_loudness -from item_generation_scripts.audiotools.wrappers.reverb import reverb_stereo +from ivas_processing_scripts.audiotools import audio, audiofile +from ivas_processing_scripts.audiotools.wrappers.bs1770 import get_loudness +from ivas_processing_scripts.audiotools.wrappers.reverb import reverb_stereo # function for converting nd numpy array to strings with 2 decimal digits @@ -67,60 +67,57 @@ def generate_stereo_items( # get the number of scenes N_scenes = len(scenes) - + for scene_name, scene in scenes.items(): logger.info(f"Processing scene: {scene_name} out of {N_scenes} scenes") # extract the number of audio sources N_sources = len(np.atleast_1d(scene["source"])) - + # read the IR (check if stereo or two mono files were provided) source_IR = np.atleast_1d(scene["IR"]) y = audio.ChannelBasedAudio("STEREO") for i in range(N_sources): - # parse parameters from the scene description source_file = np.atleast_1d(scene["source"])[i] IR_file = np.atleast_1d(scene["IR"])[i] - if 'delay' in scene.keys(): + if "delay" in scene.keys(): source_delay = np.atleast_1d(scene["delay"])[i] else: source_delay = np.array([0]) - - logger.info( - f"Convolving {source_file} with {source_IR}" - ) + + logger.info(f"Convolving {source_file} with {source_IR}") # read source file x = audio.fromfile("MONO", os.path.join(input_path, source_file), fs=fs) # get the number of frames (multiple of 20ms) N_frames = int(len(x.audio) / x.fs * 50) - + # trim the source signal to align to 20ms boundary N_trim = int(N_frames * x.fs / 50) x.audio = x.audio[:N_trim] # read the IR file IR = audio.fromfile("STEREO", os.path.join(IR_path, IR_file), fs=IR_fs) - + # delay the source file if source_delay > 0: # ensure delay is a multiple of 20ms N_delay = int(floor(source_delay * 50) / 50 * x.fs) - + # insert all-zero preamble pre = np.zeros((N_delay, x.audio.shape[1])) x.audio = np.concatenate([pre, x.audio]) - + # convolve with stereo IR x_rev = reverb_stereo(x, IR) - + # adjust the level of the stereo signal _, scale_factor = get_loudness(x_rev, target_level, "STEREO") x_rev.audio *= scale_factor - + # add source signal to the array of source signals y.fs = x.fs if y.audio is None: @@ -128,11 +125,31 @@ def generate_stereo_items( else: # append zeros to have equal length of all source signals if x_rev.audio.shape[0] > y.audio.shape[0]: - y.audio = np.vstack((y.audio, np.zeros((x_rev.audio.shape[0] - y.audio.shape[0], y.audio.shape[1])))) + y.audio = np.vstack( + ( + y.audio, + np.zeros( + ( + x_rev.audio.shape[0] - y.audio.shape[0], + y.audio.shape[1], + ) + ), + ) + ) elif y.audio.shape[0] > x_rev.audio.shape[0]: - x_rev.audio = np.vstack((x_rev.audio, np.zeros((y.audio.shape[0] - x_rev.audio.shape[0], x_rev.audio.shape[1])))) - - # superimpose + x_rev.audio = np.vstack( + ( + x_rev.audio, + np.zeros( + ( + y.audio.shape[0] - x_rev.audio.shape[0], + x_rev.audio.shape[1], + ) + ), + ) + ) + + # superimpose y.audio += x_rev.audio # write the reverberated audio into output file @@ -141,4 +158,4 @@ def generate_stereo_items( os.path.join(output_path, output_filename), y.audio, y.fs ) # !!!! TBD: replace all os.path.xxx operations with the Path object - return \ No newline at end of file + return -- GitLab From 3616e6dfb65f49f9d59d946ba3b91f6633d1889b Mon Sep 17 00:00:00 2001 From: Archit Tamarapu Date: Thu, 11 May 2023 16:00:37 +0200 Subject: [PATCH 14/27] [fix] get() -> getattr() --- ivas_processing_scripts/generation/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ivas_processing_scripts/generation/__init__.py b/ivas_processing_scripts/generation/__init__.py index 2c7c9bf3..094bfe35 100755 --- a/ivas_processing_scripts/generation/__init__.py +++ b/ivas_processing_scripts/generation/__init__.py @@ -94,7 +94,7 @@ def main(args): fs=cfg.fs, preamble=cfg.preamble, postamble=cfg.postamble, - add_low_level_random_noise=cfg.get("add_low_level_random_noise", False), + add_low_level_random_noise=getattr(cfg, "add_low_level_random_noise", False), # TODO@VM dict.get() can provide a default value if the key is not found # please check if this is a viable solution - I kept getting "AttributeError: 'TestConfig' object has no attribute 'add_low_level_random_noise'" ) -- GitLab From f41efcb89708f821a70ea28a292cb6e173ca8719 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Thu, 11 May 2023 16:11:44 +0200 Subject: [PATCH 15/27] support for +- overlap in STEREO items, expect trimmed sentences, support for low-level random noise addition --- item_generation_scripts/__init__.py | 1 + .../config/STEREO_CONFIG.yml | 129 +++++++++--------- .../processing/process_ism_items.py | 5 +- .../processing/process_stereo_items.py | 85 +++++++++--- 4 files changed, 135 insertions(+), 85 deletions(-) diff --git a/item_generation_scripts/__init__.py b/item_generation_scripts/__init__.py index 8b3d8bae..93516464 100644 --- a/item_generation_scripts/__init__.py +++ b/item_generation_scripts/__init__.py @@ -110,6 +110,7 @@ def main(args): IR_fs=cfg.IR_fs, preamble=cfg.preamble, postamble=cfg.postamble, + add_low_level_random_noise=cfg.add_low_level_random_noise, ) # copy configuration to output directory diff --git a/item_generation_scripts/config/STEREO_CONFIG.yml b/item_generation_scripts/config/STEREO_CONFIG.yml index 0933b1da..cb14747d 100644 --- a/item_generation_scripts/config/STEREO_CONFIG.yml +++ b/item_generation_scripts/config/STEREO_CONFIG.yml @@ -35,6 +35,13 @@ output_path: "./items_STEREO" ### Target loudness in LKFS; default = null (no loudness normalization applied) loudness: -26 +### Pre-amble and Post-amble length in seconds (default = 0.0) +preamble: 0.5 +postamble: 0.5 + +### Flag for adding low-level random background noise (amplitude +-4) instead of silence; default = false (silence) +add_low_level_random_noise: true + ################################################ ### Scene description @@ -43,7 +50,7 @@ loudness: -26 ### Each scene must start with the sceneN tag ### Specify the mono source filename (the program will search for it in the input_path folder) ### Specify the stereo IR source filename (the program will search for it in the input_path_IR folder) -### Specify the delay in seconds for each input source +### Specify the overlap length in seconds for each input source (negative value creates a gap) ### Note 1: use [val1, val2, ...] for multiple sources in a scene ### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames @@ -51,252 +58,252 @@ scenes: a1: name: "G1S1.wav" description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room." - source: ["test_single.wav", "test_single.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["LEABP04.wav", "LEABP11.wav"] - delay: [0, 3] + overlap: 0.5 a2: name: "G6S2.wav" description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room." - source: ["test_single.wav", "test_single.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["LEABP05.wav", "LEABP11.wav"] - delay: [0, 3] + overlap: 0.5 a3: name: "G5S3.wav" description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room." - source: ["test_single.wav", "test_single.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["LEABP06.wav", "LEABP11.wav"] - delay: [0, 3] + overlap: 0.5 a4: name: "G4S4.wav" description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room." - source: ["test_single.wav", "test_single.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["LEABP05.wav", "LEABP10.wav"] - delay: [0, 1.5] + overlap: -0.5 a5: name: "G3S5.wav" description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room." - source: ["test_single.wav", "test_single.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["LEABP05.wav", "LEABP11.wav"] - delay: [0, 1.5] + overlap: -0.5 a6: name: "G2S6.wav" description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room." - source: ["test_single.wav", "test_single.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["LEABP05.wav", "LEABP12.wav"] - delay: [0, 1.5] + overlap: -0.5 b1: name: "G2S1.wav" description: "Two speakers sitting at oval table side by side in a large anechoic conference room." - source: ["test_single.wav", "test_single.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["LAABP05.wav", "LAABP06.wav"] - delay: [0, 35] + overlap: -0.5 b2: name: "G1S2.wav" description: "Two speakers sitting at oval table side by side in a large anechoic conference room." - source: ["test_single.wav", "test_single.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["LAABP07.wav", "LAABP08.wav"] - delay: [0, 3] + overlap: 0.5 b3: name: "G6S3.wav" description: "Two speakers sitting at oval table side by side in a large anechoic conference room." - source: ["test_single.wav", "test_single.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["LAABP09.wav", "LAABP10.wav"] - delay: [0, 3] + overlap: 0.5 b4: name: "G5S4.wav" description: "Two speakers sitting at oval table side by side in a large anechoic conference room." - source: ["test_single.wav", "test_single.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["LAABP11.wav", "LAABP12.wav"] - delay: [0, 1.5] + overlap: -0.5 b5: name: "G4S5.wav" description: "Two speakers sitting at oval table side by side in a large anechoic conference room." - source: ["test_single.wav", "test_single.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["LAABP01.wav", "LAABP02.wav"] - delay: [0, 1.5] + overlap: -0.5 b6: name: "G3S6.wav" description: "Two speakers sitting at oval table side by side in a large anechoic conference room." - source: ["test_single.wav", "test_single.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["LAABP03.wav", "LAABP04.wav"] - delay: [0, 1.5] + overlap: -0.5 c1: name: "G3S1.wav" description: "One talker sitting at table in a small anechoic conference room." source: ["test_single.wav"] IR: ["SAMSP01.wav"] - delay: [0] + overlap: -0.5 c2: name: "G2S2.wav" description: "One talker sitting at table in a small anechoic conference room." source: ["test_single.wav"] IR: ["SAMSP04.wav"] - delay: [0] + overlap: -0.5 c3: name: "G1S3.wav" description: "One talker sitting at table in a small anechoic conference room." source: ["test_single.wav"] IR: ["SAMSP07.wav"] - delay: [0] + overlap: -0.5 c4: name: "G6S4.wav" description: "One talker sitting at table in a small echoic conference room." source: ["test_single.wav"] IR: ["SEABP01.wav"] - delay: [0] + overlap: -0.5 c5: name: "G5S5.wav" description: "One talker sitting at table in a small echoic conference room." source: ["test_single.wav"] IR: ["SEABP03.wav"] - delay: [0] + overlap: -0.5 c6: name: "G4S6.wav" description: "One talker sitting at table in a small echoic conference room." source: ["test_single.wav"] IR: ["SEABP06.wav"] - delay: [0] + overlap: -0.5 d1: name: "G4S1.wav" description: "One talker sitting at table in a small anechoic conference room." source: ["test_single.wav"] IR: ["SEBIP01.wav"] - delay: [0] + overlap: -0.5 d2: name: "G3S2.wav" description: "One talker sitting at table in a small anechoic conference room." source: ["test_single.wav"] IR: ["SEBIP04.wav"] - delay: [0] + overlap: -0.5 d3: name: "G3S2.wav" description: "One talker sitting at table in a small anechoic conference room." source: ["test_single.wav"] IR: ["SEBIP07.wav"] - delay: [0] + overlap: -0.5 d4: name: "G1S4.wav" description: "One talker sitting at table in a small echoic conference room." source: ["test_single.wav"] IR: ["SEBIP07.wav"] - delay: [0] + overlap: -0.5 d5: name: "G6S5.wav" description: "One talker sitting at table in a small echoic conference room." source: ["test_single.wav"] IR: ["SEBIP07.wav"] - delay: [0] + overlap: -0.5 d6: name: "G5S6.wav" description: "One talker sitting at table in a small echoic conference room." source: ["test_single.wav"] IR: ["SEBIP07.wav"] - delay: [0] + overlap: -0.5 e1: name: "G5S1.wav" description: "Two talkers sitting in a room." - source: ["test_single.wav", "test_single.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["SEMSP01.wav", "SEMSP03.wav"] - delay: [0, 3] + overlap: 0.5 e2: name: "G4S2.wav" description: "Two talkers sitting in a room." - source: ["test_single.wav", "test_single.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["SEMSP01.wav", "SEMSP05.wav"] - delay: [0, 3] + overlap: 0.5 e3: name: "G3S3.wav" description: "Two talkers sitting in a room." - source: ["test_single.wav", "test_single.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["SEMSP01.wav", "SEMSP07.wav"] - delay: [0, 3] + overlap: 0.5 e4: name: "G2S4.wav" description: "Two talkers sitting in a room." - source: ["test_single.wav", "test_single.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["SEMSP03.wav", "SEMSP04.wav"] - delay: [0, 1.5] + overlap: -0.5 e5: name: "G1S5.wav" description: "Two talkers sitting in a room." - source: ["test_single.wav", "test_single.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["SEMSP05.wav", "SEMSP07.wav"] - delay: [0, 1.5] + overlap: -0.5 e6: name: "G6S6.wav" description: "Two talkers sitting in a room." - source: ["test_single.wav", "test_single.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["SEMSP06.wav", "SEMSP02.wav"] - delay: [0, 1.5] + overlap: -0.5 f1: name: "G6S1.wav" description: "Two talkers sitting in a room." - source: ["test_single.wav", "test_single.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["SEBIP05.wav", "SEBIP01.wav"] - delay: [0, 3] + overlap: 0.5 f2: name: "G5S2.wav" description: "Two talkers sitting in a room." - source: ["test_single.wav", "test_single.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["SEBIP07.wav", "SEBIP01.wav"] - delay: [0, 3] + overlap: 0.5 f3: name: "G4S3.wav" description: "Two talkers sitting in a room." - source: ["test_single.wav", "test_single.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["SEBIP04.wav", "SEBIP01.wav"] - delay: [0, 3] + overlap: 0.5 f4: name: "G3S4.wav" description: "Two talkers sitting in a room." - source: ["test_single.wav", "test_single.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["SEBIP02.wav", "SEBIP06.wav"] - delay: [0, 1.5] + overlap: -0.5 f5: name: "G2S5.wav" description: "Two talkers sitting in a room." - source: ["test_single.wav", "test_single.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["SEBIP02.wav", "SEBIP06.wav"] - delay: [0, 1.5] + overlap: -0.5 f6: name: "G1S6.wav" description: "Two talkers sitting in a room." - source: ["test_single.wav", "test_single.wav"] + source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["SEBIP03.wav", "SEBIP04.wav"] - delay: [0, 1.5] + overlap: -0.5 \ No newline at end of file diff --git a/item_generation_scripts/processing/process_ism_items.py b/item_generation_scripts/processing/process_ism_items.py index fe62f048..b03468ec 100644 --- a/item_generation_scripts/processing/process_ism_items.py +++ b/item_generation_scripts/processing/process_ism_items.py @@ -99,10 +99,7 @@ def generate_ism_items( # read source file x = audio.fromfile("MONO", os.path.join(input_path, source_file), fs=fs) - - ############### DEBUG ############33 - # x.audio = x.audio[:-10] - + # get the number of frames (multiple of 20ms) N_frames = int(len(x.audio) / x.fs * 50) diff --git a/item_generation_scripts/processing/process_stereo_items.py b/item_generation_scripts/processing/process_stereo_items.py index f8dcc43d..a6ed6c8a 100644 --- a/item_generation_scripts/processing/process_stereo_items.py +++ b/item_generation_scripts/processing/process_stereo_items.py @@ -40,11 +40,12 @@ from copy import copy import numpy as np from math import floor - from item_generation_scripts.audiotools import audio, audiofile from item_generation_scripts.audiotools.wrappers.bs1770 import get_loudness from item_generation_scripts.audiotools.wrappers.reverb import reverb_stereo +SEED_RANDOM_NOISE = 0 + # function for converting nd numpy array to strings with 2 decimal digits def csv_formatdata(data): @@ -62,6 +63,9 @@ def generate_stereo_items( logger: logging.Logger, fs: Optional[int] = 48000, IR_fs: Optional[int] = 48000, + preamble: Optional[float] = 0.0, + postamble: Optional[float] = 0.0, + add_low_level_random_noise: Optional[bool] = False, ): """Generate STEREO items from mono items based on scene description""" @@ -76,6 +80,12 @@ def generate_stereo_items( # read the IR (check if stereo or two mono files were provided) source_IR = np.atleast_1d(scene["IR"]) + + # read the overlap length + if 'overlap' in scene.keys(): + source_overlap = float(scene["overlap"]) + else: + source_overlap = 0.0 y = audio.ChannelBasedAudio("STEREO") for i in range(N_sources): @@ -83,10 +93,6 @@ def generate_stereo_items( # parse parameters from the scene description source_file = np.atleast_1d(scene["source"])[i] IR_file = np.atleast_1d(scene["IR"])[i] - if 'delay' in scene.keys(): - source_delay = np.atleast_1d(scene["delay"])[i] - else: - source_delay = np.array([0]) logger.info( f"Convolving {source_file} with {source_IR}" @@ -98,22 +104,9 @@ def generate_stereo_items( # get the number of frames (multiple of 20ms) N_frames = int(len(x.audio) / x.fs * 50) - # trim the source signal to align to 20ms boundary - N_trim = int(N_frames * x.fs / 50) - x.audio = x.audio[:N_trim] - # read the IR file IR = audio.fromfile("STEREO", os.path.join(IR_path, IR_file), fs=IR_fs) - - # delay the source file - if source_delay > 0: - # ensure delay is a multiple of 20ms - N_delay = int(floor(source_delay * 50) / 50 * x.fs) - - # insert all-zero preamble - pre = np.zeros((N_delay, x.audio.shape[1])) - x.audio = np.concatenate([pre, x.audio]) - + # convolve with stereo IR x_rev = reverb_stereo(x, IR) @@ -121,12 +114,36 @@ def generate_stereo_items( _, scale_factor = get_loudness(x_rev, target_level, "STEREO") x_rev.audio *= scale_factor + # shift the second (and all other) source files (positive shift creates overlap, negative shift creates a gap) + if i > 0 and source_overlap != 0.0: + # get the length of the first source file + N_delay = len(y.audio[:,0]) + + # add the shift + N_delay += int(source_overlap * x.fs) + + # ensure delay is a multiple of 20ms + # N_delay = int(floor(source_shift * 50) / 50 * x.fs) + + # insert all-zero preamble + pre = np.zeros((N_delay, x.audio.shape[1])) + x.audio = np.concatenate([pre, x.audio]) + + # pad with zeros to ensure that the signal length is a multiple of 20ms + N_frame = x.fs / 50 + if len(x.audio) % N_frame != 0: + N_pad = int(N_frame - len(x.audio) % N_frame) + + # insert all-zero preamble + pre = np.zeros((N_pad, x.audio.shape[1])) + x.audio = np.concatenate([pre, x.audio]) + # add source signal to the array of source signals y.fs = x.fs if y.audio is None: y.audio = x_rev.audio else: - # append zeros to have equal length of all source signals + # pad with zeros to have equal length of all source signals if x_rev.audio.shape[0] > y.audio.shape[0]: y.audio = np.vstack((y.audio, np.zeros((x_rev.audio.shape[0] - y.audio.shape[0], y.audio.shape[1])))) elif y.audio.shape[0] > x_rev.audio.shape[0]: @@ -135,6 +152,34 @@ def generate_stereo_items( # superimpose y.audio += x_rev.audio + # append pre-amble and post-amble to all sources + if preamble != 0.0: + # ensure that pre-mable is a multiple of 20ms + N_pre = int(floor(preamble * 50) / 50 * y.fs) + + # insert all-zero preamble to all sources + pre = np.zeros((N_pre, y.audio.shape[1])) + y.audio = np.concatenate([pre, y.audio]) + + if postamble != 0.0: + # ensure that post-mable is a multiple of 20ms + N_post = int(floor(postamble * 50) / 50 * y.fs) + + # append all-zero postamble to all sources + post = np.zeros((N_post, y.audio.shape[1])) + y.audio = np.concatenate([y.audio, post]) + + # add random noise + if add_low_level_random_noise: + # create uniformly distributed noise between -4 and 4 + np.random.seed(SEED_RANDOM_NOISE) + noise = np.random.randint( + low=-4, high=5, size=y.audio.shape + ).astype("float") + + # superimpose + y.audio += noise + # write the reverberated audio into output file output_filename = scene["name"] audiofile.write( -- GitLab From 7c0bac405bfe5a40f0aa29ce91663ae441976c67 Mon Sep 17 00:00:00 2001 From: Archit Tamarapu Date: Fri, 12 May 2023 10:20:09 +0200 Subject: [PATCH 16/27] [fix] audiofile.py: write out specified dtype for .wav output too (function still clips to int16 range! --- ivas_processing_scripts/audiotools/audiofile.py | 2 +- ivas_processing_scripts/audiotools/wrappers/reverb.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ivas_processing_scripts/audiotools/audiofile.py b/ivas_processing_scripts/audiotools/audiofile.py index d5687a89..d6f39f65 100755 --- a/ivas_processing_scripts/audiotools/audiofile.py +++ b/ivas_processing_scripts/audiotools/audiofile.py @@ -141,7 +141,7 @@ def write( x = np.clip(x, np.iinfo(np.int16).min, np.iinfo(np.int16).max) if file_extension == ".wav": - x = x.astype(np.int16) + x = x.astype(dtype) wav.write(filename, fs, x) elif file_extension == ".pcm" or file_extension == ".raw": x = x.astype(dtype).reshape(-1, 1) diff --git a/ivas_processing_scripts/audiotools/wrappers/reverb.py b/ivas_processing_scripts/audiotools/wrappers/reverb.py index 46f4ee33..4f4de5dd 100644 --- a/ivas_processing_scripts/audiotools/wrappers/reverb.py +++ b/ivas_processing_scripts/audiotools/wrappers/reverb.py @@ -102,7 +102,7 @@ def reverb( # write IR to temporary file in .pcm format # note: the reverb tool expects 32b float format tmp_IR_file = tmp_dir.joinpath(f"tmp_IR.pcm") - write(tmp_IR_file, IR.audio.astype("float32"), IR.fs, dtype="float32") + write(tmp_IR_file, IR.audio.astype(np.float32), IR.fs, dtype=np.float32) # set up the 'reverb' command line cmd = [ -- GitLab From 18b3e256c665ca8bf85aa9cc894f67f22fc26e58 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Fri, 12 May 2023 12:43:24 +0200 Subject: [PATCH 17/27] stereo IR files of the ITU-T reverb package in int16 32kHz WAVE format --- .gitignore | 1 - .../generation/IR/LAABP01.wav | 3 + .../generation/IR/LAABP02.wav | 3 + .../generation/IR/LAABP03.wav | 3 + .../generation/IR/LAABP04.wav | 3 + .../generation/IR/LAABP05.wav | 3 + .../generation/IR/LAABP06.wav | 3 + .../generation/IR/LAABP07.wav | 3 + .../generation/IR/LAABP08.wav | 3 + .../generation/IR/LAABP09.wav | 3 + .../generation/IR/LAABP10.wav | 3 + .../generation/IR/LAABP11.wav | 3 + .../generation/IR/LAABP12.wav | 3 + .../generation/IR/LEABP01.wav | 3 + .../generation/IR/LEABP02.wav | 3 + .../generation/IR/LEABP03.wav | 3 + .../generation/IR/LEABP04.wav | 3 + .../generation/IR/LEABP05.wav | 3 + .../generation/IR/LEABP06.wav | 3 + .../generation/IR/LEABP07.wav | 3 + .../generation/IR/LEABP08.wav | 3 + .../generation/IR/LEABP09.wav | 3 + .../generation/IR/LEABP10.wav | 3 + .../generation/IR/LEABP11.wav | 3 + .../generation/IR/LEABP12.wav | 3 + .../generation/IR/README.TXT | 56 +++++++++++++++++++ .../generation/IR/SAABP01.wav | 3 + .../generation/IR/SAABP02.wav | 3 + .../generation/IR/SAABP03.wav | 3 + .../generation/IR/SAABP04.wav | 3 + .../generation/IR/SAABP05.wav | 3 + .../generation/IR/SAABP06.wav | 3 + .../generation/IR/SAABP07.wav | 3 + .../generation/IR/SAMSP01.wav | 3 + .../generation/IR/SAMSP02.wav | 3 + .../generation/IR/SAMSP03.wav | 3 + .../generation/IR/SAMSP04.wav | 3 + .../generation/IR/SAMSP05.wav | 3 + .../generation/IR/SAMSP06.wav | 3 + .../generation/IR/SAMSP07.wav | 3 + .../generation/IR/SEABP01.wav | 3 + .../generation/IR/SEABP02.wav | 3 + .../generation/IR/SEABP03.wav | 3 + .../generation/IR/SEABP04.wav | 3 + .../generation/IR/SEABP05.wav | 3 + .../generation/IR/SEABP06.wav | 3 + .../generation/IR/SEABP07.wav | 3 + .../generation/IR/SEBIP01.wav | 3 + .../generation/IR/SEBIP02.wav | 3 + .../generation/IR/SEBIP03.wav | 3 + .../generation/IR/SEBIP04.wav | 3 + .../generation/IR/SEBIP05.wav | 3 + .../generation/IR/SEBIP06.wav | 3 + .../generation/IR/SEBIP07.wav | 3 + .../generation/IR/SEMSP01.wav | 3 + .../generation/IR/SEMSP02.wav | 3 + .../generation/IR/SEMSP03.wav | 3 + .../generation/IR/SEMSP04.wav | 3 + .../generation/IR/SEMSP05.wav | 3 + .../generation/IR/SEMSP06.wav | 3 + .../generation/IR/SEMSP07.wav | 3 + 61 files changed, 233 insertions(+), 1 deletion(-) create mode 100644 ivas_processing_scripts/generation/IR/LAABP01.wav create mode 100644 ivas_processing_scripts/generation/IR/LAABP02.wav create mode 100644 ivas_processing_scripts/generation/IR/LAABP03.wav create mode 100644 ivas_processing_scripts/generation/IR/LAABP04.wav create mode 100644 ivas_processing_scripts/generation/IR/LAABP05.wav create mode 100644 ivas_processing_scripts/generation/IR/LAABP06.wav create mode 100644 ivas_processing_scripts/generation/IR/LAABP07.wav create mode 100644 ivas_processing_scripts/generation/IR/LAABP08.wav create mode 100644 ivas_processing_scripts/generation/IR/LAABP09.wav create mode 100644 ivas_processing_scripts/generation/IR/LAABP10.wav create mode 100644 ivas_processing_scripts/generation/IR/LAABP11.wav create mode 100644 ivas_processing_scripts/generation/IR/LAABP12.wav create mode 100644 ivas_processing_scripts/generation/IR/LEABP01.wav create mode 100644 ivas_processing_scripts/generation/IR/LEABP02.wav create mode 100644 ivas_processing_scripts/generation/IR/LEABP03.wav create mode 100644 ivas_processing_scripts/generation/IR/LEABP04.wav create mode 100644 ivas_processing_scripts/generation/IR/LEABP05.wav create mode 100644 ivas_processing_scripts/generation/IR/LEABP06.wav create mode 100644 ivas_processing_scripts/generation/IR/LEABP07.wav create mode 100644 ivas_processing_scripts/generation/IR/LEABP08.wav create mode 100644 ivas_processing_scripts/generation/IR/LEABP09.wav create mode 100644 ivas_processing_scripts/generation/IR/LEABP10.wav create mode 100644 ivas_processing_scripts/generation/IR/LEABP11.wav create mode 100644 ivas_processing_scripts/generation/IR/LEABP12.wav create mode 100644 ivas_processing_scripts/generation/IR/README.TXT create mode 100644 ivas_processing_scripts/generation/IR/SAABP01.wav create mode 100644 ivas_processing_scripts/generation/IR/SAABP02.wav create mode 100644 ivas_processing_scripts/generation/IR/SAABP03.wav create mode 100644 ivas_processing_scripts/generation/IR/SAABP04.wav create mode 100644 ivas_processing_scripts/generation/IR/SAABP05.wav create mode 100644 ivas_processing_scripts/generation/IR/SAABP06.wav create mode 100644 ivas_processing_scripts/generation/IR/SAABP07.wav create mode 100644 ivas_processing_scripts/generation/IR/SAMSP01.wav create mode 100644 ivas_processing_scripts/generation/IR/SAMSP02.wav create mode 100644 ivas_processing_scripts/generation/IR/SAMSP03.wav create mode 100644 ivas_processing_scripts/generation/IR/SAMSP04.wav create mode 100644 ivas_processing_scripts/generation/IR/SAMSP05.wav create mode 100644 ivas_processing_scripts/generation/IR/SAMSP06.wav create mode 100644 ivas_processing_scripts/generation/IR/SAMSP07.wav create mode 100644 ivas_processing_scripts/generation/IR/SEABP01.wav create mode 100644 ivas_processing_scripts/generation/IR/SEABP02.wav create mode 100644 ivas_processing_scripts/generation/IR/SEABP03.wav create mode 100644 ivas_processing_scripts/generation/IR/SEABP04.wav create mode 100644 ivas_processing_scripts/generation/IR/SEABP05.wav create mode 100644 ivas_processing_scripts/generation/IR/SEABP06.wav create mode 100644 ivas_processing_scripts/generation/IR/SEABP07.wav create mode 100644 ivas_processing_scripts/generation/IR/SEBIP01.wav create mode 100644 ivas_processing_scripts/generation/IR/SEBIP02.wav create mode 100644 ivas_processing_scripts/generation/IR/SEBIP03.wav create mode 100644 ivas_processing_scripts/generation/IR/SEBIP04.wav create mode 100644 ivas_processing_scripts/generation/IR/SEBIP05.wav create mode 100644 ivas_processing_scripts/generation/IR/SEBIP06.wav create mode 100644 ivas_processing_scripts/generation/IR/SEBIP07.wav create mode 100644 ivas_processing_scripts/generation/IR/SEMSP01.wav create mode 100644 ivas_processing_scripts/generation/IR/SEMSP02.wav create mode 100644 ivas_processing_scripts/generation/IR/SEMSP03.wav create mode 100644 ivas_processing_scripts/generation/IR/SEMSP04.wav create mode 100644 ivas_processing_scripts/generation/IR/SEMSP05.wav create mode 100644 ivas_processing_scripts/generation/IR/SEMSP06.wav create mode 100644 ivas_processing_scripts/generation/IR/SEMSP07.wav diff --git a/.gitignore b/.gitignore index 7855f81e..77abd26a 100644 --- a/.gitignore +++ b/.gitignore @@ -6,7 +6,6 @@ venv/ .vscode/ .idea/ .DS_Store -*.wav !tests/data/**/*.wav *.pcm *.bs diff --git a/ivas_processing_scripts/generation/IR/LAABP01.wav b/ivas_processing_scripts/generation/IR/LAABP01.wav new file mode 100644 index 00000000..aeaa9eeb --- /dev/null +++ b/ivas_processing_scripts/generation/IR/LAABP01.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a4e959d347d3f99468dbe75bce9853eb9d66af6cb22cf3ea9ad2dc4c9e84a2a +size 36804 diff --git a/ivas_processing_scripts/generation/IR/LAABP02.wav b/ivas_processing_scripts/generation/IR/LAABP02.wav new file mode 100644 index 00000000..41586c2f --- /dev/null +++ b/ivas_processing_scripts/generation/IR/LAABP02.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2658ddec94aa86e2fa0ed365686daded586a6a46436dff1c6d8dba6d17d0182c +size 36804 diff --git a/ivas_processing_scripts/generation/IR/LAABP03.wav b/ivas_processing_scripts/generation/IR/LAABP03.wav new file mode 100644 index 00000000..c4ec38f9 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/LAABP03.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5616c8bcf3959aeee246a96a9f2ce6793d4087bfce3dfd1d97e313e3717b5bd6 +size 36804 diff --git a/ivas_processing_scripts/generation/IR/LAABP04.wav b/ivas_processing_scripts/generation/IR/LAABP04.wav new file mode 100644 index 00000000..1c50022f --- /dev/null +++ b/ivas_processing_scripts/generation/IR/LAABP04.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f433047f7fdba568183873d11c7f4423550a675b3e0677b6d846137227862bac +size 36804 diff --git a/ivas_processing_scripts/generation/IR/LAABP05.wav b/ivas_processing_scripts/generation/IR/LAABP05.wav new file mode 100644 index 00000000..e3bd1916 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/LAABP05.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:791b69ca22d15226e5e2f6c5a39d3d40af04264523f3373d842a070ea4d40862 +size 36804 diff --git a/ivas_processing_scripts/generation/IR/LAABP06.wav b/ivas_processing_scripts/generation/IR/LAABP06.wav new file mode 100644 index 00000000..1c50022f --- /dev/null +++ b/ivas_processing_scripts/generation/IR/LAABP06.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f433047f7fdba568183873d11c7f4423550a675b3e0677b6d846137227862bac +size 36804 diff --git a/ivas_processing_scripts/generation/IR/LAABP07.wav b/ivas_processing_scripts/generation/IR/LAABP07.wav new file mode 100644 index 00000000..c4ec38f9 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/LAABP07.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5616c8bcf3959aeee246a96a9f2ce6793d4087bfce3dfd1d97e313e3717b5bd6 +size 36804 diff --git a/ivas_processing_scripts/generation/IR/LAABP08.wav b/ivas_processing_scripts/generation/IR/LAABP08.wav new file mode 100644 index 00000000..41586c2f --- /dev/null +++ b/ivas_processing_scripts/generation/IR/LAABP08.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2658ddec94aa86e2fa0ed365686daded586a6a46436dff1c6d8dba6d17d0182c +size 36804 diff --git a/ivas_processing_scripts/generation/IR/LAABP09.wav b/ivas_processing_scripts/generation/IR/LAABP09.wav new file mode 100644 index 00000000..aeaa9eeb --- /dev/null +++ b/ivas_processing_scripts/generation/IR/LAABP09.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a4e959d347d3f99468dbe75bce9853eb9d66af6cb22cf3ea9ad2dc4c9e84a2a +size 36804 diff --git a/ivas_processing_scripts/generation/IR/LAABP10.wav b/ivas_processing_scripts/generation/IR/LAABP10.wav new file mode 100644 index 00000000..37693eb5 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/LAABP10.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9271410ecad011fbcf22fb8f7af5b0f19f02510ef0f198ef6c6d9e33e64d38da +size 36804 diff --git a/ivas_processing_scripts/generation/IR/LAABP11.wav b/ivas_processing_scripts/generation/IR/LAABP11.wav new file mode 100644 index 00000000..482a0e76 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/LAABP11.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cda11409aae6b99f6ccb4d20db24b065b7b2bda004dddd7659607215568d90b6 +size 36804 diff --git a/ivas_processing_scripts/generation/IR/LAABP12.wav b/ivas_processing_scripts/generation/IR/LAABP12.wav new file mode 100644 index 00000000..37693eb5 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/LAABP12.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9271410ecad011fbcf22fb8f7af5b0f19f02510ef0f198ef6c6d9e33e64d38da +size 36804 diff --git a/ivas_processing_scripts/generation/IR/LEABP01.wav b/ivas_processing_scripts/generation/IR/LEABP01.wav new file mode 100644 index 00000000..424ddfb5 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/LEABP01.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d810da26d72e818444c6ee16a3a59a77eabf74df3aaebd2b021696fa7fdd610f +size 82068 diff --git a/ivas_processing_scripts/generation/IR/LEABP02.wav b/ivas_processing_scripts/generation/IR/LEABP02.wav new file mode 100644 index 00000000..784caa2d --- /dev/null +++ b/ivas_processing_scripts/generation/IR/LEABP02.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c21239ff8bbf0e465a175f7ea5125c03f02568a8dbc9b4b63e064955529c489 +size 82068 diff --git a/ivas_processing_scripts/generation/IR/LEABP03.wav b/ivas_processing_scripts/generation/IR/LEABP03.wav new file mode 100644 index 00000000..c81bce1f --- /dev/null +++ b/ivas_processing_scripts/generation/IR/LEABP03.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96e5b25de682dc8e0c1f036bbb0c193cfef574a48621069584d48cdd40f520ed +size 82068 diff --git a/ivas_processing_scripts/generation/IR/LEABP04.wav b/ivas_processing_scripts/generation/IR/LEABP04.wav new file mode 100644 index 00000000..87d97879 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/LEABP04.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfd86b594612a319e30676e4e3c0d177f01ee5626379864610df9796532e7024 +size 82068 diff --git a/ivas_processing_scripts/generation/IR/LEABP05.wav b/ivas_processing_scripts/generation/IR/LEABP05.wav new file mode 100644 index 00000000..5e01d3be --- /dev/null +++ b/ivas_processing_scripts/generation/IR/LEABP05.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e31f9bf16791af9b3e01e75316d5bfe32115a5dec8a4b820d253e78e0b84edb +size 82068 diff --git a/ivas_processing_scripts/generation/IR/LEABP06.wav b/ivas_processing_scripts/generation/IR/LEABP06.wav new file mode 100644 index 00000000..a1027066 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/LEABP06.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65be054317c4dfd5cb0f9bef1d9fc90f35df6ae841e223280946e435c7b6b0c7 +size 82068 diff --git a/ivas_processing_scripts/generation/IR/LEABP07.wav b/ivas_processing_scripts/generation/IR/LEABP07.wav new file mode 100644 index 00000000..3bfe1b97 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/LEABP07.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78da36e2a0652cc9c7f77279ba1342d0f58b4a879ef4e3038da38580c9bfd07d +size 82068 diff --git a/ivas_processing_scripts/generation/IR/LEABP08.wav b/ivas_processing_scripts/generation/IR/LEABP08.wav new file mode 100644 index 00000000..7ac86eb1 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/LEABP08.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa78fae31221631fd31d251ea6ad5f7369bbcc054c84e8b82dca7c8613f3867a +size 82068 diff --git a/ivas_processing_scripts/generation/IR/LEABP09.wav b/ivas_processing_scripts/generation/IR/LEABP09.wav new file mode 100644 index 00000000..010be6fb --- /dev/null +++ b/ivas_processing_scripts/generation/IR/LEABP09.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd7a9ca0ff37a58455414d8e66efb9aa6d8f686af7459751e24f40eb3c2d6415 +size 82068 diff --git a/ivas_processing_scripts/generation/IR/LEABP10.wav b/ivas_processing_scripts/generation/IR/LEABP10.wav new file mode 100644 index 00000000..4fbadb40 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/LEABP10.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7149eb3558db62f34e4f476c85a57733e0ca153a297aa183ebeb550878a5ab40 +size 82068 diff --git a/ivas_processing_scripts/generation/IR/LEABP11.wav b/ivas_processing_scripts/generation/IR/LEABP11.wav new file mode 100644 index 00000000..156d4156 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/LEABP11.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2665ed857b1e3f095581c591e400e9ef532ff9e130a414bc2cc939c37b829c8a +size 82068 diff --git a/ivas_processing_scripts/generation/IR/LEABP12.wav b/ivas_processing_scripts/generation/IR/LEABP12.wav new file mode 100644 index 00000000..e84b30b8 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/LEABP12.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad942c2d19303a80ccadab2289172c514c83397096e8317476d6e8dd6463f0f4 +size 82068 diff --git a/ivas_processing_scripts/generation/IR/README.TXT b/ivas_processing_scripts/generation/IR/README.TXT new file mode 100644 index 00000000..ba5b2281 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/README.TXT @@ -0,0 +1,56 @@ +---------------------------------------------------------------------------------------------- + This set of stereo impulse responses for superwideband audio signals has been measured + by France Telecom/Orange + Copyright (c) 2008-2023 + + Authors: Claude Marro, David Virette, France Telecom/Orange, France + + WARRANTIES: + This set of stereo impulse responses is made available by Orange in the hope they will be useful, + but without any warranty. + France Telecom/Orange is not liable for any consequence related to the use of the provided data. + ---------------------------------------------------------------------------------------------- + +The naming of stereo impulse responses is defined as +[Room][Reverb][Mic]P[Position].WAV + +where: +Room is S=Small or L=Large, +Reverb is E=Echoic or A=Anechoic, +Mic is AB or MS or BI=binaural, +Position is a two digit position number + + +---------------------------------------------------------------------------------------------------------- +|Scenario | Main Characteristics | Naming of Impulse response pair +| | | (with example positions): +---------------------------------------------------------------------------------------------------------- +|Scenario 1, Large conf. room, 12 positions, | Large, Anechoic, AB | LAABP12.WAV +|AB microphone, no reverb, anechoic. | | +---------------------------------------------------------------------------------------------------------- +|Scenario 1, Large conf. room, 12 positions, | Large, Echoic, AB | LEABP01.WAV +|AB microphone, including reverberation. | | +---------------------------------------------------------------------------------------------------------- +|Scenario 2, small conf room, 7 positions, | Small, Anechoic, AB | SAABP01.WAV +|AB microphone, no reverb, anechoic. | | +---------------------------------------------------------------------------------------------------------- +|Scenario 2, small conf room, 7 positions, | Small, Anechoic, MS | SAMSP05.WAV +|MS microphone, no reverb, anechoic. | | +---------------------------------------------------------------------------------------------------------- +|Scenario 2, small conf room, 7 positions, | Small, Echoic, AB | SEABP02.WAV +|AB microphone, including reverberation. | | +---------------------------------------------------------------------------------------------------------- +|Scenario 2, small conf room, 7 positions, | Small, Echoic, Binaural | SEBIP04.WAV +|Binaural microphone, including reverberation.| | +---------------------------------------------------------------------------------------------------------- +|Scenario 2, small conf room, 7 positions, | Small, Echoic, MS | SEMSP07.WAV +|MS microphone, including reverberation. | | +---------------------------------------------------------------------------------------------------------- + +Stereo impulse responses are stored in WAV format (16-bit integer, 32 kHz). +WARNING : All these impulse responses were measured with a sampling frequency of 32kHz. +They are for use with 32 kHz sampled speech files. + +References: +[1] original description, http://ties.itu.int/u/tsg16/sg16/xchange/wp3/0809-Geneva/q10/AC-0809-Q10-22-Ericsson_STL_updates.doc +[2] original IRs, https://www.itu.int/u/tsg16/sg16/xchange/wp3/q23/g729.1_g718_swbst_qualification/impulse_resp/stereo/FT/ \ No newline at end of file diff --git a/ivas_processing_scripts/generation/IR/SAABP01.wav b/ivas_processing_scripts/generation/IR/SAABP01.wav new file mode 100644 index 00000000..180b682a --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SAABP01.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd68dd01200bbfd25bebec4dfc63b8f528a03c88d1307e75d7a6c91eeec8be6e +size 36764 diff --git a/ivas_processing_scripts/generation/IR/SAABP02.wav b/ivas_processing_scripts/generation/IR/SAABP02.wav new file mode 100644 index 00000000..f0acab78 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SAABP02.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f145f6f8eb8324c7f3e18c5af5047641e82952603a787e0b7e069d26d5c4ca6 +size 36764 diff --git a/ivas_processing_scripts/generation/IR/SAABP03.wav b/ivas_processing_scripts/generation/IR/SAABP03.wav new file mode 100644 index 00000000..1efea8d6 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SAABP03.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8493653f497915b35377984c6d79e04aa344ccf44e0d5b8e286fbec492c9c31 +size 36764 diff --git a/ivas_processing_scripts/generation/IR/SAABP04.wav b/ivas_processing_scripts/generation/IR/SAABP04.wav new file mode 100644 index 00000000..ec788896 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SAABP04.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36c04e66154b91979160d18faaf02dc226f6d2ed61f63d19227d777bb3459987 +size 36764 diff --git a/ivas_processing_scripts/generation/IR/SAABP05.wav b/ivas_processing_scripts/generation/IR/SAABP05.wav new file mode 100644 index 00000000..3098f0b4 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SAABP05.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dae758e6b7b3fd8ef3a8d76fa3210f5f412f3286056085e68a1f9ca7a13e9bab +size 36764 diff --git a/ivas_processing_scripts/generation/IR/SAABP06.wav b/ivas_processing_scripts/generation/IR/SAABP06.wav new file mode 100644 index 00000000..a4553381 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SAABP06.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5d52622ceb146c340c8a52689468c355c09bd3f71ef1f2f5dae9fb5d217b27e +size 36764 diff --git a/ivas_processing_scripts/generation/IR/SAABP07.wav b/ivas_processing_scripts/generation/IR/SAABP07.wav new file mode 100644 index 00000000..8e641a98 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SAABP07.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf6c77ccfa239f5a0cb44a071dcb0d0ca92da0bbc858e4cc060af814ab3ffe3e +size 36764 diff --git a/ivas_processing_scripts/generation/IR/SAMSP01.wav b/ivas_processing_scripts/generation/IR/SAMSP01.wav new file mode 100644 index 00000000..7d59592a --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SAMSP01.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba6e8d380d91e5492338ac98df45e444532b92ff84a71f569673610e59cde136 +size 36764 diff --git a/ivas_processing_scripts/generation/IR/SAMSP02.wav b/ivas_processing_scripts/generation/IR/SAMSP02.wav new file mode 100644 index 00000000..b8b62cef --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SAMSP02.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd991ef690a9c86fa00064c56ad3df3ef726d9b6232efaf256b33cbc1ad3ac32 +size 36764 diff --git a/ivas_processing_scripts/generation/IR/SAMSP03.wav b/ivas_processing_scripts/generation/IR/SAMSP03.wav new file mode 100644 index 00000000..feab358d --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SAMSP03.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a996729d0c2573d4f219d72db60273b986280fac7ae0f5fe0a35524b83a0d95 +size 36764 diff --git a/ivas_processing_scripts/generation/IR/SAMSP04.wav b/ivas_processing_scripts/generation/IR/SAMSP04.wav new file mode 100644 index 00000000..0f29ec53 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SAMSP04.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dad01524476b6f8a5fc2d4d31f8c1b7589a836d9b98cc4d27201e42481931962 +size 36764 diff --git a/ivas_processing_scripts/generation/IR/SAMSP05.wav b/ivas_processing_scripts/generation/IR/SAMSP05.wav new file mode 100644 index 00000000..71293903 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SAMSP05.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f0b5f91b292924c4e1eb1e2d884059720ab5c3eaae05d22230d786f19de7879 +size 36764 diff --git a/ivas_processing_scripts/generation/IR/SAMSP06.wav b/ivas_processing_scripts/generation/IR/SAMSP06.wav new file mode 100644 index 00000000..0d51fc62 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SAMSP06.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06e0d7f97b4ce56065d143d19a45ad8c757ed21cf0fe3f8ed05cbedbd966084e +size 36764 diff --git a/ivas_processing_scripts/generation/IR/SAMSP07.wav b/ivas_processing_scripts/generation/IR/SAMSP07.wav new file mode 100644 index 00000000..a20ac5f9 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SAMSP07.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb825349bec07813ea7ccb936948783aed31683805a3daae867568445820f8ea +size 36764 diff --git a/ivas_processing_scripts/generation/IR/SEABP01.wav b/ivas_processing_scripts/generation/IR/SEABP01.wav new file mode 100644 index 00000000..6120c6a0 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SEABP01.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a934da1fee82c8131c427680304c9102a3289179697318735b87536d2db6261e +size 42112 diff --git a/ivas_processing_scripts/generation/IR/SEABP02.wav b/ivas_processing_scripts/generation/IR/SEABP02.wav new file mode 100644 index 00000000..3dc413d8 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SEABP02.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21bd1f242bf459bda18ea9e444eedbdf97db20e0956e3600c4e3c03870f1a877 +size 42112 diff --git a/ivas_processing_scripts/generation/IR/SEABP03.wav b/ivas_processing_scripts/generation/IR/SEABP03.wav new file mode 100644 index 00000000..27d2af1c --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SEABP03.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bd27e370e9fff391ef37a9e45e3f1583cdcef6ce23cef6135368fb6964674f2 +size 42112 diff --git a/ivas_processing_scripts/generation/IR/SEABP04.wav b/ivas_processing_scripts/generation/IR/SEABP04.wav new file mode 100644 index 00000000..ed3c9918 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SEABP04.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4399629b729b0ceb8b30f3c994b736557bd8b35a968cb80cba486833b7c54d1 +size 42112 diff --git a/ivas_processing_scripts/generation/IR/SEABP05.wav b/ivas_processing_scripts/generation/IR/SEABP05.wav new file mode 100644 index 00000000..2e990d65 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SEABP05.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5c7af3d46eea2d738cb1c6e25a351489f9daff2976c365251595cec719b7ebe +size 42112 diff --git a/ivas_processing_scripts/generation/IR/SEABP06.wav b/ivas_processing_scripts/generation/IR/SEABP06.wav new file mode 100644 index 00000000..3d1397a0 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SEABP06.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb284bd97e306b890b9ccdd2e7649c602f6fd78774c1b2140b29051126a1fece +size 42112 diff --git a/ivas_processing_scripts/generation/IR/SEABP07.wav b/ivas_processing_scripts/generation/IR/SEABP07.wav new file mode 100644 index 00000000..075da1a1 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SEABP07.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a4fce653f7d80f389f3114a1e07688c5ad292e1419a59c6c4630a3bb8f2bf74 +size 42112 diff --git a/ivas_processing_scripts/generation/IR/SEBIP01.wav b/ivas_processing_scripts/generation/IR/SEBIP01.wav new file mode 100644 index 00000000..a6068236 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SEBIP01.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55a349cb20898415609ea49187f871ba2dc980d07a1fa36fb655efde96208b4c +size 42112 diff --git a/ivas_processing_scripts/generation/IR/SEBIP02.wav b/ivas_processing_scripts/generation/IR/SEBIP02.wav new file mode 100644 index 00000000..10f8a62c --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SEBIP02.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b35e9171ceaeb3e4e00f1e73b337c39c6c933620c39394e3a5ff095535db657a +size 42112 diff --git a/ivas_processing_scripts/generation/IR/SEBIP03.wav b/ivas_processing_scripts/generation/IR/SEBIP03.wav new file mode 100644 index 00000000..fd0ec69f --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SEBIP03.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:524e0505f83bc579774e5e36f730b40fcb62b9b10f3a7767cec4389f4689d87b +size 42112 diff --git a/ivas_processing_scripts/generation/IR/SEBIP04.wav b/ivas_processing_scripts/generation/IR/SEBIP04.wav new file mode 100644 index 00000000..30be4326 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SEBIP04.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa2e8e18ef82a299d142fcbfc462b2370472fa202cbe361e1d661c20e21cd4c8 +size 42112 diff --git a/ivas_processing_scripts/generation/IR/SEBIP05.wav b/ivas_processing_scripts/generation/IR/SEBIP05.wav new file mode 100644 index 00000000..91e57937 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SEBIP05.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88d9cb43b175c2cf94eb861780e48fdb56da0bc4a2dd4f6034b179fa17dd09ab +size 42112 diff --git a/ivas_processing_scripts/generation/IR/SEBIP06.wav b/ivas_processing_scripts/generation/IR/SEBIP06.wav new file mode 100644 index 00000000..eb589f49 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SEBIP06.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca350c682655d3ba8b075e3744adde034783dc87036b8fa9aaf9ccb3500f9286 +size 42112 diff --git a/ivas_processing_scripts/generation/IR/SEBIP07.wav b/ivas_processing_scripts/generation/IR/SEBIP07.wav new file mode 100644 index 00000000..d8a20381 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SEBIP07.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79e4520ada475e1c37b8707da8062bbbfb26e617261e0130b7344b2bc1a937c5 +size 42112 diff --git a/ivas_processing_scripts/generation/IR/SEMSP01.wav b/ivas_processing_scripts/generation/IR/SEMSP01.wav new file mode 100644 index 00000000..4dab142a --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SEMSP01.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66a37fc3855a0929cf4a4702301bf231fe346f1964b845b9cf464a5bfd3e29ad +size 42112 diff --git a/ivas_processing_scripts/generation/IR/SEMSP02.wav b/ivas_processing_scripts/generation/IR/SEMSP02.wav new file mode 100644 index 00000000..d59419c5 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SEMSP02.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cf267a42add5770e08b756d5577e95459b3efc5e49076ac910bb00aabe879b1 +size 42112 diff --git a/ivas_processing_scripts/generation/IR/SEMSP03.wav b/ivas_processing_scripts/generation/IR/SEMSP03.wav new file mode 100644 index 00000000..0e2e8205 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SEMSP03.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4947d0762a6d653690d164c1a0dc09acc9c2bf38e8c28f33b9661d899094cd7 +size 42112 diff --git a/ivas_processing_scripts/generation/IR/SEMSP04.wav b/ivas_processing_scripts/generation/IR/SEMSP04.wav new file mode 100644 index 00000000..dc665c65 --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SEMSP04.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f8a703057836541f8ca3e1e788d95302adcf983e12e1f6481e0743548559eeb +size 42112 diff --git a/ivas_processing_scripts/generation/IR/SEMSP05.wav b/ivas_processing_scripts/generation/IR/SEMSP05.wav new file mode 100644 index 00000000..aec9c66f --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SEMSP05.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12613e8b6f43d6a8df2a4b78961fcacc2956d3b0bd8e3321fdea487ab00679ab +size 42112 diff --git a/ivas_processing_scripts/generation/IR/SEMSP06.wav b/ivas_processing_scripts/generation/IR/SEMSP06.wav new file mode 100644 index 00000000..84f990ed --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SEMSP06.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b107956649319df472cfe311e278f73735708957b44f7af6a6e444a33b7cb9d0 +size 42112 diff --git a/ivas_processing_scripts/generation/IR/SEMSP07.wav b/ivas_processing_scripts/generation/IR/SEMSP07.wav new file mode 100644 index 00000000..bf89445a --- /dev/null +++ b/ivas_processing_scripts/generation/IR/SEMSP07.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef5a76c1026510861b8cac697415e6e08810857252b9ce52e0157c6024400bf2 +size 42112 -- GitLab From 8947746678efb2bfac403b7c4e1a331e46dc056d Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Fri, 12 May 2023 14:03:13 +0200 Subject: [PATCH 18/27] simplification of the top-level functions generate_[ism|stereo]_items() --- item_gen_configs/ISM1_CONFIG.yml | 11 ++-- item_gen_configs/ISM2_CONFIG.yml | 6 -- item_gen_configs/STEREO_CONFIG.yml | 10 +-- .../generation/__init__.py | 30 +-------- .../generation/process_ism_items.py | 59 ++++++++++------- .../generation/process_stereo_items.py | 64 ++++++++++++------- 6 files changed, 83 insertions(+), 97 deletions(-) diff --git a/item_gen_configs/ISM1_CONFIG.yml b/item_gen_configs/ISM1_CONFIG.yml index 9ba070f7..0f26866a 100644 --- a/item_gen_configs/ISM1_CONFIG.yml +++ b/item_gen_configs/ISM1_CONFIG.yml @@ -6,12 +6,6 @@ ### Output format format: "ISM1" -### Date; default = YYYYMMDD_HH.MM.SS -# date: 2023.06.30 - -### Deletion of temporary directories containing intermediate processing files, bitstreams etc.; default = false -# delete_tmp: true - ### Output sampling rate in Hz needed for headerless audio files; default = 48000 fs: 48000 @@ -29,10 +23,13 @@ output_path: "./items_ISM1" ### Target loudness in LKFS; default = null (no loudness normalization applied) loudness: -26 -### Pre-amble and Post-amble length in seconds (default = None) +### Pre-amble and Post-amble length in seconds (default = 0.0) preamble: 0.5 postamble: 0.5 +### Flag for adding low-level random background noise (amplitude +-4) instead of silence; default = false (silence) +add_low_level_random_noise: true + ################################################ ### Scene description diff --git a/item_gen_configs/ISM2_CONFIG.yml b/item_gen_configs/ISM2_CONFIG.yml index 198571d2..cbbb8b60 100644 --- a/item_gen_configs/ISM2_CONFIG.yml +++ b/item_gen_configs/ISM2_CONFIG.yml @@ -6,12 +6,6 @@ ### Output format format: "ISM2" -### Date; default = YYYYMMDD_HH.MM.SS -# date: 2023.06.30 - -### Deletion of temporary directories containing intermediate processing files, bitstreams etc.; default = false -# delete_tmp: true - ### Output sampling rate in Hz needed for headerless audio files; default = 48000 fs: 48000 diff --git a/item_gen_configs/STEREO_CONFIG.yml b/item_gen_configs/STEREO_CONFIG.yml index cb14747d..8f6cccc3 100644 --- a/item_gen_configs/STEREO_CONFIG.yml +++ b/item_gen_configs/STEREO_CONFIG.yml @@ -6,12 +6,6 @@ ### Output format format: "STEREO" -### Date; default = YYYYMMDD_HH.MM.SS -# date: 2023.06.30 - -### Deletion of temporary directories containing intermediate processing files, bitstreams etc.; default = false -# delete_tmp: true - ### Output sampling rate in Hz needed for headerless audio files; default = 48000 fs: 48000 @@ -26,8 +20,8 @@ IR_fs: 32000 ### Input path to mono files input_path: "./items_mono" -### Input path to stereo impulse response files -IR_path: "./IR" +### Input path to stereo impulse response files, default = './ivas_processing_scripts/generation/IR' +# IR_path: "./IR" ### Output path for generated test items and metadata files output_path: "./items_STEREO" diff --git a/ivas_processing_scripts/generation/__init__.py b/ivas_processing_scripts/generation/__init__.py index 76d10610..98591883 100755 --- a/ivas_processing_scripts/generation/__init__.py +++ b/ivas_processing_scripts/generation/__init__.py @@ -84,36 +84,10 @@ def main(args): # generate input items if cfg.format.startswith("ISM"): # generate ISM items with metadata according to scene description - process_ism_items.generate_ism_items( - cfg.format, - cfg.loudness, - cfg.input_path, - cfg.output_path, - cfg.scenes, - logger, - fs=cfg.fs, - preamble=cfg.preamble, - postamble=cfg.postamble, - add_low_level_random_noise=getattr(cfg, "add_low_level_random_noise", False), - # TODO@VM dict.get() can provide a default value if the key is not found - # please check if this is a viable solution - I kept getting "AttributeError: 'TestConfig' object has no attribute 'add_low_level_random_noise'" - ) + process_ism_items.generate_ism_items(cfg, logger) elif cfg.format == "STEREO": # generate STEREO items according to scene description - process_stereo_items.generate_stereo_items( - cfg.format, - cfg.loudness, - cfg.input_path, - cfg.IR_path, - cfg.output_path, - cfg.scenes, - logger, - fs=cfg.fs, - IR_fs=cfg.IR_fs, - preamble=cfg.preamble, - postamble=cfg.postamble, - add_low_level_random_noise=cfg.add_low_level_random_noise, - ) + process_stereo_items.generate_stereo_items(cfg, logger) # copy configuration to output directory with open(cfg.output_path.joinpath(f"{cfg.format}.yml"), "w") as f: diff --git a/ivas_processing_scripts/generation/process_ism_items.py b/ivas_processing_scripts/generation/process_ism_items.py index d788da34..a8c7e228 100644 --- a/ivas_processing_scripts/generation/process_ism_items.py +++ b/ivas_processing_scripts/generation/process_ism_items.py @@ -33,12 +33,11 @@ import csv import logging import os +import numpy as np from math import floor from pathlib import Path -from typing import Optional - -import numpy as np +from ivas_processing_scripts.generation import config from ivas_processing_scripts.audiotools import audio, audiofile from ivas_processing_scripts.audiotools.wrappers.bs1770 import get_loudness @@ -52,23 +51,34 @@ def csv_formatdata(data): def generate_ism_items( - format: str, - target_level: int, - input_path: Path, - output_path: Path, - scenes: dict, + cfg : config.TestConfig, logger: logging.Logger, - fs: Optional[int] = 48000, - preamble: Optional[float] = 0.0, - postamble: Optional[float] = 0.0, - add_low_level_random_noise: Optional[bool] = False, ): """Generate ISM items with metadata from mono items based on scene description""" # get the number of scenes - N_scenes = len(scenes) - - for scene_name, scene in scenes.items(): + N_scenes = len(cfg.scenes) + + # set the target level + if "loudness" not in cfg.__dict__: + cfg.loudness = -26 + + # set the fs + if "fs" not in cfg.__dict__: + cfg.fs = 48000 + + # set the pre-amble and post-amble + if "preamble" not in cfg.__dict__: + cfg.preamble = 0.0 + + if "postamble" not in cfg.__dict__: + cfg.postamble = 0.0 + + # set the pre-amble and post-amble + if "add_low_level_random_noise" not in cfg.__dict__: + cfg.add_low_level_random_noise = False + + for scene_name, scene in cfg.scenes.items(): logger.info(f"Processing {scene_name} out of {N_scenes} scenes") # extract the number of audio sources @@ -89,6 +99,7 @@ def generate_ism_items( # repeat for all source files for i in range(N_sources): + # parse parameters from the scene description source_file = np.atleast_1d(scene["source"])[i] source_azi = np.atleast_1d(scene["azimuth"])[i] @@ -99,7 +110,7 @@ def generate_ism_items( ) # read source file - x = audio.fromfile("MONO", os.path.join(input_path, source_file), fs=fs) + x = audio.fromfile("MONO", os.path.join(cfg.input_path, source_file), fs=cfg.fs) # get the number of frames (multiple of 20ms) N_frames = int(len(x.audio) / x.fs * 50) @@ -109,7 +120,7 @@ def generate_ism_items( # x.audio = x.audio[:N_trim] # adjust the level of the source file - _, scale_factor = get_loudness(x, target_level, "MONO") + _, scale_factor = get_loudness(x, cfg.loudness, "MONO") x.audio *= scale_factor # read azimuth information and create array @@ -271,9 +282,9 @@ def generate_ism_items( y_meta = np.concatenate([y_meta, x_meta]) # append pre-amble and post-amble to all sources - if preamble != 0.0: + if cfg.preamble != 0.0: # ensure that pre-mable is a multiple of 20ms - N_pre = int(floor(preamble * 50) / 50 * y.fs) + N_pre = int(floor(cfg.preamble * 50) / 50 * y.fs) # insert all-zero preamble to all sources pre = np.zeros((N_pre, y.audio.shape[1])) @@ -285,9 +296,9 @@ def generate_ism_items( ) # !!!! TBD - check if we should insert netrual position or the first position of the metadata y_meta = np.concatenate([pre, y_meta], axis=1) - if postamble != 0.0: + if cfg.postamble != 0.0: # ensure that post-mable is a multiple of 20ms - N_post = int(floor(postamble * 50) / 50 * y.fs) + N_post = int(floor(cfg.postamble * 50) / 50 * y.fs) # append all-zero postamble to all sources post = np.zeros((N_post, y.audio.shape[1])) @@ -300,7 +311,7 @@ def generate_ism_items( y_meta = np.concatenate([y_meta, post], axis=1) # add random noise - if add_low_level_random_noise: + if cfg.add_low_level_random_noise: # create uniformly distributed noise between -4 and 4 np.random.seed(SEED_RANDOM_NOISE) noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype( @@ -313,7 +324,7 @@ def generate_ism_items( # write individual ISM audio streams to the output file in an interleaved format output_filename = scene["name"] audiofile.write( - os.path.join(output_path, output_filename), y.audio, y.fs + os.path.join(cfg.output_path, output_filename), y.audio, y.fs ) # !!!! TBD: replace all os.path.xxx operations with the Path object # write individual ISM metadata to output files in .csv format @@ -322,7 +333,7 @@ def generate_ism_items( csv_filename = os.path.normpath(f"{output_filename}.{i}.csv") with open( - os.path.join(output_path, csv_filename), + os.path.join(cfg.output_path, csv_filename), "w", newline="", encoding="utf-8", diff --git a/ivas_processing_scripts/generation/process_stereo_items.py b/ivas_processing_scripts/generation/process_stereo_items.py index feae1b26..109d0b08 100644 --- a/ivas_processing_scripts/generation/process_stereo_items.py +++ b/ivas_processing_scripts/generation/process_stereo_items.py @@ -30,17 +30,15 @@ # the United Nations Convention on Contracts on the International Sales of Goods. # - import csv import logging import os +import numpy as np from copy import copy from math import floor from pathlib import Path -from typing import Optional - -import numpy as np +from ivas_processing_scripts.generation import config from ivas_processing_scripts.audiotools import audio, audiofile from ivas_processing_scripts.audiotools.wrappers.bs1770 import get_loudness from ivas_processing_scripts.audiotools.wrappers.reverb import reverb_stereo @@ -55,30 +53,48 @@ def csv_formatdata(data): def generate_stereo_items( - format: str, - target_level: int, - input_path: Path, - IR_path: Path, - output_path: Path, - scenes: dict, + cfg : config.TestConfig, logger: logging.Logger, - fs: Optional[int] = 48000, - IR_fs: Optional[int] = 48000, - preamble: Optional[float] = 0.0, - postamble: Optional[float] = 0.0, - add_low_level_random_noise: Optional[bool] = False, ): """Generate STEREO items from mono items based on scene description""" # get the number of scenes - N_scenes = len(scenes) + N_scenes = len(cfg.scenes) + + # set the target level + if "loudness" not in cfg.__dict__: + cfg.loudness = -26 + + # set the fs + if "fs" not in cfg.__dict__: + cfg.fs = 48000 + + # set the IR fs + if "IR_fs" not in cfg.__dict__: + cfg.IR_fs = 48000 + + # set the pre-amble and post-amble + if "preamble" not in cfg.__dict__: + cfg.preamble = 0.0 + + if "postamble" not in cfg.__dict__: + cfg.postamble = 0.0 + + # set the IR path + if "IR_path" not in cfg.__dict__: + cfg.IR_path = os.path.join(os.path.dirname(__file__), "IR") + + # set the pre-amble and post-amble + if "add_low_level_random_noise" not in cfg.__dict__: + cfg.add_low_level_random_noise = False - for scene_name, scene in scenes.items(): + # repeat for all source files + for scene_name, scene in cfg.scenes.items(): logger.info(f"Processing scene: {scene_name} out of {N_scenes} scenes") # extract the number of audio sources N_sources = len(np.atleast_1d(scene["source"])) - + # read the IR (check if stereo or two mono files were provided) source_IR = np.atleast_1d(scene["IR"]) @@ -99,19 +115,19 @@ def generate_stereo_items( ) # read source file - x = audio.fromfile("MONO", os.path.join(input_path, source_file), fs=fs) + x = audio.fromfile("MONO", os.path.join(cfg.input_path, source_file), fs=cfg.fs) # get the number of frames (multiple of 20ms) N_frames = int(len(x.audio) / x.fs * 50) # read the IR file - IR = audio.fromfile("STEREO", os.path.join(IR_path, IR_file), fs=IR_fs) + IR = audio.fromfile("STEREO", os.path.join(cfg.IR_path, IR_file), fs=cfg.IR_fs) # convolve with stereo IR x_rev = reverb_stereo(x, IR) # adjust the level of the stereo signal - _, scale_factor = get_loudness(x_rev, target_level, "STEREO") + _, scale_factor = get_loudness(x_rev, cfg.loudness, "STEREO") x_rev.audio *= scale_factor # shift the second (and all other) source files (positive shift creates overlap, negative shift creates a gap) @@ -173,9 +189,9 @@ def generate_stereo_items( y.audio += x_rev.audio # append pre-amble and post-amble to all sources - if preamble != 0.0: + if cfg.preamble != 0.0: # ensure that pre-mable is a multiple of 20ms - N_pre = int(floor(preamble * 50) / 50 * y.fs) + N_pre = int(floor(cfg.preamble * 50) / 50 * y.fs) # insert all-zero preamble to all sources pre = np.zeros((N_pre, y.audio.shape[1])) @@ -203,7 +219,7 @@ def generate_stereo_items( # write the reverberated audio into output file output_filename = scene["name"] audiofile.write( - os.path.join(output_path, output_filename), y.audio, y.fs + os.path.join(cfg.output_path, output_filename), y.audio, y.fs ) # !!!! TBD: replace all os.path.xxx operations with the Path object return -- GitLab From 67085bed4e1045319c76c6417e53295a34d63688 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Fri, 12 May 2023 15:15:04 +0200 Subject: [PATCH 19/27] formatting --- .../generation/__init__.py | 6 +- ivas_processing_scripts/generation/config.py | 2 +- .../generation/constants.py | 2 +- .../generation/process_ism_items.py | 20 ++--- .../generation/process_stereo_items.py | 75 ++++++++++--------- 5 files changed, 57 insertions(+), 48 deletions(-) diff --git a/ivas_processing_scripts/generation/__init__.py b/ivas_processing_scripts/generation/__init__.py index 98591883..27ff9021 100755 --- a/ivas_processing_scripts/generation/__init__.py +++ b/ivas_processing_scripts/generation/__init__.py @@ -40,7 +40,11 @@ from ivas_processing_scripts.constants import ( LOGGER_FORMAT, LOGGER_SUFFIX, ) -from ivas_processing_scripts.generation import config, process_ism_items, process_stereo_items +from ivas_processing_scripts.generation import ( + config, + process_ism_items, + process_stereo_items, +) from ivas_processing_scripts.utils import create_dir diff --git a/ivas_processing_scripts/generation/config.py b/ivas_processing_scripts/generation/config.py index ca9dbcc2..a84b156c 100644 --- a/ivas_processing_scripts/generation/config.py +++ b/ivas_processing_scripts/generation/config.py @@ -122,4 +122,4 @@ class TestConfig: # Report missing keys to the user if MISSING_KEYS: - raise KeyError(f"The following key(s) must be specified : {MISSING_KEYS}") \ No newline at end of file + raise KeyError(f"The following key(s) must be specified : {MISSING_KEYS}") diff --git a/ivas_processing_scripts/generation/constants.py b/ivas_processing_scripts/generation/constants.py index 34001207..c1454730 100644 --- a/ivas_processing_scripts/generation/constants.py +++ b/ivas_processing_scripts/generation/constants.py @@ -64,4 +64,4 @@ REQUIRED_KEYS = [ "input_path", "output_path", "scenes", -] \ No newline at end of file +] diff --git a/ivas_processing_scripts/generation/process_ism_items.py b/ivas_processing_scripts/generation/process_ism_items.py index a8c7e228..b2e09151 100644 --- a/ivas_processing_scripts/generation/process_ism_items.py +++ b/ivas_processing_scripts/generation/process_ism_items.py @@ -33,13 +33,14 @@ import csv import logging import os -import numpy as np from math import floor from pathlib import Path -from ivas_processing_scripts.generation import config +import numpy as np + from ivas_processing_scripts.audiotools import audio, audiofile from ivas_processing_scripts.audiotools.wrappers.bs1770 import get_loudness +from ivas_processing_scripts.generation import config SEED_RANDOM_NOISE = 0 @@ -51,7 +52,7 @@ def csv_formatdata(data): def generate_ism_items( - cfg : config.TestConfig, + cfg: config.TestConfig, logger: logging.Logger, ): """Generate ISM items with metadata from mono items based on scene description""" @@ -70,14 +71,14 @@ def generate_ism_items( # set the pre-amble and post-amble if "preamble" not in cfg.__dict__: cfg.preamble = 0.0 - + if "postamble" not in cfg.__dict__: cfg.postamble = 0.0 - + # set the pre-amble and post-amble if "add_low_level_random_noise" not in cfg.__dict__: cfg.add_low_level_random_noise = False - + for scene_name, scene in cfg.scenes.items(): logger.info(f"Processing {scene_name} out of {N_scenes} scenes") @@ -99,7 +100,6 @@ def generate_ism_items( # repeat for all source files for i in range(N_sources): - # parse parameters from the scene description source_file = np.atleast_1d(scene["source"])[i] source_azi = np.atleast_1d(scene["azimuth"])[i] @@ -110,8 +110,10 @@ def generate_ism_items( ) # read source file - x = audio.fromfile("MONO", os.path.join(cfg.input_path, source_file), fs=cfg.fs) - + x = audio.fromfile( + "MONO", os.path.join(cfg.input_path, source_file), fs=cfg.fs + ) + # get the number of frames (multiple of 20ms) N_frames = int(len(x.audio) / x.fs * 50) diff --git a/ivas_processing_scripts/generation/process_stereo_items.py b/ivas_processing_scripts/generation/process_stereo_items.py index 109d0b08..82ba54ca 100644 --- a/ivas_processing_scripts/generation/process_stereo_items.py +++ b/ivas_processing_scripts/generation/process_stereo_items.py @@ -33,15 +33,16 @@ import csv import logging import os -import numpy as np from copy import copy from math import floor from pathlib import Path -from ivas_processing_scripts.generation import config +import numpy as np + from ivas_processing_scripts.audiotools import audio, audiofile from ivas_processing_scripts.audiotools.wrappers.bs1770 import get_loudness from ivas_processing_scripts.audiotools.wrappers.reverb import reverb_stereo +from ivas_processing_scripts.generation import config SEED_RANDOM_NOISE = 0 @@ -53,14 +54,14 @@ def csv_formatdata(data): def generate_stereo_items( - cfg : config.TestConfig, + cfg: config.TestConfig, logger: logging.Logger, ): """Generate STEREO items from mono items based on scene description""" # get the number of scenes N_scenes = len(cfg.scenes) - + # set the target level if "loudness" not in cfg.__dict__: cfg.loudness = -26 @@ -76,14 +77,14 @@ def generate_stereo_items( # set the pre-amble and post-amble if "preamble" not in cfg.__dict__: cfg.preamble = 0.0 - + if "postamble" not in cfg.__dict__: cfg.postamble = 0.0 - + # set the IR path if "IR_path" not in cfg.__dict__: cfg.IR_path = os.path.join(os.path.dirname(__file__), "IR") - + # set the pre-amble and post-amble if "add_low_level_random_noise" not in cfg.__dict__: cfg.add_low_level_random_noise = False @@ -94,12 +95,12 @@ def generate_stereo_items( # extract the number of audio sources N_sources = len(np.atleast_1d(scene["source"])) - + # read the IR (check if stereo or two mono files were provided) source_IR = np.atleast_1d(scene["IR"]) - + # read the overlap length - if 'overlap' in scene.keys(): + if "overlap" in scene.keys(): source_overlap = float(scene["overlap"]) else: source_overlap = 0.0 @@ -109,51 +110,53 @@ def generate_stereo_items( # parse parameters from the scene description source_file = np.atleast_1d(scene["source"])[i] IR_file = np.atleast_1d(scene["IR"])[i] - - logger.info( - f"Convolving {source_file} with {source_IR}" - ) + + logger.info(f"Convolving {source_file} with {source_IR}") # read source file - x = audio.fromfile("MONO", os.path.join(cfg.input_path, source_file), fs=cfg.fs) + x = audio.fromfile( + "MONO", os.path.join(cfg.input_path, source_file), fs=cfg.fs + ) # get the number of frames (multiple of 20ms) N_frames = int(len(x.audio) / x.fs * 50) - + # read the IR file - IR = audio.fromfile("STEREO", os.path.join(cfg.IR_path, IR_file), fs=cfg.IR_fs) - + IR = audio.fromfile( + "STEREO", os.path.join(cfg.IR_path, IR_file), fs=cfg.IR_fs + ) + # convolve with stereo IR x_rev = reverb_stereo(x, IR) # adjust the level of the stereo signal _, scale_factor = get_loudness(x_rev, cfg.loudness, "STEREO") x_rev.audio *= scale_factor - + # shift the second (and all other) source files (positive shift creates overlap, negative shift creates a gap) if i > 0 and source_overlap != 0.0: # get the length of the first source file - N_delay = len(y.audio[:,0]) - + N_delay = len(y.audio[:, 0]) + # add the shift N_delay += int(source_overlap * x.fs) - + # ensure delay is a multiple of 20ms # N_delay = int(floor(source_shift * 50) / 50 * x.fs) - + # insert all-zero preamble pre = np.zeros((N_delay, x.audio.shape[1])) x.audio = np.concatenate([pre, x.audio]) - - # pad with zeros to ensure that the signal length is a multiple of 20ms + + # pad with zeros to ensure that the signal length is a multiple of 20ms N_frame = x.fs / 50 if len(x.audio) % N_frame != 0: N_pad = int(N_frame - len(x.audio) % N_frame) - + # insert all-zero preamble pre = np.zeros((N_pad, x.audio.shape[1])) x.audio = np.concatenate([pre, x.audio]) - + # add source signal to the array of source signals y.fs = x.fs if y.audio is None: @@ -192,30 +195,30 @@ def generate_stereo_items( if cfg.preamble != 0.0: # ensure that pre-mable is a multiple of 20ms N_pre = int(floor(cfg.preamble * 50) / 50 * y.fs) - + # insert all-zero preamble to all sources pre = np.zeros((N_pre, y.audio.shape[1])) y.audio = np.concatenate([pre, y.audio]) - + if postamble != 0.0: # ensure that post-mable is a multiple of 20ms N_post = int(floor(postamble * 50) / 50 * y.fs) - + # append all-zero postamble to all sources post = np.zeros((N_post, y.audio.shape[1])) y.audio = np.concatenate([y.audio, post]) - + # add random noise if add_low_level_random_noise: # create uniformly distributed noise between -4 and 4 np.random.seed(SEED_RANDOM_NOISE) - noise = np.random.randint( - low=-4, high=5, size=y.audio.shape - ).astype("float") - + noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype( + "float" + ) + # superimpose y.audio += noise - + # write the reverberated audio into output file output_filename = scene["name"] audiofile.write( -- GitLab From 1fa207776bc0b272f1b845dd6a3fa66afabff519 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Fri, 12 May 2023 15:19:57 +0200 Subject: [PATCH 20/27] moving legal notice of using ITU-T IR responses to the proper place --- .../IR/README.TXT => thirdPartyLegalNotices/REVERB_IR.TXT | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename ivas_processing_scripts/generation/IR/README.TXT => thirdPartyLegalNotices/REVERB_IR.TXT (100%) diff --git a/ivas_processing_scripts/generation/IR/README.TXT b/thirdPartyLegalNotices/REVERB_IR.TXT similarity index 100% rename from ivas_processing_scripts/generation/IR/README.TXT rename to thirdPartyLegalNotices/REVERB_IR.TXT -- GitLab From fa6708f39eb4c1f2e212d5638dbfc00500601a12 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Fri, 12 May 2023 15:29:10 +0200 Subject: [PATCH 21/27] cleanup + fix unknown variables --- .../audiotools/wrappers/reverb.py | 9 ++++----- ivas_processing_scripts/generation/config.py | 1 + .../generation/process_ism_items.py | 5 ----- .../generation/process_stereo_items.py | 15 ++++----------- 4 files changed, 9 insertions(+), 21 deletions(-) diff --git a/ivas_processing_scripts/audiotools/wrappers/reverb.py b/ivas_processing_scripts/audiotools/wrappers/reverb.py index 4f4de5dd..d0f04677 100644 --- a/ivas_processing_scripts/audiotools/wrappers/reverb.py +++ b/ivas_processing_scripts/audiotools/wrappers/reverb.py @@ -30,11 +30,10 @@ # the United Nations Convention on Contracts on the International Sales of Goods. # -import os.path from copy import copy from pathlib import Path from tempfile import TemporaryDirectory -from typing import Optional, Union +from typing import Optional import numpy as np from scipy.fft import fft @@ -91,7 +90,7 @@ def reverb( tmp_input.fs = IR.fs # write input audio signal to temporary file in .pcm format - tmp_input_file = tmp_dir.joinpath(f"tmp_reverbIn.pcm") + tmp_input_file = tmp_dir.joinpath("tmp_reverbIn.pcm") write(tmp_input_file, tmp_input.audio, tmp_input.fs) # down-scale IR to prevent saturation @@ -101,7 +100,7 @@ def reverb( # write IR to temporary file in .pcm format # note: the reverb tool expects 32b float format - tmp_IR_file = tmp_dir.joinpath(f"tmp_IR.pcm") + tmp_IR_file = tmp_dir.joinpath("tmp_IR.pcm") write(tmp_IR_file, IR.audio.astype(np.float32), IR.fs, dtype=np.float32) # set up the 'reverb' command line @@ -114,7 +113,7 @@ def reverb( cmd.extend(["-align", str(align)]) # append temporary filenames - tmp_output_file = tmp_dir.joinpath(f"tmp_reverbOut.pcm") + tmp_output_file = tmp_dir.joinpath("tmp_reverbOut.pcm") cmd.extend([tmp_input_file, tmp_IR_file, tmp_output_file]) # run the 'reverb' command diff --git a/ivas_processing_scripts/generation/config.py b/ivas_processing_scripts/generation/config.py index a84b156c..1947f8d5 100644 --- a/ivas_processing_scripts/generation/config.py +++ b/ivas_processing_scripts/generation/config.py @@ -123,3 +123,4 @@ class TestConfig: # Report missing keys to the user if MISSING_KEYS: raise KeyError(f"The following key(s) must be specified : {MISSING_KEYS}") + diff --git a/ivas_processing_scripts/generation/process_ism_items.py b/ivas_processing_scripts/generation/process_ism_items.py index b2e09151..bb2e6523 100644 --- a/ivas_processing_scripts/generation/process_ism_items.py +++ b/ivas_processing_scripts/generation/process_ism_items.py @@ -34,7 +34,6 @@ import csv import logging import os from math import floor -from pathlib import Path import numpy as np @@ -117,10 +116,6 @@ def generate_ism_items( # get the number of frames (multiple of 20ms) N_frames = int(len(x.audio) / x.fs * 50) - # trim the source signal to align to 20ms boundary - # N_trim = int(N_frames * x.fs / 50) - # x.audio = x.audio[:N_trim] - # adjust the level of the source file _, scale_factor = get_loudness(x, cfg.loudness, "MONO") x.audio *= scale_factor diff --git a/ivas_processing_scripts/generation/process_stereo_items.py b/ivas_processing_scripts/generation/process_stereo_items.py index 82ba54ca..98b4129d 100644 --- a/ivas_processing_scripts/generation/process_stereo_items.py +++ b/ivas_processing_scripts/generation/process_stereo_items.py @@ -30,14 +30,10 @@ # the United Nations Convention on Contracts on the International Sales of Goods. # -import csv import logging import os -from copy import copy -from math import floor -from pathlib import Path - import numpy as np +from math import floor from ivas_processing_scripts.audiotools import audio, audiofile from ivas_processing_scripts.audiotools.wrappers.bs1770 import get_loudness @@ -118,9 +114,6 @@ def generate_stereo_items( "MONO", os.path.join(cfg.input_path, source_file), fs=cfg.fs ) - # get the number of frames (multiple of 20ms) - N_frames = int(len(x.audio) / x.fs * 50) - # read the IR file IR = audio.fromfile( "STEREO", os.path.join(cfg.IR_path, IR_file), fs=cfg.IR_fs @@ -200,16 +193,16 @@ def generate_stereo_items( pre = np.zeros((N_pre, y.audio.shape[1])) y.audio = np.concatenate([pre, y.audio]) - if postamble != 0.0: + if cfg.postamble != 0.0: # ensure that post-mable is a multiple of 20ms - N_post = int(floor(postamble * 50) / 50 * y.fs) + N_post = int(floor(cfg.postamble * 50) / 50 * y.fs) # append all-zero postamble to all sources post = np.zeros((N_post, y.audio.shape[1])) y.audio = np.concatenate([y.audio, post]) # add random noise - if add_low_level_random_noise: + if cfg.add_low_level_random_noise: # create uniformly distributed noise between -4 and 4 np.random.seed(SEED_RANDOM_NOISE) noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype( -- GitLab From a221268b0db9a0700c9b94f46bdcea19da87bc42 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Fri, 12 May 2023 16:08:50 +0200 Subject: [PATCH 22/27] formatting --- ivas_processing_scripts/generation/config.py | 1 - ivas_processing_scripts/generation/process_stereo_items.py | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ivas_processing_scripts/generation/config.py b/ivas_processing_scripts/generation/config.py index 1947f8d5..a84b156c 100644 --- a/ivas_processing_scripts/generation/config.py +++ b/ivas_processing_scripts/generation/config.py @@ -123,4 +123,3 @@ class TestConfig: # Report missing keys to the user if MISSING_KEYS: raise KeyError(f"The following key(s) must be specified : {MISSING_KEYS}") - diff --git a/ivas_processing_scripts/generation/process_stereo_items.py b/ivas_processing_scripts/generation/process_stereo_items.py index 98b4129d..11b19b43 100644 --- a/ivas_processing_scripts/generation/process_stereo_items.py +++ b/ivas_processing_scripts/generation/process_stereo_items.py @@ -32,9 +32,10 @@ import logging import os -import numpy as np from math import floor +import numpy as np + from ivas_processing_scripts.audiotools import audio, audiofile from ivas_processing_scripts.audiotools.wrappers.bs1770 import get_loudness from ivas_processing_scripts.audiotools.wrappers.reverb import reverb_stereo -- GitLab From 6a6e89e96f412af6c59c45c231157f85924b3400 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Mon, 15 May 2023 10:10:07 +0200 Subject: [PATCH 23/27] fix incorrect overlap handling --- .../generation/process_ism_items.py | 2 +- .../generation/process_stereo_items.py | 26 +++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/ivas_processing_scripts/generation/process_ism_items.py b/ivas_processing_scripts/generation/process_ism_items.py index bb2e6523..7bd682a7 100644 --- a/ivas_processing_scripts/generation/process_ism_items.py +++ b/ivas_processing_scripts/generation/process_ism_items.py @@ -193,7 +193,7 @@ def generate_ism_items( N_delay = len(y.audio[:, 0]) # add the shift - N_delay += int(source_overlap * x.fs) + N_delay += int(-source_overlap * x.fs) # ensure delay is a multiple of 20ms # N_delay = int(floor(source_shift * 50) / 50 * x.fs) diff --git a/ivas_processing_scripts/generation/process_stereo_items.py b/ivas_processing_scripts/generation/process_stereo_items.py index 11b19b43..9498e2cf 100644 --- a/ivas_processing_scripts/generation/process_stereo_items.py +++ b/ivas_processing_scripts/generation/process_stereo_items.py @@ -121,11 +121,11 @@ def generate_stereo_items( ) # convolve with stereo IR - x_rev = reverb_stereo(x, IR) + x = reverb_stereo(x, IR) # adjust the level of the stereo signal - _, scale_factor = get_loudness(x_rev, cfg.loudness, "STEREO") - x_rev.audio *= scale_factor + _, scale_factor = get_loudness(x, cfg.loudness, "STEREO") + x.audio *= scale_factor # shift the second (and all other) source files (positive shift creates overlap, negative shift creates a gap) if i > 0 and source_overlap != 0.0: @@ -133,7 +133,7 @@ def generate_stereo_items( N_delay = len(y.audio[:, 0]) # add the shift - N_delay += int(source_overlap * x.fs) + N_delay += int(-source_overlap * x.fs) # ensure delay is a multiple of 20ms # N_delay = int(floor(source_shift * 50) / 50 * x.fs) @@ -154,36 +154,36 @@ def generate_stereo_items( # add source signal to the array of source signals y.fs = x.fs if y.audio is None: - y.audio = x_rev.audio + y.audio = x.audio else: # pad with zeros to have equal length of all source signals - if x_rev.audio.shape[0] > y.audio.shape[0]: + if x.audio.shape[0] > y.audio.shape[0]: y.audio = np.vstack( ( y.audio, np.zeros( ( - x_rev.audio.shape[0] - y.audio.shape[0], + x.audio.shape[0] - y.audio.shape[0], y.audio.shape[1], ) ), ) ) - elif y.audio.shape[0] > x_rev.audio.shape[0]: - x_rev.audio = np.vstack( + elif y.audio.shape[0] > x.audio.shape[0]: + x.audio = np.vstack( ( - x_rev.audio, + x.audio, np.zeros( ( - y.audio.shape[0] - x_rev.audio.shape[0], - x_rev.audio.shape[1], + y.audio.shape[0] - x.audio.shape[0], + x.audio.shape[1], ) ), ) ) # superimpose - y.audio += x_rev.audio + y.audio += x.audio # append pre-amble and post-amble to all sources if cfg.preamble != 0.0: -- GitLab From eb7fbabcd0b0bb63c741c3a9d084466d50d9e656 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Tue, 16 May 2023 09:53:49 +0200 Subject: [PATCH 24/27] fix typo in the .yml file --- item_gen_configs/ISM2_CONFIG.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/item_gen_configs/ISM2_CONFIG.yml b/item_gen_configs/ISM2_CONFIG.yml index cbbb8b60..3f3c4fb8 100644 --- a/item_gen_configs/ISM2_CONFIG.yml +++ b/item_gen_configs/ISM2_CONFIG.yml @@ -175,7 +175,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [10, 110] elevation: [0, 60] - shift: [0, 1] + overlap: [0, 1] c5: name: "G5S5.wav" -- GitLab From 7bcde6443393bd1512cda75414dc455325c438f4 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Tue, 16 May 2023 09:56:24 +0200 Subject: [PATCH 25/27] change overlap/gap to -1.0/+1.0s --- item_gen_configs/ISM2_CONFIG.yml | 76 +++++++++++++++--------------- item_gen_configs/STEREO_CONFIG.yml | 76 +++++++++++++++--------------- 2 files changed, 76 insertions(+), 76 deletions(-) diff --git a/item_gen_configs/ISM2_CONFIG.yml b/item_gen_configs/ISM2_CONFIG.yml index 3f3c4fb8..c9b749a5 100644 --- a/item_gen_configs/ISM2_CONFIG.yml +++ b/item_gen_configs/ISM2_CONFIG.yml @@ -24,8 +24,8 @@ output_path: "./items_ISM2" loudness: -26 ### Pre-amble and Post-amble length in seconds (default = 0.0) -preamble: 0.5 -postamble: 0.5 +preamble: 1.0 +postamble: 1.0 ### Flag for adding low-level random background noise (amplitude +-4) instead of silence; default = false (silence) add_low_level_random_noise: true @@ -55,7 +55,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [0, 50] elevation: [0, 0] - overlap: -0.5 + overlap: -1.0 a2: name: "G6S2.wav" @@ -63,7 +63,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [50, 350] elevation: [0, 0] - overlap: -0.5 + overlap: -1.0 a3: name: "G5S3.wav" @@ -71,7 +71,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [40, 290] elevation: [0, 0] - overlap: -0.5 + overlap: -1.0 a4: name: "G4S4.wav" @@ -79,7 +79,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [30, 230] elevation: [15, 15] - overlap: -0.5 + overlap: -1.0 a5: name: "G3S5.wav" @@ -87,7 +87,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [20, 170] elevation: [15, 15] - overlap: -0.5 + overlap: -1.0 a6: name: "G2S6.wav" @@ -95,7 +95,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [10, 110] elevation: [15, 15] - overlap: -0.5 + overlap: -1.0 b1: name: "G2S1.wav" @@ -103,7 +103,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [20, 170] elevation: [30, 30] - overlap: 0.5 + overlap: 1.0 b2: name: "G1S2.wav" @@ -111,7 +111,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [10, 110] elevation: [30, 30] - overlap: 0.5 + overlap: 1.0 b3: name: "G6S3.wav" @@ -119,7 +119,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [0, 50] elevation: [30, 30] - overlap: 0.5 + overlap: 1.0 b4: name: "G5S4.wav" @@ -127,7 +127,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [50, 350] elevation: [60, 60] - overlap: 0.5 + overlap: 1.0 b5: name: "G4S5.wav" @@ -135,7 +135,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [40, 290] elevation: [60, 60] - overlap: 0.5 + overlap: 1.0 b6: name: "G3S6.wav" @@ -143,7 +143,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [30, 230] elevation: [60, 60] - overlap: 0.5 + overlap: 1.0 c1: name: "G3S1.wav" @@ -151,7 +151,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [40, 290] elevation: [0, 60] - overlap: -0.5 + overlap: -1.0 c2: name: "G2S2.wav" @@ -159,7 +159,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [30, 230] elevation: [0, 60] - overlap: -0.5 + overlap: -1.0 c3: name: "G1S3.wav" @@ -167,7 +167,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [20, 170] elevation: [0, 60] - overlap: -0.5 + overlap: -1.0 c4: name: "G6S4.wav" @@ -175,7 +175,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [10, 110] elevation: [0, 60] - overlap: [0, 1] + overlap: -1.0 c5: name: "G5S5.wav" @@ -183,7 +183,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [0, 50] elevation: [0, 60] - overlap: -0.5 + overlap: -1.0 c6: name: "G4S6.wav" @@ -191,7 +191,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [50, 350] elevation: [0, 60] - overlap: -0.5 + overlap: -1.0 d1: name: "G4S1.wav" @@ -199,7 +199,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [50, "180:1:120 + 360"] elevation: [0, 60] - overlap: 0.5 + overlap: 1.0 d2: name: "G3S2.wav" @@ -207,7 +207,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [300, "-70:-1:-10 - 360"] elevation: [0, 60] - overlap: 0.5 + overlap: 1.0 d3: name: "G2S3.wav" @@ -215,7 +215,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [250, "-20:-1:-320"] elevation: [0, 60] - overlap: 0.5 + overlap: 1.0 d4: name: "G1S4.wav" @@ -223,7 +223,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [200, "30:-1:-270"] elevation: [0, 60] - overlap: 0.5 + overlap: 1.0 d5: name: "G6S5.wav" @@ -231,7 +231,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [150, "80:1:20 + 360"] elevation: [0, 60] - overlap: 0.5 + overlap: 1.0 d6: name: "G5S6.wav" @@ -239,7 +239,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: [100, "130:1:70 + 360"] elevation: [0, 60] - overlap: 0.5 + overlap: 1.0 e1: name: "G5S1.wav" @@ -247,7 +247,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: ["80:1:20 + 360", "80:1:20 + 360"] elevation: [10, 60] - overlap: 0.5 + overlap: 1.0 e2: name: "G4S2.wav" @@ -255,7 +255,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: ["130:1:70 + 360", "130:1:70 + 360"] elevation: [10, 60] - overlap: 0.5 + overlap: 1.0 e3: name: "G3S3.wav" @@ -263,7 +263,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: ["180:1:120 + 360", "180:1:120 + 360"] elevation: [10, 60] - overlap: 0.5 + overlap: 1.0 e4: name: "G2S4.wav" @@ -271,7 +271,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: ["-70:-1:-10 - 360", "-70:-1:-10 - 360"] elevation: [10, 60] - overlap: 0.5 + overlap: 1.0 e5: name: "G1S5.wav" @@ -279,7 +279,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: ["-20:-1:-320", "-20:-1:-320"] elevation: [10, 60] - overlap: 0.5 + overlap: 1.0 e6: name: "G6S6.wav" @@ -287,7 +287,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: ["30:-1:-270", "30:-1:-270"] elevation: [10, 60] - overlap: 0.5 + overlap: 1.0 f1: name: "G6S1.wav" @@ -295,7 +295,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: ["60:1:0 + 360", "60:-1:120 - 360"] elevation: [20, 50] - overlap: -0.5 + overlap: -1.0 f2: name: "G5S2.wav" @@ -303,7 +303,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: ["0:1:300", "0:-1:60 - 360"] elevation: [20, 50] - overlap: -0.5 + overlap: -1.0 f3: name: "G4S3.wav" @@ -311,7 +311,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: ["300:1:240 + 360", "300:-1:0"] elevation: [20, 50] - overlap: -0.5 + overlap: -1.0 f4: name: "G3S4.wav" @@ -319,7 +319,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: ["240:1:180 + 360", "240:-1:-60"] elevation: [20, 50] - overlap: -0.5 + overlap: -1.0 f5: name: "G2S5.wav" @@ -327,7 +327,7 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: ["180:1:120 + 360", "180:-1:-120"] elevation: [20, 50] - overlap: -0.5 + overlap: -1.0 f6: name: "G1S6.wav" @@ -335,5 +335,5 @@ scenes: source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] azimuth: ["120:1:60 + 360", "120:-1:180 - 360"] elevation: [20, 50] - overlap: -0.5 + overlap: -1.0 \ No newline at end of file diff --git a/item_gen_configs/STEREO_CONFIG.yml b/item_gen_configs/STEREO_CONFIG.yml index 8f6cccc3..7dd1a956 100644 --- a/item_gen_configs/STEREO_CONFIG.yml +++ b/item_gen_configs/STEREO_CONFIG.yml @@ -30,8 +30,8 @@ output_path: "./items_STEREO" loudness: -26 ### Pre-amble and Post-amble length in seconds (default = 0.0) -preamble: 0.5 -postamble: 0.5 +preamble: 1.0 +postamble: 1.0 ### Flag for adding low-level random background noise (amplitude +-4) instead of silence; default = false (silence) add_low_level_random_noise: true @@ -54,250 +54,250 @@ scenes: description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room." source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["LEABP04.wav", "LEABP11.wav"] - overlap: 0.5 + overlap: 1.0 a2: name: "G6S2.wav" description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room." source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["LEABP05.wav", "LEABP11.wav"] - overlap: 0.5 + overlap: 1.0 a3: name: "G5S3.wav" description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room." source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["LEABP06.wav", "LEABP11.wav"] - overlap: 0.5 + overlap: 1.0 a4: name: "G4S4.wav" description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room." source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["LEABP05.wav", "LEABP10.wav"] - overlap: -0.5 + overlap: -1.0 a5: name: "G3S5.wav" description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room." source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["LEABP05.wav", "LEABP11.wav"] - overlap: -0.5 + overlap: -1.0 a6: name: "G2S6.wav" description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room." source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["LEABP05.wav", "LEABP12.wav"] - overlap: -0.5 + overlap: -1.0 b1: name: "G2S1.wav" description: "Two speakers sitting at oval table side by side in a large anechoic conference room." source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["LAABP05.wav", "LAABP06.wav"] - overlap: -0.5 + overlap: -1.0 b2: name: "G1S2.wav" description: "Two speakers sitting at oval table side by side in a large anechoic conference room." source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["LAABP07.wav", "LAABP08.wav"] - overlap: 0.5 + overlap: 1.0 b3: name: "G6S3.wav" description: "Two speakers sitting at oval table side by side in a large anechoic conference room." source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["LAABP09.wav", "LAABP10.wav"] - overlap: 0.5 + overlap: 1.0 b4: name: "G5S4.wav" description: "Two speakers sitting at oval table side by side in a large anechoic conference room." source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["LAABP11.wav", "LAABP12.wav"] - overlap: -0.5 + overlap: -1.0 b5: name: "G4S5.wav" description: "Two speakers sitting at oval table side by side in a large anechoic conference room." source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["LAABP01.wav", "LAABP02.wav"] - overlap: -0.5 + overlap: -1.0 b6: name: "G3S6.wav" description: "Two speakers sitting at oval table side by side in a large anechoic conference room." source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["LAABP03.wav", "LAABP04.wav"] - overlap: -0.5 + overlap: -1.0 c1: name: "G3S1.wav" description: "One talker sitting at table in a small anechoic conference room." source: ["test_single.wav"] IR: ["SAMSP01.wav"] - overlap: -0.5 + overlap: -1.0 c2: name: "G2S2.wav" description: "One talker sitting at table in a small anechoic conference room." source: ["test_single.wav"] IR: ["SAMSP04.wav"] - overlap: -0.5 + overlap: -1.0 c3: name: "G1S3.wav" description: "One talker sitting at table in a small anechoic conference room." source: ["test_single.wav"] IR: ["SAMSP07.wav"] - overlap: -0.5 + overlap: -1.0 c4: name: "G6S4.wav" description: "One talker sitting at table in a small echoic conference room." source: ["test_single.wav"] IR: ["SEABP01.wav"] - overlap: -0.5 + overlap: -1.0 c5: name: "G5S5.wav" description: "One talker sitting at table in a small echoic conference room." source: ["test_single.wav"] IR: ["SEABP03.wav"] - overlap: -0.5 + overlap: -1.0 c6: name: "G4S6.wav" description: "One talker sitting at table in a small echoic conference room." source: ["test_single.wav"] IR: ["SEABP06.wav"] - overlap: -0.5 + overlap: -1.0 d1: name: "G4S1.wav" description: "One talker sitting at table in a small anechoic conference room." source: ["test_single.wav"] IR: ["SEBIP01.wav"] - overlap: -0.5 + overlap: -1.0 d2: name: "G3S2.wav" description: "One talker sitting at table in a small anechoic conference room." source: ["test_single.wav"] IR: ["SEBIP04.wav"] - overlap: -0.5 + overlap: -1.0 d3: name: "G3S2.wav" description: "One talker sitting at table in a small anechoic conference room." source: ["test_single.wav"] IR: ["SEBIP07.wav"] - overlap: -0.5 + overlap: -1.0 d4: name: "G1S4.wav" description: "One talker sitting at table in a small echoic conference room." source: ["test_single.wav"] IR: ["SEBIP07.wav"] - overlap: -0.5 + overlap: -1.0 d5: name: "G6S5.wav" description: "One talker sitting at table in a small echoic conference room." source: ["test_single.wav"] IR: ["SEBIP07.wav"] - overlap: -0.5 + overlap: -1.0 d6: name: "G5S6.wav" description: "One talker sitting at table in a small echoic conference room." source: ["test_single.wav"] IR: ["SEBIP07.wav"] - overlap: -0.5 + overlap: -1.0 e1: name: "G5S1.wav" description: "Two talkers sitting in a room." source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["SEMSP01.wav", "SEMSP03.wav"] - overlap: 0.5 + overlap: 1.0 e2: name: "G4S2.wav" description: "Two talkers sitting in a room." source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["SEMSP01.wav", "SEMSP05.wav"] - overlap: 0.5 + overlap: 1.0 e3: name: "G3S3.wav" description: "Two talkers sitting in a room." source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["SEMSP01.wav", "SEMSP07.wav"] - overlap: 0.5 + overlap: 1.0 e4: name: "G2S4.wav" description: "Two talkers sitting in a room." source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["SEMSP03.wav", "SEMSP04.wav"] - overlap: -0.5 + overlap: -1.0 e5: name: "G1S5.wav" description: "Two talkers sitting in a room." source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["SEMSP05.wav", "SEMSP07.wav"] - overlap: -0.5 + overlap: -1.0 e6: name: "G6S6.wav" description: "Two talkers sitting in a room." source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["SEMSP06.wav", "SEMSP02.wav"] - overlap: -0.5 + overlap: -1.0 f1: name: "G6S1.wav" description: "Two talkers sitting in a room." source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["SEBIP05.wav", "SEBIP01.wav"] - overlap: 0.5 + overlap: 1.0 f2: name: "G5S2.wav" description: "Two talkers sitting in a room." source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["SEBIP07.wav", "SEBIP01.wav"] - overlap: 0.5 + overlap: 1.0 f3: name: "G4S3.wav" description: "Two talkers sitting in a room." source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["SEBIP04.wav", "SEBIP01.wav"] - overlap: 0.5 + overlap: 1.0 f4: name: "G3S4.wav" description: "Two talkers sitting in a room." source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["SEBIP02.wav", "SEBIP06.wav"] - overlap: -0.5 + overlap: -1.0 f5: name: "G2S5.wav" description: "Two talkers sitting in a room." source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["SEBIP02.wav", "SEBIP06.wav"] - overlap: -0.5 + overlap: -1.0 f6: name: "G1S6.wav" description: "Two talkers sitting in a room." source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"] IR: ["SEBIP03.wav", "SEBIP04.wav"] - overlap: -0.5 + overlap: -1.0 \ No newline at end of file -- GitLab From 3ea660b926e92d9aaece18db73a3925a17bacef3 Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Tue, 16 May 2023 11:01:05 +0200 Subject: [PATCH 26/27] fix incorrect length of .csv files; improving source code readability --- .../generation/process_ism_items.py | 47 ++++++++++--------- .../generation/process_stereo_items.py | 9 ++-- 2 files changed, 29 insertions(+), 27 deletions(-) diff --git a/ivas_processing_scripts/generation/process_ism_items.py b/ivas_processing_scripts/generation/process_ism_items.py index 7bd682a7..54c7556a 100644 --- a/ivas_processing_scripts/generation/process_ism_items.py +++ b/ivas_processing_scripts/generation/process_ism_items.py @@ -115,10 +115,15 @@ def generate_ism_items( # get the number of frames (multiple of 20ms) N_frames = int(len(x.audio) / x.fs * 50) + frame_len = int(x.fs / 50) + + # trim the samples from the end to ensure that the signal length is a multiple of 20ms + x.audio = x.audio[:N_frames * frame_len] # adjust the level of the source file _, scale_factor = get_loudness(x, cfg.loudness, "MONO") x.audio *= scale_factor + # read azimuth information and create array if isinstance(source_azi, str): @@ -192,36 +197,34 @@ def generate_ism_items( # get the length of the first source file N_delay = len(y.audio[:, 0]) - # add the shift - N_delay += int(-source_overlap * x.fs) + # add the shift value (ensure that the shift is a multiple of 20ms) + N_delay += int(floor(-source_overlap * 50) / 50 * x.fs) - # ensure delay is a multiple of 20ms - # N_delay = int(floor(source_shift * 50) / 50 * x.fs) - - # insert all-zero preamble + # insert all-zero signal pre = np.zeros((N_delay, x.audio.shape[1])) x.audio = np.concatenate([pre, x.audio]) # insert neutral position as a pre-amble + N_delay = int(N_delay / frame_len) pre = np.tile( [0.00, 0.00, 1.00, 0.00, 1.00], (N_delay, 1) ) # !!!! TBD - check if we should insert netrual position or the first position of the metadata x_meta = np.concatenate([pre, x_meta]) # pad with zeros to ensure that the signal length is a multiple of 20ms - N_frame = x.fs / 50 - if len(x.audio) % N_frame != 0: - N_pad = int(N_frame - len(x.audio) % N_frame) - - # insert all-zero preamble - pre = np.zeros((N_pad, x.audio.shape[1])) - x.audio = np.concatenate([pre, x.audio]) - - # insert neutral position as a pre-amble - pre = np.tile( - [0.00, 0.00, 1.00, 0.00, 1.00], (N_pad, 1) - ) # !!!! TBD - check if we should insert netrual position or the first position of the metadata - x_meta = np.concatenate([pre, x_meta]) + if len(x.audio) % frame_len != 0: + # pad the source signal + N_pad = int(frame_len - len(x.audio) % frame_len) + post = np.zeros((N_pad, x.audio.shape[1])) + x.audio = np.concatenate([x.audio, post]) + + # pad the metadata + N_pad = int(len(x.audio) / frame_len) - len(x_meta) + if N_pad > 0: + post = np.tile( + [0.00, 0.00, 1.00, 0.00, 1.00], (N_pad, 1) + ) # !!!! TBD - check if we should insert netrual position or the first position of the metadata + x_meta = np.concatenate([x_meta, post]) # add source signal to the array of all source signals y.fs = x.fs @@ -280,7 +283,7 @@ def generate_ism_items( # append pre-amble and post-amble to all sources if cfg.preamble != 0.0: - # ensure that pre-mable is a multiple of 20ms + # ensure that pre-amble is a multiple of 20ms N_pre = int(floor(cfg.preamble * 50) / 50 * y.fs) # insert all-zero preamble to all sources @@ -288,6 +291,7 @@ def generate_ism_items( y.audio = np.concatenate([pre, y.audio]) # insert neutral position as a pre-amble to all sources + N_pre = int(N_pre / frame_len) pre = np.tile( [0.00, 0.00, 1.00, 0.00, 1.00], (y_meta.shape[0], N_pre, 1) ) # !!!! TBD - check if we should insert netrual position or the first position of the metadata @@ -302,6 +306,7 @@ def generate_ism_items( y.audio = np.concatenate([y.audio, post]) # append neutral position as a post-amble to all sources + N_post = int(N_post / frame_len) post = np.tile( [0.00, 0.00, 1.00, 0.00, 1.00], (y_meta.shape[0], N_post, 1) ) # !!!! TBD - check if we should insert netrual position or the last position of the metadata @@ -319,7 +324,7 @@ def generate_ism_items( y.audio += noise # write individual ISM audio streams to the output file in an interleaved format - output_filename = scene["name"] + output_filename = scene_name audiofile.write( os.path.join(cfg.output_path, output_filename), y.audio, y.fs ) # !!!! TBD: replace all os.path.xxx operations with the Path object diff --git a/ivas_processing_scripts/generation/process_stereo_items.py b/ivas_processing_scripts/generation/process_stereo_items.py index 9498e2cf..bd7e5915 100644 --- a/ivas_processing_scripts/generation/process_stereo_items.py +++ b/ivas_processing_scripts/generation/process_stereo_items.py @@ -88,7 +88,7 @@ def generate_stereo_items( # repeat for all source files for scene_name, scene in cfg.scenes.items(): - logger.info(f"Processing scene: {scene_name} out of {N_scenes} scenes") + logger.info(f"Processing scene: {scene_name} out of {N_scenes} scenes, name: {scene_name}") # extract the number of audio sources N_sources = len(np.atleast_1d(scene["source"])) @@ -135,9 +135,6 @@ def generate_stereo_items( # add the shift N_delay += int(-source_overlap * x.fs) - # ensure delay is a multiple of 20ms - # N_delay = int(floor(source_shift * 50) / 50 * x.fs) - # insert all-zero preamble pre = np.zeros((N_delay, x.audio.shape[1])) x.audio = np.concatenate([pre, x.audio]) @@ -187,7 +184,7 @@ def generate_stereo_items( # append pre-amble and post-amble to all sources if cfg.preamble != 0.0: - # ensure that pre-mable is a multiple of 20ms + # ensure that pre-amble is a multiple of 20ms N_pre = int(floor(cfg.preamble * 50) / 50 * y.fs) # insert all-zero preamble to all sources @@ -214,7 +211,7 @@ def generate_stereo_items( y.audio += noise # write the reverberated audio into output file - output_filename = scene["name"] + output_filename = scene_name audiofile.write( os.path.join(cfg.output_path, output_filename), y.audio, y.fs ) # !!!! TBD: replace all os.path.xxx operations with the Path object -- GitLab From bfa648d433dad4ec6c6e577f1ec19bdc5b421ddf Mon Sep 17 00:00:00 2001 From: Vladimir Malenovsky Date: Tue, 16 May 2023 11:44:15 +0200 Subject: [PATCH 27/27] formatting --- ivas_processing_scripts/generation/process_ism_items.py | 5 ++--- ivas_processing_scripts/generation/process_stereo_items.py | 4 +++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/ivas_processing_scripts/generation/process_ism_items.py b/ivas_processing_scripts/generation/process_ism_items.py index 54c7556a..800f12a1 100644 --- a/ivas_processing_scripts/generation/process_ism_items.py +++ b/ivas_processing_scripts/generation/process_ism_items.py @@ -116,14 +116,13 @@ def generate_ism_items( # get the number of frames (multiple of 20ms) N_frames = int(len(x.audio) / x.fs * 50) frame_len = int(x.fs / 50) - + # trim the samples from the end to ensure that the signal length is a multiple of 20ms - x.audio = x.audio[:N_frames * frame_len] + x.audio = x.audio[: N_frames * frame_len] # adjust the level of the source file _, scale_factor = get_loudness(x, cfg.loudness, "MONO") x.audio *= scale_factor - # read azimuth information and create array if isinstance(source_azi, str): diff --git a/ivas_processing_scripts/generation/process_stereo_items.py b/ivas_processing_scripts/generation/process_stereo_items.py index bd7e5915..ff3ec592 100644 --- a/ivas_processing_scripts/generation/process_stereo_items.py +++ b/ivas_processing_scripts/generation/process_stereo_items.py @@ -88,7 +88,9 @@ def generate_stereo_items( # repeat for all source files for scene_name, scene in cfg.scenes.items(): - logger.info(f"Processing scene: {scene_name} out of {N_scenes} scenes, name: {scene_name}") + logger.info( + f"Processing scene: {scene_name} out of {N_scenes} scenes, name: {scene_name}" + ) # extract the number of audio sources N_sources = len(np.atleast_1d(scene["source"])) -- GitLab