diff --git a/item_generation_scripts/config/ISM1_CONFIG.yml b/item_gen_configs/ISM1_CONFIG.yml similarity index 100% rename from item_generation_scripts/config/ISM1_CONFIG.yml rename to item_gen_configs/ISM1_CONFIG.yml diff --git a/item_generation_scripts/config/ISM2_CONFIG.yml b/item_gen_configs/ISM2_CONFIG.yml similarity index 100% rename from item_generation_scripts/config/ISM2_CONFIG.yml rename to item_gen_configs/ISM2_CONFIG.yml diff --git a/item_generation_scripts/config/STEREO_CONFIG.yml b/item_gen_configs/STEREO_CONFIG.yml similarity index 100% rename from item_generation_scripts/config/STEREO_CONFIG.yml rename to item_gen_configs/STEREO_CONFIG.yml diff --git a/item_generation_scripts/audiotools/EFAP.py b/item_generation_scripts/audiotools/EFAP.py deleted file mode 100644 index b83d57e6af92e819f5f5569f0b6f840c91bc1513..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/EFAP.py +++ /dev/null @@ -1,922 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import argparse -from enum import Enum -from itertools import combinations -from pathlib import Path -from typing import Optional, Tuple, Union - -import numpy as np - - -def wrap_angles( - azi: float, - ele: float, - clip_ele: Optional[bool] = False, -) -> Tuple[float, float]: - """ - Wrap angles to (-180, 180] azimuth and [-90, 90] elevation - Takes into account hemisphere flips from large elevation changes unless clip_ele is specified - """ - if clip_ele: - ele = min(max(ele, -90), 90) - - if ele % 90 == 0 and ele % 180 != 0: - # if elevation is a multiple of 90, azimuth is irrelevant since we are at a pole - azi = 0 - while np.abs(ele) > 90: - ele -= 360 - else: - # wrap elevation value - while np.abs(ele) > 90: - # flip azimuth to other hemisphere - azi += 180 - - # compensate elevation accordingly - if ele > 90: - ele = 180 - ele - elif ele < -90: - ele = -180 - ele - - # wrap azimuth value - while azi > 180: - azi -= 360 - while azi <= -180: - azi += 360 - - return azi, ele - - -class EfapDmxType(Enum): - NONE = 0 - AMPLITUDE = 1 - INTENSITY = 2 - - -class EfapVertex: - """ - Vertex data structure for EFAP - - Initialises a vertex from the given spherical coordinate pair, - with a flag specifying if it is a ghost loudspeaker - - Parameters - ---------- - azi : float - Azimuth of vertex - ele : float - Elevation of vertex - is_ghost : bool - Whether the vertex is a ghost, default is False - dmx_type : EfapDmxType - Downmix type for ghost vertices - """ - - def __init__( - self, - azi: float, - ele: float, - is_ghost: Optional[bool] = False, - dmx_type: Optional[EfapDmxType] = EfapDmxType.INTENSITY, - ): - self.azi, self.ele = wrap_angles(azi, ele) - self.pos = np.array( - [ - np.cos(np.deg2rad(azi)) * np.cos(np.deg2rad(ele)), - np.sin(np.deg2rad(azi)) * np.cos(np.deg2rad(ele)), - np.sin(np.deg2rad(ele)), - ] - ) - - idx_azi = np.round(np.abs(90 - np.abs(self.azi))) - idx_ele = 90 - np.round(np.abs(self.ele)) - self.index = ( - idx_azi + 181 * idx_ele - ) # vertices on the median plane have lowest index - - self.is_ghost = is_ghost - self.dmx_type = dmx_type - - def __str__(self): - str_ = f"a{self.azi}e{self.ele}" - if self.is_ghost: - str_ += "*" - return str_ - - def __lt__(self, other): - return self.index < other.index - - -class EFAP: - """ - EFAP data structure - - Initialise EFAP data for computing panning gains - - Parameters - ---------- - azimuths : np.ndarray - Azimuth positions of the loudspeaker array - elevations : np.ndarray - Elevation postions of the loudspeaker array - intensity_panning : bool - Whether intensity panning is enabled or not - - Examples - -------- - >>> from EFAP import EFAP - >>> panner = EFAP([30, -30, 0, 110, -110], [0, 0, 0, 0, 0], False) - >>> panner.pan(15, 45) - array([0.66742381, 0.19069252, 0.66742381, 0.19069252, 0.19069252]) - """ - - _EFAP_HULL_TOL = 1e-4 # tolerance for a point to be added to the convex hull - _EFAP_MAX_AZI_GAP = 160 # maximum allowed angular gap in the middle layer - _EFAP_POLAR_ELE = 90 # elevation of north / south poles (zenith / nadir) - _EFAP_THRESH_COPLANAR = 1e-3 # tolerance for points to be considered coplanar - _EFAP_THRESH_MID_LAYER = 45 # elevation threshold for loudspeakers to be considered as in the middle layer - _EFAP_THRESH_POLES = 1e-6 # tolerance for a vertex to be considered polar - _EFAP_THRESH_TRI = 1e-10 # tolerance for a point to be inside a triangle - - def __init__( - self, - azimuths: Union[list, np.ndarray], - elevations: Union[list, np.ndarray], - intensity_panning: Optional[bool] = False, - ): - # validation - azimuths = np.array(azimuths) - elevations = np.array(elevations) - if np.squeeze(azimuths).ndim > 1: - raise ValueError("Too many dimensions for loudspeaker azimuth array") - if np.squeeze(elevations).ndim > 1: - raise ValueError("Too many dimensions for loudspeaker elevations array") - if azimuths.shape != elevations.shape: - raise ValueError("Mismatch between loudspeaker azimuths and elevations") - - # set EFIP flag - self.intensity_panning = intensity_panning - - # initialise vertices and add ghost loudspeakers if needed - self.verts = np.array( - [EfapVertex(azi, ele) for azi, ele in zip(azimuths, elevations)] - ) - self._add_ghost_speakers() - - # formulate initial tetrahedron for the convex hull - self._init_simplex() - - # add the remaining vertices to the convex hull in order of their index - for i in np.argsort(self.verts): - if self.verts[i] not in self.verts[self.tris]: - self._add_vertex_to_hull(i) - - # compute downmix matrix with remapped ghost speakers - self._remap_ghost_speakers() - - # set vertices near poles to have NaN azimuth - for v in self.verts: - if ( - v.ele > self._EFAP_POLAR_ELE - self._EFAP_THRESH_POLES - or v.ele < self._EFAP_THRESH_POLES - self._EFAP_POLAR_ELE - ): - v.azi = np.nan - - # combine triangles into polygons - self._tri2poly() - - def _add_ghost_speakers(self) -> None: - """ - Add ghost loudspeakers at the poles, or to fill large horizontal gaps - """ - ele = [v.ele for v in self.verts] - - dmx_type = EfapDmxType.INTENSITY - - # add ghost loudspeakers at the poles if necessary - if max(ele) < self._EFAP_POLAR_ELE: - if self.intensity_panning: - if max(ele) > self._EFAP_THRESH_MID_LAYER: - dmx_type = EfapDmxType.NONE - else: - dmx_type = EfapDmxType.AMPLITUDE - - self.verts = np.append(self.verts, EfapVertex(0, 90, True, dmx_type)) - - if min(ele) > -self._EFAP_POLAR_ELE: - if self.intensity_panning: - if min(ele) < -self._EFAP_THRESH_MID_LAYER: - dmx_type = EfapDmxType.NONE - else: - dmx_type = EfapDmxType.AMPLITUDE - - self.verts = np.append(self.verts, EfapVertex(0, -90, True, dmx_type)) - - # check for large gaps in the middle horizontal layer - mid_spkrs = [ - v.azi for v in self.verts if np.abs(v.ele) < self._EFAP_THRESH_MID_LAYER - ] - - # no speakers in middle layer; add a triangle of ghost speakers - if not mid_spkrs: - self.verts = np.append( - self.verts, - [ - EfapVertex(0, 0, True), - EfapVertex(180, 0, True), - EfapVertex(240, 0, True), - ], - ) - # only one speaker in the threshold; add two ghost speakers to form a triangle - elif len(mid_spkrs) == 1: - self.verts = np.append( - self.verts, - [ - EfapVertex(mid_spkrs[0] + 120, 0, True), - EfapVertex(mid_spkrs[0] + 240, 0, True), - ], - ) - # search for and fill gaps greater than MAX_AZI_GAP - else: - mid_spkrs = np.sort(mid_spkrs) - angle_diff = np.diff(np.concatenate([mid_spkrs, [mid_spkrs[0] + 360]])) - sectors = np.ceil(angle_diff / self._EFAP_MAX_AZI_GAP) - - for i, s in enumerate(sectors): - if s > 1: - new_diff = angle_diff[i] / s - num_new = s - 1 - for k in range(int(num_new)): - new_azi = mid_spkrs[i] + (k + 1) * new_diff - self.verts = np.append(self.verts, EfapVertex(new_azi, 0, True)) - - def _init_simplex(self) -> None: - """ - Create an initial tetrahedron / simplex for the convex hull from 4 vertices - """ - # take the first vertex as seed - t = [0] - - # attempt to form an edge with non-zero length - for i, v in enumerate(self.verts): - if ( - v.azi != self.verts[t[0]].azi or v.ele != self.verts[t[0]].ele - ) and i not in t: - t.append(i) - break - else: - raise ValueError("Vertices are conincident!") - - # attempt to form a triangle with non-zero area - for i, v in enumerate(self.verts): - if ( - np.linalg.norm( - np.cross( - self.verts[t[1]].pos - self.verts[t[0]].pos, - v.pos - self.verts[t[0]].pos, - ), - 2, - ) - > self._EFAP_HULL_TOL - and i not in t - ): - t.append(i) - break - else: - raise ValueError("Vertices are colinear!") - - # attempt to form a tetrahedron with non-zero volume - for i, v in enumerate(self.verts): - if ( - np.abs( - np.dot( - np.cross( - self.verts[t[1]].pos - self.verts[t[0]].pos, - self.verts[t[2]].pos - self.verts[t[0]].pos, - ), - v.pos - self.verts[t[0]].pos, - ) - ) - ) > self._EFAP_HULL_TOL and i not in t: - t.append(i) - break - else: - raise ValueError("Vertices are coplanar!") - - # create a list of the triangles of the initial simplex / tetrahedron - t = np.array(t) - self.tris = np.array([t[[0, 1, 2]], t[[0, 1, 3]], t[[0, 2, 3]], t[[1, 2, 3]]]) - - # orient the triangle surface planes outwards from the centroid - self.centroid = np.mean([self.verts[i].pos for i in t], axis=0) - for i, tri in enumerate(self.tris): - self.tris[i, :] = self._flip_plane(tri) - - def _add_vertex_to_hull(self, idx_new_vert: int) -> None: - """ - Add a vertex to the convex hull and update the list of triangles in the hull - """ - # compute the centroid of the current convex hull - self.centroid = np.mean( - [self.verts[i].pos for i in np.unique(self.tris)], axis=0 - ) - - tris_new = [] - visible = [] - - # find which hull surfaces are visible from the new vertex - for i, tri in enumerate(self.tris): - if self._vertex_dist(tri, idx_new_vert) > -1e-6: - visible.append(i) - else: - tris_new.append(tri) - - tris_new = np.array(tris_new) - visible = np.array(visible, dtype=int) - - # find edges of the visible hull surfaces - max_vert = np.amax(self.tris[visible]) + 1 - counter = np.zeros([max_vert, max_vert]) - for i, tri in enumerate(self.tris[visible]): - surface = np.append(tri, tri[0]) - for n in range(3): - a = surface[n] - b = surface[n + 1] - counter[a, b] = counter[a, b] + 1 - - counter += counter.T - - edges = [] - for a in range(max_vert - 1): - for b in range(a + 1, max_vert): - if counter[a, b] == 1: - edges.append([a, b]) - edges = np.vstack(edges) - - # break the edges visible from the new vertex and add the new triangle - for e in edges: - tris_new = np.vstack( - [tris_new, self._flip_plane(np.append(e, idx_new_vert))] - ) - - # update the list of triangles in the convex hull - self.tris = tris_new - - def _remap_ghost_speakers(self) -> None: - """ - Remove unused ghost speakers and compute a downmix matrix for the rest - """ - # find ghosts that are not part of the convex hull - ghosts = [i for i, v in enumerate(self.verts) if v.is_ghost] - unused_ghosts = np.compress( - np.isin(ghosts, np.unique(self.tris), invert=True), ghosts - ) - - if unused_ghosts.size > 0: - # remove the unused ghosts from the triangle array and also adjust indices - self.tris[self.tris > unused_ghosts.min()] -= unused_ghosts.size - # delete them from the vertex array - self.verts = np.delete(self.verts, unused_ghosts) - - # generate initial sound energy distribution matrix - n_vtx = len(self.verts) - n_ghost = len(ghosts) - len(unused_ghosts) - - M = np.eye(n_vtx) - for i, v in enumerate(self.verts): - if v.is_ghost: - neighbours = self._get_neighbours(i) - M[:, i] = np.zeros(n_vtx) - M[neighbours, i] = np.ones(len(neighbours)) / len(neighbours) - - # re-distribute sound energy from ghosts - M2 = M.copy() - for i, v in enumerate(self.verts): - if v.is_ghost: - vec = M[:, i] - while np.sum(vec[-n_ghost:]) > 1e-4: - vec = M @ vec - M2[:, i] = vec - - self.dmx_mat = M2[:-n_ghost, :] - - # amplitude downmix for real loudspeakers - self.dmx_mat[:, :-n_ghost] = np.sqrt(self.dmx_mat[:, :-n_ghost]) - - # distribute ghosts according to downmix type - for i, v in enumerate(self.verts): - if v.is_ghost: - if v.dmx_type == EfapDmxType.NONE: - self.dmx_mat[:, i] = 0 - elif v.dmx_type == EfapDmxType.AMPLITUDE: - pass - else: - self.dmx_mat[:, i] = np.sqrt(self.dmx_mat[:, i]) - - def _tri2poly(self) -> None: - """ - Merge hull triangles into polygons if they are coplanar - """ - polys = [] - - for tri in self.tris: - # find all vertices coplanar with this triangle (including those already in the triangle) - new_poly = np.array( - [ - i - for i, _ in enumerate(self.verts) - if np.abs(self._vertex_dist(tri, i)) < self._EFAP_THRESH_COPLANAR - ] - ) - - # check if we already found this polygon as a complete subset - is_subset = [ - i for i, poly in enumerate(polys) if np.all(np.isin(new_poly, poly)) - ] - is_superset = [ - i for i, poly in enumerate(polys) if np.all(np.isin(poly, new_poly)) - ] - - if is_subset: - continue - elif is_superset: - # remove the other polygon since it will be replaced by the superset polygon - polys_new = [p for i, p in enumerate(polys) if i not in is_superset] - polys = polys_new - - # orient the polygon plane in the same direction as the triangle - P1 = self.verts[tri[0]].pos - P2 = self.verts[tri[1]].pos - P3 = self.verts[tri[2]].pos - - # first base vector - U = P2 - P1 - U = U / np.linalg.norm(U) - - # second base vector - V = P3 - P2 - V = V - np.dot(U, V) * U - V = V / np.linalg.norm(V) - - # center of the first triangle - M = np.mean([P1, P2, P3], axis=0) - - # sort vertices - azi = np.zeros_like(new_poly, dtype=float) - for i, idx_v in enumerate(new_poly): - P = self.verts[idx_v].pos - M - X = np.dot(P, U) - Y = np.dot(P, V) - azi[i] = np.arctan2(Y, X) - - idx = np.argsort(azi) - new_poly = new_poly[idx] - - # add the polygon to the main list - polys.append(new_poly) - - self.polys = polys - - def _pan_EFAP_poly( - self, azimuth: float, elevation: float, poly: np.ndarray, mod: int - ) -> np.ndarray: - """ - Compute panning gains for each vertex in the given polygon - - Parameters - ---------- - azimuth : float - Azimuth of requested panning position - elevation : float - Elevation of requested panning position - poly : np.ndarray - Array of vertices defining the polygon - - Returns - ------- - poly_gain: np.ndarray - Gains for each vertex in the polygon - """ - poly_gain = np.zeros_like(poly, dtype=float) - - P = np.array([azimuth, elevation]) - # search for the triangle of the polygon in which P belongs - for i in range(1, poly.size + 1): - A = np.array([self.verts[poly[i - 1]].azi, self.verts[poly[i - 1]].ele]) - for j in range(i, poly.size - 2 + i): - idx1 = 1 + (j % poly.size) - idx2 = 1 + (idx1 % poly.size) - B = np.array( - [self.verts[poly[idx1 - 1]].azi, self.verts[poly[idx1 - 1]].ele] - ) - C = np.array( - [self.verts[poly[idx2 - 1]].azi, self.verts[poly[idx2 - 1]].ele] - ) - - if mod: - if not np.isnan(A[0]): - A[0] %= mod - if not np.isnan(B[0]): - B[0] %= mod - if not np.isnan(C[0]): - C[0] %= mod - - if self._in_triangle(P, A, B, C): - N = np.transpose([B[1] - C[1], C[0] - B[0]]) - N = N / np.dot(N, B - A) - poly_gain[i - 1] = 1 - np.dot(P - A, N) - - """ DEBUGGING / TODO """ - # set gains <= -60dB to 0 - poly_gain[np.abs(poly_gain) < 1e-6] = 0 - - return poly_gain - - """ geometric / math helper functions """ - - def _get_neighbours(self, idx_vert: int) -> np.ndarray: - """ - Find triangles containing the given vertex index (neighbouring vertices) - """ - n = self.tris[np.any(np.isin(self.tris, idx_vert), axis=1)] - return np.unique(n[n != idx_vert]) - - def _get_azi_ele(self, idx_vert: int) -> Tuple[float, float]: - """ - Return a tuple of (azi, ele) for a vertex at the given index - """ - return self.verts[idx_vert].azi, self.verts[idx_vert].ele - - def _in_polygon( - self, azimuth: float, elevation: float, poly: np.ndarray - ) -> Tuple[bool, int]: - """ - Determine whether the panning position lies within the given polygon - by iteratively checking its triangles - - Parameters - ---------- - azimuth : float - Azimuth of requested panning position - elevation : float - Elevation of requested panning position - poly : np.ndarray - Array of vertices defining the polygon - - Returns - ------- - in_polygon, mod: Tuple[bool, int] - Flag indicating whether the point is inside the given polygon - Value of wrapping required if used - """ - azi = [self.verts[v].azi for v in poly] - - P = np.array([azimuth, elevation]) - - for tri in combinations(poly, 3): - A = np.array(self._get_azi_ele(tri[0])) - B = np.array(self._get_azi_ele(tri[1])) - C = np.array(self._get_azi_ele(tri[2])) - if self._in_triangle(P, A, B, C): - return True, None - - # if the azimuth difference is large, perform the 2D check again with azimuths wrapped to (-360, 0] and [0, 360) - if np.nanmax(azi) - np.nanmin(azi) > 180: - for tri in combinations(poly, 3): - A = np.array(self._get_azi_ele(tri[0])) - B = np.array(self._get_azi_ele(tri[1])) - C = np.array(self._get_azi_ele(tri[2])) - if not np.isnan(A[0]): - A[0] %= 360 - if not np.isnan(B[0]): - B[0] %= 360 - if not np.isnan(C[0]): - C[0] %= 360 - if self._in_triangle(P, A, B, C): - return True, 360 - - for tri in combinations(poly, 3): - A = np.array(self._get_azi_ele(tri[0])) - B = np.array(self._get_azi_ele(tri[1])) - C = np.array(self._get_azi_ele(tri[2])) - if not np.isnan(A[0]): - A[0] %= -360 - if not np.isnan(B[0]): - B[0] %= -360 - if not np.isnan(C[0]): - C[0] %= -360 - if self._in_triangle(P, A, B, C): - return True, -360 - - return False, None - - def _in_triangle( - self, P: np.ndarray, A: np.ndarray, B: np.ndarray, C: np.ndarray - ) -> bool: - """ - Determine whether the panning position lies within the given triangle - - Parameters - ---------- - P : float - Point under test - A : float - First vertex of the triangle - B : float - Second vertex of the triangle - C : float - Third vertex of the triangle - - Returns - ------- - bool - Flag indicating whether the point is inside the given triangle - """ - if np.isnan(A[0]): - A[0] = P[0] - - if np.isnan(B[0]): - B[0] = P[0] - - if np.isnan(C[0]): - C[0] = P[0] - - tmpMat = np.transpose([B - A, C - A]) - if (1 / np.linalg.cond(tmpMat)) < self._EFAP_THRESH_TRI: - return False - - Minv = np.linalg.inv(tmpMat) - S = Minv @ (P - A) - - if ( - S[0] < -self._EFAP_THRESH_TRI - or S[1] < -self._EFAP_THRESH_TRI - or S[0] + S[1] > 1 + self._EFAP_THRESH_TRI - ): - return False - - return True - - def _vertex_dist(self, surface: np.ndarray, idx_vert: int) -> float: - """ - Compute the distance of a vertex from a given plane - - Parameters - ---------- - surface : np.ndarray - Array of 3 ordered vertices defining the plane and its orientation - idx_vert: int - Index of the vertex to compute the distance for - - Returns - ------- - float - Distance of the vertex from the given plane - """ - return self._point_plane_dist( - self.verts[surface[0]].pos, - self.verts[surface[1]].pos, - self.verts[surface[2]].pos, - self.verts[idx_vert].pos, - ) - - def _point_plane_dist( - self, P1: np.ndarray, P2: np.ndarray, P3: np.ndarray, X: np.ndarray - ) -> float: - """ - Compute the distance of a vertex from a plane defined by three points - - Parameters - ---------- - P1 : np.ndarray - Cartesian coordinates of the first point - P2 : np.ndarray - Cartesian coordinates of the second point - P3 : np.ndarray - Cartesian coordinates of the third point - X: np.ndarray - Cartesian coordinates of the vertex - - Returns - ------- - float - Distance of the vertex from the given plane - """ - - if np.all(X == P1) or np.all(X == P2) or np.all(X == P3): - return 0 - else: - N = np.cross(P1 - P2, P1 - P3) - eps = np.finfo(float).eps - return np.dot(X - P1, N / (np.linalg.norm(N) + eps)) - - def _flip_plane(self, surface: np.ndarray) -> np.ndarray: - """ - Flip the orientation of a plane (invert normal vector) - - Parameters - ---------- - surface : np.ndarray - Array of 3 ordered vertices defining the plane and its orientation - - Returns - ------- - surface : np.ndarray - Reordered vertices with plane normal pointing outwards from the hull centroid - """ - if ( - self._point_plane_dist( - self.verts[surface[0]].pos, - self.verts[surface[1]].pos, - self.verts[surface[2]].pos, - self.centroid, - ) - > 0 - ): - surface = np.flip(surface.copy()) - - return surface - - def _compute_gains_point(self, azimuth: float, elevation: float) -> np.ndarray: - """ - Compute gains for the requested panning position - - Parameters - ---------- - azimuth : float - Azimuth of requested panning position - elevation : float - Elevation of requested panning position - - Returns - ------- - gains: np.ndarray - Panning gains for the loudspeaker layout - """ - if np.isnan(azimuth) or np.isnan(elevation): - raise ValueError(f"Angles cannot be NaNs : ({azimuth}, {elevation})") - - azimuth, elevation = wrap_angles(azimuth, elevation) - point_pos = [ - np.cos(np.deg2rad(azimuth)) * np.cos(np.deg2rad(elevation)), - np.sin(np.deg2rad(azimuth)) * np.cos(np.deg2rad(elevation)), - np.sin(np.deg2rad(elevation)), - ] - - # filter the polygon list with a quick 2d check - found_polys = [] - for poly in self.polys: - in_poly, mod = self._in_polygon(azimuth, elevation, poly) - if in_poly: - found_polys.append((poly, mod)) - - if not found_polys: - raise AssertionError("Unexpected error during panning") - - # find a visible polygon with the smallest distance - dist = [] - - for poly, mod in found_polys: - surface = self.verts[poly] - d = self._point_plane_dist( - surface[0].pos, - surface[1].pos, - surface[2].pos, - point_pos, - ) - if d >= 0: - dist.append(d) - else: - dist.append(np.inf) - - found_poly, mod = found_polys[np.argmin(dist)] - - # compute gains for the polygon vertices - poly_gain = self._pan_EFAP_poly(azimuth, elevation, found_poly, mod) - - # downmix ghost loudspeakers - gains = np.zeros(self.verts.size) - gains[found_poly] = poly_gain / np.linalg.norm(poly_gain) - gains = gains @ self.dmx_mat.T - gains = gains / np.linalg.norm(gains) - - if self.intensity_panning: - gains = np.sqrt(gains / np.sum(gains)) - - return gains - - """ public functions """ - - def pan( - self, - azimuths: float, - elevations: float, - intensity_panning: Optional[bool] = False, - ) -> np.ndarray: - """ - Compute gains for the requested panning position - - Parameters - ---------- - azimuths : float - Azimuth of requested panning position - elevations : float - Elevation of requested panning position - intensity_panning : bool - Flag whether to use intensity panning (Default is False == amplitude panning) - - Returns - ------- - gains: np.ndarray - Panning gains for the loudspeaker layout - """ - azimuths = np.array(azimuths) - elevations = np.array(elevations) - if azimuths.size == 1 and elevations.size == 1: - return self._compute_gains_point(azimuths, elevations) - elif np.squeeze(azimuths).ndim == 1 and np.squeeze(elevations).ndim == 1: - gains = [] - for a, e in zip(azimuths, elevations): - gains.append(self._compute_gains_point(a, e)) - return np.vstack(gains) - else: - raise ValueError( - "Azimuth and Elevation arrays cannot have more than one dimension and must be of equal size" - ) - - -def main(args): - """ - Parses a speaker layout text file and prints the panning gains - for the requested position - - Parameters - ---------- - args : Namespace - Command line arguments - """ - - speaker_positions = np.loadtxt(Path(args.input), delimiter=",", max_rows=2) - panner = EFAP(speaker_positions[0, :], speaker_positions[1, :], args.efip) - print(panner.pan(args.azimuth, args.elevation)) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Edge-Fading Amplitude Panning") - parser.add_argument( - "-i", - "--input", - metavar="layout_file", - required=True, - type=str, - help="IVAS compatible loudspeaker layout file (Loudspeaker azimuths in first line, elevations in second, subsequent lines are ignored)", - ) - parser.add_argument( - "-efip", - "-intensity_panning", - default=False, - action="store_true", - help="Intensity panning mode (EFIP)", - ) - parser.add_argument( - "azimuth", - type=float, - help="Azimuth of direction to compute panning gains for (positive-left)", - ) - parser.add_argument( - "elevation", - type=float, - help="Elevation of direction to compute panning gains for (positive-up)", - ) - args = parser.parse_args() - main(args) diff --git a/item_generation_scripts/audiotools/__init__.py b/item_generation_scripts/audiotools/__init__.py deleted file mode 100644 index effc5a25cbaec6e5342720e3da9899f782d12de9..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/__init__.py +++ /dev/null @@ -1,286 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import argparse -from itertools import repeat -from pathlib import Path - -from item_generation_scripts.audiotools.constants import AUDIO_FORMATS -from item_generation_scripts.audiotools.convert import convert_file -from item_generation_scripts.utils import apply_func_parallel - - -def add_processing_args(group, input=True): - # set up prefixes to avoid argument collision - if input: - p = "in" - ps = "i" - else: - p = "out" - ps = "o" - - group.add_argument( - f"-{ps}", - f"--{p}", - dest=f"{p}put", - required=True, - type=Path, - help="Path to *.{wav, pcm, raw} file or directory", - ) - group.add_argument( - f"-{ps}f", - f"--{p}_fmt", - required=input, - type=str, - help="Audio format (use -l, --list for a list / -L, --long for a detailed list)", - default=None, - ) - group.add_argument( - f"-{ps}s", - f"--{p}_fs", - type=int, - help="Sampling rate (Hz) (deduced for .wav input, same as input if output not specified, default = %(default)s)", - default=48000, - ) - group.add_argument( - f"-{ps}fc", - f"--{p}_cutoff", - type=int, - help="Cut-off frequency for low-pass filtering (default = %(default)s)", - default=None, - ) - group.add_argument( - f"-{ps}hp", - f"--{p}_hp50", - help="Apply 50 Hz high-pass filtering (default = %(default)s)", - action="store_true", - ) - group.add_argument( - f"-{ps}w", - f"--{p}_window", - type=float, - help="Window the start/end of the signal by this amount in milliseconds (default = %(default)s)", - default=None, - ) - group.add_argument( - f"-{ps}t", - f"--{p}_trim", - type=float, - nargs=2, - metavar=("PRE_TRIM", "POST_TRIM"), - help="Pre-/post-trim the signal by this amount in milliseconds (negative values pad silence), (default = %(default)s)", - ) - group.add_argument( - f"-{ps}pn", - f"--{p}_pad_noise", - help="Flag for padding with noise instead of zeros", - action="store_true", - ) - group.add_argument( - f"-{ps}d", - f"--{p}_delay", - type=float, - help="Delay the signal by this amount in milliseconds (negative values advance, default = %(default)s)", - default=None, - ) - group.add_argument( - f"-{ps}l", - f"--{p}_loudness", - type=float, - help="Normalize to given loudness with BS 1770-4 (default = %(default)s)", - default=None, - ) - group.add_argument( - f"-{ps}nf", - f"--{p}_loudness_fmt", - type=str, - help=f"Format used for loudness computation (only valid with with -{ps}l/--{p}_loudness, default = {p.upper()}_FMT)", - default=None, - ) - - -def get_args(): - parser = argparse.ArgumentParser( - description="Audiotools: Convert/Manipulate spatial audio files." - ) - - """ Input file arguments """ - input_parser = parser.add_argument_group("Input (pre-) processing options") - - # add common arguments - add_processing_args(input_parser) - - # input only arguments - input_parser.add_argument( - "-im", - "--in_meta", - type=str, - nargs="+", - help="list of input metadata files (only relevant for ISM and MASA input)", - default=None, - ) - - """ Output file arguments """ - output_parser = parser.add_argument_group("Output (post-) processing options") - - # add common arguments - add_processing_args(output_parser, False) - - # output only arguments - output_parser.add_argument( - "-lm", - "--limit", - help="Apply limiting to output (default = %(default)s)", - action="store_true", - ) - output_parser.add_argument( - "-t", - "--trajectory", - type=str, - help="Head-tracking trajectory file for binaural output (default = %(default)s)", - default=None, - ) - output_parser.add_argument( - "-bd", - "--bin_dataset", - type=str, - help="Use a custom binaural dataset (see README.md and audiotools/binaural_datasets/README.txt for further information)", - default=None, - ) - output_parser.add_argument( - "-bl", - "--bin_lfe_gain", - type=float, - help="Render LFE to binaural output with the specified gain (only valid for channel-based input, default = %(default)s)", - default=None, - ) - output_parser.add_argument( - "-mnru", - "--mnru_q", - type=float, - help="Flag for MNRU processing", - default=None, - ) - output_parser.add_argument( - "-esdru", - "--esdru_alpha", - type=float, - help="Flag for ESDRU processing", - default=None, - ) - - misc_parser = parser.add_argument_group("General options") - - """ Miscellaneous or meta arguments """ - misc_parser.add_argument( - "-l", - "--list", - help="list all supported audio formats and exit", - action="store_true", - ) - misc_parser.add_argument( - "-L", - "--long", - help="list all supported audio formats with long description and exit", - action="store_true", - ) - misc_parser.add_argument( - "-mp", - "--multiprocessing", - help="Enable multiprocessing (default = %(default)s)", - action="store_true", - ) - - return parser.parse_args() - - -def main(): - args = get_args() - - if args.list is True or args.long is True: - for fmt in AUDIO_FORMATS: - if args.long: - for f, d in fmt.items(): - print(f) - [print(f"\t{k}: {v}", end=None) for k, v in d.items()] - else: - print(", ".join(fmt.keys())) - exit() - - elif args.input is not None: - if not args.out_fs: - args.out_fs = args.in_fs - - if not args.out_fmt: - args.out_fmt = args.in_fmt - - if not args.out_loudness_fmt: - args.out_loudness_fmt = args.out_fmt - - # List input files - args.input = Path(args.input) - in_files = [] - if args.input.exists(): - if args.input.is_dir(): - in_files.extend(args.input.glob("*.wav")) - in_files.extend(args.input.glob("*.pcm")) - in_files.extend(args.input.glob("*.raw")) - else: - in_files = [args.input] - else: - raise ValueError(f"Input path {args.input} does not exist!") - - if len(in_files) == 0: - raise ValueError(f"Input directory {args.input} empty!") - - # Create output directory - args.output = Path(args.output) - - if len(in_files) == 1 and args.input.is_file(): - out_files = [args.output] - else: - args.output.mkdir(exist_ok=True) - out_files = [args.output.joinpath(i.name) for i in in_files] - - # Multiprocessing - enable_multiprocessing = args.multiprocessing - - # Remove unneeded keys to avoid passing to convert_file() - for k in ["list", "long", "multiprocessing", "input", "output"]: - args.__dict__.pop(k) - - apply_func_parallel( - convert_file, - zip(in_files, out_files), - repeat(args.__dict__), - "mp" if enable_multiprocessing else None, - ) diff --git a/item_generation_scripts/audiotools/__main__.py b/item_generation_scripts/audiotools/__main__.py deleted file mode 100644 index 9bdf64cda74530b84b707f96e789b1559b753986..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/__main__.py +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -from item_generation_scripts.audiotools import main - -if __name__ == "__main__": - main() diff --git a/item_generation_scripts/audiotools/audio.py b/item_generation_scripts/audiotools/audio.py deleted file mode 100644 index 1804f5dd3c9d8939ef370326f34dc2f240a8ab4f..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/audio.py +++ /dev/null @@ -1,428 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import warnings -from abc import ABC, abstractmethod -from pathlib import Path -from typing import Optional, Union - -import numpy as np - -from item_generation_scripts.audiotools.audiofile import read -from item_generation_scripts.audiotools.constants import ( - BINAURAL_AUDIO_FORMATS, - CHANNEL_BASED_AUDIO_ALTNAMES, - CHANNEL_BASED_AUDIO_FORMATS, - IVAS_FRAME_LEN_MS, - METADATA_ASSISTED_SPATIAL_AUDIO_FORMATS, - OBJECT_BASED_AUDIO_FORMATS, - SCENE_BASED_AUDIO_FORMATS, -) - -from .EFAP import wrap_angles - - -class Audio(ABC): - """Base class for audio data""" - - def __init__(self, name: str): - self.name = name.upper() - self.audio = None - self.fs = None - self.num_channels = None - # self.logger = None # TODO needed? - - def __repr__(self): - return f"{self.__class__} : {self.__dict__}" - - @classmethod - @abstractmethod - def _from_file(cls, name: str, filename: Path, fs: Optional[int] = None) -> "Audio": - """Create an Audio object from a file""" - out_audio = cls(name) - - filename = Path(filename) - if filename.suffix in [".pcm", ".raw"]: - if fs is None: - raise ValueError( - "Sampling rate must be specified for headerless files!" - ) - out_audio.audio, out_audio.fs = read(filename, out_audio.num_channels, fs) - elif filename.suffix == ".wav": - out_audio.audio, out_audio.fs = read(filename) - else: - raise NotImplementedError(f"Filetype {filename.suffix} is unsupported!") - - return out_audio - - @classmethod - @abstractmethod - def _from_filelist( - cls, name, files: list[Path], fs: Optional[int] = None - ) -> "Audio": - """Create an Audio object from a list of files with channels""" - out_audio = cls(name) - - for f in files: - f = Path(f) - - if f.suffix in [".pcm", ".raw"]: - if fs is None: - raise ValueError( - "Sampling rate must be specified for headerless files!" - ) - channel, fs = read(f, out_audio.num_channels, fs) - elif f.suffix == ".wav": - channel, fs = read(f) - else: - raise NotImplementedError(f"Filetype {f.suffix} is unsupported!") - - if out_audio.audio is None: - out_audio.audio = channel - out_audio.fs = fs - else: - if fs != out_audio.fs: - raise ValueError( - f"Sampling rate mismatch between input audio files, expected {out_audio.fs}, encountered {fs} for {f}!" - ) - - if channel.shape[0] > out_audio.audio.shape[0]: - channel = channel[: out_audio.audio.shape[0], :] - elif channel.shape[0] < out_audio.audio.shape[0]: - out_audio.audio = out_audio.audio[: channel.shape[0], :] - out_audio.audio = np.column_stack([out_audio.audio, channel]) - - return out_audio - - def apply(self, func, **kwargs) -> None: - """Apply a function to the audio array""" - self.audio = func(self.audio, self.fs, **kwargs) - - -class BinauralAudio(Audio): - """Sub-class for binaural audio""" - - def __init__(self, name: str): - super().__init__(name) - try: - self.__dict__.update(BINAURAL_AUDIO_FORMATS[name.upper()]) - except KeyError: - raise ValueError(f"Unsupported binaural audio format {name}") - - @classmethod - def _from_file( - cls, name: str, filename: Path, fs: Optional[int] = None - ) -> "BinauralAudio": - return super()._from_file(name, filename, fs) - - @classmethod - def _from_filelist( - cls, name: str, filename: Path, fs: Optional[int] = None - ) -> "BinauralAudio": - return super()._from_filelist(name, filename, fs) - - -class ChannelBasedAudio(Audio): - """Sub-class for channel-based audio""" - - def __init__(self, name: str): - if Path(name).exists() and Path(name).suffix == ".txt": - self.parse_custom_layout(name) - else: - # remap configuration name to internal naming - if name.upper() in CHANNEL_BASED_AUDIO_ALTNAMES.keys(): - name = CHANNEL_BASED_AUDIO_ALTNAMES[name.upper()] - - super().__init__(name) - try: - self.__dict__.update(CHANNEL_BASED_AUDIO_FORMATS[name.upper()]) - except KeyError: - raise ValueError(f"Unsupported channel-based audio format {name}") - - self.is_planar = np.all([e == 0 for e in self.ls_ele]) - - def parse_custom_layout(self, layout_file: Union[Path, str]): - layout_file = Path(layout_file) - with open(layout_file) as f_ls: - self.ls_azi = [float(x.strip()) for x in f_ls.readline().strip().split(",")] - self.ls_ele = [float(x.strip()) for x in f_ls.readline().strip().split(",")] - try: - self.lfe_index = [ - int(x.strip()) for x in f_ls.readline().strip().split(",") - ] - except Exception: - self.lfe_index = [] - - if self.lfe_index: - [self.ls_azi.insert(i, 0.0) for i in self.lfe_index] - [self.ls_ele.insert(i, 0.0) for i in self.lfe_index] - - self.name = layout_file.stem - self.num_channels = len(self.ls_azi) - self.layout_file = layout_file - - @classmethod - def _from_file( - cls, name: str, filename: Path, fs: Optional[int] = None - ) -> "ChannelBasedAudio": - return super()._from_file(name, filename, fs) - - @classmethod - def _from_filelist( - cls, name: str, filename: Path, fs: Optional[int] = None - ) -> "ChannelBasedAudio": - return super()._from_filelist(name, filename, fs) - - -class MetadataAssistedSpatialAudio(Audio): - """Sub-class for metadata-assisted spatial audio""" - - def __init__(self, name: str): - super().__init__(name) - try: - self.__dict__.update(METADATA_ASSISTED_SPATIAL_AUDIO_FORMATS[name.upper()]) - except KeyError: - raise ValueError( - f"Unsupported metadata assisted spatial audio format {name}" - ) - self.metadata_files = [] - - @classmethod - def _from_file( - cls, - name: str, - filename: Path, - metadata_files: list[str], - fs: Optional[int] = None, - ) -> "MetadataAssistedSpatialAudio": - obj = super()._from_file(name, filename, fs) - obj.metadata_file = Path(metadata_files[0]) - return obj - - @classmethod - def _from_filelist( - cls, - name: str, - filename: Path, - metadata_files: list[str], - fs: Optional[int] = None, - ) -> "MetadataAssistedSpatialAudio": - obj = super()._from_file(name, filename, fs) - obj.metadata_file = Path(metadata_files[0]) - return obj - - -class ObjectBasedAudio(Audio): - """Sub-class for object-based audio""" - - def __init__(self, name: str): - super().__init__(name) - try: - self.__dict__.update(OBJECT_BASED_AUDIO_FORMATS[name.upper()]) - except KeyError: - raise ValueError(f"Unsupported object-based audio format {name}") - self.object_pos = [] - self.metadata_files = [] - - @classmethod - def _from_file( - cls, - name: str, - filename: Union[str, Path], - metadata_files: list[Union[str, Path]], - fs: Optional[int] = None, - ) -> "ObjectBasedAudio": - obj = super()._from_file(name, filename, fs) - if metadata_files is not None: - obj.metadata_files = [Path(f) for f in metadata_files] - else: - # search for metadata with naming scheme: name.(wav, pcm).(0-3).csv - for obj_idx in range(obj.num_channels): - file_name_meta = filename.with_suffix( - f"{filename.suffix}.{obj_idx}.csv" - ) - if file_name_meta.is_file(): - obj.metadata_files.append(file_name_meta) - else: - raise ValueError(f"Metadata file {file_name_meta} not found.") - warnings.warn( - f"No metadata files specified: The following files were found and used: \n {*obj.metadata_files,}" - ) - - obj.init_metadata() - return obj - - @classmethod - def _from_filelist( - cls, - name: str, - filename: Path, - metadata_files: list[Union[str, Path]], - fs: Optional[int] = None, - ) -> "ObjectBasedAudio": - obj = super()._from_filelist(name, filename, fs) - obj.metadata_files = [Path(f) for f in metadata_files] - obj.init_metadata() - return obj - - def init_metadata(self): - if self.audio.shape[1] != len(self.metadata_files): - raise ValueError( - f"Mismatch between number of channels in file [{self.audio.shape[1]}], and metadata [{len(self.metadata_files)}]" - ) - - self.object_pos = [] - for i, f in enumerate(self.metadata_files): - pos = np.genfromtxt(f, delimiter=",") - - # check if metadata has right number of columns - if pos.shape[1] < 5: - raise ValueError("Metadata incomplete. Columns are missing.") - elif pos.shape[1] > 5: - if pos.shape[1] == 7: - pos = pos[:, :5] - else: - raise ValueError( - "Too many columns in metadata (possibly old version with frame index used)" - ) - - # check if metadata is longer than file -> cut off - num_frames = int( - np.ceil(self.audio.shape[0] / (self.fs * IVAS_FRAME_LEN_MS / 1000)) - ) - if num_frames < pos.shape[0]: - pos = pos[:num_frames] - # check if metadata is shorter than file -> loop - elif num_frames > pos.shape[0]: - pos_loop = np.zeros((num_frames, pos.shape[1])) - pos_loop[: pos.shape[0]] = pos - for idx in range(pos.shape[0], num_frames): - pos_loop[idx, :2] = pos[idx % pos.shape[0], :2] - pos = pos_loop - - # wrap metadata to target value range - for j in range(num_frames): - pos[j, 0], pos[j, 1] = wrap_angles(pos[j, 0], pos[j, 1], clip_ele=True) - - self.object_pos.append(pos) - - -class SceneBasedAudio(Audio): - """Sub-class for scene-based audio""" - - def __init__(self, name: str): - if name == "SBA1": - name = "FOA" - elif name == "SBA2": - name = "HOA2" - elif name == "SBA3": - name = "HOA3" - - super().__init__(name) - try: - self.__dict__.update(SCENE_BASED_AUDIO_FORMATS[name.upper()]) - except KeyError: - raise ValueError(f"Unsupported scene-based audio format {name}") - - # self.ambi_order = ambi_order_from_nchan(self.num_channels) - self.ambi_order = int(np.sqrt(self.num_channels) - 1) - - @classmethod - def _from_file( - cls, name: str, filename: Path, fs: Optional[int] = None - ) -> "SceneBasedAudio": - return super()._from_file(name, filename, fs) - - @classmethod - def _from_filelist( - cls, name: str, filename: Path, fs: Optional[int] = None - ) -> "SceneBasedAudio": - return super()._from_filelist(name, filename, fs) - - -def _get_audio_class(fmt) -> Audio: - """Return a child audio class corresponding to the specifed format""" - if fmt in BINAURAL_AUDIO_FORMATS.keys(): - return BinauralAudio - elif fmt in METADATA_ASSISTED_SPATIAL_AUDIO_FORMATS.keys(): - return MetadataAssistedSpatialAudio - elif fmt in OBJECT_BASED_AUDIO_FORMATS.keys(): - return ObjectBasedAudio - elif fmt in SCENE_BASED_AUDIO_FORMATS.keys(): - return SceneBasedAudio - elif ( - fmt in CHANNEL_BASED_AUDIO_FORMATS.keys() or CHANNEL_BASED_AUDIO_ALTNAMES.keys() - ): - return ChannelBasedAudio - elif Path(fmt).suffix == ".txt": - return ChannelBasedAudio - else: - raise ValueError(f"Unknown audio format {fmt}!") - - -def fromtype(fmt: str) -> Audio: - return _get_audio_class(fmt)(fmt) - - -def fromarray(fmt: str, x: np.ndarray, fs: int) -> Audio: - """Wrap the given array into an audio format""" - if x is None or not fs: - return ValueError("Both array and sampling rate must be specified!") - - output = _get_audio_class(fmt)(fmt) - - output.audio = x - output.fs = fs - - return output - - -def fromfile( - fmt: str, - filename: Union[str, Path], - fs: Optional[int] = None, - in_meta: Optional[list[Union[str, Path]]] = None, -) -> Audio: - """Create an Audio object of the specified format from the given file""" - filename = Path(filename) - fmt_cls = _get_audio_class(fmt) - if fmt_cls is ObjectBasedAudio or fmt_cls is MetadataAssistedSpatialAudio: - return fmt_cls._from_file(fmt, filename, in_meta, fs) - else: - return fmt_cls._from_file(fmt, filename, fs) - - -def fromfilelist( - fmt: str, files: list[Union[str, Path]], fs: Optional[int] = None -) -> Audio: - """Create an Audio object of the specified format from the given list of files""" - return _get_audio_class(fmt)._from_filelist(fmt, files, fs) diff --git a/item_generation_scripts/audiotools/audioarray.py b/item_generation_scripts/audiotools/audioarray.py deleted file mode 100644 index c0909c4c2c09e4830426f9e4d7a265233a8d47d8..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/audioarray.py +++ /dev/null @@ -1,690 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import logging -import warnings -from typing import Iterator, Optional, Tuple, Union - -import numpy as np -import scipy.signal as sig - -from .constants import DELAY_COMPENSATION_FOR_FILTERING, SEED_PADDING - -logger = logging.getLogger("__main__") -logger.setLevel(logging.DEBUG) - - -"""Functions used in this module""" - - -def trim( - x: np.ndarray, - fs: Optional[int] = 48000, - limits: Optional[Tuple[int, int]] = None, - pad_noise: Optional[bool] = False, - samples: Optional[bool] = False, -) -> np.ndarray: - """ - Trim an audio array - - Parameters - ---------- - x: np.ndarray - Input array - fs: Optional[int] - Input sampling rate in Hz, default = 48000 - limits: Optional[Tuple[int, int]] - Pre- and post-trim duration in milliseconds (negative values pad) - pad_noise: Optional[bool] - If true noise will be padded otherwise zeros will be padded - samples: Optional[bool] - If true limits are interpreted as samples, otherwise as ms - - Returns - ------- - y : np.ndarray - Output trimmed array - """ - - if not limits: - return x - - if not samples: - pre_trim = int(limits[0] * fs // 1000) - post_trim = int(limits[1] * fs // 1000) - else: - pre_trim = limits[0] - post_trim = limits[1] - - if pre_trim < 0: - if pad_noise: - # pad with uniformly distributed noise between -4 and 4 - np.random.seed(SEED_PADDING) - noise = np.random.randint( - low=-4, high=5, size=(np.abs(pre_trim), np.shape(x)[1]) - ).astype("float") - x = np.concatenate((noise, x), axis=0) - else: - x = np.pad(x, [[np.abs(pre_trim), 0], [0, 0]]) - elif pre_trim > 0: - x = x[pre_trim:, :] - - if post_trim < 0: - if pad_noise: - # pad with uniformly distributed noise between -4 and 4 - np.random.seed(SEED_PADDING) - noise = np.random.randint( - low=-4, high=5, size=(np.abs(post_trim), np.shape(x)[1]) - ).astype("float") - x = np.concatenate((x, noise), axis=0) - else: - x = np.pad(x, [[0, np.abs(post_trim)], [0, 0]]) - elif post_trim > 0: - x = x[:-post_trim, :] - - return x - - -def window( - x: np.ndarray, - fs: Optional[int] = 48000, - len_ms: Optional[float] = 100, -) -> np.ndarray: - """ - Apply windowing to the start and end - of an audio array - - - Parameters - ---------- - x: np.ndarray - Input audio array - fs: Optional[int] - Input sampling rate in Hz, default = 48000 - len_ms: Optional[float] - Window length used at start and end of array in milliseconds, default = 100 ms - - Returns - ------- - y: np.ndarray - Output windowed array - """ - - wlen_smp = int(len_ms * fs // 1000) - - # if requested window length is larger than the signal, simply window the signal - if wlen_smp > x.shape[0]: - wlen_smp = x.shape[0] // 2 - - window = sig.windows.hann(2 * wlen_smp) - - # we only need half of the window - window = window[:wlen_smp, np.newaxis] - - x[:wlen_smp, :] *= window - x[-wlen_smp:, :] *= window[::-1, :] - - return x - - -def delay_compensation( - x: np.ndarray, - flt_type: str, - fs: Optional[int] = 48000, - up: Optional[bool] = False, - down: Optional[bool] = False, -) -> np.ndarray: - """ - Compensation for a delayed signal - - Parameters - ---------- - x: np.ndarray - Input array - flt_type: str - Name of filter type used for filtering - fs: Optional[int] - Input sampling rate - up: Optional[bool] - Flag for up-sampling - down: Optional[bool] - Flag for down-sampling - - Returns - ------- - x: np.ndarray - Delay compensated test array - """ - - # Get the delay in number of samples - if flt_type == "SHQ2" and up: - d_samples = DELAY_COMPENSATION_FOR_FILTERING["SHQ2"]["up"] - elif flt_type == "SHQ2" and down: - d_samples = DELAY_COMPENSATION_FOR_FILTERING["SHQ2"]["down"] - elif flt_type == "SHQ3" and up: - d_samples = DELAY_COMPENSATION_FOR_FILTERING["SHQ3"]["up"] - elif flt_type == "SHQ3" and down: - d_samples = DELAY_COMPENSATION_FOR_FILTERING["SHQ3"]["down"] - else: - d_samples = DELAY_COMPENSATION_FOR_FILTERING[flt_type] - # Delay compensation - x = delay(x, fs, -d_samples, samples=True) - - return x - - -def delay( - x: np.ndarray, - fs: Optional[int] = 48000, - delay: Optional[float] = 0, - samples: Optional[bool] = False, -) -> np.ndarray: - """ - Delay a signal by a specified duration (ms) or number of samples - - Parameters - ---------- - x: np.ndarray - Input array - fs: Optional[int] - Sampling rate - delay: Optional[float] - Delay in milliseconds or samples (negative values advance file) - samples: Optional[bool] - If true delay is interpreted as samples, if false as milliseconds - - Returns - ------- - x: np.ndarray - Delayed audio signal - """ - - if not samples: - delay = int(delay * fs / 1000) - - delay_abs = np.abs(delay) - - x = np.roll(x, delay, axis=0) - - if delay < 0: - x[-delay_abs:, :] = 0 - elif delay > 0: - x[:delay_abs, :] = 0 - - return x - - -def limiter( - x: np.ndarray, - fs: int, -) -> np.ndarray: - """ - Apply limiting to an audio signal - - Parameters - ---------- - x: np.ndarray - Input reference array - fs: int - Input sampling frequency - - Returns - ------- - x: np.ndarray - Limited audio signal - """ - - limiter_threshold = 32729 # -0.01dB FS - limiter_attack_seconds = 0.005 - attack_constant = 0.01 ** (1.0 / (limiter_attack_seconds * fs)) - release_heuristics_mem = 0.0 - gain = 1.0 - strong_saturation_cnt = 0 - limited = False - - if x.ndim == 1: - n_samples_x = x.shape - n_chan_x = 1 - else: - n_samples_x, n_chan_x = x.shape - # framing - framesize = fs // 50 - nframes = n_samples_x // framesize - for fr in range(nframes): - apply_limiting = True - fr_sig = x[fr * framesize : ((fr + 1) * framesize), :] - sig_max = np.amax(np.absolute(fr_sig)) - release_heuristic = release_heuristics_mem - if sig_max > limiter_threshold: - frame_gain = limiter_threshold / sig_max - release_heuristic = min(1.0, release_heuristic + (4.0 * framesize / fs)) - else: - release_heuristic = max(0.0, release_heuristic - (framesize / fs)) - if gain >= 1.0 - 1e-10: - apply_limiting = False - - frame_gain = 1.0 - - if sig_max > 3 * limiter_threshold and strong_saturation_cnt > 0: - apply_strong_limiting = True - elif sig_max > 10 * limiter_threshold: - strong_saturation_cnt += 20 - apply_strong_limiting = True - else: - strong_saturation_cnt -= 1 - if strong_saturation_cnt < 0: - strong_saturation_cnt = 0 - apply_strong_limiting = False - - if apply_strong_limiting is True: - if frame_gain < 0.3: - frame_gain /= 3.0 - else: - apply_strong_limiting = False - - if frame_gain < 0.1 and apply_strong_limiting is False: - frame_gain = 0.1 - - if apply_limiting is True: - if frame_gain < gain: - fac = attack_constant ** (np.arange(1, framesize + 1, dtype=np.float32)) - else: - release_constant = 0.01 ** ( - 1.0 / (0.005 * (200.0**release_heuristic) * fs) - ) - fac = release_constant ** ( - np.arange(1, framesize + 1, dtype=np.float32) - ) - - fr_gain = np.tile(gain * fac + frame_gain * (1.0 - fac), (n_chan_x, 1)).T - fr_sig *= fr_gain - gain = fr_gain[-1, 0] - limited = True - else: - gain = 1.0 - - release_heuristics_mem = release_heuristic - # hard limiting for everything that still sticks out - if (fr_sig > 32767).any() or (fr_sig < -32768).any(): - limited = True - idx_max = np.where(fr_sig > 32767) - fr_sig[idx_max] = 32767 - idx_min = np.where(fr_sig < -32768) - fr_sig[idx_min] = -32768 - - if limited: - warnings.warn("Limiting had to be applied") - return x - - -def get_framewise( - x: np.ndarray, - chunk_size: int, - zero_pad: Optional[bool] = False, -) -> Iterator: - """ - Generator to yield a signal frame by frame - If array size is not a multiple of chunk_size, last frame contains the remainder - - Parameters - ---------- - x: np.ndarray - Input reference array - chunk_size: int - Size of frames to yield - zero_pad: Optional[bool] - Whether to zero pad the last chunk if there are not enough samples - - Yields - ------- - frame : np.ndarray - One frame of the input audio signal - """ - - n_frames = x.shape[0] // chunk_size - for i in range(n_frames): - yield x[i * chunk_size : (i + 1) * chunk_size, :] - if x.shape[0] % chunk_size: - last_chunk = x[n_frames * chunk_size :, :] - if zero_pad: - yield np.pad( - last_chunk, [[0, chunk_size - (x.shape[0] % chunk_size)], [0, 0]] - ) - else: - yield last_chunk - - -def framewise_io( - i: np.ndarray, o: np.ndarray, chunk_size: int, zero_pad: Optional[bool] = False -) -> Iterator: - """ - Return an iterator over frame_index, input_frame and output_frame - - Parameters - ---------- - i: np.ndarray - Input array - o: np.ndarray - Output array - chunk_size: int - Size of frames to yield - zero_pad: Optional[bool] - Whether to zero pad the last chunk if there are not enough samples - - Yields - ------- - frame : Iterator - Frame index, one frame of the input and output audio signal - """ - - return enumerate( - zip( - get_framewise(i, chunk_size, zero_pad), - get_framewise(o, chunk_size, zero_pad), - ) - ) - - -"""Deprecated functions (partly replaced by ITU binaries)""" - - -def resample( - x: np.ndarray, - in_freq: int, - out_freq: int, -) -> np.ndarray: - """ - Resample a multi-channel audio array - - Parameters - ---------- - x: np.ndarray - Input array - in_freq: int - Input sampling rate - out_freq: int - Output sampling rate - - Returns - ------- - y: np.ndarray - Output resampled array - """ - - if in_freq == out_freq or out_freq is None: - y = x - else: - datatype = x.dtype - if datatype.name.startswith("int"): - # cast necessary due to bug in resample_poly() with input of type int - x = x.astype("float") - - y = sig.resample_poly(x, out_freq, in_freq) - - if datatype.name.startswith("int"): - y = x.astype(datatype) - - return y - - -def lpfilter( - x: np.ndarray, - fc: int, - fs: int, -) -> np.ndarray: - """ - Low-pass filter a multi-channel audio array - - Parameters - ---------- - x: np.ndarray - Input array - fc: int - Cut-off frequency in Hz - fs: int - Sampling rate in Hz - - Returns - ------- - y: np.ndarray - Output low-pass filtered array - """ - - if (fc + 500) < (fs / 2.0): - # Design a Chebychev Type II filter, band_pass-band_stop = 500 Hz - N, Wn = sig.cheb2ord(fc / (fs / 2), (fc + 500) / (fs / 2), 3, 60) - b, a = sig.cheby2(N, 60, Wn, "low") - - # Apply the Butterworth filter for each channels, across time axis - # y = sig.lfilter(b, a, axis=0) # non zero-phase filter - y = sig.filtfilt(b, a, x, axis=0) # zero-phase filer, batch processing - else: - y = x - - return y - - -def cut( - x: np.ndarray, - limits: Optional[Tuple[int, int]], -) -> np.ndarray: - """ - Cut an audio array - - Parameters - ---------- - x: np.ndarray - Input array - limits: Tuple[int, int] - first and last samples to extract - - Returns - ------- - y: np.ndarray - Output cut array - """ - - in_samples, in_channels = x.shape - first_sample = limits[0] - last_sample = limits[1] - - if first_sample == 0 and (last_sample == -1 or last_sample == in_samples): - y = x - else: - if last_sample == -1: - last_sample = in_samples - - signal_start = first_sample - signal_end = last_sample - insert_start = 0 - insert_end = last_sample - first_sample - total_samples = last_sample - first_sample - if first_sample < 0: - samples_to_pad_begin = -first_sample - insert_start = samples_to_pad_begin - insert_end += samples_to_pad_begin - if last_sample > in_samples: - signal_end = in_samples - insert_end = insert_end - last_sample + in_samples - y = np.zeros([total_samples, in_channels], dtype=x.dtype) - y[insert_start:insert_end, :] = x[signal_start:signal_end, :] - - return y - - -def compare( - ref: np.ndarray, - test: np.ndarray, - fs: int, - per_frame: bool = False, -) -> dict: - """ - Compare two audio arrays - - Parameters - ---------- - ref: np.ndarray - Input reference array - test: np.ndarray - Input test array - fs: int - Input sampling rate in Hz - - Returns - ------- - result: dict - Comparison results - """ - - framesize = fs // 50 - diff = abs(test - ref) - max_diff = int(diff.max()) - result = { - "bitexact": True, - "max_abs_diff": 0, - "max_abs_diff_pos_sample": 0, - "max_abs_diff_pos_channel": 0, - "nsamples_diff": 0, - "nsamples_diff_percentage": 0.0, - "first_diff_pos_sample": -1, - "first_diff_pos_channel": -1, - "first_diff_pos_frame": -1, - } - if per_frame: - result["max_abs_diff_pos_frame"] = 0 - result["nframes_diff"] = 0 - result["nframes_diff_percentage"] = 0.0 - - if max_diff != 0: - if diff.ndim == 1: - nsamples_total = diff.shape - nchannels = 1 - else: - nsamples_total, nchannels = diff.shape - max_diff_pos = np.nonzero(diff == max_diff) - max_diff_pos = [ - max_diff_pos[0][0], - max_diff_pos[0][0] // framesize, - max_diff_pos[1][0], - ] - - first_diff_pos = np.nonzero(diff) - first_diff_pos = [ - first_diff_pos[0][0], - first_diff_pos[0][0] // framesize, - first_diff_pos[1][0], - ] - - nsamples_diff = np.nonzero(diff)[0].size - nsamples_diff_percentage = nsamples_diff / (nsamples_total * nchannels) * 100.0 - nframes = nsamples_total // framesize - nframes_diff = 0 - - result = { - "bitexact": False, - "max_abs_diff": max_diff, - "max_abs_diff_pos_sample": max_diff_pos[0], - "max_abs_diff_pos_channel": max_diff_pos[2], - "nsamples_diff": nsamples_diff, - "nsamples_diff_percentage": nsamples_diff_percentage, - "first_diff_pos_sample": first_diff_pos[0], - "first_diff_pos_channel": first_diff_pos[2], - "first_diff_pos_frame": first_diff_pos[1], - } - - if per_frame: - for fr in range(nframes): - diff_fr = diff[fr * framesize : ((fr + 1) * framesize), :] - nframes_diff += 1 if diff_fr.nonzero()[0].size > 0 else 0 - nframes_diff_percentage = nframes_diff / nframes * 100.0 - result["max_abs_diff_pos_frame"] = max_diff_pos[1] - result["nframes_diff"] = nframes_diff - result["nframes_diff_percentage"] = nframes_diff_percentage - - return result - - -def getdelay( - x: np.ndarray, - y: np.ndarray, -) -> int: - """ - Get the delay between two audio signals - - Parameters - ---------- - x: np.ndarray - Input reference array - y: np.ndarray - Input test array - - Returns - ------- - result: int - Delay of y in samples with respect to x (median of individual channel delays) - """ - - if x.ndim == 1: - n_samples_x = x.shape - n_chan_x = 1 - else: - n_samples_x, n_chan_x = x.shape - if y.ndim == 1: - n_samples_y = y.shape - n_chan_y = 1 - else: - n_samples_y, n_chan_y = y.shape - if n_chan_x != n_chan_y: - raise ValueError - lags = np.arange(-n_samples_x + 1, n_samples_y) - lag = np.zeros([n_chan_x, 1], dtype=int) - for chan in range(n_chan_x): - correlation = sig.correlate(y[:, chan], x[:, chan], mode="full") - lag[chan] = lags[np.argmax(correlation)] - return int(np.median(lag)) - - -def mono_downmix(x: np.ndarray) -> np.ndarray: - """ - Creates a passive mono downmix for a multi-channel audio signal - """ - return np.sum(x, axis=1) - - -def mute_channels( - x: np.ndarray, mute: Optional[Union[list, np.ndarray]] = None -) -> np.ndarray: - """ - Mute audio channels in signal - """ - x[:, mute] = 0 - return x diff --git a/item_generation_scripts/audiotools/audiofile.py b/item_generation_scripts/audiotools/audiofile.py deleted file mode 100644 index d5687a89d919bcc734e89e69a1e92cd24a33d10d..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/audiofile.py +++ /dev/null @@ -1,436 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import logging -import struct -from pathlib import Path -from typing import Optional, Tuple, Union - -import numpy as np -import scipy.io.wavfile as wav - -from .audioarray import trim, window - -logger = logging.getLogger("__main__") -logger.setLevel(logging.DEBUG) - - -def read( - filename: Union[str, Path], - nchannels: Optional[int] = 1, - fs: Optional[int] = 48000, - outdtype: Optional[str] = "float", -) -> Tuple[np.ndarray, int]: - """ - Read audio file (.pcm, .wav or .raw) - - Parameters - ---------- - filename: str - Input file path - nchannels: Optional[int] - Number of input channels, required for .pcm otherwise default = 1 - fs: Optional[int] - Input sampling rate, required for .pcm input file, otherwise default = 48000 (Hz) - outdtype: Optional[str] - Data type of output array, python builtin or np.dtype - - Returns - ------- - x: np.ndarray - audio signal array - fs: int - signal sampling frequency - """ - - file_extension = Path(filename).suffix - - if file_extension == ".wav": - fs, data = wav.read(filename) - if data.dtype == np.int32: - data = np.interp( - data, - (np.iinfo(np.int32).min, np.iinfo(np.int32).max), - (np.iinfo(np.int16).min, np.iinfo(np.int16).max), - ) - elif data.dtype == np.float32: - data = np.interp( - data, - (-1, 1), - (np.iinfo(np.int16).min, np.iinfo(np.int16).max), - ) - x = np.array(data, dtype=outdtype) - file_len = x.shape[0] - if x.ndim == 1: - # force to be a mtx - x = np.reshape(x, (file_len, 1)) - elif file_extension in [".pcm", ".raw"]: - x = np.fromfile(filename, dtype=np.int16).astype(outdtype) - signal_len = len(x) // nchannels - try: - x = x.reshape(signal_len, nchannels) - except ValueError: - raise ValueError("Wrong number of channels") - else: - raise ValueError("Wrong input format. Use wav, pcm or raw") - - return x, fs - - -def write( - filename: Union[str, Path], - x: np.ndarray, - fs: Optional[int] = 48000, - dtype: Optional[str] = "int16", -) -> None: - """ - Write audio file (.pcm, .wav or .raw) - - Parameters - ---------- - filename: str - Output file path (.pcm, .wav or .raw) - x: np.ndarray - Numpy 2D array of dimension: number of channels x number of samples - fs: Optional[int] - Sampling rate, required for .pcm or .raw input file, default = 48000 (Hz) - dtype: Optional[str] - Data type format required for .pcm or .raw input file, default = 'int16' - - Returns - ------- - None - """ - - file_extension = Path(filename).suffix - - clipped_samples = np.sum( - np.logical_or(x < np.iinfo(np.int16).min, x > np.iinfo(np.int16).max) - ) - if clipped_samples > 0: - logger.warning(f" Warning: {clipped_samples} samples clipped") - x = np.clip(x, np.iinfo(np.int16).min, np.iinfo(np.int16).max) - - if file_extension == ".wav": - x = x.astype(np.int16) - wav.write(filename, fs, x) - elif file_extension == ".pcm" or file_extension == ".raw": - x = x.astype(dtype).reshape(-1, 1) - x.tofile(filename) - else: - raise ValueError("Wrong input format. Use wav, pcm or raw") - - -def concat( - in_filenames: list, - out_file: str, - silence_pre: Optional[int] = 0, - silence_post: Optional[int] = 0, - in_fs: Optional[int] = 48000, - num_channels: Optional[int] = None, - pad_noise: Optional[bool] = False, - preamble: Optional[int] = None, - pad_noise_preamble: Optional[bool] = False, -) -> list: - """ - Horizontally concatenates audio files into one long file - - Parameters - __________ - in_filenames: list - Input list of filenmames (.pcm, .raw or .wav) - out_file: str - Output multi-channel audio file name (.pcm, .raw or .wav) - silence_pre: int - Padded zeros before signal in samples - silence_post: int - Padded zeros after signal in samples - in_fs: Optional[int] - Input sampling rate, default 48000 Hz - pad_noise: Optional[bool] - If true noise will be padded otherwise zeros will be padded - - Returns - ------- - splits - List of sample indices to split the resulting file at - """ - - y = None - fs_compare = 0 - - # create a list of splits - splits = [0] - - # Read input files - for in_file in in_filenames: - x, fs = read(in_file, fs=in_fs, nchannels=num_channels) - if fs_compare and fs_compare != fs: - raise ValueError("Sampling rates of files to concatenate don't match") - else: - fs_compare = fs - - # pad with very low amplitude noise - x = trim( - x, in_fs, (-silence_pre, -silence_post), samples=True, pad_noise=pad_noise - ) - - # add the length to our splits list - splits.append(splits[-1] + x.shape[0]) - - # concatenate - y = np.concatenate([y, x]) if y is not None else x - - # add preamble - if preamble: - y = trim(y, in_fs, (-preamble, 0), pad_noise_preamble) - - write(out_file, y, fs=in_fs) - - return splits[1:] - - -def split( - in_filename: Union[str, Path], - out_folder: Union[str, Path], - split_filenames: list[Union[str, Path]], - splits: list[int], - in_fs: Optional[int] = 48000, - preamble: Optional[int] = 0, - loudness: Optional[float] = None, -) -> list[Union[str, Path]]: - """ - Horizontally splits audio files into multiple shorter files and applies windowing and scaling - - Parameters - __________ - in_filename: Union[str, Path] - Input filenmame (.pcm, .raw or .wav) - out_folder: Union[str, Path] - Output folder where to put the splits - split_filenames: list[Union[str, Path]] - List of names for the split files - splits: list[int] - List of sample indices where to cut the signal - in_fs: Optional[int] - Input sampling rate, default 48000 Hz - loudness: Optional[float] - Desired loudness of individual files - """ - - # create a list of output files - out_paths = [] - - # Read input file - x, fs = read(in_filename, fs=in_fs) - - # remove preamble - if preamble: - x = trim(x, fs, (preamble, 0)) - - split_old = 0 - for idx, split in enumerate(splits): - out_file = Path(out_folder) / Path(split_filenames[idx]).with_suffix( - in_filename.suffix - ) - - # add the path to our list - out_paths.append(out_file) - - # split - y = x[split_old:split, :] - - # windowing - y = window(y) - - # write file - write(out_file, y, fs=in_fs) - - split_old = split - - return out_paths - - -def combine( - in_filenames: list, - out_file: str, - in_fs: Optional[int] = 48000, -) -> None: - """ - Combines audio files into one multi-channel file - - Parameters - ---------- - in_filenames: list - Input list of filenmames (.pcm, .raw or .wav) - out_file: str - Output multi-channel audio file name (.pcm, .raw or .wav) - in_fs: Optional[int] - Input sampling rate, required for .pcm and .raw input file, default 48000 Hz - - Returns - ------- - None - """ - - y = None - fs_compare = 0 - - # Read input files - for in_file in in_filenames: - # assign correct channel - x, fs = read(in_file, fs=in_fs) - if fs_compare and fs_compare != in_fs: - raise ValueError("Sampling rates of files to combine don't match") - else: - fs_compare = fs - if y is None: - y = x - else: - if x.shape[0] > y.shape[0]: - x = x[: y.shape[0], :] - elif y.shape[0] > x.shape[0]: - y = y[: x.shape[0], :] - y = np.column_stack([y, x]) - - write(out_file, y, fs=in_fs) - - -def split_channels( - in_file: str, - out_filenames: list, - in_nchans: int, - in_fs: Optional[int] = 48000, -) -> None: - """ - Split multi-channel audio files into individual mono files - - Parameters - ---------- - in_file: str - Input file name (.pcm, .raw or .wav) - out_filenames: list - List of output file names (.pcm, .raw or .wav) - in_nchans: int - Input number of channels - in_fs: Optional[int] = 48000 - Input sampling rate, default 48000 Hz - - Returns - ------- - None - """ - - # validation - if in_nchans is None: - raise ValueError("Number of channels to split must be specified!") - if in_nchans != len(out_filenames): - print( - "Split: Mismatch between number of channels and output filenames length. Truncating output filenames list." - ) - out_filenames = out_filenames[:in_nchans] - - x, in_fs = read(in_file, nchannels=in_nchans, fs=in_fs) - - # Write output files - for idx, out_file in enumerate(out_filenames): - # extract correct channel - y = x[:, idx] - - write(out_file, y, fs=in_fs) - - -def parse_wave_header( - filename: str, -) -> dict: - """ - Get the format information from a WAV file. - Return a dictionary with the format information - - Parameters - ---------- - filename : string or open file handle - Input WAV file. - - Returns - ------- - Dictionary - """ - - with open(filename, "rb") as fid: - riff = fid.read(4) - - if riff == b"RIFF": - binary_format = "<" - elif riff == b"RIFX": - binary_format = ">" - else: - raise IOError("No RIFF chunk found!") - - wav_size = struct.unpack(f"{binary_format}I", fid.read(4))[0] - - wav_identifier = fid.read(4) - if wav_identifier != b"WAVE": - raise IOError("No WAVE chunk found!") - - fmt_chunk_id = fid.read(4) - - if fmt_chunk_id == b"fmt ": - fmt_size = struct.unpack(f"{binary_format}I", fid.read(4))[0] - wav_format = struct.unpack(f"{binary_format}H", fid.read(2))[0] - channels = struct.unpack(f"{binary_format}H", fid.read(2))[0] - fs = struct.unpack(f"{binary_format}I", fid.read(4))[0] - bytes_per_second = struct.unpack(f"{binary_format}I", fid.read(4))[0] - block_align = struct.unpack(f"{binary_format}H", fid.read(2))[0] - bit_depth = struct.unpack(f"{binary_format}H", fid.read(2))[0] - rem_bytes = fmt_size - 16 - ext_param_size = 0 - ext_param = None - if rem_bytes: - ext_param_size = struct.unpack(f"{binary_format}H", fid.read(2))[0] - - if ext_param_size: - ext_param = fid.read(ext_param_size) - else: - raise IOError("Missing or corrupt fmt chunk!") - - return { - "size": wav_size, - "format_tag": wav_format, - "channels": channels, - "fs": fs, - "bytes_per_second": bytes_per_second, - "block_align": block_align, - "bit_depth": bit_depth, - "ext_param_size": ext_param_size, - "ext_param": ext_param, - } diff --git a/item_generation_scripts/audiotools/binaural_datasets/BRIR_IISofficialMPEG222UC_FULL.mat b/item_generation_scripts/audiotools/binaural_datasets/BRIR_IISofficialMPEG222UC_FULL.mat deleted file mode 100644 index 42e702db0e30fa828427b5f5dc28f3615bf3dbe6..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/binaural_datasets/BRIR_IISofficialMPEG222UC_FULL.mat +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a3ddecef64dfcf8887904b5cc370c0d9723bd8fd1637e32232205cdcd739b80d -size 12623190 diff --git a/item_generation_scripts/audiotools/binaural_datasets/BRIR_IISofficialMPEG222UC_LS.mat b/item_generation_scripts/audiotools/binaural_datasets/BRIR_IISofficialMPEG222UC_LS.mat deleted file mode 100644 index 1d590edb9369826d028846a346bb1b53abf9c64e..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/binaural_datasets/BRIR_IISofficialMPEG222UC_LS.mat +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e2c964b96d802532c0ecf1076092c7d246a54293a3a0c4c72995953c66bfec71 -size 6348499 diff --git a/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA1.mat b/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA1.mat deleted file mode 100644 index 4f59a8a9147c1fd346bc980ff67a7a35eea952b7..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA1.mat +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3a9ad5d8d874ac2fb851f5d2b0b303494f1d115612e9f6cab40e5eb33591b05c -size 4630 diff --git a/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA2.mat b/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA2.mat deleted file mode 100644 index 1ad2162acb5de9b451f1537d08a543e975c2abd8..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA2.mat +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6fc2a15579b80493597a8096bd815e8b847fe1880bdba760d4405122878b0b0a -size 10323 diff --git a/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA3.mat b/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA3.mat deleted file mode 100644 index 0e7c3ef463fc067bc04b6bed4ba2c7d338066d67..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_Dolby_SBA3.mat +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:83822cfa090c345a6ece14d1ec1a92023626f467e2f8d982cf099c071dfc1080 -size 18229 diff --git a/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_FULL.mat b/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_FULL.mat deleted file mode 100644 index a2ab24e5125ad3e01323ae8f3e86f8b9419b5225..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_FULL.mat +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bf86a03f0b13932c5c138af22584f864b75c5733df1b01ac3fdf7750a1bdbe5f -size 14335913 diff --git a/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_LS.mat b/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_LS.mat deleted file mode 100644 index 65c2684c94cc6a51bce4ae0a25f528b959606672..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/binaural_datasets/HRIR_ORANGE53_LS.mat +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2e25ef101e9e72c5d70a55bc1451a07d041d29f96a803d7d3f968f20fe403316 -size 20190 diff --git a/item_generation_scripts/audiotools/binaural_datasets/README.txt b/item_generation_scripts/audiotools/binaural_datasets/README.txt deleted file mode 100644 index 9fd37c966abf95f652245ae9ff1ae8573754b570..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/binaural_datasets/README.txt +++ /dev/null @@ -1,34 +0,0 @@ -Files in this directory should contain impulse responses for use in rendering in Matlab .mat format -Samplingrate of 48kHz is assumed - -Files should adhere to the following naming scheme: - -{HRIR|BRIR}_{DATASETNAME}_{FULL|LS|SBA(1-3)}.mat - -- HRIR or BRIR - specifies the type of impulse response which will be used - for either BINAURAL or BINAURAL_ROOM output respectively -- DATASETNAME - specifies the name used with the binaural_dataset commandline argument - or YAML key to enable selection of this dataset -- FULL or LS or SBA3 - specifies the subset of impulse responses in the file: - FULL: all available measurements on the sphere - LS: superset of supported loudspeaker layouts - (see audiotools.constants.CHANNEL_BASED_AUDIO_FORMATS["LS""]) - SBA(1-3): impulse responses transformed to ambisonics by external conversion - if available SBA1 is used for FOA, SBA2 for HOA2 and SBA3 for HOA3 - if not available SBA3 is used and truncated for all Ambisonic formats - -Each Matlab file should contain the following variables: -- IR - Impulse responses with dimensions [ir_length x n_ears x n_channels] -- SourcePosition - array of {azimuth, elevation, radius} of dimensions [n_channels x 3] - required for FULL, optional otherwise -- latency_s - latency of the dataset in samples - optional, will be estimated if not provided - -LICENSES: -Please see HRIR.txt and BRIR.txt for license info \ No newline at end of file diff --git a/item_generation_scripts/audiotools/binaural_datasets/__init__.py b/item_generation_scripts/audiotools/binaural_datasets/__init__.py deleted file mode 100644 index aea270d8d1752e772ab716bc33be0bf7b8a0cf35..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/binaural_datasets/__init__.py +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# diff --git a/item_generation_scripts/audiotools/binaural_datasets/binaural_dataset.py b/item_generation_scripts/audiotools/binaural_datasets/binaural_dataset.py deleted file mode 100644 index e6c4dbe73458e5f7a129a289953d15ab909a69b4..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/binaural_datasets/binaural_dataset.py +++ /dev/null @@ -1,288 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import warnings -from pathlib import Path -from typing import Optional, Tuple, Union - -import numpy as np -from scipy.io import loadmat - -from item_generation_scripts.audiotools.audio import fromtype -from item_generation_scripts.audiotools.constants import ( - CHANNEL_BASED_AUDIO_FORMATS, - OBJECT_BASED_AUDIO_FORMATS, - SCENE_BASED_AUDIO_FORMATS, -) -from item_generation_scripts.audiotools.EFAP import wrap_angles - - -def load_hrtf( - filename: Union[str, Path], -) -> Tuple[np.ndarray, np.ndarray, int]: - """ - Read HRTFs from Matlab dictionary file mat - - Parameters - ---------- - filename: str - HRTFs file name (.mat) - - Returns - ------- - IR: np.ndarray - Array of impulse responses - SourcePosition: np.ndarray - Array of source positions corresponding to the impulse responses - latency_s: int - Latency in samples - """ - - if not filename.exists(): - raise FileNotFoundError( - f"File {filename.name} was not found in dataset folder!" - ) - - mat_contents = loadmat(filename) - - try: - IR = mat_contents["IR"] - except KeyError: - raise KeyError(f"Key 'IR' not found in .mat file: {filename} !") - - SourcePosition = mat_contents.get("SourcePosition") - latency_s = mat_contents.get("latency_s") - if latency_s is not None: - latency_s = latency_s.astype(np.int32)[0, 0] - - return IR, SourcePosition, latency_s - - -def load_ir( - in_fmt: str, - out_fmt: str, - dataset: Optional[str] = None, -) -> Tuple[np.ndarray, np.ndarray, int]: - """ - Load IRs for a specified rendering format - - Parameters - ---------- - in_fmt: str - Input format - out_fmt: str - Output format - dataset: Optional[str] - Name of desired dataset without prefix and suffix - - Returns - ------- - IR: np.ndarray - Array of impulse responses - SourcePosition: np.ndarray - Array of source positions corresponding to the impulse responses - latency_smp: int - Latency in samples - """ - - dataset_prefix = None - dataset_suffix = None - - if out_fmt.startswith("BINAURAL") and "ROOM" in out_fmt: - dataset_prefix = "BRIR" - if dataset is None: - dataset = "IISofficialMPEG222UC" - - if in_fmt.startswith("MOZART"): - dataset_suffix = "FULL" - elif in_fmt in CHANNEL_BASED_AUDIO_FORMATS.keys(): - dataset_suffix = "LS" - - elif out_fmt.startswith("BINAURAL"): - dataset_prefix = "HRIR" - if dataset is None: - dataset = "ORANGE53" - - if in_fmt in OBJECT_BASED_AUDIO_FORMATS.keys() or in_fmt.startswith( - "CUSTOM_LS" - ): - dataset_suffix = "FULL" - elif in_fmt in CHANNEL_BASED_AUDIO_FORMATS.keys() and in_fmt != "MONO": - dataset_suffix = "LS" - elif in_fmt in SCENE_BASED_AUDIO_FORMATS.keys(): - dataset = "ORANGE53_Dolby" - if in_fmt == "SBA1" or in_fmt == "FOA": - dataset_suffix = "SBA1" - # Use truncated SBA3 dataset if no SBA1 or 2 dataset exists - if not ( - Path(__file__).parent.joinpath( - f"{dataset_prefix}_{dataset}_{dataset_suffix}.mat" - ) - ).is_file(): - dataset_suffix = "SBA3" - warnings.warn("No SBA1 dataset found -> use truncated SBA3 dataset") - elif in_fmt.endswith("2"): - dataset_suffix = "SBA2" - # Use truncated SBA3 dataset if no SBA1 or 2 dataset exists - if not ( - Path(__file__).parent.joinpath( - f"{dataset_prefix}_{dataset}_{dataset_suffix}.mat" - ) - ).is_file(): - dataset_suffix = "SBA3" - warnings.warn("No SBA2 dataset found -> use truncated SBA3 dataset") - else: - dataset_suffix = "SBA3" - - path_dataset = Path(__file__).parent.joinpath( - f"{dataset_prefix}_{dataset}_{dataset_suffix}.mat" - ) - IR, SourcePosition, latency_s = load_hrtf(path_dataset) - - if latency_s is not None: - latency_smp = latency_s - else: - latency_smp = int(np.min(np.argmax(np.sum(np.abs(IR), axis=1), axis=0))) - warnings.warn( - f"No latency of HRTF dataset specified in {path_dataset} file -> computed latency: {latency_smp} sample(s)" - ) - - if in_fmt.startswith("STEREO"): - IR = IR[:, :, :2] # use L and R channels. - elif ( - in_fmt in CHANNEL_BASED_AUDIO_FORMATS.keys() - and not in_fmt.startswith("CUSTOM_LS") - and not in_fmt.startswith("MOZART") - ): - # extract positions from the loudspeaker file - in_fmt = fromtype(in_fmt) - tmp_fmt = fromtype("LS") - - IR_tmp = IR.copy() - IR = np.zeros([IR_tmp.shape[0], IR_tmp.shape[1], in_fmt.num_channels]) - - ir_index = 0 - for i in range(tmp_fmt.num_channels): - for j in range(in_fmt.num_channels): - if ( - tmp_fmt.ls_azi[i] == in_fmt.ls_azi[j] - and tmp_fmt.ls_ele[i] == in_fmt.ls_ele[j] - ): - if j != in_fmt.lfe_index[0]: - IR[:, :, ir_index] = IR_tmp[:, :, i] - ir_index += 1 - - return IR, SourcePosition, latency_smp - - -def find_ir( - SourcePosition: np.ndarray, - azi: float, - ele: float, - num_filter: Optional[int] = None, -) -> Tuple[np.ndarray, np.ndarray]: - """ - Find HRTF measurement closest to the selected direction - - Parameters - ---------- - SourcePosition: np.ndarray - Source IR positions - azi: float - Desired response azimuth - ele: float - Desired response elevation - num_filter: Optional[int] - Number of filters to return, if None return all - - Returns - ------- - i_dir: np.ndarray - Indices of nearest SourcePositions - dist_sort: np.ndarray - Distances corresponding to the indices - """ - - dist = dist_on_sphere(SourcePosition, azi, ele) - - if num_filter is None: - i_dir = np.argsort(dist) - dist_sort = np.sort(dist) - else: - i_dir = np.argsort(dist)[:num_filter] - dist_sort = np.sort(dist)[:num_filter] - - return i_dir, dist_sort - - -def dist_on_sphere( - positions: np.ndarray, - azi: float, - ele: float, -) -> np.ndarray: - """ - Compute great-circle distance - - Parameters - ---------- - positions: np.ndarray - Source IR positions - azi: float - Desired response azimuth - ele: float - Desired response elevation - - Returns - ------- - dist: np.ndarray - Distances from desired point - """ - - azi, ele = wrap_angles(azi, ele) - - delta_azi = np.deg2rad(np.abs(azi - positions[:, 0])) - - # compute great circle distance - a = np.sin(np.deg2rad(positions[:, 1])) * np.sin(np.deg2rad(ele)) + np.cos( - np.deg2rad(positions[:, 1]) - ) * np.cos(np.deg2rad(ele)) * np.cos(delta_azi) - if np.max(a) > 1.001 or np.min(a) < -1.001: - raise ValueError( - f"Absolute distance value larger than one! Min: {np.min(a)}, Max: {np.max(a)}" - ) - - # limiting to prevent errors in arccos due to numerical inaccuracies - a[a > 1] = 1 - a[a < -1] = -1 - dist = np.arccos(a) - - return dist diff --git a/item_generation_scripts/audiotools/binauralobjectrenderer.py b/item_generation_scripts/audiotools/binauralobjectrenderer.py deleted file mode 100644 index 548c49217863d329fa1f0647e74c2e5137ef6025..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/binauralobjectrenderer.py +++ /dev/null @@ -1,652 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import itertools -from itertools import repeat -from typing import Optional, Tuple - -import numpy as np -from scipy.signal import convolve - -from item_generation_scripts.audiotools.binaural_datasets.binaural_dataset import ( - find_ir, -) -from item_generation_scripts.audiotools.constants import IVAS_FRAME_LEN_MS -from item_generation_scripts.audiotools.EFAP import wrap_angles -from item_generation_scripts.utils import apply_func_parallel - - -def barycentric_weights( - azi_deg: np.ndarray, - ele_deg: np.ndarray, - pos_in: np.ndarray, - interp_1d: Optional[bool] = False, -) -> Tuple[int, int, int]: - """ - Computation of spherical Barycentric weights - Implementation based on paper "Spherical Barycentric Coordinates" - from T. Langer, A. Belyaev und H. Seidel - - Parameters - ---------- - azi_deg: np.ndarray - Azimuthal coordinates of three points that form a triangle in degrees - ele_deg: np.ndarray - Elevation coordinates of three points that form a triangle in degrees - pos_in: np.ndarray - Azimuthal and elevation coordinates in degrees for point to compute weights - interp_1d: bool - 1d interpolation between two points - - Returns - ------- - W_1, W_2, W_3: scalar values - Barycentric weights for corresponding vertices - """ - - # check if point is equal to vertex - for k in range(3): - if azi_deg[k] == pos_in[0] and ele_deg[k] == pos_in[1]: - output = np.zeros(3) - output[k] = 1 - return tuple(output) - - pos = np.copy(pos_in) - - pos[0], pos[1] = wrap_angles(pos[0], pos[1]) - - # convert rad - ele = ( - -np.deg2rad(ele_deg, dtype="float64") + np.pi / 2 - ) # different definition of elevation in metadata - azi = np.deg2rad(azi_deg, dtype="float64") - pos[0] = np.deg2rad(pos[0]) - pos[1] = -np.deg2rad(pos[1]) + np.pi / 2 - - """ spherical barycentric coordinates """ - - # convert to cartesian coordinates - x = np.sin(ele) * np.cos(azi) - y = np.sin(ele) * np.sin(azi) - z = np.cos(ele) - pos_x = np.sin(pos[1]) * np.cos(pos[0]) - pos_y = np.sin(pos[1]) * np.sin(pos[0]) - pos_z = np.cos(pos[1]) - - pos_cart = np.array([pos_x, pos_y, pos_z]) - v_1 = np.array([x[0], y[0], z[0]]) - v_2 = np.array([x[1], y[1], z[1]]) - v_3 = np.array([x[2], y[2], z[2]]) - - # rotate coordinate system - unit = np.array([0, 0, 1]) - a = np.cross(pos_cart, unit) - b = np.dot(pos_cart, unit) - a_matrix = np.array([[0, -a[2], a[1]], [a[2], 0, -a[0]], [-a[1], a[0], 0]]) - if b == -1: - rot_matrix = np.eye(3, 3) # a and b point to opposite directions - else: - rot_matrix = np.eye(3, 3) + a_matrix + np.dot(a_matrix, a_matrix) / (1 + b) - - v_1 = rot_matrix @ v_1 - v_2 = rot_matrix @ v_2 - v_3 = rot_matrix @ v_3 - # test_vec = rot_matrix @ pos_cart # should be [0, 0, 1] - - # scale verticies to tangent plane - v_1_plane = v_1 / v_1[2] - v_2_plane = v_2 / v_2[2] - v_3_plane = v_3 / v_3[2] - eps = 10**-10 - - # compute planar barycentric coordinates - denom = (v_2_plane[1] - v_3_plane[1]) * (v_1_plane[0] - v_3_plane[0]) + ( - v_3_plane[0] - v_2_plane[0] - ) * (v_1_plane[1] - v_3_plane[1]) - # denom is proportional to area of triangle -> when area is zero, use linear 1d interpolation - if abs(denom) <= 10**-15: - interp_1d = True - - if not interp_1d: - W_1_plane = ( - (v_2_plane[1] - v_3_plane[1]) * (0 - v_3_plane[0]) - + (v_3_plane[0] - v_2_plane[0]) * (0 - v_3_plane[1]) - ) / (denom + eps) - W_2_plane = ( - (v_3_plane[1] - v_1_plane[1]) * (0 - v_3_plane[0]) - + (v_1_plane[0] - v_3_plane[0]) * (0 - v_3_plane[1]) - ) / (denom + eps) - W_3_plane = 1 - W_1_plane - W_2_plane - else: - v_diff = np.array( - [v_1_plane[:-1], v_2_plane[:-1], v_3_plane[:-1]] - ) # z entry always one - dist_all = np.linalg.norm(v_diff, axis=1) - v_diff_norm = np.divide(v_diff, dist_all[:, None]) - dot_v_ind = np.array( - [[0, 1], [1, 2], [2, 0]] - ) # the three possible combinations of points - # compute dot product between all vertices to find pairs that lie in opposite directions w.r.t. the point - # in this case the dot product is -1 (due to normalization) - dot = np.empty(3) - k = 0 - for ind_i, ind_j in dot_v_ind: - dot[k] = np.dot(v_diff_norm[ind_i], v_diff_norm[ind_j]) - k += 1 - - margin = 10**-5 - indices_minus_one = np.array(np.abs(dot + 1) < margin) - if indices_minus_one.any(): # test if one entry is -1 - v_ind = dot_v_ind[indices_minus_one] - # use vertex pair with smalles distance from origin (current position) - if np.shape(v_ind)[0] >= 2: - used_vertices = v_ind[ - np.argmin( - np.array([sum(dist_all[v_ind[0]]), sum(dist_all[v_ind[1]])]) - ) - ] - else: - used_vertices = v_ind[0] - dist = dist_all[used_vertices[0]] / sum(dist_all[used_vertices]) - if 0 in used_vertices and 1 in used_vertices: - W_1_plane = 1 - dist - W_2_plane = dist - W_3_plane = 0 - elif 1 in used_vertices and 2 in used_vertices: - W_1_plane = 0 - W_2_plane = 1 - dist - W_3_plane = dist - elif 2 in used_vertices and 0 in used_vertices: - W_1_plane = dist - W_2_plane = 0 - W_3_plane = 1 - dist - else: - raise ValueError("problem in 1d interpolation") - else: - # point does not lie on line spanned by two of the points - W_1_plane = -1 - W_2_plane = -1 - W_3_plane = -1 - - # compute spherical weights from planar weights - W_1 = W_1_plane * np.dot(v_1, v_1_plane) - W_2 = W_2_plane * np.dot(v_2, v_2_plane) - W_3 = W_3_plane * np.dot(v_3, v_3_plane) - - # avoid rejection of triangles due to numerical errors since point lies on edge of tiangle - threshold_error = -1 * 10**-8 - if threshold_error < W_1 < 0: - W_1 = 0 - if threshold_error < W_2 < 0: - W_2 = 0 - if threshold_error < W_3 < 0: - W_3 = 0 - - return W_1, W_2, W_3 - - -def get_tri_weights( - pos: np.ndarray, - SourcePosition: np.ndarray, -) -> Tuple[np.ndarray, np.ndarray]: - """ - Finds suitable triangle of data points on surface in which the defined point lies - - Parameters - ---------- - pos: np.ndarray - Point of interest given as [azimutahal, elevation] - SourcePosition: np.ndarray - Positions of the source in the measurements in IR - - Returns - ------- - combination_vertices: np.ndarray - Indices of the three vertices in SourcePosition - W: np.ndarray - Barycentric weights of point in triangle; - if negative, no suitable triangle was found - """ - - W_1, W_2, W_3 = -1, -1, -1 - index_triangle = 3 - # get indices of source positions sorted by distance on the plane from pos - index_vertices, _ = find_ir(SourcePosition, pos[0], pos[1]) - pos = np.array(wrap_angles(pos[0], pos[1])) - combination_vertices = None - while W_1 < 0 or W_2 < 0 or W_3 < 0: - if ( - SourcePosition[index_vertices[0], 0] == pos[0] - and SourcePosition[index_vertices[0], 1] == pos[1] - ): - # if position is position in data set take first triangle that incudes the point - combination_vertices = index_vertices[:3] - W_1, W_2, W_3 = (1, 0, 0) - break - index_HRIR = index_vertices[:index_triangle] # get nearest positions - y_ele_all = SourcePosition[index_HRIR, 1] - if pos[1] > np.max(y_ele_all) or pos[1] < np.min(y_ele_all): - # no need to compute weights since all possible triangles lie completely above or below point - # attention: this can be problematic if no point is available at [0, +-90] - pass - else: - # test all triangle combinations with new point - for combination_vertices_tmp in itertools.combinations(index_HRIR[:-1], 2): - combination_vertices = np.concatenate( - (index_HRIR[-1, None], combination_vertices_tmp), axis=0 - ) - - x_azi = SourcePosition[combination_vertices, 0] - y_ele = SourcePosition[combination_vertices, 1] - W_1, W_2, W_3 = barycentric_weights(x_azi, y_ele, pos) - if W_1 >= 0 and W_2 >= 0 and W_3 >= 0: - # found suitable triangle - break - index_triangle += 1 - if index_triangle > 30: - # stop after too many iterations - return np.array(combination_vertices), np.array([-1, -1, -1]) - - W = np.array([W_1, W_2, W_3]) - return np.array(combination_vertices), W - - -def interpolate_2d( - azi_in: np.ndarray, - ele_in: np.ndarray, - values: np.ndarray, - pos: np.ndarray, - interp_1d: Optional[bool] = False, - weights: Optional[np.ndarray] = None, - ghost: Optional[list[bool]] = None, - SourcePosition: Optional[np.ndarray] = None, - IR: Optional[np.ndarray] = None, - phase: Optional[bool] = False, -) -> np.ndarray: - """ - Compute HRIR for point on surface spanned by three points via barycentric coordinates - - Parameters - ---------- - azi_in: np.ndarray - Azimuthal coordinates of three points that form a triangle in degrees - ele_in: np.ndarray - Elevation coordinates of three points that form a triangle in degrees - values: np.ndarray - Values to interpolate, here either HRIRs or magnitude or phase of HRTFs - pos: np.ndarray - Position of desired interpolation value - interp_1d: bool - 1d interpolation between two points - weights: tuple - If barycentric weights are already known these values are used - ghost: list of bool - If north and/or south pole is ghost source - SourcePosition: np.ndarray - Only necessary if at least one element in ghost is true - IR: np.ndarray - Only necessary if at least one element in ghost is true - phase: bool - If interpolated values are phases and should be wrapped - - Returns - ------- - HRIR: np.ndarray - Interpolated value at point pos - """ - - if ghost is None: - ghost = [False, False] - - if weights is None: - W_1, W_2, W_3 = barycentric_weights( - azi_in, ele_in, pos, interp_1d - ) # compute barycentric weights - else: - (W_1, W_2, W_3) = weights - - if ( - W_1 + W_2 + W_3 > 1.5 - ): # on sphere sum of weights is not necessarily equal to one! - raise ValueError( - f"Sum of positive barycentric weights larger than expected: {W_1 +W_2 +W_3}" - ) - - threshold_error = -1 * 10**-10 - if W_1 < threshold_error or W_2 < threshold_error or W_3 < threshold_error: - raise ValueError("Point lies outside of triangle! No interpolation possible") - - # do some phase unwrapping - if phase: - values = np.unwrap(values, axis=1) - - # treat potential ghost sources at the north and south pole - if (ghost[0] and 90 in ele_in) or (ghost[1] and -90 in ele_in): - if SourcePosition is None or IR is None: - raise ValueError( - "Source positions and IRs are required in interpolation if ghost source is used" - ) - ele_ghost = [] - additional_term = 0 - weights_copy = np.copy(weights) - if ghost[0] and 90 in ele_in: - ele_ghost.append(90) - if ghost[1] and -90 in ele_in: - ele_ghost.append(-90) - for ele_g in ele_ghost: - ind_dist, dist = find_ir(SourcePosition[: -len(ele_ghost)], 0, ele_g) - ind_dist = ind_dist[dist == dist[0]] - weight_spread = weights_copy[ele_in == ele_g] / len(ind_dist) - weights_copy[ele_in == ele_g] = 0 - additional_term += np.sum(IR[:, ind_dist], axis=1) * weight_spread - - HRIR = ( - values[:, 0] * W_1 - + values[:, 1] * W_2 - + values[:, 2] * W_3 - + additional_term - ) - - else: - HRIR = ( - values[:, 0] * W_1 + values[:, 1] * W_2 + values[:, 2] * W_3 - ) # apply weights - - return HRIR - - -def add_ghost_speaker_bary( - SourcePosition: np.ndarray, - IR: np.ndarray, -) -> Tuple[list[bool], np.ndarray, np.ndarray]: - """ - Adds a ghost speaker at the poles if necessary and indicates result by bool values - - Parameters - ---------- - SourcePosition: np.ndarray - All source positions - IR: np.ndarray - IRs at corresponding source positions - - Returns - ------- - ghost_pos: list of bool - If entry is True a ghost speaker is introduced at the north or south pole, respectively - SourcePosition: np.ndarray - All source positions plus poles if ghost_pos is True - IR: np.ndarray - IRs at corresponding source positions - """ - - ghost_pos = [False, False] - if 90 not in SourcePosition[:, 1]: - # if north pole is not in dataset add it - ghost_pos[0] = True - pole = np.array([0, 90, 1]) - SourcePosition = np.concatenate((SourcePosition, pole[None, :]), axis=0) - IR = np.concatenate((IR, np.zeros((*np.shape(IR)[:2], 1))), axis=2) - if -90 not in SourcePosition[:, 1]: - # if south pole is not in dataset add it - ghost_pos[1] = True - pole = np.array([0, -90, 1]) - SourcePosition = np.concatenate((SourcePosition, pole[None, :]), axis=0) - IR = np.concatenate((IR, np.zeros((*np.shape(IR)[:2], 1))), axis=2) - - return ghost_pos, SourcePosition, IR - - -def binaural_fftconv_framewise( - x: np.ndarray, - IR: np.ndarray, - SourcePosition: np.ndarray, - azi: Optional[np.ndarray] = None, - ele: Optional[np.ndarray] = None, - frame_len: Optional[int] = (IVAS_FRAME_LEN_MS // 4) * 48, -) -> np.ndarray: - """ - Binauralization using fft convolution with frame-wise processing - supports rotation on trajectories with interpolation between measured Source - positions, reimplemented roughly along the lines of ConvBinauralRenderer.m - - Parameters - ---------- - x: np.ndarray - Input multi-channel array - IR: np.ndarray - HRIRs array - SourcePosition: np.ndarray - Positions of the source in the measurements in IR - azi: np.ndarray - Azimuth angles for all frames - ele: np.ndarray - Elevation angles for all frames - frame_len: int - Frame length, optional, default = (IVAS_FRAME_LEN_MS // 4) * 48000 - - Returns - ------- - y: np.ndarray - Output binaural signal array - """ - - sig_len = x.shape[0] - N_frames = int( - sig_len / frame_len - ) # TODO add ceil function for non-integer frame length multiples - num_points_interp = 3 # interpolation in triangle - - N_HRIR_taps = IR.shape[0] - - if azi is None or ele is None: - azi = np.repeat([0.0], N_frames) - ele = np.repeat([0.0], N_frames) - elif len(azi) < N_frames or len(ele) < N_frames: - azi = np.concatenate( - [np.repeat(azi, N_frames // len(azi)), azi[: N_frames % len(azi)]] - ) - ele = np.concatenate( - [np.repeat(ele, N_frames // len(ele)), ele[: N_frames % len(ele)]] - ) - - indices_HRIR = np.empty([N_frames, num_points_interp], dtype=int) - IR_2d = np.empty((N_frames, N_HRIR_taps, 2, num_points_interp)) - Bary_weights = np.empty((N_frames, 3)) - - # find three points to form a triangle for interpolation - # test if point lies within triangle spanned by these points by checking the signas of barycentric coordinates - # if all weights are >= 0 the point lies within the triangle - for index in range(np.shape(SourcePosition)[0]): - SourcePosition[index, 0:2] = np.array( - wrap_angles(SourcePosition[index, 0], SourcePosition[index, 1]) - ) - - # add ghost speaker to poles if necessary - ghost_pos, SourcePosition, IR = add_ghost_speaker_bary(SourcePosition, IR) - for i_frame in range(N_frames): - if ( - i_frame - and azi[i_frame] == azi[i_frame - 1] - and ele[i_frame] == ele[i_frame - 1] - ): - IR_2d[i_frame] = IR_2d[i_frame - 1] - indices_HRIR[i_frame] = indices_HRIR[i_frame - 1] - Bary_weights[i_frame] = Bary_weights[i_frame - 1] - continue - pos = np.array([azi[i_frame], ele[i_frame]]) - combination_vertices, W = get_tri_weights(pos, SourcePosition) - if (W < 0).all(): - raise ValueError("No suitable triangle found in frame " + str(i_frame)) - IR_2d[i_frame] = IR[:, :, np.array(combination_vertices)] - indices_HRIR[i_frame] = combination_vertices - Bary_weights[i_frame] = W - - T_rev = frame_len + N_HRIR_taps - 1 - N_rev = int(np.ceil(T_rev / frame_len)) - - fade_in = np.arange(frame_len) / (frame_len - 1) - fade_in = fade_in[:, np.newaxis] - fade_out = 1.0 - fade_in - - # compute both ears in parallel - i_ear = list(range(2)) - result = apply_func_parallel( - render_ear, - zip( - i_ear, - repeat(frame_len), - repeat(N_frames), - repeat(N_rev), - repeat(T_rev), - repeat(fade_in), - repeat(fade_out), - repeat(x), - repeat(sig_len), - repeat(N_HRIR_taps), - repeat(azi), - repeat(ele), - repeat(SourcePosition), - repeat(IR_2d), - repeat(Bary_weights), - repeat(ghost_pos), - repeat(IR), - repeat(indices_HRIR), - ), - None, - "mp", - False, - ) - - y = np.stack(result, axis=1) - - return y[0:sig_len] - - -def render_ear( - i_ear, - frame_len, - N_frames, - N_rev, - T_rev, - fade_in, - fade_out, - x, - sig_len, - N_HRIR_taps, - azi, - ele, - SourcePosition, - IR_2d, - Bary_weights, - ghost_pos, - IR, - indices_HRIR, -) -> np.ndarray: - # function to process one ear used in multiprocessing - G = np.empty((N_frames, N_HRIR_taps)) - - for frame in range(N_frames): - pos = np.array([azi[frame], ele[frame]]) - # Interpolation of time-domain signals - G[frame] = interpolate_2d( - SourcePosition[indices_HRIR[frame], 0], - SourcePosition[indices_HRIR[frame], 1], - IR_2d[frame, :, i_ear], - pos, - weights=Bary_weights[frame], - ghost=ghost_pos, - SourcePosition=SourcePosition, - IR=IR[:, i_ear], - ) - - # frame wise parallel computation slow (many frames, small computational load per frame) - i_frame = list(range(N_frames)) - result = apply_func_parallel( - convolve_frame, - zip( - i_frame, - repeat(frame_len), - repeat(N_frames), - repeat(N_rev), - repeat(T_rev), - repeat(i_ear), - repeat(fade_in), - repeat(fade_out), - repeat(G), - repeat(x), - repeat(sig_len), - repeat(N_HRIR_taps), - ), - None, - "mt", - False, - ) - - return np.hstack(result) - - -def convolve_frame( - i_frame, - frame_len, - N_frames, - N_rev, - T_rev, - i_ear, - fade_in, - fade_out, - G, - x, - sig_len, - N_HRIR_taps, -) -> np.ndarray: - # function to process one frame used in multiprocessing - i1 = i_frame * frame_len - i2 = (i_frame + 1) * frame_len - - y0 = np.zeros([2, sig_len + N_HRIR_taps - 1, 2]) - - G0 = G[i_frame] - G1 = G[min(i_frame + 1, N_frames - 1)] - - for j_frame in range(max(0, i_frame - N_rev), min(i_frame + 1, N_frames)): - j1 = j_frame * frame_len - j2 = (j_frame + 1) * frame_len - j2p = j1 + T_rev - - y0[0, j1:j2p, i_ear] += convolve(np.squeeze(x[j1:j2]), G0) - y0[1, j1:j2p, i_ear] += convolve(np.squeeze(x[j1:j2]), G1) - - y_frame = ( - np.squeeze(fade_out) * y0[0, i1:i2, i_ear] - + np.squeeze(fade_in) * y0[1, i1:i2, i_ear] - ) - return y_frame diff --git a/item_generation_scripts/audiotools/constants.py b/item_generation_scripts/audiotools/constants.py deleted file mode 100644 index c3af9d29952ea4069cd51060e1d5e9c2191353a0..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/constants.py +++ /dev/null @@ -1,704 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import numpy as np - -BINAURAL_AUDIO_FORMATS = { - "BINAURAL": { - "num_channels": 2, - }, - "BINAURAL_ROOM": { - "num_channels": 2, - }, -} - -BINAURAL_LFE_GAIN = 10 ** (5.5 / 20) - -LFE_INDEX_DEFAULT = 3 - -LS_AZI_MONO = [0] -LS_ELE_MONO = [0] - -LS_AZI_STEREO = [30, -30] -LS_ELE_STEREO = [0, 0] - -LS_AZI_CICP6 = [30, -30, 0, 0, 110, -110] -LS_ELE_CICP6 = [0, 0, 0, 0, 0, 0] - -LS_AZI_CICP12 = [30, -30, 0, 0, 110, -110, 135, -135] -LS_ELE_CICP12 = [0, 0, 0, 0, 0, 0, 0, 0] - -LS_AZI_CICP14 = [30, -30, 0, 0, 110, -110, 30, -30] -LS_ELE_CICP14 = [0, 0, 0, 0, 0, 0, 35, 35] - -LS_AZI_CICP16 = [30, -30, 0, 0, 110, -110, 30, -30, 110, -110] -LS_ELE_CICP16 = [0, 0, 0, 0, 0, 0, 35, 35, 35, 35] - -LS_AZI_CICP19 = [30, -30, 0, 0, 135, -135, 90, -90, 30, -30, 135, -135] -LS_ELE_CICP19 = [0, 0, 0, 0, 0, 0, 0, 0, 35, 35, 35, 35] - - -CHANNEL_BASED_AUDIO_FORMATS = { - "MONO": { - "num_channels": 1, - "ls_azi": LS_AZI_MONO, - "ls_ele": LS_ELE_MONO, - "lfe_index": [], - }, - "STEREO": { - "num_channels": 2, - "ls_azi": LS_AZI_STEREO, - "ls_ele": LS_ELE_STEREO, - "lfe_index": [], - }, - "5_1": { - "num_channels": 6, - "ls_azi": LS_AZI_CICP6, - "ls_ele": LS_ELE_CICP6, - "lfe_index": [LFE_INDEX_DEFAULT], - }, - "5_1_2": { - "num_channels": 8, - "ls_azi": LS_AZI_CICP14, - "ls_ele": LS_ELE_CICP14, - "lfe_index": [LFE_INDEX_DEFAULT], - }, - "5_1_4": { - "num_channels": 10, - "ls_azi": LS_AZI_CICP16, - "ls_ele": LS_ELE_CICP16, - "lfe_index": [LFE_INDEX_DEFAULT], - }, - "7_1": { - "num_channels": 8, - "ls_azi": LS_AZI_CICP12, - "ls_ele": LS_ELE_CICP12, - "lfe_index": [LFE_INDEX_DEFAULT], - }, - "7_1_4": { - "num_channels": 12, - "ls_azi": LS_AZI_CICP19, - "ls_ele": LS_ELE_CICP19, - "lfe_index": [LFE_INDEX_DEFAULT], - }, - "LS": { - "num_channels": 15, - "ls_azi": [ - 30, - -30, - 0, - 135, - -135, - 110, - -110, - 90, - -90, - 30, - -30, - 110, - -110, - 135, - -135, - ], - "ls_ele": [0, 0, 0, 0, 0, 0, 0, 0, 0, 35, 35, 35, 35, 35, 35], - "lfe_index": [], - }, - "MOZART": { - "num_channels": 30, - "ls_azi": [ - 0, - 0, - 135, - -135, - 30, - -30, - 180, - 0, - 90, - -90, - 45, - -45, - 0, - 0, - 135, - -135, - 90, - -90, - 180, - 0, - 45, - -45, - 60, - -60, - 110, - -110, - 30, - -30, - 110, - -110, - ], - "ls_ele": [ - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 35, - 35, - 35, - 90, - 35, - 35, - 35, - 35, - 35, - -15, - -15, - -15, - 0, - 0, - 0, - 0, - 35, - 35, - 35, - 35, - ], - "lfe_index": [1, 7], - }, - "CUSTOM_LS": { - "num_channels": -1, - "ls_azi": None, - "ls_ele": None, - "lfe_index": None, - }, -} - -# Support a variety of names for multichannel configs -CHANNEL_BASED_AUDIO_ALTNAMES = { - # 5_1 - 51: "5_1", # YAML by default will interpret underscore delimited numbers as integers, similar to python - "5d1": "5_1", - "5.1": "5_1", - "CICP6": "5_1", - # 7_1 - 71: "7_1", - "7d1": "7_1", - "7.1": "7_1", - "CICP12": "7_1", - # 5_1_2 - 512: "5_1_2", - "5d1p2": "5_1_2", - "5.1+2": "5_1_2", - "5.1.2": "5_1_2", - "CICP14": "5_1_2", - # 5_1_4 - 514: "5_1_4", - "5d1p4": "5_1_4", - "5.1+4": "5_1_4", - "5.1.4": "5_1_4", - "CICP16": "5_1_4", - # 7_1_4 - 714: "7_1_4", - "7d1p4": "7_1_4", - "7.1+4": "7_1_4", - "7.1.4": "7_1_4", - "CICP19": "7_1_4", -} - -METADATA_ASSISTED_SPATIAL_AUDIO_FORMATS = { - "MASA1": { - "num_channels": 1, - }, - "MASA2": { - "num_channels": 2, - }, -} -OBJECT_BASED_AUDIO_FORMATS = { - "ISM1": { - "num_channels": 1, - }, - "ISM2": { - "num_channels": 2, - }, - "ISM3": { - "num_channels": 3, - }, - "ISM4": { - "num_channels": 4, - }, -} - - -SCENE_BASED_AUDIO_FORMATS = { - "FOA": { - "num_channels": 4, - "is_planar": False, - }, - "HOA2": { - "num_channels": 9, - "is_planar": False, - }, - "HOA3": { - "num_channels": 16, - "is_planar": False, - }, - "PLANARFOA": { - "num_channels": 4, - "is_planar": True, - }, - "PLANARHOA2": { - "num_channels": 9, - "is_planar": True, - }, - "PLANARHOA3": { - "num_channels": 16, - "is_planar": True, - }, - "SBA1": { - "num_channels": 4, - "is_planar": False, - }, - "SBA2": { - "num_channels": 9, - "is_planar": False, - }, - "SBA3": { - "num_channels": 16, - "is_planar": False, - }, -} - -SCENE_METADATA_FORMATS = {"META"} - -AUDIO_FORMATS = [ - BINAURAL_AUDIO_FORMATS, - CHANNEL_BASED_AUDIO_FORMATS, - METADATA_ASSISTED_SPATIAL_AUDIO_FORMATS, - OBJECT_BASED_AUDIO_FORMATS, - SCENE_BASED_AUDIO_FORMATS, -] - - -IVAS_FRAME_LEN_MS = 20 - -IVAS_CICPX_TO_MONO = np.array( - [ - [ - 1, - 1, - 1, - 1, - 0.79999995, - 0.79999995, - 0.79999995, - 0.79999995, - 0.849999964, - 0.849999964, - 0.849999964, - 0.849999964, - ] - ] -).T - -IVAS_CICPX_TO_STEREO = np.array( - [ - [1, 0], - [0, 1], - [np.sqrt(0.5), np.sqrt(0.5)], - [np.sqrt(0.5), np.sqrt(0.5)], - [0.79999995, 0], - [0, 0.79999995], - [0.79999995, 0], - [0, 0.79999995], - [0.849999964, 0], - [0, 0.849999964], - [0.849999964, 0], - [0, 0.849999964], - ] -) - -# downmix matrices -IVAS_CICP12_TO_6 = np.zeros(8 * 6) -IVAS_CICP12_TO_6[[0, 7, 14, 21, 28, 35, 40, 47]] = 1 -IVAS_CICP12_TO_6 = IVAS_CICP12_TO_6.reshape(8, 6) - -IVAS_CICP14_TO_6 = np.zeros(8 * 6) -IVAS_CICP14_TO_6[[0, 7, 14, 21, 28, 35]] = 1 -IVAS_CICP14_TO_6[[36, 43]] = 0.849999964 -IVAS_CICP14_TO_6 = IVAS_CICP14_TO_6.reshape(8, 6) - -IVAS_CICP16_TO_6 = np.zeros(10 * 6) -IVAS_CICP16_TO_6[[0, 7, 14, 21, 28, 35]] = 1 -IVAS_CICP16_TO_6[[36, 43, 52, 59]] = 0.849999964 -IVAS_CICP16_TO_6 = IVAS_CICP16_TO_6.reshape(10, 6) - -IVAS_CICP16_TO_12 = np.zeros(10 * 8) -IVAS_CICP16_TO_12[[0, 9, 18, 27, 36, 45]] = 1 -IVAS_CICP16_TO_12[[48, 57, 68, 77]] = 0.849999964 -IVAS_CICP16_TO_12 = IVAS_CICP16_TO_12.reshape(10, 8) - -IVAS_CICP16_TO_14 = np.zeros(10 * 8) -IVAS_CICP16_TO_14[[0, 9, 18, 27, 36, 45, 54, 63]] = 1 -IVAS_CICP16_TO_14[[68, 77]] = 0.849999964 -IVAS_CICP16_TO_14 = IVAS_CICP16_TO_14.reshape(10, 8) - -IVAS_CICP19_TO_6 = np.zeros(12 * 6) -IVAS_CICP19_TO_6[[0, 7, 14, 21, 28, 35]] = 1 -IVAS_CICP19_TO_6[[36, 43]] = 0.367322683 -IVAS_CICP19_TO_6[[48, 55, 64, 71]] = 0.849999964 -IVAS_CICP19_TO_6[[40, 47]] = 0.930093586 -IVAS_CICP19_TO_6 = IVAS_CICP19_TO_6.reshape(12, 6) - -IVAS_CICP19_TO_12 = np.zeros(12 * 8) -IVAS_CICP19_TO_12[[0, 9, 18, 27, 38, 47]] = 1 -IVAS_CICP19_TO_12[[48, 57]] = 0.367322683 -IVAS_CICP19_TO_12[[64, 73, 84, 93]] = 0.849999964 -IVAS_CICP19_TO_12[[52, 61]] = 0.930093586 -IVAS_CICP19_TO_12 = IVAS_CICP19_TO_12.reshape(12, 8) - -IVAS_CICP19_TO_14 = np.zeros(12 * 8) -IVAS_CICP19_TO_14[[0, 9, 18, 27, 36, 45, 70, 79]] = 1 -IVAS_CICP19_TO_14[[48, 57]] = 0.367322683 -IVAS_CICP19_TO_14[[84, 93]] = 0.849999964 -IVAS_CICP19_TO_14[[52, 61]] = 0.930093586 -IVAS_CICP19_TO_14 = IVAS_CICP19_TO_14.reshape(12, 8) - -IVAS_CICP19_TO_16 = np.zeros(12 * 10) -IVAS_CICP19_TO_16[[0, 11, 22, 33, 44, 55, 86, 97, 108, 119]] = 1 -IVAS_CICP19_TO_16[[60, 71]] = 0.367322683 -IVAS_CICP19_TO_16[[64, 75]] = 0.930093586 -IVAS_CICP19_TO_16 = IVAS_CICP19_TO_16.reshape(12, 10) - -# upmix matrices -IVAS_MONO_TO_CICPX = np.zeros([1, 12]) -IVAS_MONO_TO_CICPX[0, 2] = 1 - -IVAS_STEREO_TO_CICPX = np.zeros([2, 12]) -IVAS_STEREO_TO_CICPX[0, 0] = 1 -IVAS_STEREO_TO_CICPX[1, 1] = 1 - -IVAS_CICP12_TO_14 = np.zeros(8 * 8) -IVAS_CICP12_TO_14[[0, 9, 18, 27, 36, 45, 52, 61]] = 1 -IVAS_CICP12_TO_14 = IVAS_CICP12_TO_14.reshape(8, 8) - -IVAS_CICP12_TO_16 = np.zeros(8 * 10) -IVAS_CICP12_TO_16[[0, 11, 22, 33, 44, 55, 64, 75]] = 1 -IVAS_CICP12_TO_16 = IVAS_CICP12_TO_16.reshape(8, 10) - -IVAS_CICP12_TO_19 = np.zeros(8 * 12) -IVAS_CICP12_TO_19[[0, 13, 26, 39, 54, 67, 76, 89]] = 1 -IVAS_CICP12_TO_19 = IVAS_CICP12_TO_19.reshape(8, 12) - -IVAS_CICP14_TO_19 = np.zeros(8 * 12) -IVAS_CICP14_TO_19[[0, 13, 26, 39, 52, 65, 80, 93]] = 1 -IVAS_CICP14_TO_19 = IVAS_CICP14_TO_19.reshape(8, 12) - -IVAS_CICP16_TO_19 = np.zeros(10 * 12) -IVAS_CICP16_TO_19[[0, 13, 26, 39, 52, 65, 80, 93, 106, 119]] = 1 -IVAS_CICP16_TO_19 = IVAS_CICP16_TO_19.reshape(10, 12) - -# mapping dict -IVAS_MC_CONVERSION = { - "MONO": { - # upmix - "5_1": IVAS_MONO_TO_CICPX[:, :6], - "7_1": IVAS_MONO_TO_CICPX[:, :8], - "5_1_2": IVAS_MONO_TO_CICPX[:, :8], - "5_1_4": IVAS_MONO_TO_CICPX[:, :10], - "7_1_4": IVAS_MONO_TO_CICPX[:, :12], - }, - "STEREO": { - # upmix - "5_1": IVAS_STEREO_TO_CICPX[:, :6], - "7_1": IVAS_STEREO_TO_CICPX[:, :8], - "5_1_2": IVAS_STEREO_TO_CICPX[:, :8], - "5_1_4": IVAS_STEREO_TO_CICPX[:, :10], - "7_1_4": IVAS_STEREO_TO_CICPX[:, :12], - }, - "5_1": { - # downmix - "MONO": IVAS_CICPX_TO_MONO[:6, :], - "STEREO": IVAS_CICPX_TO_STEREO[:6, :], - # upmix - "7_1": np.pad(np.eye(6), [[0, 0], [0, 2]]), - "5_1_2": np.pad(np.eye(6), [[0, 0], [0, 2]]), - "5_1_4": np.pad(np.eye(6), [[0, 0], [0, 4]]), - "7_1_4": np.pad(np.eye(6), [[0, 0], [0, 6]]), - }, - "7_1": { - # downmix - "MONO": IVAS_CICPX_TO_MONO[:8, :], - "STEREO": IVAS_CICPX_TO_STEREO[:8, :], - "5_1": IVAS_CICP12_TO_6, - # upmix - "5_1_2": IVAS_CICP12_TO_14, - "5_1_4": IVAS_CICP12_TO_16, - "7_1_4": IVAS_CICP12_TO_19, - }, - "5_1_2": { - # downmix - "MONO": np.vstack([IVAS_CICPX_TO_MONO[:6, :], IVAS_CICPX_TO_MONO[-2:, :]]), - "STEREO": np.vstack( - [IVAS_CICPX_TO_STEREO[:6, :], IVAS_CICPX_TO_STEREO[-2:, :]] - ), - "5_1": IVAS_CICP14_TO_6, - "7_1": np.pad(IVAS_CICP14_TO_6, [[0, 0], [0, 2]]), - # upmix - "5_1_4": np.pad(np.eye(8), [[0, 0], [0, 2]]), - "7_1_4": IVAS_CICP14_TO_19, - }, - "5_1_4": { - # downmix - "MONO": np.vstack([IVAS_CICPX_TO_MONO[:6, :], IVAS_CICPX_TO_MONO[-4:, :]]), - "STEREO": np.vstack( - [IVAS_CICPX_TO_STEREO[:6, :], IVAS_CICPX_TO_STEREO[-4:, :]] - ), - "5_1": IVAS_CICP16_TO_6, - "7_1": IVAS_CICP16_TO_12, - "5_1_2": IVAS_CICP16_TO_14, - # upmix - "7_1_4": IVAS_CICP16_TO_19, - }, - "7_1_4": { - # downmix - "MONO": IVAS_CICPX_TO_MONO, - "STEREO": IVAS_CICPX_TO_STEREO, - "5_1": IVAS_CICP19_TO_6, - "7_1": IVAS_CICP19_TO_12, - "5_1_2": IVAS_CICP19_TO_14, - "5_1_4": IVAS_CICP19_TO_16, - }, -} - -# LFE 120 Hz LPF filter coefficients -IVAS_LPF_4_BUTTER_48K_SOS = np.array( - [ - [ - 5.12617881476274e-09, - 1.02523584294987e-08, - 5.12617879059970e-09, - 1, - -1.96875982668433, - 0.969044914826862, - ], - [ - 1, - 1.99999984394358, - 1.00000000471366, - 1, - -1.98677297369091, - 0.987060670205863, - ], - ] -) - -T_DESIGN_11_AZI = np.array( - [ - 132.927291884332, - -83.9349499672527, - 8.47410038634525, - -113.340833834572, - -103.265909909537, - -33.2370360923825, - 21.8564347471830, - -156.539486489880, - -64.2647531387317, - 165.779530068738, - -25.2028339893249, - -97.0037973959711, - 27.8546391256925, - 153.214218975132, - -155.061608694663, - -11.8421354925543, - 80.5387312016125, - -42.0561606270165, - -31.2233262205060, - 38.8379041944063, - 93.7606877469492, - -84.7560200078398, - 7.75536818082863, - -122.276883381108, - 46.8012705252113, - -24.7686335284573, - 99.8904719062334, - -134.783996960185, - -83.0880230164493, - 60.1281736000420, - 152.644656278084, - 29.7576658909417, - 40.7793187974476, - 110.183927562412, - 165.652065916454, - -12.9926632105736, - 79.7359893585681, - -50.5245271190884, - 118.923930267733, - 47.2202861862577, - 171.925276523721, - -62.5145800558502, - -11.1156697680531, - 132.018041099963, - -135.355486412425, - 102.370921576708, - 112.739282398012, - -178.304963670831, - -122.319932198534, - 59.0763464570905, - 151.704200334501, - 21.3763364190503, - -169.005476417779, - 118.980811786769, - -116.089295979010, - 9.64767870353308, - 60.8933243657771, - -156.021526862757, - -63.4602993325163, - 174.929787427393, - -175.288768596346, - -105.951907934032, - -50.1928304519800, - 131.358266702971, - -136.296815007542, - 93.5644603506407, - -97.0840116473627, - -169.158278888619, - -44.1323835471345, - 81.4795403841382, - ] -) - -T_DESIGN_11_ELE = np.array( - [ - 7.69254738757899, - -23.7300652200871, - 23.5127556185301, - 70.4225940747938, - -9.89694439538752, - -70.7513316063095, - -26.4618527647561, - 47.7764936689044, - -7.72047049524459, - 44.5343602375216, - 26.3897904767450, - -44.6578850137166, - 9.76703456924600, - -47.7053318175498, - 7.45302934155972, - -23.5901209534773, - 23.7194484034707, - 70.4382693912270, - -9.83541588740259, - -70.4980825105727, - -26.2949218109204, - 47.6148028805222, - -7.51718499746626, - 44.2862347125773, - 26.6442619674660, - -44.5693707254340, - 9.91271928508000, - -47.9599550372574, - 7.29679922953795, - -23.3445981426306, - 23.6415261666079, - 70.6843143997832, - -9.58140351749889, - -70.3934534122902, - -26.4258159091605, - 47.7510668062369, - -7.30853603036844, - 44.2632768570349, - 26.7140614474957, - -44.3149733480527, - 9.75899721561506, - -48.0361913333593, - 7.43965099805872, - -23.3326075548841, - 23.3868959687598, - 70.8219078016791, - -9.48596399169388, - -70.5801867828491, - -26.6740262349265, - 47.9978414043199, - -7.38276167631068, - 44.4970603752708, - 26.5024990214418, - -44.2461913308458, - 9.51845076548334, - -47.8281351088411, - 7.68427447425834, - -23.5706842106942, - 23.3074499244045, - 70.6586472132300, - -9.68088860263008, - -70.8026785673948, - -26.6963451935976, - 48.0136296461397, - -7.63734823159200, - 44.6651234222196, - 26.3023490002159, - -44.4576351865647, - 9.52341455917443, - -47.6242211091394, - ] -) -PLANAR_HOA_CHANNELS_ACN = np.array([0, 1, 3, 4, 8, 9, 15]) -VERT_HOA_CHANNELS_ACN = np.array([2, 5, 6, 7, 10, 11, 12, 13, 14]) - -SEED_PADDING = 0 - -# delay in number of samples -DELAY_COMPENSATION_FOR_FILTERING = { - "SHQ2": { - "up": 436, - "down": 218, - }, - "SHQ3": { - "up": 436, - "down": 145, - }, - "MSIN": 92, - "LP1p5": 322, - "LP35": 232, - "LP7": 117, - "LP10": 82, - "LP12": 164, - "LP14": 234, - "LP20": 161, - "HP50_32KHZ": 559, - "HP50_48KHZ": 839, -} diff --git a/item_generation_scripts/audiotools/convert/__init__.py b/item_generation_scripts/audiotools/convert/__init__.py deleted file mode 100644 index 4ec23739ae7924d6af872b11abd27fca9e570c44..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/convert/__init__.py +++ /dev/null @@ -1,323 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import logging -from pathlib import Path, PurePath -from typing import Optional, Union - -from item_generation_scripts.audiotools import audio, audioarray, metadata -from item_generation_scripts.audiotools.audiofile import write -from item_generation_scripts.audiotools.convert.channelbased import convert_channelbased -from item_generation_scripts.audiotools.convert.masa import convert_masa -from item_generation_scripts.audiotools.convert.objectbased import convert_objectbased -from item_generation_scripts.audiotools.convert.scenebased import convert_scenebased -from item_generation_scripts.audiotools.wrappers.bs1770 import loudness_norm -from item_generation_scripts.audiotools.wrappers.esdru import esdru -from item_generation_scripts.audiotools.wrappers.filter import ( - hp50filter_itu, - lpfilter_itu, - resample_itu, -) -from item_generation_scripts.audiotools.wrappers.p50fbmnru import p50fbmnru - -from ..metadata import write_ISM_metadata_in_file - - -def convert_file( - in_file: Union[str, Path], - out_file: Union[str, Path], - in_fs: int, - in_fmt: Union[str, Path], - out_fmt: Optional[Union[str, Path]] = None, - out_fs: Optional[int] = None, - in_meta: Optional[list] = None, - logger: Optional[logging.Logger] = None, - **kwargs, -) -> None: - """Conversion function for one audio file""" - - if not in_fmt: - raise ValueError("Input audio format must be specified!") - - # get audio class object - can be either a regular single audio or scene description .txt - if not isinstance(in_fmt, PurePath) and in_fmt.startswith("META"): - input = metadata.Metadata(in_file) - else: - input = audio.fromfile(in_fmt, in_file, in_fs, in_meta) - - # try to set reasonable defaults if missing - if not in_fs: - in_fs = input.fs - if not out_fs: - out_fs = input.fs - - if not out_fmt: - if isinstance(input, metadata.Metadata): - raise ValueError( - "Output format must be specified for scene description files!" - ) - else: - out_fmt = input.name - - output = audio.fromtype(out_fmt) - if isinstance(output, audio.ObjectBasedAudio): - try: - output.object_pos = input.object_pos - output.metadata_files = input.metadata_files - except Exception: - raise ValueError( - "ISM is not supported as an output for rendering! Only usable as pass-through" - ) - - if isinstance(input, metadata.Metadata): - if logger: - logger.debug(f"Converting metadata to {out_fmt} : {in_file} -> {out_file}") - - # render each audio instance separately - for audio_in in input.audio: - output.fs = out_fs - tmp = audio.fromtype(out_fmt) - tmp.fs = in_fs # resampling not yet applied - convert(audio_in, tmp, in_fs=in_fs, out_fs=out_fs, logger=logger, **kwargs) - if output.audio is not None: - output.audio += tmp.audio - else: - output.audio = tmp.audio - else: - if logger: - logger.debug(f"Converting {in_fmt} to {out_fmt} : {in_file} -> {out_file}") - # run main conversion method - output.fs = in_fs # resampling not yet applied - convert(input, output, in_fs=in_fs, out_fs=out_fs, logger=logger, **kwargs) - - # write output - write(out_file, output.audio, output.fs) - if isinstance(output, audio.ObjectBasedAudio): - write_ISM_metadata_in_file(output.object_pos, [out_file], automatic_naming=True) - - -def convert( - input: audio.Audio, - output: audio.Audio, - in_trim: Optional[list] = None, - in_pad_noise: Optional[bool] = False, - in_delay: Optional[float] = None, - in_fs: Optional[int] = None, - in_cutoff: Optional[int] = None, - in_hp50: Optional[bool] = None, - in_window: Optional[list] = None, - in_loudness: Optional[float] = None, - in_loudness_fmt: Optional[str] = None, - out_trim: Optional[list] = None, - out_pad_noise: Optional[bool] = False, - out_delay: Optional[float] = None, - out_fs: Optional[int] = None, - out_cutoff: Optional[int] = None, - out_hp50: Optional[bool] = None, - out_window: Optional[list] = None, - out_loudness: Optional[float] = None, - out_loudness_fmt: Optional[str] = None, - limit: Optional[bool] = False, - mnru_q: Optional[float] = None, - esdru_alpha: Optional[float] = None, - logger: Optional[logging.Logger] = None, - **kwargs, -) -> None: - """Perform pre-processing, conversion and post-processing""" - - """pre-processing""" - process_audio( - x=input, - trim=in_trim, - pad_noise=in_pad_noise, - delay=in_delay, - fs=in_fs, - fc=in_cutoff, - hp50=in_hp50, - window=in_window, - loudness=in_loudness, - loudness_fmt=in_loudness_fmt, - logger=logger, - ) - - """format conversion""" - format_conversion(input, output, logger=logger, **kwargs) - - """post-processing""" - process_audio( - x=output, - trim=out_trim, - pad_noise=out_pad_noise, - delay=out_delay, - fs=out_fs, - fc=out_cutoff, - hp50=out_hp50, - window=out_window, - loudness=out_loudness, - loudness_fmt=out_loudness_fmt, - limit=limit, - mnru_q=mnru_q, - esdru_alpha=esdru_alpha, - logger=logger, - ) - - -def process_audio( - x: audio.Audio, - trim: Optional[list] = None, - pad_noise: Optional[bool] = False, - delay: Optional[float] = None, - fs: Optional[int] = None, - fc: Optional[int] = None, - hp50: Optional[bool] = False, - window: Optional[float] = None, - loudness: Optional[float] = None, - loudness_fmt: Optional[str] = None, - limit: Optional[bool] = False, - mnru_q: Optional[float] = None, - esdru_alpha: Optional[float] = None, - logger: Optional[logging.Logger] = None, -) -> None: - """Perform (pre-/pos-) processing of audio""" - - if fs is None: - fs = x.fs - - """delay audio""" - if delay is not None: - if logger: - logger.debug(f"Delaying audio by {delay} ms") - x.audio = audioarray.delay(x.audio, x.fs, delay) - - """trim or pad audio""" - if trim is not None: - if isinstance(x, audio.ObjectBasedAudio): - # metadata concatenation necessary for ISM - metadata.trim_meta(x, tuple(trim), pad_noise) - else: - x.audio = audioarray.trim(x.audio, x.fs, tuple(trim), pad_noise) - - """windowing""" - if window is not None: - if logger: - logger.debug(f"Windowing audio with {window} ms Hann window") - x.audio = audioarray.window(x.audio, x.fs, window) - - """high-pass (50 Hz) filtering""" - if hp50: - if logger: - logger.debug("Applying 50 Hz high-pass filter using ITU STL filter") - x.audio = hp50filter_itu(x) - - """resampling""" - if x.fs != fs: - if logger: - logger.debug(f"Resampling from {x.fs} to {fs} using ITU STL filter") - x.audio = resample_itu(x, fs) - x.fs = fs - - """loudness normalization""" - if loudness is not None: - if logger: - logger.debug( - f"Applying loudness adjustment to {loudness} LKFS for format {loudness_fmt} using ITU STL bs1770demo" - ) - x.audio = loudness_norm(x, loudness, loudness_fmt) - - """low-pass filtering""" - if fc is not None: - if logger: - logger.debug( - f"Applying low-pass filter with cutoff {fc} Hz using ITU STL filter" - ) - x.audio = lpfilter_itu(x, fc) - - """MNRU""" - if mnru_q is not None: - if logger: - logger.debug("Applying P.50 Fullband MNRU") - x.audio = p50fbmnru(x, mnru_q) - - """ESDRU""" - if esdru_alpha is not None: - if logger: - logger.debug("Applying ESDRU Recommendation ITU-T P.811") - x.audio = esdru(x, esdru_alpha) - - """limiting""" - if limit: - if logger: - logger.debug("Applying limiter") - audioarray.limiter(x.audio, x.fs) - - -def format_conversion( - input: audio.Audio, - output: audio.Audio, - logger: Optional[logging.Logger] = None, - **kwargs, -) -> None: - """Convert one audio format to another""" - - # validation - if isinstance(output, audio.MetadataAssistedSpatialAudio): - raise NotImplementedError("MASA is not supported as an output for rendering!") - - if isinstance(output, audio.ObjectBasedAudio) and input.name != output.name: - raise NotImplementedError( - "ISM is not supported as an output for rendering! Only usable as pass-through" - ) - - if logger: - logger.debug(f"Format conversion: {input.name} -> {output.name}") - - if input.name == output.name or ( - input.name.startswith("BINAURAL") and output.name.startswith("BINAURAL") - ): - output.audio = input.audio - else: - if isinstance(input, audio.BinauralAudio): - raise NotImplementedError( - f"{input.name} is not supported as an input for rendering!" - ) - elif isinstance(input, audio.ChannelBasedAudio): - convert_channelbased(input, output, **kwargs) - elif isinstance(input, audio.MetadataAssistedSpatialAudio): - convert_masa(input, output, **kwargs) - elif isinstance(input, audio.ObjectBasedAudio): - convert_objectbased(input, output, **kwargs) - elif isinstance(input, audio.SceneBasedAudio): - convert_scenebased(input, output, **kwargs) - else: - raise NotImplementedError( - f"Unknown or unsupported audio format {input.name}" - ) diff --git a/item_generation_scripts/audiotools/convert/binaural.py b/item_generation_scripts/audiotools/convert/binaural.py deleted file mode 100644 index b23e69eec4c34f44655cf45ca3978ff33c1cadd8..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/convert/binaural.py +++ /dev/null @@ -1,108 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -from typing import Optional - -import numpy as np -from scipy.signal import fftconvolve - - -def NS2SA( - fs: float, - x: float, -) -> int: - """ - Converts from nanoseconds to number of samples - - Parameters - ---------- - fs: float - Sampling rate - x: float - Duration in nano seconds - - Returns - ------- - Number of samples - """ - - return int(int(fs / 100) * (x / 100) / 100000) - - -def binaural_fftconv( - x: np.ndarray, - IR: np.ndarray, - nchannels: int, - lfe_index: Optional[list[int]] = None, -) -> np.ndarray: - """ - Binauralization using fft convolution - - Parameters - ---------- - x: np.ndarray - Input multi-channel array - IR: np.ndarray - HRIRs array - nchannels: int - Maximum number of channels to process - lfe_index: Optional[list[int]] - List of LFE channel indices - - Returns - ------- - y: np.ndarray - Output convolved signal array - """ - - if lfe_index is None: - lfe_index = [] - - y = np.zeros([x.shape[0], 2]) - for chan_idx in range(min(x.shape[1], nchannels)): - if chan_idx not in lfe_index: - y[:, 0] = np.add( - y[:, 0], - fftconvolve(x[:, chan_idx].astype(float), IR[:, 0, chan_idx]).astype( - float - )[: x.shape[0]], - ) - y[:, 1] = np.add( - y[:, 1], - fftconvolve(x[:, chan_idx].astype(float), IR[:, 1, chan_idx]).astype( - float - )[: x.shape[0]], - ) - else: - ... - - return y diff --git a/item_generation_scripts/audiotools/convert/channelbased.py b/item_generation_scripts/audiotools/convert/channelbased.py deleted file mode 100644 index a8d941e253ba841eedbcdb2805770e790797fd4f..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/convert/channelbased.py +++ /dev/null @@ -1,390 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -from pathlib import Path -from typing import Optional, Tuple, Union - -import numpy as np - -from item_generation_scripts.audiotools import audio -from item_generation_scripts.audiotools.audioarray import delay, framewise_io -from item_generation_scripts.audiotools.binaural_datasets.binaural_dataset import ( - load_ir, -) -from item_generation_scripts.audiotools.constants import ( - BINAURAL_LFE_GAIN, - IVAS_FRAME_LEN_MS, - IVAS_MC_CONVERSION, -) -from item_generation_scripts.audiotools.convert import scenebased -from item_generation_scripts.audiotools.convert.binaural import binaural_fftconv -from item_generation_scripts.audiotools.EFAP import EFAP -from item_generation_scripts.audiotools.rotation import Quat2RotMat, rotateAziEle -from item_generation_scripts.audiotools.wrappers.filter import resample_itu - -""" ChannelBasedAudio functions """ - - -def convert_channelbased( - cba: audio.ChannelBasedAudio, - out: audio.Audio, - **kwargs, -) -> audio.Audio: - """Convert channel-based audio to the requested output format""" - # CBA -> Binaural - if isinstance(out, audio.BinauralAudio): - render_cba_to_binaural(cba, out, **kwargs) - - # CBA -> CBA - elif isinstance(out, audio.ChannelBasedAudio): - render_cba_to_cba(cba, out) - - # CBA -> SBA - elif isinstance(out, audio.SceneBasedAudio): - render_cba_to_sba(cba, out) - - else: - raise NotImplementedError( - f"Conversion from {cba.name} to {out.name} is unsupported!" - ) - - return out - - -def render_cba_to_binaural( - cba: audio.ChannelBasedAudio, - bin: audio.BinauralAudio, - trajectory: Optional[Union[str, Path]] = None, - bin_dataset: Optional[str] = None, - bin_lfe_gain: Optional[float] = None, - **kwargs, -) -> None: - """ - Binauralization of channel-based audio - - Parameters - ---------- - cba: audio.ChannelBasedAudio - Channel-based input audio - bin: audio.BinauralAudio - Binaural output audio - trajectory: Optional[Union[str, Path]] - Head rotation trajectory path - bin_dataset: Optional[str] - Name of binaural dataset wihtout prefix or suffix - bin_lfe_gain: Optional[float] - LFE gain for binaural rendering - """ - - if cba.name == "MONO": - # no binauralization possible for mono -> render to stereo and assume binaural signal - cba_stereo = audio.fromtype("STEREO") - cba_stereo.fs = bin.fs - render_cba_to_cba(cba, cba_stereo) - bin.audio = cba_stereo.audio - return - - cba.audio = resample_itu(cba, 48000) - old_fs = cba.fs - cba.fs = 48000 - bin.fs = 48000 - - if trajectory is not None: - cba.audio = rotate_cba(cba, trajectory) - - IR, _, latency_smp = load_ir(cba.name, bin.name, bin_dataset) - - # render LFE - if bin_lfe_gain is not None: - bin_lfe, lfe_delay_ns = render_lfe_to_binaural( - cba.audio, cba.fs, cba.lfe_index, bin_lfe_gain - ) - - # render rest of the signal - bin.audio = binaural_fftconv(cba.audio, IR, cba.num_channels, cba.lfe_index) - # compensate delay from binaural dataset - bin.audio = delay(bin.audio, bin.fs, -latency_smp, samples=True) - - # add LFE and rest - if bin_lfe_gain is not None: - bin.audio += bin_lfe - - bin.audio = resample_itu(bin, old_fs) - - -def render_custom_ls_binaural( - custom_ls: audio.ChannelBasedAudio, - output: audio.BinauralAudio, - IR: np.ndarray, - SourcePosition: np.ndarray, - trajectory: str, -): - # TODO rework impl. (with EFAP) - # logger.info(" Processing channels on custom LS layout") - # azis = ", ".join([f"{a:7.2f}" for a in ls_azi_all]) - # eles = ", ".join([f"{e:7.2f}" for e in ls_ele_all]) - # logger.info(f" azi: {azis}") - # logger.info(f" ele: {eles}") - # logger.info(f" lfe_index: {lfe_index_all}") - - # if output.name == "BINAURAL_ROOM": - # tmp = get_audio_type("MOZART") - # convert_channel_based(custom_ls, tmp) - # logger.info(f" {custom_ls.name} -> {tmp.name} -> {output.name}") - # custom_ls.audio = tmp.audio - # else: - # tmp = custom_ls - # - # ls_azi_all = tmp.ls_azi - # ls_ele_all = tmp.ls_ele - # lfe_index_all = tmp.lfe_index - # - # frame_len = (IVAS_FRAME_LEN_MS // 4) * (fs // 1000) - # sig_len = custom_ls.audio.shape[0] - # N_frames = int(sig_len / frame_len) - # - # i_ls = 0 - # y = np.zeros([sig_len, 2]) - # for i_chan in range(custom_ls.audio.shape[1]): - # - # # skip LFE - # if i_chan in lfe_index_all: - # continue - # - # # skip silent (or very low volume) channels - # if np.allclose(custom_ls.audio[:, i_chan], 0.0, atol=32.0): - # continue - # - # ls_azi = np.repeat(ls_azi_all[i_ls], N_frames) - # ls_ele = np.repeat(ls_ele_all[i_ls], N_frames) - # - # azi, ele = rotateISM(ls_azi, ls_ele, trajectory=trajectory) - # - # y += binaural_fftconv_framewise( - # custom_ls.audio[:, i_chan], - # IR, - # SourcePosition, - # frame_len=frame_len, - # azi=azi, - # ele=ele, - # ) - # i_ls += 1 - # - # return y - return - - -def render_cba_to_cba( - cba_in: audio.ChannelBasedAudio, cba_out: audio.ChannelBasedAudio -) -> None: - """ - Rendering of channel-based input signal to channel-based output - - Parameters - ---------- - cba_in: audio.ObjectBasedAudio - Channel-based input audio - cba_out: audio.ChannelBasedAudio - Channel-based output audio - """ - - # Stereo to Mono - if cba_in.name == "STEREO" and cba_out.name == "MONO": - render_mtx = np.vstack([[0.5], [0.5]]) - else: - try: - render_mtx = IVAS_MC_CONVERSION[cba_in.name][cba_out.name] - except KeyError: - # Use EFAP panning if no matrix was found - panner = EFAP( - np.delete(cba_out.ls_azi, cba_out.lfe_index).astype(float), - np.delete(cba_out.ls_ele, cba_out.lfe_index).astype(float), - ) - - render_mtx = np.vstack( - [ - panner.pan(a, e).T - for i, (a, e) in enumerate(zip(cba_in.ls_azi, cba_in.ls_ele)) - if i not in cba_in.lfe_index - ] - ) - - # pass-through for LFE - for index in np.sort(cba_in.lfe_index): - render_mtx = np.insert(render_mtx, index, 0, axis=0) - render_mtx = np.insert(render_mtx, cba_out.lfe_index, 0, axis=1) - render_mtx[cba_in.lfe_index, cba_out.lfe_index] = 1 - - if cba_out.num_channels <= 2: - render_mtx[cba_in.lfe_index, :] = 0 - - cba_out.audio = cba_in.audio @ render_mtx - - -def render_cba_to_sba(cba: audio.ChannelBasedAudio, sba: audio.SceneBasedAudio) -> None: - """ - Rendering of channel-based input signal to SBA output - - Parameters - ---------- - cba: audio.ObjectBasedAudio - Channel-based input audio - sba: audio.ChannelBasedAudio - SBA output audio - """ - - if cba.name == "MONO": - raise ValueError(f"Rendering from MONO to {sba.name} is not supported.") - - # SH response for loudspeaker positions - render_mtx = np.hstack( - [ - scenebased.getRSH(np.array([a]), np.array([e]), sba.ambi_order) - for a, e in zip(cba.ls_azi, cba.ls_ele) - ] - ).T - render_mtx[cba.lfe_index] = 0 - - sba.audio = cba.audio @ render_mtx - # do not add LFE to output - if sba.is_planar: - scenebased.zero_vert_channels(sba) - - -def rotate_cba( - cba: audio.ChannelBasedAudio, - trajectory: str, -) -> np.ndarray: - """ - Rotate MC signal by applying a rotation matrix calculated from the current quaternion - in each subframe - - Parameters: - ---------- - x: np.ndarray - Input multichannel signal - trajectory: str - Path to trajectory file - - Returns: - ---------- - y: np.ndarray - Rotated multichannel signal - """ - - trj_data = np.genfromtxt(trajectory, delimiter=",") - trj_frames = trj_data.shape[0] - - sig_len = cba.audio.shape[0] - sig_dim = cba.audio.shape[1] - frame_len = (IVAS_FRAME_LEN_MS // 4) * 48 - - out = np.zeros([sig_len, sig_dim]) - - panner = EFAP(cba.ls_azi, cba.ls_ele) - - fade_in = np.arange(frame_len) / (frame_len - 1) - fade_in = fade_in[:, np.newaxis] - fade_out = 1.0 - fade_in - - R_old = np.eye(cba.num_channels) - - for i, (frame_in, frame_out) in framewise_io(cba.audio, out, frame_len): - # update the crossfade if we have a smaller last frame - if frame_out.shape[0] != frame_len: - frame_size = frame_out.shape[0] - fade_in = np.arange(frame_size) / (frame_size - 1) - fade_in = fade_in[:, np.newaxis] - fade_out = 1.0 - fade_in - - q = trj_data[i % trj_frames, :] - rotated_pos = np.array( - [rotateAziEle(a, e, Quat2RotMat(q)) for a, e in zip(cba.ls_azi, cba.ls_ele)] - ) - R = panner.pan(rotated_pos[:, 0], rotated_pos[:, 1]) - R[:, [cba.lfe_index]] = 0 - R[[cba.lfe_index], :] = 0 - R[cba.lfe_index, cba.lfe_index] = 1 - - frame_out[:, :] = (fade_in * frame_in @ R) + (fade_out * frame_in @ R_old) - - R_old = R.copy() - - return out - - -""" Helper functions """ - - -def render_lfe_to_binaural( - x: np.ndarray, - fs: Optional[int] = 48000, - lfe_index: Optional[list] = None, - LFE_gain: Optional[float] = BINAURAL_LFE_GAIN, -) -> Tuple[np.ndarray, int]: - """ - Extract LFE from the given input and render - it binaurally, accounting for delay - """ - - lfe = x[:, lfe_index].copy() - - # if there is more than one LFE sum them into one - if lfe.shape[1] > 1: - lfe = np.sum(lfe, axis=1) - - """ - # 120 Hz low-pass filtering for LFE using IVAS filter coefficients - if fs == 48000: - lfe = sig.sosfilt(IVAS_LPF_4_BUTTER_48K_SOS, lfe, axis=0) - else: - raise NotImplementedError("Only 48 kHz supported at the moment!") - - # 3.5ms LP filter delay from IVAS ROM - lfe_delay_ns = 0.0035 * 1e9 - lfe_delay_smp = round(lfe_delay_ns * fs / 1e9) - - # Delay LFE by the same amount as the HRTF delay - lfe = np.roll(lfe, round(latency_smp), axis=0) - lfe[0 : round(latency_smp), :] = 0 - """ - lfe_delay_ns = 0 - - # apply gain - lfe *= LFE_gain - - # duplicate for each binaural channel - if len(np.shape(lfe)) < 2: - lfe = lfe[:, np.newaxis] - lfe = np.hstack([lfe, lfe]) - - return lfe, lfe_delay_ns diff --git a/item_generation_scripts/audiotools/convert/masa.py b/item_generation_scripts/audiotools/convert/masa.py deleted file mode 100644 index 15f1c683b25927e054184dbd41762cc61fd17a07..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/convert/masa.py +++ /dev/null @@ -1,165 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -from pathlib import Path -from typing import Optional, Union -from warnings import warn - -from item_generation_scripts.audiotools import audio -from item_generation_scripts.audiotools.convert import channelbased -from item_generation_scripts.audiotools.wrappers.masaRenderer import masaRenderer - -""" MetadataAssistedSpatialAudio functions """ - - -def convert_masa( - masa: audio.MetadataAssistedSpatialAudio, - out: audio.Audio, - **kwargs, -) -> audio.Audio: - """Convert Metadata Assisted Spatial audio to the requested output format""" - - # MASA -> Binaural - if isinstance(out, audio.BinauralAudio): - render_masa_to_binaural(masa, out, **kwargs) - - # MASA -> CBA - elif isinstance(out, audio.ChannelBasedAudio): - render_masa_to_cba(masa, out) - - # MASA -> SBA - elif isinstance(out, audio.SceneBasedAudio): - render_masa_to_sba(masa, out) - - else: - raise NotImplementedError( - f"Conversion from {masa.name} to {out.name} is unsupported!" - ) - - return out - - -def render_masa_to_binaural( - masa: audio.MetadataAssistedSpatialAudio, - bin: audio.BinauralAudio, - trajectory: Optional[Union[str, Path]] = None, - bin_dataset: Optional[str] = None, - **kwargs, -) -> None: - """ - Binauralization of MASA audio - - Parameters - ---------- - masa: audio.MetadataAssistedSpatialAudio - MASA input audio - bin: audio.BinauralAudio - Output binaural audio - trajectory: Optional[Union[str, Path]] - Head rotation trajectory path - bin_dataset: Optional[str] - Name of binaural dataset without prefix or suffix - """ - - if "ROOM" in bin.name: - cba_tmp = audio.fromtype("7_1_4") - cba_tmp.fs = masa.fs - - render_masa_to_cba(masa, cba_tmp) - - channelbased.render_cba_to_binaural(cba_tmp, bin, trajectory) - else: - if trajectory is not None: - warn( - f"Head-rotation not supported by MasaRenderer! Trajectory {trajectory} will be ignored!" - ) - if bin_dataset is not None: - warn( - "Binaural dataset selection not supported by MasaRenderer - please copy the required hrir.bin manually!" - ) - - bin.audio = masaRenderer(masa, "BINAURAL") - - -def render_masa_to_cba( - masa: audio.MetadataAssistedSpatialAudio, - cba: audio.ChannelBasedAudio, -) -> None: - """ - Rendering of MASA input signal to Channel-based format - - Parameters - ---------- - masa: audio.MetadataAssistedSpatialAudio - MASA input audio - cba: audio.ChannelBasedAudio - Channel-based output audio - """ - - if cba.name not in ["5_1", "7_1_4"]: - warn( - f"MasaRenderer does not support {cba.name} natively. Using 7_1_4 as an intermediate format." - ) - - cba_tmp = audio.fromtype("7_1_4") - cba_tmp.fs = masa.fs - cba_tmp.audio = masaRenderer(masa, cba_tmp.name) - - channelbased.render_cba_to_cba(cba_tmp, cba) - else: - cba.audio = masaRenderer(masa, cba.name) - - -def render_masa_to_sba( - masa: audio.MetadataAssistedSpatialAudio, - sba: audio.SceneBasedAudio, -) -> None: - """ - Rendering of MASA input signal to SBA format - - Parameters - ---------- - masa: audio.MetadataAssistedSpatialAudio - MASA input audio - sba: audio.SceneBasedAudio - SBA output audio - """ - - warn( - f"MasaRenderer does not support {sba.name} natively. Using 7_1_4 as an intermediate format." - ) - - cba_tmp = audio.fromtype("7_1_4") - cba_tmp.fs = masa.fs - cba_tmp.audio = masaRenderer(masa, cba_tmp.name) - - channelbased.render_cba_to_sba(cba_tmp, sba) diff --git a/item_generation_scripts/audiotools/convert/objectbased.py b/item_generation_scripts/audiotools/convert/objectbased.py deleted file mode 100644 index 9fb74ed19bf3f67902533ed55a575f6a0498a8b4..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/convert/objectbased.py +++ /dev/null @@ -1,352 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -from itertools import repeat -from pathlib import Path -from typing import Optional, Tuple, Union - -import numpy as np - -from item_generation_scripts.audiotools import audio -from item_generation_scripts.audiotools.audioarray import delay, framewise_io -from item_generation_scripts.audiotools.binaural_datasets.binaural_dataset import ( - load_ir, -) -from item_generation_scripts.audiotools.binauralobjectrenderer import ( - binaural_fftconv_framewise, -) -from item_generation_scripts.audiotools.constants import IVAS_FRAME_LEN_MS -from item_generation_scripts.audiotools.convert.channelbased import ( - render_cba_to_binaural, -) -from item_generation_scripts.audiotools.convert.scenebased import getRSH -from item_generation_scripts.audiotools.EFAP import EFAP, wrap_angles -from item_generation_scripts.audiotools.rotation import Quat2RotMat, rotateAziEle -from item_generation_scripts.audiotools.wrappers.filter import resample_itu -from item_generation_scripts.utils import apply_func_parallel - -""" ObjectBasedAudio functions """ - - -def convert_objectbased( - oba: audio.ObjectBasedAudio, - out: audio.Audio, - **kwargs, -) -> audio.Audio: - """Convert an ISM signal to the requested output format""" - - # OBA -> Binaural - if isinstance(out, audio.BinauralAudio): - render_oba_to_binaural(oba, out, **kwargs) - - # OBA -> CBA - elif isinstance(out, audio.ChannelBasedAudio): - render_oba_to_cba(oba, out) - - # OBA -> SBA - elif isinstance(out, audio.SceneBasedAudio): - render_oba_to_sba(oba, out) - else: - raise NotImplementedError( - f"Conversion from {oba.name} to {out.name} is unsupported!" - ) - - return out - - -def render_oba_to_binaural( - oba: audio.ObjectBasedAudio, - bin: audio.BinauralAudio, - trajectory: Optional[Union[str, Path]] = None, - bin_dataset: Optional[str] = None, - **kwargs, -) -> None: - """ - Binauralization of ISM input signal - - Parameters - ---------- - oba: audio.ObjectBasedAudio - Object based input audio - bin: audio.BinauralAudio - Binaural output audio - trajectory: Optional[Union[str, Path]] - Head rotation trajectory - bin_dataset: Optional[str] - Name of binaural dataset, if None default dataset is used - """ - - # bin.audio = np.zeros([oba.audio.shape[0], bin.num_channels]) - - if "ROOM" in bin.name: - cba_tmp = audio.fromtype("7_1_4") - cba_tmp.fs = oba.fs - - render_oba_to_cba(oba, cba_tmp) - - render_cba_to_binaural(cba_tmp, bin, trajectory) - else: - IR, SourcePosition, latency_smp = load_ir(oba.name, bin.name, bin_dataset) - - oba.audio = resample_itu(oba, 48000) - fs_old = oba.fs - oba.fs = 48000 - - # apply processing for every object in parallel - obj_pos = oba.object_pos - obj_idx = list(range(oba.num_channels)) - result = apply_func_parallel( - render_object, - zip( - obj_idx, - obj_pos, - repeat(oba), - repeat(trajectory), - repeat(IR), - repeat(SourcePosition), - ), - None, - "mt", - False, - ) - - # sum results over all objects - bin.audio = np.sum(np.stack(result, axis=2), axis=2) - - # compensate delay from binaural dataset - bin.audio = delay(bin.audio, bin.fs, -latency_smp, samples=True) - - bin.audio = resample_itu(bin, fs_old) - bin.fs = fs_old - - -def render_oba_to_cba( - oba: audio.ObjectBasedAudio, - cba: audio.ChannelBasedAudio, -) -> None: - """ - Rendering of ISM input signal to channel-based format - - Parameters - ---------- - oba: audio.ObjectBasedAudio - Object based input audio - cba: audio.ChannelBasedAudio - Channel-based output audio - """ - - cba.audio = np.zeros([oba.audio.shape[0], cba.num_channels]) - - for obj_idx, obj_pos in enumerate(oba.object_pos): - obj_audio = oba.audio[:, [obj_idx]] - pos_frames = obj_pos.shape[0] - - frame_len = IVAS_FRAME_LEN_MS * (oba.fs // 1000) - - fade_in = np.arange(frame_len) / (frame_len - 1) - fade_in = fade_in[:, np.newaxis] - fade_out = 1.0 - fade_in - - # use EFAP for rendering - panner = EFAP( - np.delete(cba.ls_azi, cba.lfe_index), np.delete(cba.ls_ele, cba.lfe_index) - ) - gains_old = None - - for i, (frame_in, frame_out) in framewise_io(obj_audio, cba.audio, frame_len): - # update the crossfade if we have a smaller last frame - if frame_out.shape[0] != frame_len: - frame_size = frame_out.shape[0] - fade_in = np.arange(frame_size) / (frame_size - 1) - fade_in = fade_in[:, np.newaxis] - fade_out = 1.0 - fade_in - - azi, ele = wrap_angles(*obj_pos[i % pos_frames, :2], clip_ele=True) - gains = panner.pan(azi, ele) - for lfe in np.sort(cba.lfe_index): - gains = np.insert(gains, lfe, 0) - gains = gains[np.newaxis, :] - - if gains_old is None: - gains_old = gains.copy() - - frame_out[:] += (fade_in * frame_in @ gains) + ( - fade_out * frame_in @ gains_old - ) - - gains_old = gains.copy() - - -def render_oba_to_sba( - oba: audio.ObjectBasedAudio, - sba: audio.SceneBasedAudio, -) -> None: - """ - Rendering of ISM input signal to SBA format - - Parameters - ---------- - oba: audio.ObjectBasedAudio - Object based input audio - sba: audio.SceneBasedAudio - SBA output audio - """ - - sba.audio = np.zeros([oba.audio.shape[0], sba.num_channels]) - - for obj_idx, obj_pos in enumerate(oba.object_pos): - obj_audio = oba.audio[:, [obj_idx]] - pos_frames = obj_pos.shape[0] - - frame_len = IVAS_FRAME_LEN_MS * (oba.fs // 1000) - - fade_in = np.arange(frame_len) / (frame_len - 1) - fade_in = fade_in[:, np.newaxis] - fade_out = 1.0 - fade_in - - gains_old = None - - for i, (frame_in, frame_out) in framewise_io(obj_audio, sba.audio, frame_len): - # update the crossfade if we have a smaller last frame - if frame_out.shape[0] != frame_len: - frame_size = frame_out.shape[0] - fade_in = np.arange(frame_size) / (frame_size - 1) - fade_in = fade_in[:, np.newaxis] - fade_out = 1.0 - fade_in - - pos = obj_pos[i % pos_frames, :] - gains = getRSH(np.array([pos[0]]), np.array([pos[1]]), sba.ambi_order) - - if gains_old is None: - gains_old = gains.copy() - - frame_out[:] += (fade_in * frame_in @ gains.T) + ( - fade_out * frame_in @ gains_old.T - ) - - gains_old = gains.copy() - - -def rotate_oba( - azi: np.ndarray, - ele: np.ndarray, - trajectory: Optional[str] = None, -) -> Tuple[np.ndarray, np.ndarray]: - """ - Application of head tracking trajectory - - Parameters: - ---------- - azi: np.ndarray - Azimuth coordinates of objects - ele: np.ndarray - Elevation coordinates of objects - trajectory: str - Head-tracking trajectory path - - Returns: - ---------- - azi_rot: np.ndarray - Azimuth coordinates after application of trajectory - ele_rot: np.ndarray - Elevation coordinates after application of trajectory - """ - - if trajectory is None: - return azi, ele - - trj_data = np.genfromtxt(trajectory, delimiter=",") - trj_frames = trj_data.shape[0] - - N_frames = azi.shape[0] - if ele.shape[0] != azi.shape[0]: - raise ValueError("Inconsistent input in azi and ele") - - azi_rot = np.zeros([N_frames]) - ele_rot = np.zeros([N_frames]) - - for i_frame in range(N_frames): - q = trj_data[i_frame % trj_frames, :] - azi_rot[i_frame], ele_rot[i_frame] = rotateAziEle( - azi[i_frame], ele[i_frame], Quat2RotMat(q) - ) - - return azi_rot, ele_rot - - -def render_object( - obj_idx: int, - obj_pos: np.ndarray, - oba: audio.ObjectBasedAudio, - trajectory: str, - IR: np.ndarray, - SourcePosition: np.ndarray, -) -> np.ndarray: - """ - Binaural rendering for one ISM object - - Parameters: - ---------- - obj_idx: int - Index of object in list of all objects - obj_pos: np.ndarray - Position of object - oba: audio.ObjectBasedAudio - Input ISM audio object - trajectory: str - Head-tracking trajectory path - IR: np.ndarray - HRIRs for binauralization - SourcePosition: np.ndarray - Positions of HRIR measurements - - Returns: - ---------- - result_audio: np.ndarray - Binaurally rendered object - """ - - # repeat each value four times since head rotation data is on sub-frame basis - azi = np.repeat(obj_pos[:, 0], 4) - ele = np.repeat(obj_pos[:, 1], 4) - # apply head-rotation trajectory - obj_audio = oba.audio[:, [obj_idx]] - azi, ele = rotate_oba(azi, ele, trajectory) - # convolve signal with HRIRs - result_audio = binaural_fftconv_framewise( - obj_audio, - IR, - SourcePosition, - azi, - ele, - ) - return result_audio diff --git a/item_generation_scripts/audiotools/convert/scenebased.py b/item_generation_scripts/audiotools/convert/scenebased.py deleted file mode 100644 index a7e89b4f0fb66fe270f4791230bf5463dbadaf11..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/convert/scenebased.py +++ /dev/null @@ -1,429 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -from pathlib import Path -from typing import Optional, Union -from warnings import warn - -import numpy as np -from scipy.special import lpmv - -from item_generation_scripts.audiotools import audio -from item_generation_scripts.audiotools.audioarray import delay, framewise_io -from item_generation_scripts.audiotools.binaural_datasets.binaural_dataset import ( - load_ir, -) -from item_generation_scripts.audiotools.constants import ( - IVAS_FRAME_LEN_MS, - T_DESIGN_11_AZI, - T_DESIGN_11_ELE, - VERT_HOA_CHANNELS_ACN, -) -from item_generation_scripts.audiotools.convert import channelbased -from item_generation_scripts.audiotools.convert.binaural import binaural_fftconv -from item_generation_scripts.audiotools.EFAP import EFAP -from item_generation_scripts.audiotools.rotation import Quat2RotMat, SHrotmatgen -from item_generation_scripts.audiotools.wrappers.filter import resample_itu - -""" SceneBasedAudio functions """ - - -def convert_scenebased( - sba: audio.SceneBasedAudio, - out: audio.Audio, - **kwargs, -) -> audio.Audio: - """Convert scene-based audio to the requested output format""" - - # SBA -> Binaural - if isinstance(out, audio.BinauralAudio): - render_sba_to_binaural(sba, out, **kwargs) - - # SBA -> CBA - elif isinstance(out, audio.ChannelBasedAudio): - render_sba_to_cba(sba, out) - - # SBA -> SBA - elif isinstance(out, audio.SceneBasedAudio): - render_sba_to_sba(sba, out) - else: - raise NotImplementedError( - f"Conversion from {sba.name} to {out.name} is unsupported!" - ) - - return out - - -def render_sba_to_binaural( - sba: audio.SceneBasedAudio, - bin: audio.BinauralAudio, - trajectory: Optional[Union[str, Path]] = None, - bin_dataset: Optional[str] = None, - **kwargs, -) -> None: - """ - Binauralization of scene-based audio - - Parameters - ---------- - sba: audio.SceneBasedAudio - Input SBA audio - bin: audio.BinauralAudio - Output binaural audio - trajectory: Optional[Union[str, Path]] - Head rotation trajectory path - bin_dataset: Optional[str] - Name of binaural dataset without prefix or suffix - """ - - if trajectory is not None: - sba.audio = rotate_sba(sba, trajectory) - - if "ROOM" in bin.name: - cba_tmp = audio.fromtype("7_1_4") - cba_tmp.fs = sba.fs - - render_sba_to_cba(sba, cba_tmp) - - channelbased.render_cba_to_binaural(cba_tmp, bin, trajectory) - else: - IR, _, latency_smp = load_ir(sba.name, bin.name, bin_dataset) - - sba.audio = resample_itu(sba, 48000) - fs_old = sba.fs - sba.fs = 48000 - - bin.audio = binaural_fftconv(sba.audio, IR, sba.num_channels) - - # compensate delay from binaural dataset - bin.audio = delay(bin.audio, bin.fs, -latency_smp, samples=True) - - bin.audio = resample_itu(bin, fs_old) - bin.fs = fs_old - - -def render_sba_to_cba( - sba: audio.SceneBasedAudio, - cba: audio.ChannelBasedAudio, -) -> None: - """ - Rendering of SBA input signal to channel-based format - - Parameters - ---------- - sba: audio.SceneBasedAudio - Scene-based input audio - cba: audio.ChannelBasedAudio - Channel-based output audio - """ - - render_mtx = get_allrad_mtx(sba.ambi_order, cba) - cba.audio = sba.audio @ render_mtx.T - - -def render_sba_to_sba( - sba_in: audio.SceneBasedAudio, - sba_out: audio.SceneBasedAudio, -) -> None: - """ - Rendering of SBA input signal to SBA output format - - Parameters - ---------- - sba_in: audio.SceneBasedAudio - Scene-based input audio - sba_out: audio.SceneBasedAudio - Scene-based output audio - """ - - if sba_out.ambi_order > sba_in.ambi_order: - sba_out.audio = np.pad( - sba_in.audio, [[0, 0], [0, sba_out.num_channels - sba_in.num_channels]] - ) - elif sba_out.ambi_order < sba_in.ambi_order: - sba_out.audio = sba_in.audio[:, : sba_out.num_channels] - - if sba_out.is_planar: - zero_vert_channels(sba_out) - - -def rotate_sba( - sba: audio.SceneBasedAudio, - trajectory: str, -) -> np.ndarray: - """ - Rotate HOA signal by applying a rotation matrix calculated from the current quaternion - in each subframe - - Parameters: - ---------- - x: np.ndarray - Input signal upto HOA3 - trajectory: str - Path to trajectory file - - Returns: - ---------- - y: np.ndarray - Rotated HOA signal - """ - - trj_data = np.genfromtxt(trajectory, delimiter=",") - trj_frames = trj_data.shape[0] - - sig_len = sba.audio.shape[0] - sig_dim = sba.audio.shape[1] - frame_len = (IVAS_FRAME_LEN_MS // 4) * 48 - - if sig_dim not in [4, 9, 16]: - raise ValueError("rotate_sba can only handle FOA, HOA2 or HOA3 signals!") - - out = np.zeros([sig_len, sig_dim]) - - fade_in = np.arange(frame_len) / (frame_len - 1) - fade_in = fade_in[:, np.newaxis] - fade_out = 1.0 - fade_in - - R = np.eye(sig_dim) - R_old = np.eye(sig_dim) - for i, (frame_in, frame_out) in framewise_io(sba.audio, out, frame_len): - # update the crossfade if we have a smaller last frame - if frame_out.shape[0] != frame_len: - frame_size = frame_out.shape[0] - fade_in = np.arange(frame_size) / (frame_size - 1) - fade_in = fade_in[:, np.newaxis] - fade_out = 1.0 - fade_in - - R_r = Quat2RotMat(trj_data[i % trj_frames, :]) - R[:, :] = SHrotmatgen(R_r, order=ambi_order_from_nchan(sig_dim)) - - frame_out[:, :] = (fade_in * frame_in @ R.T) + (fade_out * frame_in @ R_old.T) - - R_old[:, :] = R.copy() - - return out - - -""" Helper functions """ - - -def zero_vert_channels(sba: audio.SceneBasedAudio) -> None: - """Remove all ambisonics parts with vertical components""" - sba.audio[:, VERT_HOA_CHANNELS_ACN[VERT_HOA_CHANNELS_ACN < sba.num_channels]] = 0 - - -def nchan_from_ambi_order(ambi_order: int) -> int: - """Compute number of channels based on ambisonics order""" - return (ambi_order + 1) ** 2 - - -def ambi_order_from_nchan(nchan: int) -> int: - """Compute ambisonics order based on number of channels""" - return int(np.sqrt(nchan) - 1) - - -def rE_weight(order: int) -> np.ndarray: - """Compute max-rE weighting matrix""" - return np.array( - [ - lpmv(0, l, np.cos(np.deg2rad(137.9) / (order + 1.51))) - for l in range(order + 1) - for _ in range(-l, l + 1) - ] - ).T - - -def n2sn(order: int) -> np.ndarray: - """Compute conversion matrix for N3D to SN3D normalization""" - return np.array( - [1.0 / np.sqrt(2 * l + 1) for l in range(order + 1) for _ in range(-l, l + 1)] - ) - - -def sn2n(order: int) -> np.ndarray: - """Compute conversion matrix for SN3D to N3D normalization""" - return np.array( - [np.sqrt(2 * l + 1) for l in range(order + 1) for _ in range(-l, l + 1)] - ) - - -def getRSH( - azi: np.ndarray, - ele: np.ndarray, - ambi_order: int, - norm: Optional[str] = "sn3d", - degrees: Optional[bool] = True, -) -> np.ndarray: - """ - Returns real spherical harmonic response for the given position(s) - - Parameters: - ---------- - azi: np.ndarray - Azimuth angles - ele: np.ndarray - Elevation angles - ambi_order: int - Ambisonics order - norm: Optional[str] - Normalization of ambisonic bases. - Possible values: "sn3d", "n3d", everything else is interpreted as orthogonal - degrees: Optional[bool] - If true azi and ele are interpreted as angles in degrees, otherwise as radians - - Returns: - ---------- - response: np.ndarray - Real spherical harmonic response - """ - - if degrees: - azi = np.deg2rad(azi) - ele = np.deg2rad(ele) - - azi = azi.astype("float64") - ele = ele.astype("float64") - - LM = np.array([(l, m) for l in range(ambi_order + 1) for m in range(-l, l + 1)]) - - response = np.zeros([LM.shape[0], azi.shape[0]]) - - # trig_term * legendre * uncondon - for i, (l, m) in enumerate(LM): - # N3D norm - response[i, :] = np.sqrt( - ((2 * l + 1) * float(np.math.factorial(l - np.abs(m)))) - / (4 * np.pi * float(np.math.factorial(l + np.abs(m)))) - ) - - # trig term - if m < 0: - response[i, :] *= np.sqrt(2) * np.sin(azi * np.abs(m)) - elif m == 0: - pass # response[i,:] *= 1 - else: - response[i, :] *= np.sqrt(2) * np.cos(azi * m) - - # legendre polynomial - a = lpmv(np.abs(m), l, np.sin(ele)) * ((-1) ** np.abs(m)) - if np.inf in a or -np.inf in a: - a[a == np.inf] = np.finfo(np.float64).max - a[a == -np.inf] = np.finfo(np.float64).min - warn( - "Warning: order too large -> leads to overflow. Inf values are discarded!" - ) - response[i, :] *= a - - if norm == "sn3d": - response *= np.sqrt(4 * np.pi) - response[:] = np.diag(n2sn(ambi_order)) @ response - elif norm == "n3d": - response *= np.sqrt(4 * np.pi) - else: - pass # ortho - - return response - - -def get_allrad_mtx( - ambi_order: int, - cba: audio.ChannelBasedAudio, - norm: Optional[str] = "sn3d", - rE_weight_bool: Optional[bool] = False, - intensity_panning: Optional[bool] = True, -) -> np.ndarray: - """ - Returns ALLRAD matrix - - Parameters: - ---------- - ambi_order: int - Ambisonics order - cba: audio.ChannelBasedAudio - Channel-based audio object - norm: Optional[str] - Normalization of ambisonic bases. - Possible values: "sn3d", "ortho", everything else is interpreted as n3d - re_weight_bool: Optional[bool] - Flag for max-rE weighting - intensity_panning: Optional[bool] - Flag for intensity panning - - Returns: - ---------- - hoa_dec: np.ndarray - ALLRAD matrix - """ - - n_harm = nchan_from_ambi_order(ambi_order) - - if cba.name == "MONO": - hoa_dec = np.zeros([1, n_harm]) - hoa_dec[0, 0] = 1 - elif cba.name == "STEREO": - hoa_dec = np.zeros([2, n_harm]) - # Cardioids +/- 90 degrees - hoa_dec[0, 0] = 0.5 - hoa_dec[0, 1] = 0.5 - hoa_dec[1, 0] = 0.5 - hoa_dec[1, 1] = -0.5 - else: - Y_td = getRSH( - T_DESIGN_11_AZI, - T_DESIGN_11_ELE, - ambi_order, - norm="ortho", - ) - Y_td *= np.sqrt(4 * np.pi) - - n_ls_woLFE = cba.num_channels - len(cba.lfe_index) - ls_azi_woLFE = np.delete(cba.ls_azi, cba.lfe_index).astype(float) - ls_ele_woLFE = np.delete(cba.ls_ele, cba.lfe_index).astype(float) - - panner = EFAP(ls_azi_woLFE, ls_ele_woLFE, intensity_panning) - G_td = panner.pan(T_DESIGN_11_AZI, T_DESIGN_11_ELE) - - hoa_dec = (G_td.T @ Y_td.T) / T_DESIGN_11_AZI.size - - if norm == "sn3d": - hoa_dec = hoa_dec @ np.diag(sn2n(ambi_order)) - elif norm == "ortho": - hoa_dec *= np.sqrt(4 * np.pi) - - if rE_weight_bool: - a_n = rE_weight(ambi_order) - nrg_pre = np.sqrt(len(n_ls_woLFE) / np.sum(a_n**2)) - hoa_dec = hoa_dec @ np.diag(a_n) * nrg_pre - - hoa_dec = np.insert(hoa_dec, cba.lfe_index, np.zeros(n_harm), axis=0) - - return hoa_dec diff --git a/item_generation_scripts/audiotools/metadata.py b/item_generation_scripts/audiotools/metadata.py deleted file mode 100644 index 0a4631ae97b1383bf480cdfb911c646d24142423..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/metadata.py +++ /dev/null @@ -1,571 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import csv -from pathlib import Path -from typing import Optional, TextIO, Tuple, Union - -import numpy as np - -from item_generation_scripts.audiotools import audio -from item_generation_scripts.audiotools.audio import fromtype -from item_generation_scripts.audiotools.audioarray import trim -from item_generation_scripts.audiotools.audiofile import read -from item_generation_scripts.audiotools.constants import IVAS_FRAME_LEN_MS - - -class Metadata: - def __init__(self, meta_file: Union[str, Path]): - self.meta_file = Path(meta_file) - - if not self.meta_file.exists(): - raise FileNotFoundError( - f"Scene description file {self.meta_file} does not exist!" - ) - - with open(self.meta_file) as f: - audio_file = self.meta_file.parent.joinpath(f.readline().strip()).absolute() - - if audio_file.suffix != ".wav": - raise ValueError( - "Scene description files can only be used with WAVE input!" - ) - - self.audio_array, self.fs = read(audio_file) - self.audio = [] - - num_audio = int(f.readline().strip()) - for _ in range(num_audio): - in_fmt = f.readline().strip().upper() - - if in_fmt == "ISM": - self.parse_ism_input(f) - elif in_fmt == "MASA": - self.parse_masa_input(f) - elif in_fmt == "MC": - self.parse_mc_input(f) - elif in_fmt == "SBA": - self.parse_sba_input(f) - else: - raise KeyError(f"Unknown input type in metadata file {in_fmt}") - - def parse_ism_input(self, f: TextIO): - start = int(f.readline().strip()) - 1 - - ism = fromtype("ISM1") - ism.audio = self.audio_array[:, start : start + 1] - ism.fs = self.fs - - line = f.readline().strip() - tmp_path = self.meta_file.parent.joinpath(line).absolute() - if tmp_path.exists(): - # csv metadata - ism.metadata_files = [tmp_path] - ism.init_metadata() - else: - # manually specified metadata - positions = [f.readline().strip() for _ in range(int(line))] - positions = np.genfromtxt( - positions, delimiter="," - ) # TODO can use ndmin = 2 with numpy > 1.23.0; check support - if positions.ndim == 1: - positions = positions[np.newaxis, :] - - obj_pos = [] - # repeat based on first column - for p in positions: - repeats = int(p[0]) - obj_pos.append(np.tile(p[1:], [repeats, 1])) - obj_pos = np.vstack(obj_pos) - - ism.object_pos = [obj_pos] - - self.audio.append(ism) - - def parse_masa_input(self, f: TextIO): - start = int(f.readline().strip()) - 1 - masa_tc = int(f.readline().strip()) - - masa = fromtype(f"MASA{masa_tc}") - masa.audio = self.audio_array[:, start : start + masa_tc] - masa.fs = self.fs - masa.metadata_files = [ - self.meta_file.parent.joinpath(f.readline().strip()).absolute() - ] - masa.init_metadata() - - self.audio.append(masa) - - def parse_mc_input(self, f: TextIO): - start = int(f.readline().strip()) - 1 - mc_fmt = f.readline().strip() - - mc = fromtype(mc_fmt) - mc.audio = self.audio_array[:, start : start + mc.num_channels] - mc.fs = self.fs - - self.audio.append(mc) - - def parse_sba_input(self, f: TextIO): - start = int(f.readline().strip()) - 1 - sba_order = int(f.readline().strip()) - - sba = fromtype(f"SBA{sba_order}") - sba.audio = self.audio_array[:, start : start + sba.num_channels] - sba.fs = self.fs - - self.audio.append(sba) - - def parse_optional_values(self, f: TextIO): - raise NotImplementedError( - "Additional configuration keys in metadata currently unsupported!" - ) - - # opts = {} - # original_pos = f.tell() - # key_value = f.readline().strip() - - # try to parse a key, otherwise reset read pointer - # for key in OPT_KEYS: - # if key_value.startswith(key): - # opts[key] = key_value.replace(key, "").replace(":", "") - # original_pos = f.tell() - # key_value = f.readline.strip() - # else: - # f.seek(original_pos) - # - - -def write_ISM_metadata_in_file( - metadata: list[np.ndarray], - file_name: list[Union[str, Path]], - automatic_naming: Optional[bool] = False, -) -> list[str, Path]: - """ - Write ISM metadata into csv file(s) - - Parameters - ---------- - metadata: list[np.ndarray] - List of metadata arrays - file_name: list[Union[str, Path]] - List of file names for csv files - automatic_naming: Optional[bool] - If true files are named automatically name.0.csv, name.1.csv, ... with name as the first entry of file_name - - Returns - ---------- - file_names: list[str, Path] - List of acutally used file names - """ - - if not automatic_naming and len(metadata) != len(file_name): - raise ValueError("Number of metadata objects and file names has to match") - number_objects = len(metadata) - - if automatic_naming: - file_names = [] - for m_object in range(number_objects): - file_names.append(f"{file_name[0]}.{m_object}.csv") - else: - file_names = file_name - - for i, csv_file in enumerate(file_names): - number_frames = metadata[i].shape[0] - with open(csv_file, "w", newline="") as file: - writer = csv.writer(file) - for k in range(number_frames): - row_list = [ - "%+07.2f" % np.round(metadata[i][k, 0], 2), - "%+06.2f" % np.round(metadata[i][k, 1], 2), - "01.00", - "000.00", - "1.00", - ] - writer.writerow(row_list) - - return file_names - - -def trim_meta( - x: audio.ObjectBasedAudio, - limits: Optional[Tuple[int, int]] = None, - pad_noise: Optional[bool] = False, - samples: Optional[bool] = False, -) -> None: - """ - Trim or pad ISM including metadata - positive limits trim negative limits pad - - Parameters - ---------- - x: audio.ObjectBasedAudio - ISM audio object - limits: Optional[Tuple[int, int]] - Number of samples to trim or pad at beginning and end - pad_noise: Optional[bool] - Flag for padding noise instead of silence - samples: Optional[bool] - Flag for interpreting limits as samples, otherwise milliseconds - """ - - if not limits: - return - - frame_length = int(IVAS_FRAME_LEN_MS * x.fs // 1000) - - # check if trim values are multiples of the frame length - if not samples: - pre_trim = int(limits[0] * x.fs // 1000) - post_trim = int(limits[1] * x.fs // 1000) - else: - pre_trim = limits[0] - post_trim = limits[1] - - if pre_trim % frame_length != 0 or post_trim % frame_length != 0: - raise ValueError( - f"ISM metadata padding and trimming only possible if pad/trim length is multiple of frame length. " - f"Frame length: {IVAS_FRAME_LEN_MS}ms" - ) - - # check if audio is multiple of frame length - if np.shape(x.audio)[0] % frame_length != 0: - raise ValueError( - f"ISM metadata padding and trimming only possible if audio length is multiple of frame length. " - f"Frame length: {IVAS_FRAME_LEN_MS}ms" - ) - - # check if metadata length fits exactly to audio length - for meta in x.object_pos: - if np.shape(meta)[0] * frame_length != np.shape(x.audio)[0]: - raise ValueError( - f"ISM metadata padding and trimming only possible if audio length is multiple of frame " - f"length and audio and metadata length match. Frame length: {IVAS_FRAME_LEN_MS}ms" - ) - - # trim audio - x.audio = trim(x.audio, x.fs, limits, pad_noise, samples) - - # trim metadata - trim_frames_pre = int(pre_trim / frame_length) - trim_frames_post = int(post_trim / frame_length) - for i in range(len(x.object_pos)): - x.object_pos[i] = trim( - x.object_pos[i], - limits=(trim_frames_pre, trim_frames_post), - pad_noise=False, - samples=True, - ) - - # add radius 1 - if trim_frames_pre < 0: - x.object_pos[i][: abs(trim_frames_pre), 2] = 1 - if trim_frames_post < 0: - x.object_pos[i][abs(trim_frames_post) :, 2] = 1 - - return - - -def concat_meta_from_file( - audio_files: list[str], - meta_files: list[list[str]], - out_file: list[str], - input_fmt: str, - silence_pre: Optional[int] = 0, - silence_post: Optional[int] = 0, - preamble: Optional[int] = None, -) -> None: - """ - Concatenate ISM metadata from files - - Parameters - ---------- - audio_files: list[str] - List of audio file names - meta_files: list[list[str]] - List of corresponding metadata file names - out_file: list[str] - Name of concatenated output file - input_fmt: str - Input audio format - silence_pre: Optional[int] - Silence inserted before each item - silence_post: Optional[int] - Silence inserted after each item - preamble: Optional[int] - Length of preamble in milliseconds - """ - - # create audio objects - audio_objects = [] - fs = None - for i, audio_file in enumerate(audio_files): - # metadata is cut/looped to signal length in init of audio object - audio_object = audio.fromfile(input_fmt, audio_file, in_meta=meta_files[i]) - audio_objects.append(audio_object) - if fs: - if audio_object.fs != fs: - raise ValueError("Sampling rates of files to concatenate don't match") - else: - fs = audio_object.fs - - frame_length = int(IVAS_FRAME_LEN_MS * audio_objects[0].fs // 1000) - - # pad and concatenate - concat_meta_all_obj = [None] * audio_objects[0].num_channels - - for audio_item in audio_objects: - # check if audio is multiple of frame length - if np.shape(audio_item.audio)[0] % frame_length != 0: - raise ValueError( - f"ISM metadata padding and trimming only possible if audio length is multiple of frame length. " - f"Frame length: {IVAS_FRAME_LEN_MS}ms" - ) - - # check if metadata length fits exactly to audio length - for meta in audio_item.object_pos: - if np.shape(meta)[0] * frame_length != np.shape(audio_item.audio)[0]: - raise ValueError( - f"ISM metadata padding and trimming only possible if audio length is multiple of frame " - f"length and audio and metadata length match. Frame length: {IVAS_FRAME_LEN_MS}ms" - ) - - # pad - trim_meta( - audio_item, (-silence_pre, -silence_post) - ) # use negative value since we want to pad, not trim - - # concatenate - for idx, obj_pos in enumerate(audio_item.object_pos): - concat_meta_all_obj[idx] = ( - np.concatenate([concat_meta_all_obj[idx], obj_pos]) - if concat_meta_all_obj[idx] is not None - else obj_pos - ) - - # add preamble - if preamble: - concat_meta_all_obj = add_remove_preamble(concat_meta_all_obj, preamble) - - write_ISM_metadata_in_file(concat_meta_all_obj, out_file) - - return - - -def split_meta_in_file( - in_filename: Union[str, Path], - out_folder: Union[str, Path], - split_filenames: list[Union[str, Path]], - splits: list[int], - input_fmt: str, - meta_files: Optional[list[Union[str, Path]]] = None, - in_fs: Optional[int] = 48000, - preamble: Optional[int] = 0, -): - """ - Splits ISM metadata files into multiple shorter files - - Parameters - __________ - in_filename: Union[str, Path] - Input filenmame (.pcm, .raw or .wav) - out_folder: Union[str, Path] - Output folder where to put the splits - split_filenames: list[Union[str, Path]] - List of names for the split files - splits: list[int] - List of sample indices where to cut the signal - in_fs: Optional[int] - Input sampling rate, default 48000 Hz - """ - - # create a list of output files - out_paths = [] - - # Read input file by creating ISM audio object - audio_object = audio.fromfile(input_fmt, in_filename, in_meta=meta_files, fs=in_fs) - - split_old = 0 - for idx, split in enumerate(splits): - out_paths_obj = [] - for obj in range(audio_object.num_channels): - out_file = ( - Path(out_folder) - / f"{Path(split_filenames[idx]).with_suffix(in_filename.suffix)}.{obj}.csv" - ) - - # add the path to our list - out_paths_obj.append(out_file) - - # remove preamble - if preamble: - preamble_frames = int(preamble / IVAS_FRAME_LEN_MS) - y = trim( - audio_object.object_pos[obj], - audio_object.fs, - (preamble_frames, 0), - samples=True, - ) - else: - y = audio_object.object_pos[obj] - - # split - split_start = int(split_old / IVAS_FRAME_LEN_MS / audio_object.fs * 1000) - split_end = int(split / IVAS_FRAME_LEN_MS / audio_object.fs * 1000) - y = y[split_start:split_end, :] - - # write file - write_ISM_metadata_in_file([y], [out_file]) - - out_paths.append(out_paths_obj) - - split_old = split - - return out_paths - - -def check_ISM_metadata( - in_meta: dict, - num_objects: int, - num_items: int, - item_names: Optional[list] = None, -) -> list: - """Find ISM metadata""" - - list_meta = [] - if in_meta is None: - for item in item_names: - list_item = metadata_search(Path(item).parent, [item], num_objects) - list_meta.append(list_item) - else: - if len(in_meta) == 1 and num_items != 1: - # automatic search for metadata files in folder for all items and objects - try: - path_meta = in_meta["all_items"] - except KeyError: - raise ValueError( - 'Only one metadata path is given but not with key "all_items".' - ) - - list_meta = metadata_search(path_meta, item_names, num_objects) - - elif num_items == len(in_meta): - # search for every item individually - for item_idx in range(num_items): - # try to use item_names as keys - try: - if item_names: - current_item = in_meta[item_names[item_idx].name] - else: - raise KeyError - except KeyError: - current_item = in_meta[f"item{item_idx + 1}"] - - if len(current_item) == 1: - # automatic search in folder - list_item = metadata_search( - current_item[0], [item_names[item_idx]], num_objects - ) - - elif len(current_item) == num_objects: - # just read out - list_item = current_item - else: - raise ValueError("Number of objects and metadata does not match.") - list_meta.append(list_item) - else: - raise ValueError("Number of metadata inputs does not match number of items") - - # return list of lists of metadata files - return list_meta - - -def metadata_search( - in_meta_path: Union[str, Path], - item_names: list[Union[str, Path]], - num_objects: int, -) -> list[list[Union[Path, str]]]: - """Search for ISM metadata with structure item_name.{0-3}.csv in in_meta folder""" - - if not item_names: - raise ValueError("Item names not provided, can't search for metadata") - - list_meta = [] - for item in item_names: - list_item = [] - for obj_idx in range(num_objects): - file_name_meta = in_meta_path / Path(item.stem).with_suffix( - f"{item.suffix}.{obj_idx}.csv" - ) - # check if file exists and add to list - if file_name_meta.is_file(): - list_item.append(file_name_meta) - else: - raise ValueError(f"Metadata file {file_name_meta} not found.") - if len(item_names) == 1: - list_meta = list_item - else: - list_meta.append(list_item) - - return list_meta - - -def add_remove_preamble( - metadata, - preamble, - add: Optional[bool] = True, -): - preamble_frames = preamble / IVAS_FRAME_LEN_MS - if not preamble_frames.is_integer(): - raise ValueError( - f"Application of preamble for ISM metadata is only possible if preamble length is multiple of frame length. " - f"Frame length: {IVAS_FRAME_LEN_MS}ms" - ) - for obj_idx in range(len(metadata)): - if metadata is not None and metadata[obj_idx] is not None: - if add: - metadata[obj_idx] = trim( - metadata[obj_idx], - limits=(-int(preamble_frames), 0), - samples=True, - ) - - # add radius 1 - metadata[obj_idx][: int(preamble_frames), 2] = 1 - else: - metadata[obj_idx] = trim( - metadata[obj_idx], - limits=(int(preamble_frames), 0), - samples=True, - ) - - return metadata diff --git a/item_generation_scripts/audiotools/rotation.py b/item_generation_scripts/audiotools/rotation.py deleted file mode 100644 index 742548a8d7f7154a1521f651754d4fe3fc8e51bf..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/rotation.py +++ /dev/null @@ -1,379 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -from typing import Optional, Tuple - -import numpy as np - -""" -Helper functions used by Ruedenberg, -an implementation of the algorithm in -Ivanic, J. & Ruedenberg, K., J. Phys. Chem. 100, 6342 (1996) -translated from ivas_rotation.c -""" - - -def SHrot_p( - i: int, - l: int, - a: int, - b: int, - SHrotmat: np.ndarray, - R_lm1: np.ndarray, -) -> float: - """Helper function to calculate the ps""" - - ri1 = SHrotmat[i + 1 + 1][1 + 1 + 1] - rim1 = SHrotmat[i + 1 + 1][-1 + 1 + 1] - ri0 = SHrotmat[i + 1 + 1][0 + 1 + 1] - - if b == -l: - R_lm1_1 = R_lm1[a + l - 1][0] - R_lm1_2 = R_lm1[a + l - 1][2 * l - 2] - p = ri1 * R_lm1_1 + rim1 * R_lm1_2 - else: - if b == l: - R_lm1_1 = R_lm1[a + l - 1][2 * l - 2] - R_lm1_2 = R_lm1[a + l - 1][0] - p = ri1 * R_lm1_1 - rim1 * R_lm1_2 - else: - R_lm1_1 = R_lm1[a + l - 1][b + l - 1] - p = ri0 * R_lm1_1 - - return p - - -def SHrot_u( - l: int, - m: int, - n: int, - SHrotmat: np.ndarray, - R_lm1: np.ndarray, -) -> float: - """Helper function to calculate the us""" - - return SHrot_p(0, l, m, n, SHrotmat, R_lm1) - - -def SHrot_v( - l: int, - m: int, - n: int, - SHrotmat: np.ndarray, - R_lm1: np.ndarray, -) -> float: - """Helper function to calculate the vs""" - - if m == 0: - p0 = SHrot_p(1, l, 1, n, SHrotmat, R_lm1) - p1 = SHrot_p(-1, l, -1, n, SHrotmat, R_lm1) - return p0 + p1 - else: - if m > 0: - d = 1.0 if (m == 1) else 0.0 - p0 = SHrot_p(1, l, m - 1, n, SHrotmat, R_lm1) - p1 = SHrot_p(-1, l, -m + 1, n, SHrotmat, R_lm1) - return p0 * np.sqrt(1.0 + d) - p1 * (1.0 - d) - else: - d = 1.0 if (m == -1) else 0.0 - p0 = SHrot_p(1, l, m + 1, n, SHrotmat, R_lm1) - p1 = SHrot_p(-1, l, -m - 1, n, SHrotmat, R_lm1) - return p0 * (1.0 - d) + p1 * np.sqrt(1.0 + d) - - -def SHrot_w( - l: int, - m: int, - n: int, - SHrotmat: np.ndarray, - R_lm1: np.ndarray, -) -> float: - """Helper function to calculate the w""" - - if m == 0: - raise ValueError("ERROR should not be called\n") - else: - if m > 0: - p0 = SHrot_p(1, l, m + 1, n, SHrotmat, R_lm1) - p1 = SHrot_p(-1, l, -m - 1, n, SHrotmat, R_lm1) - return p0 + p1 - else: - p0 = SHrot_p(1, l, m - 1, n, SHrotmat, R_lm1) - p1 = SHrot_p(-1, l, -m + 1, n, SHrotmat, R_lm1) - return p0 - p1 - - -def SHrotmatgen( - R: np.ndarray, - order: Optional[int] = 3, -) -> np.ndarray: - """ - Calculate SHD rotation matrix from that in real space - translated from ivas_rotation.c - - Parameters: - ---------- - R: np.ndarray - real-space rotation matrix - order: Optional[int] - Ambisonics order, default = 3 - - Returns: - ---------- - SHrotmat: np.ndarray - SHD rotation matrix - """ - - dim = (order + 1) * (order + 1) - - SHrotmat = np.zeros([dim, dim]) - R_lm1 = np.zeros([dim, dim]) - R_l = np.zeros([dim, dim]) - - SHrotmat[0][0] = 1.0 - - SHrotmat[1][1] = R[1][1] - SHrotmat[1][2] = R[1][2] - SHrotmat[1][3] = R[1][0] - - SHrotmat[2][1] = R[2][1] - SHrotmat[2][2] = R[2][2] - SHrotmat[2][3] = R[2][0] - - SHrotmat[3][1] = R[0][1] - SHrotmat[3][2] = R[0][2] - SHrotmat[3][3] = R[0][0] - - for i in range(2 * 1 + 1): - for j in range(2 * 1 + 1): - R_lm1[i][j] = SHrotmat[i + 1][j + 1] - - band_idx = 4 - for l in range(2, order + 1): - R_l[:, :] = 0.0 - - for m in range(-l, l + 1): - d = 1 if (m == 0) else 0 - absm = abs(m) - sql2mm2 = np.sqrt((l * l - m * m)) - sqdabsm = np.sqrt(((1 + d) * (l + absm - 1) * (l + absm))) - sqlabsm = np.sqrt(((l - absm - 1) * (l - absm))) - - for n in range(-l, l + 1): - if abs(n) == l: - sqdenom = np.sqrt((2 * l) * (2 * l - 1)) - else: - sqdenom = np.sqrt(l * l - n * n) - - u = sql2mm2 / sqdenom - v = sqdabsm / sqdenom * (1 - 2 * d) * 0.5 - w = sqlabsm / sqdenom * (1 - d) * (-0.5) - - if u != 0: - u = u * SHrot_u(l, m, n, SHrotmat, R_lm1) - if v != 0: - v = v * SHrot_v(l, m, n, SHrotmat, R_lm1) - if w != 0: - w = w * SHrot_w(l, m, n, SHrotmat, R_lm1) - R_l[m + l][n + l] = u + v + w - - for i in range(2 * l + 1): - for j in range(2 * l + 1): - SHrotmat[band_idx + i][band_idx + j] = R_l[i][j] - - for i in range(2 * l + 1): - for j in range(2 * l + 1): - R_lm1[i][j] = R_l[i][j] - - band_idx += 2 * l + 1 - - return SHrotmat - - -def Quat2Euler( - quat: np.ndarray, - degrees: bool = True, -) -> np.ndarray: - """Convert Quaternion to Euler angles""" - - sinr = +2.0 * (quat[..., 0] * quat[..., 1] + quat[..., 2] * quat[..., 3]) - cosr = +1.0 - 2.0 * (quat[..., 1] * quat[..., 1] + quat[..., 2] * quat[..., 2]) - roll = np.arctan2(sinr, cosr) - - sinp = +2.0 * (quat[..., 0] * quat[..., 2] - quat[..., 3] * quat[..., 1]) - pitch = np.where(np.fabs(sinp) >= 1, np.copysign(np.pi / 2, sinp), np.arcsin(sinp)) - - siny = +2.0 * (quat[..., 0] * quat[..., 3] + quat[..., 1] * quat[..., 2]) - cosy = +1.0 - 2.0 * (quat[..., 2] * quat[..., 2] + quat[..., 3] * quat[..., 3]) - yaw = np.arctan2(siny, cosy) - - ypr = np.array([yaw, pitch, roll]).T - - if degrees: - ypr = np.rad2deg(ypr) - - return ypr - - -def Euler2Quat( - ypr: np.ndarray, - degrees: bool = True, -) -> np.ndarray: - """Convert Euler angles to Quaternion""" - - if degrees: - ypr = np.deg2rad(ypr) - - if len(ypr.shape) == 2: - N_quat = ypr.shape[0] - quat = np.zeros([N_quat, 4]) - yaw = ypr[:, 0] - pitch = ypr[:, 1] - roll = ypr[:, 2] - else: - quat = np.zeros([4]) - yaw = ypr[0] - pitch = ypr[1] - roll = ypr[2] - - c1 = np.cos(0.5 * yaw) - c2 = np.cos(0.5 * pitch) - c3 = np.cos(0.5 * roll) - - s1 = np.sin(0.5 * yaw) - s2 = np.sin(0.5 * pitch) - s3 = np.sin(0.5 * roll) - - quat[..., 0] = c3 * c2 * c1 + s3 * s2 * s1 - quat[..., 1] = s3 * c2 * c1 - c3 * s2 * s1 - quat[..., 2] = s3 * c2 * s1 + c3 * s2 * c1 - quat[..., 3] = c3 * c2 * s1 - s3 * s2 * c1 - - return quat - - -def Quat2RotMat( - quat: np.ndarray, -) -> np.ndarray: - """Convert quaternion to rotation matrix""" - - R = np.zeros([3, 3]) - - if quat[0] != -3: - # Quaternions - # formula taken from ivas_rotation.c - - R[0, 0] = ( - quat[0] * quat[0] - + quat[1] * quat[1] - - quat[2] * quat[2] - - quat[3] * quat[3] - ) - R[0, 1] = 2.0 * (quat[1] * quat[2] - quat[0] * quat[3]) - R[0, 2] = 2.0 * (quat[1] * quat[3] + quat[0] * quat[2]) - - R[1, 0] = 2.0 * (quat[1] * quat[2] + quat[0] * quat[3]) - R[1, 1] = ( - quat[0] * quat[0] - - quat[1] * quat[1] - + quat[2] * quat[2] - - quat[3] * quat[3] - ) - R[1, 2] = 2.0 * (quat[2] * quat[3] - quat[0] * quat[1]) - - R[2, 0] = 2.0 * (quat[1] * quat[3] - quat[0] * quat[2]) - R[2, 1] = 2.0 * (quat[2] * quat[3] + quat[0] * quat[1]) - R[2, 2] = ( - quat[0] * quat[0] - - quat[1] * quat[1] - - quat[2] * quat[2] - + quat[3] * quat[3] - ) - - else: - # Euler angles in R_X(roll)*R_Y(pitch)*R_Z(yaw) convention - # - # yaw: rotate scene counter-clockwise in the horizontal plane - # pitch: rotate scene in the median plane, increase elevation with positive values - # roll: rotate scene from the right ear to the top - # - # formula taken from ivas_rotation.c - - c1 = np.cos(quat[3] / 180.0 * np.pi) - c2 = np.cos(quat[2] / 180.0 * np.pi) - c3 = np.cos(quat[1] / 180.0 * np.pi) - - s1 = np.sin(quat[3] / 180.0 * np.pi) - s2 = np.sin(-quat[2] / 180.0 * np.pi) - s3 = np.sin(quat[1] / 180.0 * np.pi) - - R[0, 0] = c2 * c3 - R[0, 1] = -c2 * s3 - R[0, 2] = s2 - - R[1, 0] = c1 * s3 + c3 * s1 * s2 - R[1, 1] = c1 * c3 - s1 * s2 * s3 - R[1, 2] = -c2 * s1 - - R[2, 0] = s1 * s3 - c1 * c3 * s2 - R[2, 1] = c3 * s1 + c1 * s2 * s3 - R[2, 2] = c1 * c2 - - return R - - -def rotateAziEle( - azi: float, - ele: float, - R: np.ndarray, - is_planar: bool = False, -) -> Tuple[float, float]: - """Rotate azimuth and elevation angles with rotation matrix""" - - w = np.cos(np.deg2rad(ele)) - dv = np.array( - [ - w * np.cos(np.deg2rad(azi)), - w * np.sin(np.deg2rad(azi)), - np.sin(np.deg2rad(ele)), - ] - ) - - dv_rot = R @ dv - - azi = np.rad2deg(np.arctan2(dv_rot[1], dv_rot[0])) - if is_planar: - ele = 0 - else: - ele = np.rad2deg(np.arctan2(dv_rot[2], np.sqrt(np.sum(dv_rot[:2] ** 2)))) - - return azi, ele diff --git a/item_generation_scripts/audiotools/utils.py b/item_generation_scripts/audiotools/utils.py deleted file mode 100644 index 6aaf5fa9ee27f8e6f8c9ca1d0b8d7fb7cd3ca607..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/utils.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -from pathlib import Path - -import numpy as np - -from item_generation_scripts.audiotools.rotation import Euler2Quat, Quat2Euler - - -def read_trajectory(trj_file: Path, return_quat=True): - trj = np.genfromtext(trj_file, delimiter=",") - - if np.all(trj[:, 0] == -3): - # Euler - if return_quat: - return Euler2Quat(trj[:, 1:]) - else: - return trj[:, 1:] - else: - # Quat - if return_quat: - return trj - else: - return Quat2Euler(trj) - - -def write_trajectory(trj, out_file, write_quat=True): - if trj.shape[1] == 3: - # Euler - if write_quat: - trj = Euler2Quat(trj) - else: - trj = np.insert(trj, 0, -3.0, axis=1) - elif not write_quat: - trj = Quat2Euler(trj) - trj = np.insert(trj, 0, -3.0, axis=1) - - with open(out_file, "w") as f: - for pos in trj: - f.write(", ".join([f"{q:.6f}" for q in pos])) - f.write("\n") diff --git a/item_generation_scripts/audiotools/wrappers/__init__.py b/item_generation_scripts/audiotools/wrappers/__init__.py deleted file mode 100644 index aea270d8d1752e772ab716bc33be0bf7b8a0cf35..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/wrappers/__init__.py +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# diff --git a/item_generation_scripts/audiotools/wrappers/bs1770.py b/item_generation_scripts/audiotools/wrappers/bs1770.py deleted file mode 100644 index d238bec3ded6c31b471ccfe1a1a03ea31cffcb15..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/wrappers/bs1770.py +++ /dev/null @@ -1,291 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import copy -import logging -from pathlib import Path -from tempfile import TemporaryDirectory -from typing import Optional, Tuple, Union -from warnings import warn - -import numpy as np - -from item_generation_scripts.audiotools import audio, convert -from item_generation_scripts.audiotools.audiofile import write -from item_generation_scripts.audiotools.wrappers.filter import resample_itu -from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES -from item_generation_scripts.utils import find_binary, get_devnull, run - -logger = logging.getLogger("__main__") -logger.setLevel(logging.DEBUG) - - -def bs1770demo( - input: audio.Audio, - target_loudness: Optional[float] = -26, -) -> Tuple[float, float]: - """ - Wrapper for ITU-R BS.1770-4, requires bs1770demo binary - - Parameters - ---------- - input: Audio - Input audio - target_loudness: Optional[float] - Desired loudness in LKFS - - Returns - ------- - measured_loudness : float - Measured loudness of input - scale_factor: float - Scale factor to achieve desired loudness - """ - - null_file = get_devnull() - - if "bs1770demo" in DEFAULT_CONFIG_BINARIES["binary_paths"]: - binary = find_binary( - DEFAULT_CONFIG_BINARIES["binary_paths"]["bs1770demo"].name, - binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"]["bs1770demo"].parent, - ) - else: - binary = find_binary("bs1770demo") - - if not isinstance(input, audio.BinauralAudio) and not isinstance( - input, audio.ChannelBasedAudio - ): - raise NotImplementedError(f"{input.name} is unsupported in ITU-R BS.1770-4.") - - if input.fs != 48000: - warn( - "ITU-R BS.1770-4 only supports 48kHz sampling rate. Temporarily resampling signal for measurement." - ) - tmp_sig = resample_itu(input, 48000) - else: - tmp_sig = input.audio - - with TemporaryDirectory() as tmp_dir: - tmp_dir = Path(tmp_dir) - tmp_file = tmp_dir.joinpath("tmp_loudness.pcm") - - """ - ITU-R BS.1770-4 - """ - - cmd = [ - str(binary), - "-nchan", - str(input.num_channels), # input nchan - "-lev", - str(target_loudness), # level - "-conf", - "", # config string - str(tmp_file), - null_file, - ] - - if isinstance(input, audio.BinauralAudio): - cmd[6] = "00" # -conf - elif isinstance(input, audio.ChannelBasedAudio): - # if loudspeaker position fulfills the criteria, set the config string to 1 for that index - conf_str = [ - str(int(abs(e) < 30 and (60 <= abs(a) <= 120))) - for a, e in zip(input.ls_azi, input.ls_ele) - ] - for lfe in input.lfe_index: - conf_str[lfe] = "L" - - cmd[6] = "".join(conf_str) - - # write temporary file - write(tmp_file, tmp_sig, 48000) - - # run command - result = run(cmd, logger=logger) - - # parse output - measured_loudness = float(result.stdout.splitlines()[3].split(":")[1]) - scale_factor = float(result.stdout.splitlines()[-3].split(":")[1]) - - return measured_loudness, scale_factor - - -def get_loudness( - input: audio.Audio, - target_loudness: Optional[float] = -26, - loudness_format: Optional[str] = None, -) -> Tuple[float, float]: - """ - Loudness measurement using ITU-R BS.1770-4 - - Parameters - ---------- - input : Audio - Input audio - target_loudness: float - Desired loudness in LKFS - loudness_format: str - Loudness format to render to for loudness computation (default input format if possible) - - Returns - ------- - measured_loudness : float - Measured loudness (after conversion to loudness_format if specified) - scale_factor: float - Scale factor to acheive desired loudness - """ - - if target_loudness > 0: - raise ValueError("Desired loudness is too high!") - - if loudness_format is None: - # for some formats rendering is necessary prior to loudness measurement - if isinstance(input, audio.SceneBasedAudio) or isinstance( - input, audio.MetadataAssistedSpatialAudio - ): - loudness_format = "7_1_4" - elif isinstance(input, audio.ObjectBasedAudio): - loudness_format = "BINAURAL" - elif hasattr(input, "layout_file"): - loudness_format = input.layout_file - else: - # default use input format - loudness_format = input.name - - # configure intermediate format - tmp = audio.fromtype(loudness_format) - tmp.fs = input.fs - - if input.name != loudness_format: - convert.format_conversion(input, tmp) - else: - tmp.audio = input.audio - - return bs1770demo(tmp, target_loudness) - - -def loudness_norm( - input: audio.Audio, - target_loudness: Optional[float] = -26, - loudness_format: Optional[str] = None, -) -> np.ndarray: - """ - Iterative loudness normalization using ITU-R BS.1770-4 - Signal is iteratively scaled after rendering to the specified format - until loudness converges to the target value - - Parameters - ---------- - input : Audio - Input audio - target_loudness: Optional[float] - Desired loudness level in LKFS - loudness_format: Optional[str] - Loudness format to render to for loudness computation (default input format) - - Returns - ------- - norm : Audio - Normalized audio - """ - - # repeat until convergence of loudness - measured_loudness = np.inf - scale_factor = 1 - num_iter = 1 - - while np.abs(measured_loudness - target_loudness) > 0.5 and num_iter < 10: - measured_loudness, scale_factor_new = get_loudness( - input, target_loudness, loudness_format - ) - - # scale input - input.audio *= scale_factor_new - - # update scale factor - scale_factor *= scale_factor_new - - num_iter += 1 - - if num_iter >= 10: - warn( - f"Loudness did not converge to desired value, stopping at: {measured_loudness:.2f}" - ) - - return input.audio - - -def scale_files( - file_list: list[list[Union[Path, str]]], - fmt: str, - loudness: float, - fs: Optional[int] = 48000, - in_meta: Optional[list] = None, -) -> None: - """ - Scales audio files to desired loudness - - Parameters - ---------- - file_list : list[list[Union[Path, str]]] - List of file paths in a list of the condition folders - fmt: str - Audio format of files in list - loudness: float - Desired loudness level in LKFS/dBov - fs: Optional[int] - Sampling rate - in_meta: Optional[list] - Metadata for ISM with same structure as file_list but one layer more - for the list of metadata for one file - """ - - if fmt.startswith("ISM") and in_meta: - meta_bool = True - else: - in_meta = copy.copy(file_list) - meta_bool = False - - for folder, meta_folder in zip(file_list, in_meta): - for file, meta in zip(folder, meta_folder): - # create audio object - if meta_bool: - audio_obj = audio.fromfile(fmt, file, fs, meta) - else: - audio_obj = audio.fromfile(fmt, file, fs) - - # adjust loudness - scaled_audio = loudness_norm(audio_obj, loudness) - - # write into file - write(file, scaled_audio, audio_obj.fs) diff --git a/item_generation_scripts/audiotools/wrappers/eid_xor.py b/item_generation_scripts/audiotools/wrappers/eid_xor.py deleted file mode 100644 index 0b807d940576c0cedfaaf802d82c4986d732dd47..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/wrappers/eid_xor.py +++ /dev/null @@ -1,193 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import os.path -from pathlib import Path -from typing import Optional, Union - -from item_generation_scripts.audiotools.wrappers.gen_patt import create_error_pattern -from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES -from item_generation_scripts.utils import find_binary, run - - -def eid_xor( - error_pattern: Union[str, Path], - in_bitstream: Union[str, Path], - out_bitstream: Union[str, Path], -) -> None: - """ - Wrapper for eid-xor binary to apply error patterns for the bitstream processing - - Parameters - ---------- - error_pattern: Union[str, Path] - Path to error pattern file - in_bitstream: Union[str, Path] - Path to input bitstream file - out_bitstream: Union[str, Path] - Output path for modified bitstream - """ - - # find binary - if "eid-xor" in DEFAULT_CONFIG_BINARIES["binary_paths"]: - binary = find_binary( - DEFAULT_CONFIG_BINARIES["binary_paths"]["eid-xor"].name, - binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"]["eid-xor"].parent, - ) - else: - binary = find_binary("eid-xor") - - # check for valid inputs - if not Path(in_bitstream).is_file(): - raise ValueError( - f"Input bitstream file {in_bitstream} for bitstream processing does not exist" - ) - elif not Path(error_pattern).is_file(): - raise ValueError( - f"Error pattern file {error_pattern} for bitstream processing does not exist" - ) - - # set up command line - cmd = [ - str(binary), - "-vbr", # Enables variable bit rate operation - "-fer", # Error pattern is a frame erasure pattern - in_bitstream, - error_pattern, - out_bitstream, - ] - - # run command - run(cmd) - - return - - -def create_and_apply_error_pattern( - in_bitstream: Union[Path, str], - out_bitstream: Union[Path, str], - len_sig: int, - error_pattern: Optional[Union[Path, str]] = None, - error_rate: Optional[float] = None, - preamble: Optional[int] = 0, - master_seed: Optional[int] = 0, - prerun_seed: Optional[int] = 0, -) -> None: - """ - Function to create (or use existing) frame error pattern for bitstream processing - - Parameters - ---------- - in_bitstream: Union[Path, str] - Path of input bitstream - out_bitstream: Union[Path, str] - Path of output bitstream - len_sig: int - Length of signal in frames - error_pattern: Optional[Union[Path, str]] - Path to existing error pattern - error_rate: float - Error rate in percent - preamble: Optional[int] - Length of preamble in frames - master_seed: Optional[int] - Master seed for error pattern generation - prerun_seed: Optional[int] - Number of preruns in seed generation - """ - - if error_pattern is None: - # create error pattern - if error_rate is not None: - error_pattern = in_bitstream.parent.joinpath("error_pattern").with_suffix( - ".192" - ) - create_error_pattern( - len_sig, error_pattern, error_rate, preamble, master_seed, prerun_seed - ) - else: - raise ValueError( - "Either error pattern or error rate has to be specified for bitstream processing" - ) - elif error_rate is not None: - raise ValueError( - "Error pattern and error rate are specified for bitstream processing. Can't use both" - ) - - # apply error pattern - eid_xor(error_pattern, in_bitstream, out_bitstream) - - return - - -def validate_error_pattern_application( - error_pattern: Optional[Union[Path, str]] = None, - error_rate: Optional[int] = None, -) -> None: - """ - Validate settings for the network simulator - - Parameters - ---------- - error_pattern: Optional[Union[Path, str]] - Path to existing error pattern - error_rate: Optional[int] - Frame error rate - """ - - if find_binary("gen-patt") is None: - raise FileNotFoundError( - "The binary gen-patt for error pattern generation was not found! Please check the configuration." - ) - if find_binary("eid-xor") is None: - raise FileNotFoundError( - "The binary eid-xor for error patter application was not found! Please check the configuration." - ) - if error_pattern is not None: - if not os.path.exists(os.path.realpath(error_pattern)): - raise FileNotFoundError( - f"The frame error profile file {error_pattern} was not found! Please check the configuration." - ) - if error_rate is not None: - raise ValueError( - "Frame error pattern and error rate are specified for bitstream processing. Can't use both! Please check the configuration." - ) - else: - if error_rate is None: - raise ValueError( - "Either error rate or error pattern has to be specified for FER bitstream processing." - ) - elif error_rate < 0 or error_rate > 100: - raise ValueError( - f"Specified error rate of {error_rate}% is either too large or too small." - ) - return diff --git a/item_generation_scripts/audiotools/wrappers/esdru.py b/item_generation_scripts/audiotools/wrappers/esdru.py deleted file mode 100644 index 7785a586735679526c524a97cfcc8c2e1f69fa61..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/wrappers/esdru.py +++ /dev/null @@ -1,130 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -from pathlib import Path -from tempfile import TemporaryDirectory -from typing import Optional - -import numpy as np - -from item_generation_scripts.audiotools import audio -from item_generation_scripts.audiotools.audiofile import read, write -from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES -from item_generation_scripts.utils import find_binary, run - - -def esdru( - input: audio.Audio, - alpha: float, - sf: Optional[int] = 48000, - e_step: Optional[float] = 0.5, - seed: Optional[int] = 1, -) -> np.ndarray: - """ - Wrapper for ESDRU (Ericsson spatial distortion reference unit) Recommendation ITU-T P.811, requires esdru binary - - Parameters - ---------- - input : Audio - Input audio (16 bit Stereo PCM) - alpha: float - Alpha value [0.0 ... 1.0] - sf: Optional[int] - Sampling frequency FS Hz (Default: 48000 Hz) - e_step: Optional[float] - Max step S during high energy [0.0 ... 1.0] (Default: 0.5) - seed: Optional[int] - Set random seed I [unsigned int] (Default: 1) - - Returns - ------- - output: np.ndarray - Output array (16 bit Stereo PCM) - """ - if "esdru" in DEFAULT_CONFIG_BINARIES["binary_paths"]: - binary = find_binary( - DEFAULT_CONFIG_BINARIES["binary_paths"]["esdru"].name, - binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"]["esdru"].parent, - ) - else: - binary = find_binary("esdru") - - if not isinstance(input, audio.BinauralAudio) and not input.name == "STEREO": - raise Exception( - "ESDRU condition only available for STEREO or BINAURAL output format" - ) - - if alpha < 0.0 or alpha > 1.0: - raise Exception( - "Alpha value is out of bounds. Please choose a value between 0.0 and 1.0." - ) - - if e_step < 0.0 or e_step > 1.0: - raise Exception( - "Step value is out of bounds. Please choose a value between 0.0 and 1.0." - ) - - tmp_input_signal = input.audio - tmp_output_signal = np.ones((48000, 2)) - - with TemporaryDirectory() as tmp_dir: - tmp_dir = Path(tmp_dir) - tmp_input_file = tmp_dir.joinpath("tmp_input_signal.raw") - tmp_output_file = tmp_dir.joinpath("tmp_output_signal.raw") - - """ - ITU-T Recommendation P.811, ESDRU - """ - - cmd = [ - str(binary), - "-sf", - str(sf), - "-e_step", - str(e_step), - "-seed", - str(seed), - str(alpha), - str(tmp_input_file), - str(tmp_output_file), - ] - - # write temporary file - write(tmp_input_file, tmp_input_signal, sf) - write(tmp_output_file, tmp_output_signal, sf) - - # run command - run(cmd) - - tmp_output_signal, out_fs = read(tmp_output_file, 2, sf) - - return tmp_output_signal diff --git a/item_generation_scripts/audiotools/wrappers/filter.py b/item_generation_scripts/audiotools/wrappers/filter.py deleted file mode 100644 index 4c7b61b4ae8d1837f04b51baf7e29d79dab38759..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/wrappers/filter.py +++ /dev/null @@ -1,366 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import re -from copy import copy -from pathlib import Path -from tempfile import TemporaryDirectory -from typing import Optional -from warnings import warn - -import numpy as np - -from item_generation_scripts.audiotools.audio import Audio, ChannelBasedAudio -from item_generation_scripts.audiotools.audioarray import delay_compensation -from item_generation_scripts.audiotools.audiofile import read, write -from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES -from item_generation_scripts.utils import find_binary, run - -FILTER_TYPES_REGEX = r"[\n][\s]{3}[A-Z0-9]\w+\s+" - - -def filter_itu( - input: Audio, - flt_type: str, - block_size: Optional[int] = None, - mod: Optional[bool] = False, - up: Optional[bool] = False, - down: Optional[bool] = False, - is_async: Optional[bool] = False, - delay: Optional[int] = None, - skip_channel: Optional[list[int]] = None, -) -> np.ndarray: - """ - Low-pass filter a multi-channel audio array - - Parameters - ---------- - input: Audio - Input array - flt_type: str - Name of filter type used for filtering - block_size: Optional[int] - Processing block size in number of samples (default 256 samples) - mod: Optional[bool] - Flag for using the modified IRS characteristic - up: Optional[bool] - Flag for up-sampling - down: Optional[bool] - Flag for down-sampling - is_async: Optional[bool] - Flag for asynchronization operation - delay: Optional[int] - Delay in number of samples - skip_channel: Optional[list[int]] - List of channel indices which should not be filtered - - Returns - ------- - output: np.ndarray - Output filtered array - """ - - if "filter" in DEFAULT_CONFIG_BINARIES["binary_paths"]: - binary = find_binary( - DEFAULT_CONFIG_BINARIES["binary_paths"]["filter"].name, - binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"]["filter"].parent, - ) - else: - binary = find_binary("filter") - - # check if filter type is supported - tmp = run([binary], check=False) - - FILTER_TYPES = [ - f.group().strip() for f in re.finditer(FILTER_TYPES_REGEX, tmp.stdout) - ] - - if flt_type not in FILTER_TYPES: - raise ValueError( - f"Filter type {flt_type} does not seem to be supported by the binary: {FILTER_TYPES}" - ) - - # create command line - cmd = [ - binary, - "-q", - ] - - if mod: - cmd.append("-mod") - if up and down: - raise ValueError("Either up-sampling or down-sampling has to be chosen") - if up: - cmd.append("-up") - elif down: - cmd.append("-down") - if is_async: - cmd.append("-async") - if delay: - cmd.extend(["-delay", str(delay)]) - - cmd.append(str(flt_type)) - - # create output array with according size - if up: - # upsampling -> size increases - if flt_type == "SHQ2": - output = np.zeros((np.shape(input.audio)[0] * 2, np.shape(input.audio)[1])) - elif flt_type == "SHQ3": - output = np.zeros((np.shape(input.audio)[0] * 3, np.shape(input.audio)[1])) - else: - raise ValueError(f"No upsampling with {flt_type} possible") - elif down: - # downsampling -> size decreases - if flt_type == "SHQ2": - output = np.zeros( - (int(np.ceil(np.shape(input.audio)[0] / 2)), np.shape(input.audio)[1]) - ) - elif flt_type == "SHQ3": - output = np.zeros( - (int(np.ceil(np.shape(input.audio)[0] / 3)), np.shape(input.audio)[1]) - ) - else: - raise ValueError(f"No downsampling with {flt_type} possible") - else: - # normal filtering -> size remains - output = np.zeros_like(input.audio) - - with TemporaryDirectory() as tmp_dir: - tmp_dir = Path(tmp_dir) - - # process channels separately - for channel in range(input.num_channels): - if skip_channel and channel in skip_channel: - continue - - cmd_in_out = cmd.copy() - - tmp_in = tmp_dir.joinpath(f"tmp_filterIn{channel}.pcm") - tmp_out = tmp_dir.joinpath(f"tmp_filterOut{channel}.pcm") - - cmd_in_out.append(str(tmp_in)) - cmd_in_out.append(str(tmp_out)) - - if block_size: - cmd_in_out.append(str(block_size)) - - write(tmp_in, input.audio[:, channel], input.fs) - - run(cmd_in_out) - - a, _ = read(tmp_out, nchannels=1, fs=input.fs) - output[:, channel][:, None] = a - - return output - - -def lpfilter_itu( - x: Audio, - fc: int, -) -> np.ndarray: - """ - Low-pass filter a multi-channel audio array - - Parameters - ---------- - x: Audio - Input audio - fc: int - Cut-off frequency in Hz - - Returns - ------- - y: np.ndarray - Output low-pass filtered array - """ - - # find right filter type for cut-off frequency - flt_types = ["LP1p5", "LP35", "LP7", "LP10", "LP12", "LP14", "LP20"] - flt_vals = [1500, 3500, 7000, 10000, 12000, 14000, 20000] - try: - flt_type = flt_types[flt_vals.index(fc)] - except Exception: - raise ValueError(f"LP cut-off frequency {fc}Hz not supported.") - - # resample if samplingrate is not supported - old_fs = None - tmp = copy(x) - if x.fs != 48000: - warn( - f"Filter type {flt_type} only supported for 48kHz samplingrate, not for {x.fs}Hz -> resampling" - ) - old_fs = x.fs - tmp.audio = resample_itu(tmp, 48000) - tmp.fs = 48000 - - # apply filter - y = filter_itu(tmp, flt_type=flt_type, block_size=960) - - # delay compensation - y = delay_compensation(y, flt_type=flt_type, fs=tmp.fs) - - # reverse resampling - if old_fs: - tmp.audio = y - y = resample_itu(tmp, old_fs) - - return y - - -def hp50filter_itu( - x: Audio, -) -> np.ndarray: - """ - High-pass 50Hz filter a multi-channel audio array - - Parameters - ---------- - x: Audio - Input audio - - Returns - ------- - y: np.ndarray - Output high-pass filtered array - """ - - # set filter type and check if sampling rate is supported - old_fs = None - tmp = copy(x) - if x.fs == 48000: - flt_type = "HP50_48KHZ" - elif x.fs == 32000: - flt_type = "HP50_32KHZ" - else: - # resample if samplingrate is not supported - warn( - f"Filter type HP50 only supported for 48kHz and 32kHz samlingrate, not for {x.fs}Hz -> resampling" - ) - flt_type = "HP50_48KHZ" - old_fs = x.fs - tmp.audio = resample_itu(tmp, 48000) - tmp.fs = 48000 - - # don't apply high-pass filtering to LFE channel - if isinstance(x, ChannelBasedAudio): - skip_channel = x.lfe_index - else: - skip_channel = None - - # apply filter - y = filter_itu(tmp, flt_type=flt_type, skip_channel=skip_channel) - - # delay compensation - y = delay_compensation(y, flt_type=flt_type, fs=tmp.fs) - - # reverse resampling - if old_fs: - tmp.audio = y - y = resample_itu(tmp, old_fs) - - return y - - -def resample_itu( - x: Audio, - fs_new: int, -) -> np.ndarray: - """ - Resampling of multi-channel audio array - - Parameters - ---------- - x: Audio - Input audio - fs_new: int - Target sampling rate in Hz - - Returns - ------- - y: np.ndarray - Output resampled array - """ - - fs_old = x.fs - - # if samplingrate is the same do nothing - if fs_new == fs_old: - return x.audio - - ratio_fs = fs_new / fs_old - up = [False] - down = [False] - - # select suitable processing to achieve target samplingrate - if ratio_fs == 2: - flt_type = ["SHQ2"] - up = [True] - elif ratio_fs == 0.5: - flt_type = ["SHQ2"] - down = [True] - elif ratio_fs == 3: - flt_type = ["SHQ3"] - up = [True] - elif ratio_fs == 1 / 3: - flt_type = ["SHQ3"] - down = [True] - elif ratio_fs == 2 / 3: - flt_type = ["SHQ2", "SHQ3"] - up = [True, False] - down = [False, True] - elif ratio_fs == ratio_fs == 3 / 2: - flt_type = ["SHQ3", "SHQ2"] - up = [True, False] - down = [False, True] - else: - raise ValueError("Ratio of input and output sampling frequency not supported") - - # apply filter - y = copy(x) - for i, flt in enumerate(flt_type): - y.audio = filter_itu(y, flt_type=flt, up=up[i], down=down[i]) - y.audio = delay_compensation( - y.audio, flt_type=flt, fs=y.fs, up=up[i], down=down[i] - ) - # if up[i]: - # if flt == "SHQ2": - # y.fs = y.fs * 2 - # elif flt == "SHQ3": - # y.fs = y.fs * 3 - # elif down[i]: - # if flt == "SHQ2": - # y.fs = int(y.fs / 2) - # elif flt == "SHQ3": - # y.fs = int(y.fs / 3) - - return y.audio diff --git a/item_generation_scripts/audiotools/wrappers/gen_patt.py b/item_generation_scripts/audiotools/wrappers/gen_patt.py deleted file mode 100644 index a68706a7556f610ed719f32288977fcc6fb20050..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/wrappers/gen_patt.py +++ /dev/null @@ -1,171 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -from os import getcwd -from pathlib import Path -from tempfile import TemporaryDirectory -from typing import Optional, Union - -from item_generation_scripts.audiotools.wrappers.random_seed import random_seed -from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES -from item_generation_scripts.utils import find_binary, run - -ERROR_PATTERNS_DIR = Path(__file__).parent.parent.parent.joinpath("error_patterns") - - -def gen_patt( - len_sig: int, - path_pattern: Union[Path, str], - error_rate: float, - start: Optional[int] = 0, - working_dir: Optional[Union[Path, str]] = None, -) -> None: - """ - Wrapper for gen-patt binary to create error patterns for the bitstream processing - - Parameters - ---------- - len_sig: int - Length of signal in frames - path_pattern: Union[Path, str] - Path of output pattern - error_rate: float - Error rate in percent - start: Optional[int] - Start frame of error pattern (length preamble) - working_dir: Optional[Union[Path, str]] - Directory where binary should be called (sta file has to be in this dir if desired) - """ - - # find binary - if "gen-patt" in DEFAULT_CONFIG_BINARIES["binary_paths"]: - binary = find_binary( - DEFAULT_CONFIG_BINARIES["binary_paths"]["gen-patt"].name, - binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"]["gen-patt"].parent, - ) - else: - binary = find_binary("gen-patt") - - if working_dir is None: - working_dir = getcwd() - - # set up command line - cmd = [ - str(binary), - "-tailstat", # Statistics performed on the tail - "-fer", # Frame erasure mode using Gilbert model - "-g192", # Save error pattern in 16-bit G.192 format - "-gamma", # Correlation for BER|FER modes - str(0), - "-rate", - str(error_rate / 100), - "-tol", # Max deviation of specified BER/FER/BFER - str(0.001), - "-reset", # Reset EID state in between iteractions - "-n", - str(int(len_sig)), - "-start", - str(int(start) + 1), - path_pattern, - ] - - # run command - run(cmd, cwd=working_dir) - - return - - -def create_error_pattern( - len_sig: int, - path_pattern: Union[Path, str], - frame_error_rate: float, - preamble: Optional[int] = 0, - master_seed: Optional[int] = 0, - prerun_seed: Optional[int] = 0, -) -> None: - """ - Creates error pattern with desired frame error rate for bitstream processing - - Parameters - ---------- - len_sig: int - Length of signal in frames - path_pattern: Union[Path, str] - Path of output pattern - frame_error_rate: float - Error rate in percent - preamble: Optional[int] - Length of preamble in frames - master_seed: Optional[int] - Master seed for error pattern generation - prerun_seed: optional[int] - Number of preruns in seed generation - """ - - with TemporaryDirectory() as tmp_dir: - tmp_dir = Path(tmp_dir) - - sta_file = ERROR_PATTERNS_DIR.joinpath("sta_template") - tmp_sta_file = tmp_dir.joinpath("sta") - - # compute seed - seed = random_seed((0, 99999999), master_seed, prerun_seed) - - # open file and modify - lines = [] - with open(sta_file, "r") as sta_file_txt: - lines.append(sta_file_txt.readline()) # not changed - lines.append(f"{sta_file_txt.readline()[:-2]}{frame_error_rate/100}\n") - lines.append(sta_file_txt.readline()) # not changed - lines.append(f"{sta_file_txt.readline()[:-2]}{seed}\n") - lines.append(sta_file_txt.readline()) # not changed - lines.append( - f"{sta_file_txt.readline()[:-2]}{1-(frame_error_rate/100*2)}\n" - ) - lines.append(sta_file_txt.readline()) # not changed - lines.append( - f"{sta_file_txt.readline()[:-2]}{1-(frame_error_rate/100*2)}\n" - ) - lines.append(sta_file_txt.readline()) # not changed - - with open(tmp_sta_file, "w") as tmp_sta_file_txt: - tmp_sta_file_txt.write("".join(lines)) - - gen_patt( - len_sig=len_sig, - error_rate=frame_error_rate, - path_pattern=path_pattern, - start=preamble, - working_dir=tmp_dir, - ) - - return diff --git a/item_generation_scripts/audiotools/wrappers/masaRenderer.py b/item_generation_scripts/audiotools/wrappers/masaRenderer.py deleted file mode 100644 index a5987b1e0544c72e1aeaa7a482fc6771bc1ea970..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/wrappers/masaRenderer.py +++ /dev/null @@ -1,117 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -from pathlib import Path -from tempfile import TemporaryDirectory - -import numpy as np - -from item_generation_scripts.audiotools import audio -from item_generation_scripts.audiotools.audiofile import read, write -from item_generation_scripts.audiotools.wrappers.filter import resample_itu -from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES -from item_generation_scripts.utils import find_binary, run - - -def masaRenderer( - masa: audio.MetadataAssistedSpatialAudio, - out_fmt: str, -) -> np.ndarray: - """ - Wrapper for masaRenderer (from MASA reference software) - - Parameters - ---------- - masa : MetadataAssistedSpatialAudio - Input MASA audio - out_fmt: str - Desired output format (only 5_1, 7_1_4 and BINAURAL supported) - - Returns - ------- - output : np.ndarray - MASA rendered to out_fmt - """ - - if "masaRenderer" in DEFAULT_CONFIG_BINARIES["binary_paths"]: - binary = find_binary( - DEFAULT_CONFIG_BINARIES["binary_paths"]["masaRenderer"].name, - binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"]["masaRenderer"].parent, - ) - else: - binary = find_binary("masaRenderer") - - if out_fmt not in ["5_1", "7_1_4", "BINAURAL"]: - raise ValueError(f"Output format {out_fmt} is not supported by MasaRenderer!") - - if out_fmt == "5_1": - output_mode = "-LS51" - num_channels = 6 - elif out_fmt == "7_1_4": - output_mode = "-LS714" - num_channels = 12 - else: - output_mode = "-BINAURAL" - num_channels = 2 - - cmd = [ - str(binary), - output_mode, - "", # 2 -> inputPcm - str(masa.metadata_files.resolve()), - "", # 4 -> outputPcm - ] - - with TemporaryDirectory() as tmp_dir: - tmp_dir = Path(tmp_dir) - tmp_in = tmp_dir.joinpath("tmp_masaRendIn.pcm") - tmp_out = tmp_dir.joinpath("tmp_masaRendOut.pcm") - - cmd[2] = str(tmp_in) - cmd[4] = str(tmp_out) - - tmp_audio = resample_itu(masa, 48000) - old_fs = masa.fs - - write(tmp_in, tmp_audio, 48000) - - # we need to run in the masaRenderer directory to use the .bin files it requires - run(cmd, cwd=binary.resolve().parent) - - output, _ = read(tmp_out, num_channels) - - output_audio = audio.fromtype(out_fmt) - output_audio.audio = output - output_audio.fs = 48000 - output = resample_itu(output_audio, old_fs) - - return output diff --git a/item_generation_scripts/audiotools/wrappers/networkSimulator.py b/item_generation_scripts/audiotools/wrappers/networkSimulator.py deleted file mode 100644 index 4e74c3ceb70e8fe0a59b56f732ce080a07f69343..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/wrappers/networkSimulator.py +++ /dev/null @@ -1,224 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import logging -import os.path -from pathlib import Path -from typing import Optional, Union - -from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES -from item_generation_scripts.utils import find_binary, run - -LIST_JBM_PROFILES = range(12) -ERROR_PATTERNS_DIR = Path(__file__).parent.parent.parent.joinpath("dly_error_profiles") - - -def validate_network_simulator( - error_pattern: Optional[Union[Path, str]] = None, - error_profile: Optional[int] = None, - n_frames_per_packet: Optional[int] = None, -) -> None: - """ - Validate settings for the network simulator - - Parameters - ---------- - error_pattern: Optional[Union[Path, str]] - Path to existing error pattern - error_profile: Optional[int] - Index of existing error pattern - n_frames_per_packet: Optional[int] - Number of frames per paket - """ - - if "networkSimulator_g192" in DEFAULT_CONFIG_BINARIES["binary_paths"]: - binary = find_binary( - DEFAULT_CONFIG_BINARIES["binary_paths"]["networkSimulator_g192"].name, - binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"][ - "networkSimulator_g192" - ].parent, - ) - else: - binary = find_binary("networkSimulator_g192") - - if binary is None: - raise FileNotFoundError( - "The network simulator binary was not found! Please check the configuration." - ) - if error_pattern is not None: - if not os.path.exists(os.path.realpath(error_pattern)): - raise FileNotFoundError( - f"The network simulator error profile file {error_pattern} was not found! Please check the configuration." - ) - if error_profile is not None: - raise ValueError( - "JBM pattern and JBM profile number are specified for bitstream processing. Can't use both! Please check the configuration." - ) - elif error_profile is not None: - if error_profile not in LIST_JBM_PROFILES: - raise ValueError( - f"JBM profile number {error_profile} does not exist, should be between {LIST_JBM_PROFILES[0]} and {LIST_JBM_PROFILES[-1]}" - ) - if n_frames_per_packet is not None and n_frames_per_packet not in [1, 2]: - raise ValueError( - f"n_frames_per_paket is {n_frames_per_packet}. Should be 1 or 2. Please check your configuration." - ) - - return - - -def network_simulator( - error_pattern: Union[str, Path], - in_bitstream: Union[str, Path], - out_bitstream: Union[str, Path], - n_frames_per_packet: int, - offset: int, - logger: Optional[logging.Logger] = None, -) -> None: - """ - Wrapper for networkSimulator_g192 binary to apply error patterns for the bitstream processing - - Parameters - ---------- - error_pattern: Union[str, Path] - Path to error pattern file - in_bitstream: Union[str, Path] - Path to input bitstream file - out_bitstream: Union[str, Path] - Output path for modified bitstream - n_frames_per_packet: int, - Number of frames per paket [1,2] - offset: Optional[int] - delay offset - logger: Optional[logging.Logger] - logger - """ - - # find binary - if "networkSimulator_g192" in DEFAULT_CONFIG_BINARIES["binary_paths"]: - binary = find_binary( - DEFAULT_CONFIG_BINARIES["binary_paths"]["networkSimulator_g192"].name, - binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"][ - "networkSimulator_g192" - ].parent, - ) - else: - binary = find_binary("networkSimulator_g192") - - # check for valid inputs - if not Path(in_bitstream).is_file(): - raise ValueError( - f"Input bitstream file {in_bitstream} for bitstream processing does not exist" - ) - elif not Path(error_pattern).is_file(): - raise ValueError( - f"Error pattern file {error_pattern} for bitstream processing does not exist" - ) - - # set up command line - cmd = [ - str(binary), - error_pattern, - in_bitstream, - out_bitstream, - f"{out_bitstream}_tracefile_sim", - str(n_frames_per_packet), - str(offset), - ] - - # run command - run(cmd, logger=logger) - - return - - -def apply_network_simulator( - in_bitstream: Union[Path, str], - out_bitstream: Union[Path, str], - error_pattern: Optional[Union[Path, str]] = None, - error_profile: Optional[int] = None, - n_frames_per_packet: Optional[int] = None, - offset: Optional[int] = 0, - logger: Optional[logging.Logger] = None, -) -> None: - """ - Function to apply a network simulator profile to a bitstreaam - - Parameters - ---------- - in_bitstream: Union[Path, str] - Path of input bitstream - out_bitstream: Union[Path, str] - Path of output bitstream - error_pattern: Optional[Union[Path, str]] - Path to existing error pattern - error_profile: Optional[int] - Index of existing error pattern - n_frames_per_packet: Optional[int] - Number of frames per paket - offset: Optional[int] - delay offset - logger: Optional[logging.Logger] - logger - """ - - if error_pattern is None: - # create error pattern - if error_profile is not None: - if error_profile in LIST_JBM_PROFILES: - error_pattern = ERROR_PATTERNS_DIR.joinpath( - f"dly_error_profile_{error_profile}.dat" - ) - else: - raise ValueError( - f"JBM profile number {error_profile} does not exist, should be between {LIST_JBM_PROFILES[0]} and {LIST_JBM_PROFILES[-1]}" - ) - else: - raise ValueError( - "Either error pattern or error profile number has to be specified for network simulator bitstream processing" - ) - elif error_profile is not None: - raise ValueError( - "JBM pattern and JBM profile number are specified for bitstream processing. Can't use both" - ) - - if n_frames_per_packet is None: - n_frames_per_packet = 1 - if error_profile is not None and error_profile == 5: - n_frames_per_packet = 2 - - # apply error pattern - network_simulator( - error_pattern, in_bitstream, out_bitstream, n_frames_per_packet, offset, logger - ) - - return diff --git a/item_generation_scripts/audiotools/wrappers/p50fbmnru.py b/item_generation_scripts/audiotools/wrappers/p50fbmnru.py deleted file mode 100644 index 2f4c19ef03d4c134a6729a8b3dbceb3769c19e86..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/wrappers/p50fbmnru.py +++ /dev/null @@ -1,110 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -from pathlib import Path -from tempfile import TemporaryDirectory -from warnings import warn - -import numpy as np - -from item_generation_scripts.audiotools import audio -from item_generation_scripts.audiotools.audiofile import read, write -from item_generation_scripts.audiotools.wrappers.filter import resample_itu -from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES -from item_generation_scripts.utils import find_binary, run - - -def p50fbmnru( - input: audio.Audio, - q_db: float, -) -> np.ndarray: - """ - Wrapper for P.50 Fullband MNRU (Modulated Noise Reference Unit), requires p50fbmnru binary - The mode is M (Modulated Noise) as specified in section 5.2.1 of S4-141392 - EVS-7c Processing functions for characterization phase v110.doc - - Parameters - ---------- - input : Audio - Input audio - q_db: float - The ratio, in dB, of speech power to modulated noise power - - Returns - ------- - output: np.ndarray - Output array - """ - - if "p50fbmnru" in DEFAULT_CONFIG_BINARIES["binary_paths"]: - binary = find_binary( - DEFAULT_CONFIG_BINARIES["binary_paths"]["p50fbmnru"].name, - binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"]["p50fbmnru"].parent, - ) - else: - binary = find_binary("p50fbmnru") - - if input.fs != 48000: - warn("P.50 Fullband MNRU requires a sampling rate of 48kHz.") - tmp_sig = resample_itu(input, 48000) - else: - tmp_sig = input.audio - - tmp_input_signal = tmp_sig - tmp_output_signal = np.ones((48000, input.num_channels)) - - with TemporaryDirectory() as tmp_dir: - tmp_dir = Path(tmp_dir) - tmp_input_file = tmp_dir.joinpath("tmp_input_signal.raw") - tmp_output_file = tmp_dir.joinpath("tmp_output_signal.raw") - - """ - P.50 Fullband MNRU - """ - - cmd = [ - str(binary), - str(tmp_input_file), - str(tmp_output_file), - str(q_db), - "M", - ] - - # write temporary file - write(tmp_input_file, tmp_input_signal) - write(tmp_output_file, tmp_output_signal) - - # run command - run(cmd) - - tmp_output_signal, out_fs = read(tmp_output_file, input.num_channels) - - return tmp_output_signal diff --git a/item_generation_scripts/audiotools/wrappers/random_seed.py b/item_generation_scripts/audiotools/wrappers/random_seed.py deleted file mode 100644 index 01cf08704764c70063aadc90a03d419316b26896..0000000000000000000000000000000000000000 --- a/item_generation_scripts/audiotools/wrappers/random_seed.py +++ /dev/null @@ -1,92 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -from typing import Optional, Tuple - -from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES -from item_generation_scripts.utils import find_binary, run - - -def random_seed( - range: Tuple[int, int], - master_seed: Optional[int] = 0, - prerun_seed: Optional[int] = 0, - hexa: Optional[bool] = True, -) -> int: - """ - - Parameters - ---------- - master_seed: Optional[int] - Master seed for error pattern generation - prerun_seed: Optional[int] - Number of preruns in seed generation - hexa: Optonal[bool] - Flag if output should be in hexadecimal or decimal format - - Returns - ------- - result: int - One random value - """ - - # find binary - if "random" in DEFAULT_CONFIG_BINARIES["binary_paths"]: - binary = find_binary( - DEFAULT_CONFIG_BINARIES["binary_paths"]["random"].name, - binary_path=DEFAULT_CONFIG_BINARIES["binary_paths"]["random"].parent, - ) - else: - binary = find_binary("random") - - # set up command line - cmd = [ - str(binary), - "-n", # Number of items - str(1), - "-s", - str(master_seed), - "-d", - str(prerun_seed), - "-r", # value range for results - str(range[0]), - str(range[1]), - ] - - # run command - result = run(cmd) - result = int(result.stdout[:-1]) - - if hexa: - result = hex(result) - - return result diff --git a/item_generation_scripts/binary_paths.yml b/item_generation_scripts/binary_paths.yml deleted file mode 100644 index bafcacfca471b1d3086d69665824f9fc161cdf26..0000000000000000000000000000000000000000 --- a/item_generation_scripts/binary_paths.yml +++ /dev/null @@ -1,30 +0,0 @@ ---- -################################################ -# Binary paths -################################################ -### Custom binary paths and names can be specified here. -### If not defined here, the binaries in item_generation_scripts/bin would be used -### If binaries are neither specified here nor found in the bin folder, the scripts would look for them in $PATH -### DO NOT change the location of this file. -### DO NOT USE relative paths. The paths have to be absolute. -### DO NOT change the default keys. -### For example, if the user has renamed the 'filter' binary to 'foo' then use --> filter: path/to/binary/foo - -# ### Binary for resampling and filtering -# filter: "path/to/binary/filter_new" -# ### Binary for loudness adjustment -# bs1770demo: "path/to/binary/bs1880" -# ### Binary for MNRU -# p50fbmnru: "path/to/binary/p50fbmnru" -# ### Binary for ESDRU -# esdru: "path/to/binary/esdru" -# ### Binary for frame error pattern application -# eid-xor: "path/to/binary/eid-xor" -# ### Binary for error pattern generation -# gen-patt: "path/to/binary/gen-patt" -# ### Binary for random offset/seed generation -# random: "path/to/binary/random" -# ### Binary for JBM network similulator -# networkSimulator_g192: "path/to/binary/networkSimulator_g192" -# ### Binary for MASA rendering -# masaRenderer: "path/to/binary/masaRenderer" \ No newline at end of file diff --git a/item_generation_scripts/processing/__init__.py b/item_generation_scripts/processing/__init__.py deleted file mode 100644 index aea270d8d1752e772ab716bc33be0bf7b8a0cf35..0000000000000000000000000000000000000000 --- a/item_generation_scripts/processing/__init__.py +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# diff --git a/item_generation_scripts/processing/preprocessing_2.py b/item_generation_scripts/processing/preprocessing_2.py deleted file mode 100644 index 1152ccc7ec63db8c3d07cb3fed9542bb8f2bbfe7..0000000000000000000000000000000000000000 --- a/item_generation_scripts/processing/preprocessing_2.py +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import logging -from pathlib import Path -from warnings import warn - -import numpy as np - -from item_generation_scripts.audiotools import audio -from item_generation_scripts.audiotools.audioarray import delay, trim -from item_generation_scripts.audiotools.audiofile import write -from item_generation_scripts.audiotools.metadata import ( - add_remove_preamble, - write_ISM_metadata_in_file, -) -from item_generation_scripts.audiotools.wrappers.bs1770 import ( - get_loudness, - loudness_norm, -) -from item_generation_scripts.audiotools.wrappers.random_seed import random_seed -from item_generation_scripts.processing.processing import Processing - - -class Preprocessing2(Processing): - def __init__(self, attrs: dict): - super().__init__(attrs) - self.name = "pre_2" - - def process(self, in_file: Path, out_file: Path, in_meta, logger: logging.Logger): - logger.debug(f"Preprocessing2 configuration : {self.__dict__}") - logger.debug(f"Preprocessing2 {in_file.absolute()} -> {out_file.absolute()}") - - # load in file - audio_object = audio.fromfile( - self.in_fmt, in_file, fs=self.in_fs, in_meta=in_meta - ) - - # add preamble - if self.preamble: - # also apply preamble to ISM metadata - if self.in_fmt.startswith("ISM"): - # read out old - metadata = [] - for meta in in_meta: - metadata.append(np.genfromtxt(meta, delimiter=",")) - - # modify metadata - metadata = add_remove_preamble(metadata, self.preamble) - meta_files = write_ISM_metadata_in_file(metadata, [out_file], True) - - # modify audio object - audio_object.metadata_files = meta_files - audio_object.obect_pos = metadata - - # add preamble to actual signal - audio_object.audio = trim( - audio_object.audio, - audio_object.fs, - (-self.preamble, 0), - self.pad_noise_preamble, - ) - - # add background noise - if self.background_noise: - audio_object.audio = self.add_background_noise(audio_object, in_meta) - - # save file - write(out_file, audio_object.audio, fs=audio_object.fs) - - return - - def add_background_noise(self, audio_object: audio.Audio, in_meta) -> np.ndarray: - # range for random delay - range_delay = (1, 2400000) - - # load background noise - noise_object = audio.fromfile( - self.in_fmt, - self.background_noise["background_noise_path"], - fs=self.in_fs, - in_meta=in_meta, - ) - - # if noise is too short raise error - if len(noise_object.audio) < len(audio_object.audio): - raise ValueError("Background noise too short for audio signal") - if len(noise_object.audio) - range_delay[1] < len(audio_object.audio): - warn( - "Background noise may be to short for audio signal when considering the random delay" - ) - - # measure loudness of audio signal based on output format - tmp_object = audio.fromtype(self.out_fmt) - if ( - isinstance(tmp_object, audio.ObjectBasedAudio) - or isinstance(tmp_object, audio.SceneBasedAudio) - or isinstance(tmp_object, audio.MetadataAssistedSpatialAudio) - ): - out_format = None - else: - out_format = self.out_fmt - - loudness_signal, _ = get_loudness(audio_object, loudness_format=out_format) - - # compute desired loudness of background noise - loudness_noise = loudness_signal - self.background_noise["snr"] - - # apply random delay and cut signal - rand_delay = random_seed( - range=range_delay, - master_seed=self.background_noise["master_seed"], - prerun_seed=self.background_noise["seed_delay"], - hexa=False, - ) - noise_object.audio = delay( - noise_object.audio, delay=-rand_delay, samples=True, fs=noise_object.fs - )[: len(audio_object.audio)] - - # scale background noise to desired loudness based on output format - noise_object.audio = loudness_norm(noise_object, loudness_noise, out_format) - - # add array to signal - audio_object.audio = noise_object.audio + audio_object.audio - - return audio_object.audio diff --git a/item_generation_scripts/processing/processing.py b/item_generation_scripts/processing/processing.py deleted file mode 100644 index ad2cf272c94a13bbd774d4cd5e25031c71270872..0000000000000000000000000000000000000000 --- a/item_generation_scripts/processing/processing.py +++ /dev/null @@ -1,455 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import logging -from abc import ABC, abstractmethod -from itertools import repeat -from pathlib import Path -from shutil import copyfile -from typing import Iterable, Union -from warnings import warn - -import numpy as np - -from item_generation_scripts.audiotools import audio -from item_generation_scripts.audiotools.audiofile import ( - concat, - read, - split, - trim, - write, -) -from item_generation_scripts.audiotools.metadata import ( - add_remove_preamble, - concat_meta_from_file, - metadata_search, - split_meta_in_file, - write_ISM_metadata_in_file, -) -from item_generation_scripts.audiotools.wrappers.bs1770 import scale_files -from item_generation_scripts.constants import LOGGER_DATEFMT, LOGGER_FORMAT -from item_generation_scripts.processing.config import TestConfig -from item_generation_scripts.utils import apply_func_parallel, list_audio, pairwise - - -class Processing(ABC): - def __init__(self, attrs: dict): - self.__dict__.update(attrs) - - @abstractmethod - def process( - self, in_file: Path, out_file: Path, in_meta, logger: logging.Logger - ) -> None: - pass - - -def reorder_items_list(items_list: list, concatenation_order: list) -> list: - name_to_full = {Path(full_file).name: full_file for full_file in items_list} - ordered_full_files = [ - name_to_full[name] for name in concatenation_order if name in name_to_full - ] - return ordered_full_files - - -def concat_setup(cfg: TestConfig, chain, logger: logging.Logger): - n_items_list = len(cfg.items_list) - cfg_pre2 = chain[0] - - # check for text files - if any([i for i in cfg.items_list if i.suffix == ".txt"]): - raise SystemExit("Concatenation for text files is unsupported") - - # apply concatenation order - if cfg_pre2.concatenation_order is not None: - n_concatenation_order = len(cfg_pre2.concatenation_order) - if n_concatenation_order != n_items_list: - warn( - f"Warning: Mismatch in specified concatenation order and number of items to process!\n" - f"Number of items specified in concatenation order: {n_concatenation_order}\n" - f"Number of items in the directory: {n_items_list}\n" - f"Concatenation will use the following order:\n{cfg_pre2.concatenation_order}" - ) - - logger.info(f"Concatenating input files in directory {cfg.input_path}") - - # concatenate ISM metadata - if cfg.input["fmt"].startswith("ISM"): - cfg.concat_meta = [] - for obj_idx in range(len(cfg.metadata_path[0])): - cfg.concat_meta.append( - cfg.tmp_dirs[0].joinpath( - f"{cfg.input_path.name}_concatenated.wav.{obj_idx}.csv" - ) - ) - concat_meta_from_file( - cfg.items_list, - cfg.metadata_path, - cfg.concat_meta, - cfg.input["fmt"], - ) - - # set input to the concatenated file we have just written to the output dir - cfg.metadata_path = [cfg.concat_meta] - - # concatenate audio - cfg.concat_file = cfg.tmp_dirs[0].joinpath( - f"{cfg.input_path.name}_concatenated.wav" - ) - - # determine number of channels for pcm and raw files - tmp_audio = audio.fromtype(cfg_pre2.in_fmt) - tmp_num_chans = tmp_audio.num_channels - - cfg.splits = concat( - cfg.items_list, - cfg.concat_file, - in_fs=cfg.input.get("fs", 48000), - num_channels=tmp_num_chans, - ) - - # save item naming for splits naming in the end - cfg.split_names = [] - for name in cfg.items_list: - cfg.split_names.append(Path(name).stem.split(".")[0]) - # set input to the concatenated file we have just written to the output dir - cfg.items_list = [cfg.concat_file] - - # write out splits - with open(cfg.concat_file.with_suffix(".splits.log"), "w") as f: - print(", ".join([str(s) for s in cfg.splits]), file=f) - print(", ".join([str(sn) for sn in cfg.split_names]), file=f) - print(", ".join([str(i.stem) for i in cfg.items_list]), file=f) - - logger.info(f"Splits written to file {cfg.concat_file.with_suffix('.splits.log')}") - - -def concat_teardown(cfg: TestConfig, logger: logging.Logger): - if not cfg.splits: - raise ValueError("Splitting not possible without split marker") - - output_format = cfg.postprocessing["fmt"] - - out_files = [] - out_meta = [] - - logger.info(f"Splitting output file in directory {cfg.output_path}") - - for odir in cfg.out_dirs: - path_input = odir / cfg.items_list[0].name - out_paths = split( - path_input, - odir, - cfg.split_names, - cfg.splits, - in_fs=cfg.postprocessing["fs"], - ) - - logger.debug( - f"Resulting split files condition {odir.name}: {', '.join([str(op) for op in out_paths])}" - ) - out_files.append(out_paths) - - # split ISM metadata - if output_format.startswith("ISM"): - for odir in cfg.out_dirs: - path_input = odir / cfg.items_list[0].name - out_meta_paths = split_meta_in_file( - path_input, - odir, - cfg.split_names, - cfg.splits, - output_format, - meta_files=cfg.metadata_path[0], - ) - out_meta.append(out_meta_paths) - - # remove concatenated file - if cfg.delete_tmp: - cfg.concat_file.unlink(missing_ok=True) - - return out_files, out_meta - - -def preprocess(cfg, logger): - preprocessing = cfg.proc_chains[0] - chain = preprocessing["processes"] - - logger.info(f" Generating condition: {preprocessing['name']}") - - # run preprocessing - apply_func_parallel( - process_item, - zip( - cfg.items_list, - repeat(cfg.tmp_dirs[0]), - repeat(cfg.out_dirs[0]), - repeat(chain), - repeat(logger), - cfg.metadata_path, - ), - None, - "mp" if cfg.multiprocessing else None, - ) - - # update the configuration to use preprocessing outputs as new inputs - cfg.items_list = list_audio( - cfg.out_dirs[0], absolute=False, select_list=getattr(cfg, "input_select", None) - ) - - # Re-ordering items based on concatenation order - if ( - hasattr(cfg, "preprocessing_2") - and cfg.preprocessing_2.get("concatenate_input", False) - and cfg.preprocessing_2.get("concatenation_order", None) is not None - ): - cfg.items_list = reorder_items_list(cfg.items_list, cfg.concatenation_order) - - if cfg.metadata_path[0] is not None: - for item_idx in range(len(cfg.metadata_path)): - for obj_idx in range(len(cfg.metadata_path[item_idx])): - if cfg.metadata_path[item_idx][obj_idx]: - cfg.metadata_path[item_idx][obj_idx] = cfg.out_dirs[0] / Path( - f"{cfg.items_list[item_idx].stem}.wav.{obj_idx}.csv" - ) - # remove already applied processing stage - cfg.proc_chains = cfg.proc_chains[1:] - cfg.tmp_dirs = cfg.tmp_dirs[1:] - cfg.out_dirs = cfg.out_dirs[1:] - - -def preprocess_2(cfg, logger): - preprocessing_2 = cfg.proc_chains[0] - chain = preprocessing_2["processes"] - - logger.info(f" Generating condition: {preprocessing_2['name']}") - - # concatenate items if required - if chain[0].concatenate_input: - concat_setup(cfg, chain, logger) - - # run preprocessing 2 - apply_func_parallel( - process_item, - zip( - cfg.items_list, - repeat(cfg.tmp_dirs[0]), - repeat(cfg.out_dirs[0]), - repeat(chain), - repeat(logger), - cfg.metadata_path, - ), - None, - "mp" if cfg.multiprocessing else None, - ) - - # update the configuration to use preprocessing 2 outputs as new inputs - cfg.items_list = list_audio( - cfg.out_dirs[0], absolute=False, select_list=getattr(cfg, "input_select", None) - ) - - # Re-ordering items based on concatenation order - if ( - hasattr(cfg, "preprocessing_2") - and cfg.preprocessing_2.get("concatenate_input", False) - and cfg.preprocessing_2.get("concatenation_order", None) is not None - ): - cfg.items_list = reorder_items_list(cfg.items_list, cfg.concatenation_order) - - if cfg.metadata_path[0] is not None: - for item_idx in range(len(cfg.metadata_path)): - for obj_idx in range(len(cfg.metadata_path[item_idx])): - if cfg.metadata_path[item_idx][obj_idx]: - cfg.metadata_path[item_idx][obj_idx] = cfg.out_dirs[0] / Path( - f"{cfg.items_list[item_idx].stem}.wav.{obj_idx}.csv" - ) - # remove already applied processing stage - cfg.proc_chains = cfg.proc_chains[1:] - cfg.tmp_dirs = cfg.tmp_dirs[1:] - cfg.out_dirs = cfg.out_dirs[1:] - - return - - -def reverse_process_2(cfg, logger): - # remove preamble - if cfg.pre2.preamble: - remove_preamble(cfg) - - # reverse concatenation - if cfg.pre2.concatenate_input: - # write out the splits, optionally remove file - out_paths_splits, out_meta_splits = concat_teardown(cfg, logger) - else: - # if no concatenation read files from folder - out_paths_splits = [] - for out_dir in cfg.out_dirs: - list_audio_dir = list_audio(out_dir, absolute=True) - out_paths_splits.append(list_audio_dir) - if cfg.postprocessing["fmt"].startswith("ISM"): - out_meta_splits = [] - for i, condition in enumerate(out_paths_splits): - meta_condition = metadata_search( - cfg.out_dirs[i], - condition, - num_objects=int(cfg.postprocessing["fmt"][-1]), - ) - out_meta_splits.append(meta_condition) - else: - out_meta_splits = None - - # scale individual files - if cfg.postprocessing.get("loudness", False): - scale_files( - out_paths_splits, - cfg.postprocessing["fmt"], - cfg.postprocessing["loudness"], - cfg.postprocessing["fs"], - out_meta_splits, - ) - return - - -def process_item( - in_file: Union[Path, str], - tmp_dir: Union[Path, str], - out_dir: Union[Path, str], - chain: Iterable, - logger: logging.Logger, - in_meta, -) -> None: - tmp_file = tmp_dir.joinpath(in_file.name) - tmp_file_meta = [] - if in_meta: - for im in in_meta: - tmp_file_meta.append(tmp_dir.joinpath(Path(im).name)) - - # assemble a list of files to be used during the processing chain - out_dir_wav = False - processing_paths = [in_file] - processing_paths_meta = [in_meta] - for p in chain: - if Path(in_file.name).suffix == ".txt" and p.out_fmt is not None: - processing_paths.append(tmp_file.with_suffix(f".{p.name}.wav")) - out_dir_wav = True - else: - processing_paths.append(tmp_file.with_suffix(f".{p.name}{tmp_file.suffix}")) - try: - out_format = p.out_fmt - except AttributeError: - # EVS has no attribute out_fmt - out_format = p.in_fmt - try: - bool_ism = out_format.startswith("ISM") - except Exception: - bool_ism = out_format.name.startswith("ISM") - - if bool_ism: - list_meta_step = [] - for idx, tfm in enumerate(tmp_file_meta): - list_meta_step.append( - tfm.parent - / f"{in_file.stem.split('.')[0]}.{p.name}.wav.{idx}.csv" - ) - processing_paths_meta.append(list_meta_step) - else: - processing_paths_meta.append(None) - # TODO: support txt file writing for META pass-through - - if out_dir_wav: - out_file = out_dir.joinpath(in_file.name).with_suffix(".wav") - else: - out_file = out_dir.joinpath(in_file.name) - - out_meta = [] - if in_meta: - for im in range(len(in_meta)): - out_meta.append(out_dir.joinpath(f"{Path(out_file).stem}.wav.{im}.csv")) - - # execute each process sequentially, feed output into input of next process - for p, (input, output), input_meta in zip( - chain, pairwise(processing_paths), processing_paths_meta[:-1] - ): - # setup logging for the output - item_logger = logger.getChild(output.stem) - fh = logging.FileHandler(output.with_suffix(".log"), mode="w") - fh.setLevel(logging.DEBUG) - fh.setFormatter(logging.Formatter(LOGGER_FORMAT, datefmt=LOGGER_DATEFMT)) - item_logger.addHandler(fh) - - p.process(input, output, input_meta, item_logger) - - # copy output and metadata from final process to output file - copyfile(processing_paths[-1], out_file) - if processing_paths_meta[-1]: - for idx, ppm in enumerate(processing_paths_meta[-1]): - copyfile(ppm, out_meta[idx]) - - -def remove_preamble(cfg): - # get number of channels from output format - num_channels = audio.fromtype(cfg.postprocessing["fmt"]).num_channels - for odir in cfg.out_dirs: - for item in cfg.items_list: - path_input = odir / item.name - - # remove preamble for ISM metadata - if cfg.postprocessing["fmt"].startswith("ISM"): - # search for metadata - meta_item = metadata_search( - odir, [Path(item.name)], num_objects=num_channels - ) - metadata_array = [] - for meta_i in meta_item: - metadata_array.append(np.genfromtxt(meta_i, delimiter=",")) - - # remove preamble - metadata_array = add_remove_preamble( - metadata_array, cfg.pre2.preamble, add=False - ) - - # write csv files - write_ISM_metadata_in_file( - metadata_array, [path_input], automatic_naming=True - ) - - # read file - x, fs = read( - path_input, nchannels=num_channels, fs=cfg.postprocessing["fs"] - ) - - # remove preamble - x = trim(x, fs, (cfg.pre2.preamble, 0)) - - # write file - write(path_input, x, fs) - - return diff --git a/item_generation_scripts/utils.py b/item_generation_scripts/utils.py deleted file mode 100644 index 1e21b0dba973cdf2dd73405e782743bf8700f6e7..0000000000000000000000000000000000000000 --- a/item_generation_scripts/utils.py +++ /dev/null @@ -1,297 +0,0 @@ -#!/usr/bin/env python3 - -# -# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository. All Rights Reserved. -# -# This software is protected by copyright law and by international treaties. -# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, -# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., -# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, -# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other -# contributors to this repository retain full ownership rights in their respective contributions in -# the software. This notice grants no license of any kind, including but not limited to patent -# license, nor is any license granted by implication, estoppel or otherwise. -# -# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making -# contributions. -# -# This software is provided "AS IS", without any express or implied warranties. The software is in the -# development stage. It is intended exclusively for experts who have experience with such software and -# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability -# and fitness for a particular purpose are hereby disclaimed and excluded. -# -# Any dispute, controversy or claim arising under or in relation to providing this software shall be -# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in -# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and -# the United Nations Convention on Contracts on the International Sales of Goods. -# - -import logging -import shutil -import subprocess as sp -import sys -from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor -from itertools import repeat, tee -from os import devnull -from pathlib import Path -from shutil import which -from typing import Callable, Iterable, Optional, Union - -import yaml - -ALLOWED_INPUT_EXT = (".wav", ".pcm", ".txt", ".raw") -BIN_DIR = Path(__file__).parent.joinpath("bin") - - -""" -Directory/path handling -""" - - -def create_dir(p: str) -> None: - p = Path(p) - p.mkdir(exist_ok=True, parents=True) - - -def delete_dir(p: str) -> None: - p = Path(p) - if p.exists() and p.is_dir(): - shutil.rmtree(p) - - -class DirManager: - """ - Context manager that creates directories if not already present and - automatically cleans up (i.e. deletes) all specified paths - """ - - def __init__( - self, create_paths: Union[str, list], delete_paths: Union[str, list] = list() - ): - self.create_paths = ( - create_paths if isinstance(create_paths, list) else [create_paths] - ) - self.delete_paths = ( - delete_paths if isinstance(create_paths, list) else [delete_paths] - ) - - def __enter__(self): - for path in self.create_paths: - create_dir(path) - - def __exit__(self, exc_type, exc_value, exc_traceback): - for path in self.delete_paths: - if path in self.create_paths: - delete_dir(path) - else: - print( - f"Tmp dir '{path}' was not present in creation paths - skipping deletion." - ) - - -def list_audio(path: str, absolute: bool = False, select_list: list = None) -> list: - """ - Return list with all files with ALLOWED_INPUT_EXT found under the given path. - - If path is a directory, all files in it are included, if it is a file, just the file - will be in the list. If a select list is provided, files are filtered accordingly. - """ - path = Path(path) - audio_list = [] - - if path.exists(): - if path.is_dir(): - if absolute: - [audio_list.extend(list(path.glob(ext))) for ext in ALLOWED_INPUT_EXT] - audio_list = [ - path.joinpath(f) - for f in path.iterdir() - if f.suffix in ALLOWED_INPUT_EXT - ] - else: - audio_list = [ - f for f in path.iterdir() if f.suffix in ALLOWED_INPUT_EXT - ] - else: - if not absolute: - path = path.name - ext = path.suffix - if ext in ALLOWED_INPUT_EXT: - audio_list.append(path) - - # filter according to select list - if select_list: - select_set = set([Path(i).stem for i in select_list]) - audio_list = [ - f for f in audio_list if any([pattern in f.stem for pattern in select_set]) - ] - - return audio_list - - -def get_nickname(p: Path) -> str: - return f"{p.parent.name}/{p.name}" - - -""" -System interaction -""" - - -def find_binary( - binary: str, - raise_error: Optional[bool] = True, - logger: Optional[logging.Logger] = None, - binary_path: Optional[str] = None, -) -> Union[Path, None]: - """Attempt to find and return the path to the given binary""" - # prioritise binaries placed in the directory over $PATH - if binary_path is not None: - bin = which(binary, path=binary_path) - else: - bin = which(binary, path=BIN_DIR) - if not bin: - bin = which(binary) - - if not bin and raise_error: - raise FileNotFoundError( - f"Binary {binary} was neither found in {binary_path.absolute()} nor in {BIN_DIR.absolute()} or in $PATH!" - ) - elif not bin: - if logger: - logger.debug(f"Couldn't find binary {binary}") - return None - else: - if logger: - logger.debug(f"Found binary {bin}") - return Path(bin) - - -def get_devnull(): - return devnull - - -def get_gitsha(): - try: - git_sha = sp.check_output( - ["git", "rev-parse", "HEAD"], stderr=sp.STDOUT, text=True - ).strip() - except sp.CalledProcessError: - git_sha = "git repository not found!" - - return git_sha - - -def run(cmd, cwd=None, check=True, logger: Optional[logging.Logger] = None): - if logger: - logger.debug(f"Running command {' '.join([str(c) for c in cmd])}; cwd = {cwd}") - - try: - result = sp.run(cmd, check=check, capture_output=True, text=True, cwd=cwd) - except sp.CalledProcessError as e: - raise SystemError( - f"Command returned non-zero exit status ({e.returncode}): {' '.join([str(c) for c in e.cmd])}\n{e.stderr}\n{e.stdout}" - ) - - if logger: - logger.debug(result.stderr.strip()) - logger.debug(result.stdout.strip()) - - return result - - -""" -Utility functions -""" - - -def apply_func_parallel( - func: Callable, - args: Iterable, - kwargs: Optional[Iterable] = None, - type: Optional[str] = None, - show_progress: Optional[bool] = True, -) -> list: - """ - Apply a function iteratively to a list of arguments and keyword arguments - Optionally with multiprocessing or multithreading - - Parameters - ---------- - func : Callable - Function to use - args : Iterable - List of positional arguments - kwargs: Optional[Iterable] - List of keyword arguments - type: Optional[str] - Type of parallel processing to use, "mp" for multiprocessing or "mt" for threading, default = None (sequential processing) - show_progress: Optional[bool] - Flag whether to show progress bar - - Returns - ------- - List of function results - """ - - # if no kwargs are specified, repeat the empty dict to avoid issues with zipping and unpacking - if not kwargs: - kwargs = repeat({}) - - args_zip = zip(args, kwargs) - - if type == "mp": - executor = ProcessPoolExecutor - elif type == "mt": - executor = ThreadPoolExecutor - else: - return [ - func(*a, **k) - for a, k in (progressbar(list(args_zip)) if show_progress else args_zip) - ] - - with executor() as e: - results = [e.submit(func, *a, **k) for a, k in args_zip] - return [ - r.result() for r in (progressbar(results) if show_progress else results) - ] - - -def pairwise(iter): - """itertools.pairwise() for python < 3.10""" - a, b = tee(iter) - next(b, None) - return zip(a, b) - - -def progressbar(iter: Iterable, width=80): - """simple unicode progressbar""" - count = len(iter) - - def update(progress): - fill = int(width * progress / count) - print( - f"{int(progress/count*100):3d}%{u'│'}{u'█'*fill}{(u'░'*(width-fill))}{u'│'}{progress}/{count}", - end="\r", - file=sys.stdout, - flush=True, - ) - - update(0) - for i, item in enumerate(iter): - yield item - update(i + 1) - print("\n", flush=True, file=sys.stdout) - - -def get_binary_paths(yaml_file_with_binary_paths): - with open(yaml_file_with_binary_paths, "r") as f: - data = yaml.safe_load(f) - if data is None: - return {} - else: - return {key: Path(value) for key, value in data.items()} diff --git a/ivas_processing_scripts/audiotools/audiofile.py b/ivas_processing_scripts/audiotools/audiofile.py index 954c91f8441a7a3fad3ae58794537af4a357742f..d5687a89d919bcc734e89e69a1e92cd24a33d10d 100755 --- a/ivas_processing_scripts/audiotools/audiofile.py +++ b/ivas_processing_scripts/audiotools/audiofile.py @@ -110,6 +110,7 @@ def write( filename: Union[str, Path], x: np.ndarray, fs: Optional[int] = 48000, + dtype: Optional[str] = "int16", ) -> None: """ Write audio file (.pcm, .wav or .raw) @@ -122,6 +123,8 @@ def write( Numpy 2D array of dimension: number of channels x number of samples fs: Optional[int] Sampling rate, required for .pcm or .raw input file, default = 48000 (Hz) + dtype: Optional[str] + Data type format required for .pcm or .raw input file, default = 'int16' Returns ------- @@ -141,7 +144,7 @@ def write( x = x.astype(np.int16) wav.write(filename, fs, x) elif file_extension == ".pcm" or file_extension == ".raw": - x = x.astype("int16").reshape(-1, 1) + x = x.astype(dtype).reshape(-1, 1) x.tofile(filename) else: raise ValueError("Wrong input format. Use wav, pcm or raw") diff --git a/item_generation_scripts/audiotools/wrappers/reverb.py b/ivas_processing_scripts/audiotools/wrappers/reverb.py similarity index 90% rename from item_generation_scripts/audiotools/wrappers/reverb.py rename to ivas_processing_scripts/audiotools/wrappers/reverb.py index 1c4491bd7e3aeda7bfb097b8101049d6625b19fb..46f4ee33290fb04cfd39acaf156531d18f4397aa 100644 --- a/item_generation_scripts/audiotools/wrappers/reverb.py +++ b/ivas_processing_scripts/audiotools/wrappers/reverb.py @@ -31,18 +31,19 @@ # import os.path -import numpy as np -from scipy.fft import fft from copy import copy from pathlib import Path from tempfile import TemporaryDirectory from typing import Optional, Union -from item_generation_scripts.constants import DEFAULT_CONFIG_BINARIES -from item_generation_scripts.utils import find_binary, run -from item_generation_scripts.audiotools.audio import Audio -from item_generation_scripts.audiotools.audiofile import read, write -from item_generation_scripts.audiotools.wrappers.filter import resample_itu +import numpy as np +from scipy.fft import fft + +from ivas_processing_scripts.audiotools.audio import Audio +from ivas_processing_scripts.audiotools.audiofile import read, write +from ivas_processing_scripts.audiotools.wrappers.filter import resample_itu +from ivas_processing_scripts.constants import DEFAULT_CONFIG_BINARIES +from ivas_processing_scripts.utils import find_binary, run def reverb( @@ -62,13 +63,13 @@ def reverb( Impulse response align: float multiplicative factor to apply to the reverberated sound in order to align its energy level with a second filePath to the output file - + Returns ------- output: Audio Convolved audio signal with IR """ - + # find binary if "reverb" in DEFAULT_CONFIG_BINARIES["binary_paths"]: binary = find_binary( @@ -77,10 +78,10 @@ def reverb( ) else: binary = find_binary("reverb") - + with TemporaryDirectory() as tmp_dir: tmp_dir = Path(tmp_dir) - + # resample input audio signal to that of the IR old_fs = None tmp_input = copy(input) @@ -92,12 +93,12 @@ def reverb( # write input audio signal to temporary file in .pcm format tmp_input_file = tmp_dir.joinpath(f"tmp_reverbIn.pcm") write(tmp_input_file, tmp_input.audio, tmp_input.fs) - + # down-scale IR to prevent saturation # max_value = np.max(np.abs(IR.audio)) # if max_value > 1.0: - # IR.audio = IR.audio / max_value - + # IR.audio = IR.audio / max_value + # write IR to temporary file in .pcm format # note: the reverb tool expects 32b float format tmp_IR_file = tmp_dir.joinpath(f"tmp_IR.pcm") @@ -111,7 +112,7 @@ def reverb( # append multiplicative factor, if provided if align: cmd.extend(["-align", str(align)]) - + # append temporary filenames tmp_output_file = tmp_dir.joinpath(f"tmp_reverbOut.pcm") cmd.extend([tmp_input_file, tmp_IR_file, tmp_output_file]) @@ -119,17 +120,18 @@ def reverb( # run the 'reverb' command run(cmd) - # read the reverberated output file + # read the reverberated output file output = copy(tmp_input) output.audio, _ = read(tmp_output_file, nchannels=1, fs=tmp_input.fs) - + # reverse the resampling if old_fs: output.audio = resample_itu(output, old_fs) output.fs = old_fs - + return output + def reverb_stereo( input: Audio, stereo_IR: Audio, @@ -146,13 +148,13 @@ def reverb_stereo( Impulse response align: float multiplicative factor to apply to the reverberated sound in order to align its energy level with the second file - + Returns ------- output: Audio Convolved audio signal with stereo IR """ - + # convert to float32 stereo_IR.audio = np.float32(stereo_IR.audio) @@ -160,26 +162,26 @@ def reverb_stereo( IR_left = copy(stereo_IR) IR_left.name = "MONO" IR_left.num_channels = 1 - IR_left.audio = np.reshape(stereo_IR.audio[:,0], (-1, 1)) - + IR_left.audio = np.reshape(stereo_IR.audio[:, 0], (-1, 1)) + IR_right = copy(stereo_IR) IR_right.name = "MONO" IR_right.num_channels = 1 - IR_right.audio = np.reshape(stereo_IR.audio[:,1], (-1, 1)) + IR_right.audio = np.reshape(stereo_IR.audio[:, 1], (-1, 1)) # calculate the scaling (multiplicative) factor such that the maximum gain of the IR filter across all frequencies is 0dB if align is None: H = fft(stereo_IR.audio, axis=0) align = 1.0 / np.max(np.abs(H)) - + # convolve mono input with left and right IR y_left = reverb(input, IR_left, align=align) y_right = reverb(input, IR_right, align=align) - + # combine into stereo output y = copy(input) y.name = "STEREO" y.num_channels = 2 y.audio = np.column_stack([y_left.audio, y_right.audio]) - + return y diff --git a/item_generation_scripts/__init__.py b/ivas_processing_scripts/generation/__init__.py old mode 100644 new mode 100755 similarity index 90% rename from item_generation_scripts/__init__.py rename to ivas_processing_scripts/generation/__init__.py index 9351646457eb748c46f0ee4ec8443ec712ea72a8..76d10610d5b4c608f40579b63106416b06597157 --- a/item_generation_scripts/__init__.py +++ b/ivas_processing_scripts/generation/__init__.py @@ -35,13 +35,13 @@ import os import yaml -from item_generation_scripts.constants import ( +from ivas_processing_scripts.constants import ( LOGGER_DATEFMT, LOGGER_FORMAT, LOGGER_SUFFIX, ) -from item_generation_scripts.processing import config, process_ism_items, process_stereo_items -from item_generation_scripts.utils import create_dir +from ivas_processing_scripts.generation import config, process_ism_items, process_stereo_items +from ivas_processing_scripts.utils import create_dir def logging_init(args, cfg): @@ -94,7 +94,9 @@ def main(args): fs=cfg.fs, preamble=cfg.preamble, postamble=cfg.postamble, - add_low_level_random_noise=cfg.add_low_level_random_noise, + add_low_level_random_noise=getattr(cfg, "add_low_level_random_noise", False), + # TODO@VM dict.get() can provide a default value if the key is not found + # please check if this is a viable solution - I kept getting "AttributeError: 'TestConfig' object has no attribute 'add_low_level_random_noise'" ) elif cfg.format == "STEREO": # generate STEREO items according to scene description @@ -112,7 +114,7 @@ def main(args): postamble=cfg.postamble, add_low_level_random_noise=cfg.add_low_level_random_noise, ) - + # copy configuration to output directory with open(cfg.output_path.joinpath(f"{cfg.format}.yml"), "w") as f: yaml.safe_dump(cfg._yaml_dump, f) diff --git a/item_generation_scripts/__main__.py b/ivas_processing_scripts/generation/__main__.py old mode 100644 new mode 100755 similarity index 98% rename from item_generation_scripts/__main__.py rename to ivas_processing_scripts/generation/__main__.py index b49109d3775b0f40fcb4826dbf4816eb9d7e2eda..9ba00fd5d8394e621e86bb3e6064dd112db27cb2 --- a/item_generation_scripts/__main__.py +++ b/ivas_processing_scripts/generation/__main__.py @@ -32,7 +32,7 @@ import argparse -from item_generation_scripts import main +from ivas_processing_scripts.generation import main if __name__ == "__main__": parser = argparse.ArgumentParser( diff --git a/item_generation_scripts/processing/config.py b/ivas_processing_scripts/generation/config.py similarity index 97% rename from item_generation_scripts/processing/config.py rename to ivas_processing_scripts/generation/config.py index 3e9aaaa5e88bbfb86a6e2eda11df0d69c285202e..ca9dbcc2aeab995b5a68e33db27ff495f4983b5a 100644 --- a/item_generation_scripts/processing/config.py +++ b/ivas_processing_scripts/generation/config.py @@ -35,7 +35,7 @@ from pathlib import Path import yaml -from item_generation_scripts.constants import DEFAULT_CONFIG, REQUIRED_KEYS +from ivas_processing_scripts.generation.constants import DEFAULT_CONFIG, REQUIRED_KEYS def merge_dicts(base: dict, other: dict) -> None: @@ -122,4 +122,4 @@ class TestConfig: # Report missing keys to the user if MISSING_KEYS: - raise KeyError(f"The following key(s) must be specified : {MISSING_KEYS}") + raise KeyError(f"The following key(s) must be specified : {MISSING_KEYS}") \ No newline at end of file diff --git a/item_generation_scripts/constants.py b/ivas_processing_scripts/generation/constants.py similarity index 95% rename from item_generation_scripts/constants.py rename to ivas_processing_scripts/generation/constants.py index 6b0d0681be7bddebbfe108bd2eb941bbd423f447..340012075eb020eb81bc7e991b06d41ab61c72b9 100644 --- a/item_generation_scripts/constants.py +++ b/ivas_processing_scripts/generation/constants.py @@ -33,7 +33,7 @@ from datetime import datetime from pathlib import Path -from item_generation_scripts.utils import get_binary_paths +from ivas_processing_scripts.utils import get_binary_paths LOGGER_SUFFIX = ".log" LOGGER_FORMAT = ( @@ -55,7 +55,7 @@ DEFAULT_CONFIG = { DEFAULT_CONFIG_BINARIES = { "binary_paths": get_binary_paths( - Path(__file__).parent.joinpath("binary_paths.yml") + Path(__file__).parent.parent.joinpath("binary_paths.yml") ), } @@ -64,4 +64,4 @@ REQUIRED_KEYS = [ "input_path", "output_path", "scenes", -] +] \ No newline at end of file diff --git a/item_generation_scripts/processing/process_ism_items.py b/ivas_processing_scripts/generation/process_ism_items.py similarity index 86% rename from item_generation_scripts/processing/process_ism_items.py rename to ivas_processing_scripts/generation/process_ism_items.py index b03468ecfa52406b70bdf73bccdff99abe17b301..d788da345f435adcbf0497e7c617cedd4887e86e 100644 --- a/item_generation_scripts/processing/process_ism_items.py +++ b/ivas_processing_scripts/generation/process_ism_items.py @@ -33,16 +33,18 @@ import csv import logging import os +from math import floor from pathlib import Path from typing import Optional + import numpy as np -from math import floor -from item_generation_scripts.audiotools import audio, audiofile -from item_generation_scripts.audiotools.wrappers.bs1770 import get_loudness +from ivas_processing_scripts.audiotools import audio, audiofile +from ivas_processing_scripts.audiotools.wrappers.bs1770 import get_loudness SEED_RANDOM_NOISE = 0 + # function for converting nd numpy array to strings with 2 decimal digits def csv_formatdata(data): for row in data: @@ -78,21 +80,20 @@ def generate_ism_items( else: y = audio.ChannelBasedAudio("MONO") y_meta = None - + # read the overlap length - if 'overlap' in scene.keys(): + if "overlap" in scene.keys(): source_overlap = float(scene["overlap"]) else: source_overlap = 0.0 - + # repeat for all source files for i in range(N_sources): - # parse parameters from the scene description source_file = np.atleast_1d(scene["source"])[i] source_azi = np.atleast_1d(scene["azimuth"])[i] source_ele = np.atleast_1d(scene["elevation"])[i] - + logger.info( f"Encoding {source_file} at position(s) {source_azi},{source_ele}" ) @@ -102,7 +103,7 @@ def generate_ism_items( # get the number of frames (multiple of 20ms) N_frames = int(len(x.audio) / x.fs * 50) - + # trim the source signal to align to 20ms boundary # N_trim = int(N_frames * x.fs / 50) # x.audio = x.audio[:N_trim] @@ -177,18 +178,18 @@ def generate_ism_items( # arrange all metadata fields column-wise into a matrix x_meta = np.column_stack((azi, ele, dist, spread, gain)) - + # shift the second (and all other) source files (positive shift creates overlap, negative shift creates a gap) if i > 0 and source_overlap != 0.0: # get the length of the first source file - N_delay = len(y.audio[:,0]) - + N_delay = len(y.audio[:, 0]) + # add the shift N_delay += int(source_overlap * x.fs) - + # ensure delay is a multiple of 20ms # N_delay = int(floor(source_shift * 50) / 50 * x.fs) - + # insert all-zero preamble pre = np.zeros((N_delay, x.audio.shape[1])) x.audio = np.concatenate([pre, x.audio]) @@ -196,14 +197,14 @@ def generate_ism_items( # insert neutral position as a pre-amble pre = np.tile( [0.00, 0.00, 1.00, 0.00, 1.00], (N_delay, 1) - ) # !!!! TBD - check if we should insert netrual position or the first position of the metadata + ) # !!!! TBD - check if we should insert netrual position or the first position of the metadata x_meta = np.concatenate([pre, x_meta]) - - # pad with zeros to ensure that the signal length is a multiple of 20ms + + # pad with zeros to ensure that the signal length is a multiple of 20ms N_frame = x.fs / 50 if len(x.audio) % N_frame != 0: N_pad = int(N_frame - len(x.audio) % N_frame) - + # insert all-zero preamble pre = np.zeros((N_pad, x.audio.shape[1])) x.audio = np.concatenate([pre, x.audio]) @@ -211,7 +212,7 @@ def generate_ism_items( # insert neutral position as a pre-amble pre = np.tile( [0.00, 0.00, 1.00, 0.00, 1.00], (N_pad, 1) - ) # !!!! TBD - check if we should insert netrual position or the first position of the metadata + ) # !!!! TBD - check if we should insert netrual position or the first position of the metadata x_meta = np.concatenate([pre, x_meta]) # add source signal to the array of all source signals @@ -221,14 +222,28 @@ def generate_ism_items( else: # pad with zeros to have the same length of all source signals if x.audio.shape[0] > y.audio.shape[0]: - y.audio = np.vstack((y.audio, np.zeros((x.audio.shape[0] - y.audio.shape[0], y.audio.shape[1])))) + y.audio = np.vstack( + ( + y.audio, + np.zeros( + (x.audio.shape[0] - y.audio.shape[0], y.audio.shape[1]) + ), + ) + ) elif y.audio.shape[0] > x.audio.shape[0]: - x.audio = np.vstack((x.audio, np.zeros((y.audio.shape[0] - x.audio.shape[0], x.audio.shape[1])))) + x.audio = np.vstack( + ( + x.audio, + np.zeros( + (y.audio.shape[0] - x.audio.shape[0], x.audio.shape[1]) + ), + ) + ) y.audio = np.hstack((y.audio, x.audio)) # add metadata to the array of all metadata # make sure x_meta is a 3d array - x_meta = x_meta[np.newaxis, :] + x_meta = x_meta[np.newaxis, :] if y_meta is None: y_meta = x_meta else: @@ -239,25 +254,19 @@ def generate_ism_items( if x_meta.shape[1] > y_meta.shape[1]: N_delta = x_meta.shape[1] - y_meta.shape[1] # reshape to 2d array - y_meta = y_meta.reshape(y_meta.shape[1], -1) + y_meta = y_meta.reshape(y_meta.shape[1], -1) # repeat last row N_delta times and append to the array - y_meta = np.vstack( - (y_meta, np.tile(y_meta[-1, :], (N_delta, 1))) - ) + y_meta = np.vstack((y_meta, np.tile(y_meta[-1, :], (N_delta, 1)))) # reshape back to 3d array - y_meta = y_meta.reshape( - N_srcs, -1, N_meta_features - ) + y_meta = y_meta.reshape(N_srcs, -1, N_meta_features) elif y_meta.shape[1] > x_meta.shape[1]: N_delta = y_meta.shape[1] - x_meta.shape[1] # reshape to 2d array - x_meta = x_meta.reshape(x_meta.shape[1], -1) + x_meta = x_meta.reshape(x_meta.shape[1], -1) # repeat last row N_delta times and append to the array - x_meta = np.vstack( - (x_meta, np.tile(x_meta[-1, :], (N_delta, 1))) - ) + x_meta = np.vstack((x_meta, np.tile(x_meta[-1, :], (N_delta, 1)))) # reshape back to 3d array - x_meta = np.expand_dims(x_meta, axis=0) + x_meta = np.expand_dims(x_meta, axis=0) y_meta = np.concatenate([y_meta, x_meta]) @@ -265,7 +274,7 @@ def generate_ism_items( if preamble != 0.0: # ensure that pre-mable is a multiple of 20ms N_pre = int(floor(preamble * 50) / 50 * y.fs) - + # insert all-zero preamble to all sources pre = np.zeros((N_pre, y.audio.shape[1])) y.audio = np.concatenate([pre, y.audio]) @@ -273,13 +282,13 @@ def generate_ism_items( # insert neutral position as a pre-amble to all sources pre = np.tile( [0.00, 0.00, 1.00, 0.00, 1.00], (y_meta.shape[0], N_pre, 1) - ) # !!!! TBD - check if we should insert netrual position or the first position of the metadata + ) # !!!! TBD - check if we should insert netrual position or the first position of the metadata y_meta = np.concatenate([pre, y_meta], axis=1) - + if postamble != 0.0: # ensure that post-mable is a multiple of 20ms N_post = int(floor(postamble * 50) / 50 * y.fs) - + # append all-zero postamble to all sources post = np.zeros((N_post, y.audio.shape[1])) y.audio = np.concatenate([y.audio, post]) @@ -287,17 +296,17 @@ def generate_ism_items( # append neutral position as a post-amble to all sources post = np.tile( [0.00, 0.00, 1.00, 0.00, 1.00], (y_meta.shape[0], N_post, 1) - ) # !!!! TBD - check if we should insert netrual position or the last position of the metadata + ) # !!!! TBD - check if we should insert netrual position or the last position of the metadata y_meta = np.concatenate([y_meta, post], axis=1) - + # add random noise if add_low_level_random_noise: # create uniformly distributed noise between -4 and 4 np.random.seed(SEED_RANDOM_NOISE) - noise = np.random.randint( - low=-4, high=5, size=y.audio.shape - ).astype("float") - + noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype( + "float" + ) + # superimpose y.audio += noise @@ -312,7 +321,12 @@ def generate_ism_items( # generate .csv filename (should end with .0.csv, .1.csv, ...) csv_filename = os.path.normpath(f"{output_filename}.{i}.csv") - with open(os.path.join(output_path, csv_filename), 'w', newline='', encoding='utf-8') as f: + with open( + os.path.join(output_path, csv_filename), + "w", + newline="", + encoding="utf-8", + ) as f: # create csv writer writer = csv.writer(f) diff --git a/item_generation_scripts/processing/process_stereo_items.py b/ivas_processing_scripts/generation/process_stereo_items.py similarity index 87% rename from item_generation_scripts/processing/process_stereo_items.py rename to ivas_processing_scripts/generation/process_stereo_items.py index a6ed6c8a3384a7a4418e3f92ae11188de765cb35..feae1b26636c43898dbdd7263f209595dc4ecb9c 100644 --- a/item_generation_scripts/processing/process_stereo_items.py +++ b/ivas_processing_scripts/generation/process_stereo_items.py @@ -34,15 +34,16 @@ import csv import logging import os +from copy import copy +from math import floor from pathlib import Path from typing import Optional -from copy import copy + import numpy as np -from math import floor -from item_generation_scripts.audiotools import audio, audiofile -from item_generation_scripts.audiotools.wrappers.bs1770 import get_loudness -from item_generation_scripts.audiotools.wrappers.reverb import reverb_stereo +from ivas_processing_scripts.audiotools import audio, audiofile +from ivas_processing_scripts.audiotools.wrappers.bs1770 import get_loudness +from ivas_processing_scripts.audiotools.wrappers.reverb import reverb_stereo SEED_RANDOM_NOISE = 0 @@ -71,13 +72,13 @@ def generate_stereo_items( # get the number of scenes N_scenes = len(scenes) - + for scene_name, scene in scenes.items(): logger.info(f"Processing scene: {scene_name} out of {N_scenes} scenes") # extract the number of audio sources N_sources = len(np.atleast_1d(scene["source"])) - + # read the IR (check if stereo or two mono files were provided) source_IR = np.atleast_1d(scene["IR"]) @@ -89,7 +90,6 @@ def generate_stereo_items( y = audio.ChannelBasedAudio("STEREO") for i in range(N_sources): - # parse parameters from the scene description source_file = np.atleast_1d(scene["source"])[i] IR_file = np.atleast_1d(scene["IR"])[i] @@ -109,7 +109,7 @@ def generate_stereo_items( # convolve with stereo IR x_rev = reverb_stereo(x, IR) - + # adjust the level of the stereo signal _, scale_factor = get_loudness(x_rev, target_level, "STEREO") x_rev.audio *= scale_factor @@ -145,11 +145,31 @@ def generate_stereo_items( else: # pad with zeros to have equal length of all source signals if x_rev.audio.shape[0] > y.audio.shape[0]: - y.audio = np.vstack((y.audio, np.zeros((x_rev.audio.shape[0] - y.audio.shape[0], y.audio.shape[1])))) + y.audio = np.vstack( + ( + y.audio, + np.zeros( + ( + x_rev.audio.shape[0] - y.audio.shape[0], + y.audio.shape[1], + ) + ), + ) + ) elif y.audio.shape[0] > x_rev.audio.shape[0]: - x_rev.audio = np.vstack((x_rev.audio, np.zeros((y.audio.shape[0] - x_rev.audio.shape[0], x_rev.audio.shape[1])))) - - # superimpose + x_rev.audio = np.vstack( + ( + x_rev.audio, + np.zeros( + ( + y.audio.shape[0] - x_rev.audio.shape[0], + x_rev.audio.shape[1], + ) + ), + ) + ) + + # superimpose y.audio += x_rev.audio # append pre-amble and post-amble to all sources @@ -186,4 +206,4 @@ def generate_stereo_items( os.path.join(output_path, output_filename), y.audio, y.fs ) # !!!! TBD: replace all os.path.xxx operations with the Path object - return \ No newline at end of file + return