support of .raw format (81628b69) · Commits · IVAS Codec Public Collaboration / IVAS Processing Scripts

item_generation_scripts/init.py

+1 −0

Original line number	Diff line number	Diff line
		@@ -91,6 +91,7 @@ def main(args):
		cfg.output_path,
		cfg.scenes,
		logger,
		fs=cfg.fs
		)

		# copy configuration to output directory

item_generation_scripts/config/ISM1_CONFIG.yml

+39 −39

Original line number	Diff line number	Diff line
		@@ -13,7 +13,7 @@ format: "ISM1"
		# delete_tmp: true

		### Output sampling rate in Hz needed for headerless audio files; default = 48000
		# fs: 32000
		fs: 48000

		### Any relative paths will be interpreted relative to the working directory the script is called from!
		### Usage of absolute paths is recommended.
		@@ -21,10 +21,10 @@ format: "ISM1"
		### For Windows users: please use double back slash '\\' in paths and add '.exe' to executable definitions

		### Input path to mono files
		input_path: "/mnt/c/Work/IVAS/3gpp_forge_gitlab/ivas-processing-scripts/items_mono"
		input_path: "./items_mono"

		### Output path for generated test items and metadata files
		output_path: "/mnt/c/Work/IVAS/3gpp_forge_gitlab/ivas-processing-scripts/output"
		output_path: "./output"

		### Target loudness in LKFS; default = null (no loudness normalization applied)
		loudness: -26
		@@ -51,7 +51,7 @@ scenes:
		a1:
		name: "G1S1.wav"
		description: "Talker sitting at a table"
		source: "f2s5a_Talker1.wav"
		source: "test_single.wav"
		azimuth: 0
		elevation: 0
		delay: 0
		@@ -59,7 +59,7 @@ scenes:
		a2:
		name: "G6S2.wav"
		description: "Talker sitting at a table"
		source: "f5s10a_Talker1.wav"
		source: "test_single.wav"
		azimuth: 60
		elevation: 0
		delay: 0
		@@ -67,7 +67,7 @@ scenes:
		a3:
		name: "G5S3.wav"
		description: "Talker sitting at a table"
		source: "f2s5a_Talker1.wav"
		source: "test_single.wav"
		azimuth: 120
		elevation: 0
		delay: 0
		@@ -75,7 +75,7 @@ scenes:
		a4:
		name: "G4S4.wav"
		description: "Talker sitting at a table"
		source: "m4s11b_Talker1.wav"
		source: "test_single.wav"
		azimuth: 180
		elevation: 0
		delay: 0
		@@ -83,7 +83,7 @@ scenes:
		a5:
		name: "G3S5.wav"
		description: "Talker sitting at a table"
		source: "m1s4a_Talker1.wav"
		source: "test_single.wav"
		azimuth: 240
		elevation: 0
		delay: 0
		@@ -91,7 +91,7 @@ scenes:
		a6:
		name: "G2S6.wav"
		description: "Talker sitting at a table"
		source: "f5s10a_Talker1.wav"
		source: "test_single.wav"
		azimuth: 300
		elevation: 0
		delay: 0
		@@ -99,7 +99,7 @@ scenes:
		b1:
		name: "G2S1.wav"
		description: "standing talker."
		source: "f5s10b_Talker1.wav"
		source: "test_single.wav"
		azimuth: 120
		elevation: 35
		delay: 0
		@@ -107,7 +107,7 @@ scenes:
		b2:
		name: "G1S2.wav"
		description: "standing talker."
		source: "f2s1a_Talker1.wav"
		source: "test_single.wav"
		azimuth: 180
		elevation: 35
		delay: 0
		@@ -115,7 +115,7 @@ scenes:
		b3:
		name: "G6S3.wav"
		description: "standing talker."
		source: "f5s10b_Talker1.wav"
		source: "test_single.wav"
		azimuth: 240
		elevation: 35
		delay: 0
		@@ -123,7 +123,7 @@ scenes:
		b4:
		name: "G5S4.wav"
		description: "standing talker."
		source: "f2s1a_Talker1.wav"
		source: "test_single.wav"
		azimuth: 300
		elevation: 35
		delay: 0
		@@ -131,7 +131,7 @@ scenes:
		b5:
		name: "G4S5.wav"
		description: "standing talker."
		source: "m4s11a_Talker1.wav"
		source: "test_single.wav"
		azimuth: 0
		elevation: 35
		delay: 0
		@@ -139,7 +139,7 @@ scenes:
		b6:
		name: "G3S6.wav"
		description: "standing talker."
		source: "m1s2b_Talker1.wav"
		source: "test_single.wav"
		azimuth: 60
		elevation: 35
		delay: 0
		@@ -147,7 +147,7 @@ scenes:
		c1:
		name: "G3S1.wav"
		description: "Smaller talker (child) walking around a table."
		source: "m1s6b_Talker1.wav"
		source: "test_single.wav"
		azimuth: "0:1:360"
		elevation: 0
		delay: 0
		@@ -155,7 +155,7 @@ scenes:
		c2:
		name: "G2S2.wav"
		description: "Smaller talker (child) walking around a table."
		source: "f5s14a_Talker1.wav"
		source: "test_single.wav"
		azimuth: "60:1:60+360"
		elevation: 0
		delay: 0
		@@ -163,7 +163,7 @@ scenes:
		c3:
		name: "G1S3.wav"
		description: "Smaller talker (child) walking around a table."
		source: "f2s6a_Talker1.wav"
		source: "test_single.wav"
		azimuth: "120:1:120+360"
		elevation: 0
		delay: 0
		@@ -171,7 +171,7 @@ scenes:
		c4:
		name: "G6S4.wav"
		description: "Smaller talker (child) walking around a table."
		source: "f5s14a_Talker1.wav"
		source: "test_single.wav"
		azimuth: "180:1:180+360"
		elevation: 0
		delay: 0
		@@ -179,7 +179,7 @@ scenes:
		c5:
		name: "G5S5.wav"
		description: "Smaller talker (child) walking around a table."
		source: "f2s6a_Talker1.wav"
		source: "test_single.wav"
		azimuth: "240:1:240+360"
		elevation: 0
		delay: 0
		@@ -187,7 +187,7 @@ scenes:
		c6:
		name: "G4S6.wav"
		description: "Smaller talker (child) walking around a table."
		source: "m4s13a_Talker1.wav"
		source: "test_single.wav"
		azimuth: "300:1:300+360"
		elevation: 0
		delay: 0
		@@ -195,7 +195,7 @@ scenes:
		d1:
		name: "G4S1.wav"
		description: "Talker walking around the table."
		source: "m4s12b_Talker1.wav"
		source: "test_single.wav"
		azimuth: "0:-1:-360"
		elevation: 35
		delay: 0
		@@ -203,7 +203,7 @@ scenes:
		d2:
		name: "G3S2.wav"
		description: "Talker walking around the table."
		source: "m1s12a_Talker1.wav"
		source: "test_single.wav"
		azimuth: "60:-1:60-360"
		elevation: 35
		delay: 0
		@@ -211,7 +211,7 @@ scenes:
		d3:
		name: "G3S2.wav"
		description: "Talker walking around the table."
		source: "f5s15b_Talker1.wav"
		source: "test_single.wav"
		azimuth: "120:-1:120-360"
		elevation: 35
		delay: 0
		@@ -219,7 +219,7 @@ scenes:
		d4:
		name: "G1S4.wav"
		description: "Talker walking around the table."
		source: "f2s3b_Talker1.wav"
		source: "test_single.wav"
		azimuth: "180:-1:180-360"
		elevation: 35
		delay: 0
		@@ -227,7 +227,7 @@ scenes:
		d5:
		name: "G6S5.wav"
		description: "Talker walking around the table."
		source: "f5s15b_Talker1.wav"
		source: "test_single.wav"
		azimuth: "240:-1:240-360"
		elevation: 35
		delay: 0
		@@ -235,7 +235,7 @@ scenes:
		d6:
		name: "G5S6.wav"
		description: "Talker walking around the table."
		source: "f2s3b_Talker1.wav"
		source: "test_single.wav"
		azimuth: "300:-1:300-360"
		elevation: 35
		delay: 0
		@@ -243,7 +243,7 @@ scenes:
		e1:
		name: "G5S1.wav"
		description: "Elevation displacement."
		source: "f2s4a_Talker1.wav"
		source: "test_single.wav"
		azimuth: 240
		elevation: "-90:0.5:90"
		delay: 0
		@@ -251,7 +251,7 @@ scenes:
		e2:
		name: "G4S2.wav"
		description: "Elevation displacement."
		source: "m4s16a_Talker1.wav"
		source: "test_single.wav"
		azimuth: 300
		elevation: 0
		delay: 0
		@@ -259,7 +259,7 @@ scenes:
		e3:
		name: "G3S3.wav"
		description: "Elevation displacement."
		source: "m1s16b_Talker1.wav"
		source: "test_single.wav"
		azimuth: 0
		elevation: "-90:0.5:90"
		delay: 0
		@@ -267,7 +267,7 @@ scenes:
		e4:
		name: "G2S4.wav"
		description: "Elevation displacement."
		source: "f5s19a_Talker1.wav"
		source: "test_single.wav"
		azimuth: 60
		elevation: "-90:0.5:90"
		delay: 0
		@@ -275,7 +275,7 @@ scenes:
		e5:
		name: "G1S5.wav"
		description: "Elevation displacement."
		source: "f2s4a_Talker1.wav"
		source: "test_single.wav"
		azimuth: 120
		elevation: "-90:0.5:90"
		delay: 0
		@@ -283,7 +283,7 @@ scenes:
		e6:
		name: "G6S6.wav"
		description: "Elevation displacement."
		source: "f5s19a_Talker1.wav"
		source: "test_single.wav"
		azimuth: 180
		elevation: "-90:0.5:90"
		delay: 0
		@@ -291,7 +291,7 @@ scenes:
		f1:
		name: "G6S1.wav"
		description: "Azimuth and elevation displacement."
		source: "f5s15a_Talker1.wav"
		source: "test_single.wav"
		azimuth: "60:0.5:60+180"
		elevation: "35:-0.2:-35"
		delay: 0
		@@ -299,7 +299,7 @@ scenes:
		f2:
		name: "G5S2.wav"
		description: "Azimuth and elevation displacement."
		source: "f2s7b_Talker1.wav"
		source: "test_single.wav"
		azimuth: "120:0.5:120+180"
		elevation: "35:-0.2:-35"
		delay: 0
		@@ -307,7 +307,7 @@ scenes:
		f3:
		name: "G4S3.wav"
		description: "Azimuth and elevation displacement."
		source: "m4s14a_Talker1.wav"
		source: "test_single.wav"
		azimuth: "180:0.5:180+180"
		elevation: "35:-0.2:-35"
		delay: 0
		@@ -315,7 +315,7 @@ scenes:
		f4:
		name: "G3S4.wav"
		description: "Azimuth and elevation displacement."
		source: "m1s7a_Talker1.wav"
		source: "test_single.wav"
		azimuth: "240:0.5:240+180"
		elevation: "35:-0.2:-35"
		delay: 0
		@@ -323,7 +323,7 @@ scenes:
		f5:
		name: "G2S5.wav"
		description: "Azimuth and elevation displacement."
		source: "f5s15a_Talker1.wav"
		source: "test_single.wav"
		azimuth: "300:0.5:300+180"
		elevation: "35:-0.2:-35"
		delay: 0
		@@ -331,7 +331,7 @@ scenes:
		f6:
		name: "G1S6.wav"
		description: "Azimuth and elevation displacement."
		source: "f2s7b_Talker1.wav"
		source: "test_single.wav"
		azimuth: "0:0.5:0+180"
		elevation: "35:-0.2:-35"
		delay: 0

item_generation_scripts/config/ISM2_CONFIG.yml

+3 −3

Original line number	Diff line number	Diff line
		@@ -13,7 +13,7 @@ format: "ISM2"
		# delete_tmp: true

		### Output sampling rate in Hz needed for headerless audio files; default = 48000
		# fs: 32000
		fs: 48000

		### Any relative paths will be interpreted relative to the working directory the script is called from!
		### Usage of absolute paths is recommended.
		@@ -21,10 +21,10 @@ format: "ISM2"
		### For Windows users: please use double back slash '\\' in paths and add '.exe' to executable definitions

		### Input path to mono files
		input_path: "/mnt/c/Work/IVAS/3gpp_forge_gitlab/ivas-processing-scripts/items_mono"
		input_path: "./items_mono"

		### Output path for generated test items and metadata files
		output_path: "/mnt/c/Work/IVAS/3gpp_forge_gitlab/ivas-processing-scripts/output"
		output_path: "./output"

		### Target loudness in LKFS; default = null (no loudness normalization applied)
		loudness: -26

item_generation_scripts/processing/config.py

+1 −4

Original line number	Diff line number	Diff line
		@@ -35,10 +35,7 @@ from pathlib import Path

		import yaml

		from item_generation_scripts.constants import (
		DEFAULT_CONFIG,
		REQUIRED_KEYS,
		)
		from item_generation_scripts.constants import DEFAULT_CONFIG, REQUIRED_KEYS


		def merge_dicts(base: dict, other: dict) -> None:

item_generation_scripts/processing/process_ism_items.py

+4 −9

Original line number	Diff line number	Diff line
		@@ -35,6 +35,7 @@ import csv
		import logging
		import os
		from pathlib import Path
		from typing import Optional

		import numpy as np

		@@ -55,6 +56,7 @@ def generate_ism_items(
		output_path: Path,
		scenes: dict,
		logger: logging.Logger,
		fs: Optional[int] = 48000,
		):
		"""Generate ISM items with metadata from mono items based on scene description"""

		@@ -73,7 +75,6 @@ def generate_ism_items(
		source_file = np.atleast_1d(scene["source"])[i]
		source_azi = np.atleast_1d(scene["azimuth"])[i]
		source_ele = np.atleast_1d(scene["elevation"])[i]
		# source_type = "speech" # !!!! TBD - support generic audio + background noise and speech in the .yml file
		source_delay = np.atleast_1d(scene["delay"])[i]

		logger.info(
		@@ -81,10 +82,7 @@ def generate_ism_items(
		)

		# read source file
		# x, fs = audiofile.read(os.path.join(input_path, source_file)) # !!!! TBD - check the support for headerless .raw files
		# pdb.set_trace()
		audio_object = audio.fromfile("MONO", os.path.join(input_path, source_file))

		audio_object = audio.fromfile("MONO", os.path.join(input_path, source_file), fs=fs)
		x = audio_object.audio
		fs = audio_object.fs

		@@ -93,7 +91,6 @@ def generate_ism_items(

		# adjust the level of the source file
		_, scale_factor = get_loudness(audio_object, target_level, "MONO")
		# print(f"Scaling loudness with factor: {scale_factor}")
		x *= scale_factor

		# read azimuth information and create array
		@@ -156,9 +153,7 @@ def generate_ism_items(
		)

		# additional metadata
		dist = np.ones(
		N_frames
		) # !!!! TBD - check what to do with these metadata
		dist = np.ones(N_frames) # !!!! TBD - check what to do with these metadata
		spread = np.zeros(N_frames)
		gain = np.ones(N_frames)