Commit 8cc50861 authored by Vladimir Malenovsky's avatar Vladimir Malenovsky
Browse files

improve prinout messages, apply fade-in, fade-out, correct input source shifting

parent bc5275f3
Loading
Loading
Loading
Loading
Loading
+23 −11
Original line number Diff line number Diff line
@@ -188,8 +188,9 @@ def generate_ambi_scene(
        - Writes the processed FOA/HOA2/HOA3 audio to the output file.
    """

    scenes = list(cfg.scenes.keys())
    logger.info(
        f"Processing scene: {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}"
        f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}"
    )

    # extract the number of audio sources
@@ -238,6 +239,7 @@ def generate_ambi_scene(
            source_shift = 0.0

        # convert overlap to samples and ensure it is a multiple of 20ms
        source_shift_in_seconds = source_shift
        source_shift = source_shift * cfg.fs
        if source_shift >= 0:
            source_shift = int(np.floor(source_shift / frame_len) * frame_len)
@@ -254,7 +256,7 @@ def generate_ambi_scene(
        else:
            level = -26

        logger.info(f"Convolving {source_file} with {IR_file}")
        logger.info(f"-- Convolving {source_file} with {IR_file} at {level} LUFS with shift of {source_shift_in_seconds} seconds")

        # read source file
        x = audio.fromfile("MONO", input_filename)
@@ -279,7 +281,7 @@ def generate_ambi_scene(
        elif cfg.format == "HOA3":
            x = reverb_hoa3(x, IR)

        # adjust the level of the target signal
        # adjust the level of the FOA/HOA2/HOA3 signal
        x.audio, _ = loudness_norm(x, level, loudness_format="STEREO")

        # ensure the length of the audio source signal is a multiple of 20ms
@@ -294,9 +296,10 @@ def generate_ambi_scene(
            # add source signal to the array of all source signals
            y.audio = x.audio.copy()

            # if source_shift < 0:
            #     # insert zeros to the new audio source signal to shift it right
            #     metadata.trim_meta(y, limits=[source_shift, 0], samples=True)
            if source_shift < 0:
                # insert zeros to the new audio source signal to shift it right
                y.audio = audioarray.trim_meta(y.audio, y.fs, limits=[source_shift, 0], samples=True)
            else:
                offset = source_shift
        else:
            # shift the beginning of the audio source signal
@@ -344,20 +347,29 @@ def generate_ambi_scene(
    if len(y.audio) != duration:
        y.audio = audioarray.trim(y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True)

    # adjust the loudness of the output signal
    logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS")
    y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL")

    # apply fade-in and fade-out
    if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0:
        logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds")
        y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000)

    # write the FOA/HOA2/HOA3 audio signal into output file
    audiofile.write(output_filename, y.audio, y.fs)

    # convert to BINAURAL, if option was chosen
    if cfg.binaural_output:
        binaudio = audio.fromtype("BINAURAL")
        binaudio.fs = y.fs
        convert_scenebased(y, binaudio)
        binaural_output_filename = output_filename.with_name(
            output_filename.stem + "_BINAURAL" + output_filename.suffix
        )
        logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}")
        binaudio = audio.fromtype("BINAURAL")
        binaudio.fs = y.fs
        convert_scenebased(y, binaudio)
        audiofile.write(
            binaural_output_filename,
            binaudio.audio,
            binaudio.fs,
        )
        logger.info(f"Written BINAURAL output to: {binaural_output_filename}")
+23 −9
Original line number Diff line number Diff line
@@ -176,8 +176,9 @@ def generate_ismN_scene(
        - Writes the processed audio and metadata to output files.
    """

    scenes = list(cfg.scenes.keys())
    logger.info(
        f"Processing {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}"
        f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}"
    )

    # extract the number of audio sources
@@ -236,6 +237,7 @@ def generate_ismN_scene(
            source_shift = 0.0

        # convert overlap to samples and ensure it is a multiple of 20ms
        source_shift_in_seconds = source_shift
        source_shift = source_shift * cfg.fs
        if source_shift >= 0:
            source_shift = int(np.floor(source_shift / frame_len) * frame_len)
@@ -250,7 +252,7 @@ def generate_ismN_scene(
        else:
            level = -26

        logger.info(f"Encoding {source_file} at position(s) {source_azi},{source_ele}")
        logger.info(f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds")

        # read source file
        x = audio.fromtype("ISM1")
@@ -357,9 +359,11 @@ def generate_ismN_scene(
            y.audio = x.audio.copy()
            y.object_pos = x.object_pos.copy()
            y.fs = x.fs
            # if source_shift < 0:
            #     # insert zeros to the new audio source signal to shift it right
            #     metadata.trim_meta(y, limits=[source_shift, 0], samples=True)

            if source_shift < 0:
                # insert zeros to the new audio source signal to shift it right
                metadata.trim_meta(y, limits=[source_shift, 0], samples=True)
            else:
                offset = source_shift
        else:
            # shift the beginning of the audio source signal
@@ -410,18 +414,28 @@ def generate_ismN_scene(
    if len(y.audio) != duration:
        metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True)

    # adjust the loudness of the output signal
    logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS")
    y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL")

    # apply fade-in and fade-out
    if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0:
        logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds")
        y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000)

    # write the ISMn output to .wav file in an interleaved format and ISM metadata in .csv files
    audiofile.write(output_filename, y.audio, y.fs)
    metadata.write_ISM_metadata_in_file(y.object_pos, y.metadata_files)

    # convert to BINAURAL, if option was chosen
    if cfg.binaural_output:
        binaudio = audio.fromtype("BINAURAL")
        binaudio.fs = y.fs
        convert_objectbased(y, binaudio)
        binaural_output_filename = output_filename.with_name(
            output_filename.stem + "_BINAURAL" + output_filename.suffix
        )
        logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}")
        binaudio = audio.fromtype("BINAURAL")
        binaudio.fs = y.fs
        convert_objectbased(y, binaudio)
        audiofile.write(
            binaural_output_filename,
            binaudio.audio,
+22 −9
Original line number Diff line number Diff line
@@ -175,8 +175,9 @@ def generate_OMASA_scene(
        - Handles various audio formats (e.g., MONO, FOA, HOA2) and applies transformations like loudness normalization, trimming, and padding.
    """

    scenes = list(cfg.scenes.keys())
    logger.info(
        f"Processing {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}"
        f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}"
    )

    # extract the number of audio sources
@@ -238,6 +239,7 @@ def generate_OMASA_scene(
            source_shift = 0.0

        # convert overlap to samples and ensure it is a multiple of 20ms
        source_shift_in_seconds = source_shift
        source_shift = source_shift * cfg.fs
        if source_shift >= 0:
            source_shift = int(np.floor(source_shift / frame_len) * frame_len)
@@ -254,7 +256,7 @@ def generate_OMASA_scene(
        else:
            level = -26

        logger.info(f"Encoding {source_file} at position(s) {source_azi},{source_ele}")
        logger.info(f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds")

        # get the number of channels from the .wav file header
        wav_header = audiofile.parse_wave_header(input_filename)
@@ -404,9 +406,10 @@ def generate_OMASA_scene(
                # if ISM, append object position to the OMASA object
                y.object_pos = x.object_pos.copy()

            # if source_shift < 0:
            #     # insert zeros to the new audio source signal to shift it right
            #     metadata.trim_meta(y, limits=[source_shift, 0], samples=True)
            if source_shift < 0:
                # insert zeros to the new audio source signal to shift it right
                metadata.trim_meta(y, limits=[source_shift, 0], samples=True)
            else:
                offset = source_shift
        else:
            # shift the beginning of the audio source signal
@@ -458,18 +461,28 @@ def generate_OMASA_scene(
    if len(y.audio) != duration:
        metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True)

    # adjust the loudness of the output signal
    logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS")
    y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL")

    # apply fade-in and fade-out
    if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0:
        logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds")
        y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000)

    # write the OMASA audio output to .wav file in an interleaved format and ISM metadata in .csv files
    audiofile.write(output_filename, y.audio, y.fs)
    metadata.write_ISM_metadata_in_file(y.object_pos, y.metadata_files[:-1])

    # convert to OMASA output to BINAURAL, if option was chosen
    if cfg.binaural_output:
        binaudio = audio.fromtype("BINAURAL")
        binaudio.fs = y.fs
        convert_omasa(y, binaudio)
        binaural_output_filename = output_filename.with_name(
            output_filename.stem + "_BINAURAL" + output_filename.suffix
        )
        logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}")
        binaudio = audio.fromtype("BINAURAL")
        binaudio.fs = y.fs
        convert_omasa(y, binaudio)
        audiofile.write(
            binaural_output_filename,
            binaudio.audio,
+23 −10
Original line number Diff line number Diff line
@@ -173,8 +173,9 @@ def generate_OSBA_scene(
        - Handles various audio formats (e.g., FOA, HOA2, HOA3) and applies transformations like loudness normalization, trimming, and padding.
    """

    scenes = list(cfg.scenes.keys())
    logger.info(
        f"Processing {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}"
        f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}"
    )

    # extract the number of audio sources
@@ -233,6 +234,7 @@ def generate_OSBA_scene(
            source_shift = 0.0

        # convert overlap to samples and ensure it is a multiple of 20ms
        source_shift_in_seconds = source_shift
        source_shift = source_shift * cfg.fs
        if source_shift >= 0:
            source_shift = int(np.floor(source_shift / frame_len) * frame_len)
@@ -249,7 +251,7 @@ def generate_OSBA_scene(
        else:
            level = -26

        logger.info(f"Encoding {source_file} at position(s) {source_azi},{source_ele}")
        logger.info(f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds")

        # get the number of channels from the .wav file header
        wav_header = audiofile.parse_wave_header(input_filename)
@@ -266,7 +268,7 @@ def generate_OSBA_scene(
        elif N_channels == 16:
            fmt = "HOA3"
        else:
            logger.info(
            logger.error(
                f"Error: Input format of the source file with {N_channels} channels is not supported!"
            )
            sys.exit(-1)
@@ -386,9 +388,10 @@ def generate_OSBA_scene(
                # if ISM, append object position to the OSBA object
                y.object_pos = x.object_pos.copy()

            # if source_shift < 0:
            #     # insert zeros to the new audio source signal to shift it right
            #     metadata.trim_meta(y, limits=[source_shift, 0], samples=True)
            if source_shift < 0:
                # insert zeros to the new audio source signal to shift it right
                metadata.trim_meta(y, limits=[source_shift, 0], samples=True)
            else:
                offset = source_shift
        else:
            # shift the beginning of the audio source signal
@@ -440,18 +443,28 @@ def generate_OSBA_scene(
    if len(y.audio) != duration:
        metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True)

    # adjust the loudness of the output signal
    logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS")
    y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL")

    # apply fade-in and fade-out
    if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0:
        logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds")
        y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000)

    # write the OSBA audio output to .wav file in an interleaved format and ISM metadata in .csv files
    audiofile.write(output_filename, y.audio, y.fs)
    metadata.write_ISM_metadata_in_file(y.object_pos, y.metadata_files)

    # convert the OSBA output to BINAURAL, if option was chosen
    if cfg.binaural_output:
        binaudio = audio.fromtype("BINAURAL")
        binaudio.fs = y.fs
        convert_osba(y, binaudio)
        binaural_output_filename = output_filename.with_name(
            output_filename.stem + "_BINAURAL" + output_filename.suffix
        )
        logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}")
        binaudio = audio.fromtype("BINAURAL")
        binaudio.fs = y.fs
        convert_osba(y, binaudio)
        audiofile.write(
            binaural_output_filename,
            binaudio.audio,
+19 −7
Original line number Diff line number Diff line
@@ -194,8 +194,9 @@ def generate_stereo_scene(
        - Writes the processed STEREO audio to output file.
    """

    scenes = list(cfg.scenes.keys())
    logger.info(
        f"Processing scene: {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}"
        f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}"
    )

    # extract the number of audio sources
@@ -244,6 +245,7 @@ def generate_stereo_scene(
            source_shift = 0.0

        # convert overlap to samples and ensure it is a multiple of 20ms
        source_shift_in_seconds = source_shift
        source_shift = source_shift * cfg.fs
        if source_shift >= 0:
            source_shift = int(np.floor(source_shift / frame_len) * frame_len)
@@ -260,7 +262,7 @@ def generate_stereo_scene(
        else:
            level = -26

        logger.info(f"Convolving {source_file} with {IR_file}")
        logger.info(f"-- Convolving {source_file} with {IR_file} at {level} LUFS with shift of {source_shift_in_seconds} seconds")

        # read source file
        x = audio.fromfile("MONO", input_filename)
@@ -280,7 +282,7 @@ def generate_stereo_scene(
        # convolve MONO source audio with STEREO IR -> results in STEREO audio object
        x = reverb_stereo(x, IR)

        # adjust the level of the stereo signal
        # adjust the level of the STEREO signal
        x.audio, _ = loudness_norm(x, level, loudness_format="STEREO")

        # ensure the length of the audio source signal is a multiple of 20ms
@@ -295,9 +297,10 @@ def generate_stereo_scene(
            # add source signal to the array of all source signals
            y.audio = x.audio.copy()

            # if source_shift < 0:
            #     # insert zeros to the new audio source signal to shift it right
            #     metadata.trim_meta(y, limits=[source_shift, 0], samples=True)
            if source_shift < 0:
                # insert zeros to the new audio source signal to shift it right
                y.audio = audioarray.trim(y.audio, x.fs, limits=[source_shift, 0], samples=True)
            else:
                offset = source_shift
        else:
            # shift the beginning of the audio source signal
@@ -345,5 +348,14 @@ def generate_stereo_scene(
    if len(y.audio) != duration:
        y.audio = audioarray.trim(y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True)

    # adjust the loudness of the output signal
    logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS")
    y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="STEREO")

    # apply fade-in and fade-out
    if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0:
        logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds")
        y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000)

    # write the STEREO audio signal into output file
    audiofile.write(output_filename, y.audio, y.fs)