Add server-side waveform loudness profiling on track upload
ILoudnessAlgorithm strategy (RmsLoudnessAlgorithm first impl), WaveformProfileService stores quantized byte[] sidecar in new MediaFileVault (profiles vault), wired into UnifiedTrackService.UploadAsync; failure is logged and swallowed. WaveformProfileDto and WaveformProfileOptions in shared projects.
This commit is contained in:
@@ -9,4 +9,9 @@ public static class VaultConstants
|
||||
/// Vault name for storing audio tracks
|
||||
/// </summary>
|
||||
public const string Tracks = "tracks";
|
||||
|
||||
/// <summary>
|
||||
/// Vault name for storing waveform loudness profile sidecars, keyed by track EntryKey.
|
||||
/// </summary>
|
||||
public const string WaveformProfiles = "waveform-profiles";
|
||||
}
|
||||
@@ -12,6 +12,7 @@
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="10.0.0" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options" Version="10.0.0" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
|
||||
@@ -219,6 +219,32 @@ public class AudioVault : MediaVault
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Concrete vault for plain <see cref="MediaBinary"/> entries (vault type
|
||||
/// <see cref="MediaVaultType.Media"/>) — bytes plus an extension, no audio/image-specific
|
||||
/// metadata. Used for sidecar artifacts such as waveform loudness profiles. The base
|
||||
/// <see cref="MediaVault"/> already handles Media-typed storage via the registry; this only
|
||||
/// provides the concrete factory the Image and Audio vaults also provide.
|
||||
/// </summary>
|
||||
public class MediaFileVault : MediaVault
|
||||
{
|
||||
private MediaFileVault(string rootPath, VaultIndex index, IndexFactoryService? factoryService = null)
|
||||
: base(rootPath, index, factoryService) { }
|
||||
|
||||
public static async Task<MediaFileVault?> FromAsync(string rootPath, IndexFactoryService? factoryService = null)
|
||||
{
|
||||
var factory = factoryService ?? new IndexFactoryService();
|
||||
var index = await factory.LoadOrCreateVaultIndexAsync(rootPath, MediaVaultType.Media);
|
||||
|
||||
if (index != null)
|
||||
{
|
||||
return new MediaFileVault(rootPath, (VaultIndex)index, factory);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// An open read-only stream over a vault entry plus the extension needed to
|
||||
/// resolve its MIME type. Caller owns the stream and must dispose it.
|
||||
|
||||
@@ -11,6 +11,7 @@ public static class MediaVaultFactory
|
||||
{
|
||||
return mediaType switch
|
||||
{
|
||||
MediaVaultType.Media => await MediaFileVault.FromAsync(rootPath, factoryService),
|
||||
MediaVaultType.Image => await ImageVault.FromAsync(rootPath, factoryService),
|
||||
MediaVaultType.Audio => await AudioVault.FromAsync(rootPath, factoryService),
|
||||
_ => null
|
||||
|
||||
@@ -31,7 +31,8 @@ public class SimpleMediaTypeRegistry : IMediaTypeRegistry
|
||||
dto => MediaBinary.From(dto),
|
||||
binary => new MediaBinaryDto(binary),
|
||||
(key, ext, _) => new MetaData(key, ext),
|
||||
(binary, meta) => new MediaBinaryParams(binary.Buffer, binary.Size, meta.Extension));
|
||||
(binary, meta) => new MediaBinaryParams(binary.Buffer, binary.Size, meta.Extension),
|
||||
async path => await MediaFileVault.FromAsync(path));
|
||||
|
||||
RegisterType<ImageBinary, ImageBinaryParams, ImageBinaryDto, ImageMetaData>(
|
||||
MediaVaultType.Image,
|
||||
|
||||
@@ -45,6 +45,55 @@ public class AudioProcessor
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Extracts the raw PCM data region and format parameters from a WAV buffer, reusing the
|
||||
/// same chunk-walk and validation as metadata extraction. Returns null if the buffer is not
|
||||
/// a valid PCM WAV (callers treat a null as "no profile computable" and continue) — unlike
|
||||
/// <see cref="ExtractWavMetadata"/>, this does NOT fall back to synthetic defaults, because a
|
||||
/// loudness profile over fabricated silence would be misleading.
|
||||
/// </summary>
|
||||
public PcmData? TryExtractPcm(ReadOnlySpan<byte> buffer)
|
||||
{
|
||||
// Copy the span to an array so the existing array-based parsers can be reused. The PCM
|
||||
// slice returned is a view over this array (no second copy of the data region).
|
||||
var bytes = buffer.ToArray();
|
||||
|
||||
var validation = ValidateWavStructure(bytes);
|
||||
if (!validation.IsValid)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
WavMetadata metadata;
|
||||
try
|
||||
{
|
||||
metadata = ParseWavMetadata(bytes, validation);
|
||||
ValidateAudioParameters(metadata);
|
||||
}
|
||||
catch
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
// Data bytes begin 8 past the "data" chunk id (4 id + 4 size). Clamp the declared size to
|
||||
// what is actually present — some encoders write a size that overshoots the file.
|
||||
var dataStart = validation.DataChunkPos + 8;
|
||||
if (dataStart > bytes.Length)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var available = bytes.Length - dataStart;
|
||||
var dataLength = Math.Min(metadata.DataSize, available);
|
||||
if (dataLength <= 0)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var pcm = new ReadOnlyMemory<byte>(bytes, dataStart, dataLength);
|
||||
return new PcmData(pcm, metadata.Channels, metadata.SampleRate, metadata.BitsPerSample);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Extracts metadata from WAV file buffer with comprehensive validation
|
||||
/// </summary>
|
||||
@@ -268,4 +317,18 @@ public class AudioProcessor
|
||||
public int FmtChunkPos { get; set; }
|
||||
public int DataChunkPos { get; set; }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The raw PCM sample region of a WAV plus the format parameters needed to interpret it.
|
||||
/// <see cref="Pcm"/> is a view over the decoded buffer — the data chunk only, header excluded.
|
||||
/// </summary>
|
||||
/// <param name="Pcm">The PCM sample bytes (interleaved by channel, little-endian).</param>
|
||||
/// <param name="Channels">Number of interleaved channels.</param>
|
||||
/// <param name="SampleRate">Samples per second.</param>
|
||||
/// <param name="BitsPerSample">Bit depth per sample (8, 16, 24, or 32).</param>
|
||||
public readonly record struct PcmData(
|
||||
ReadOnlyMemory<byte> Pcm,
|
||||
int Channels,
|
||||
int SampleRate,
|
||||
int BitsPerSample);
|
||||
@@ -0,0 +1,23 @@
|
||||
namespace DeepDrftContent.Processors;
|
||||
|
||||
/// <summary>
|
||||
/// Strategy for reducing a stream of PCM samples to a fixed-length, peak-normalized loudness
|
||||
/// envelope. Swappable so the loudness measure (RMS today, LUFS later) can change without
|
||||
/// touching <c>WaveformProfileService</c>, the stored wire format, or the frontend renderer.
|
||||
/// </summary>
|
||||
public interface ILoudnessAlgorithm
|
||||
{
|
||||
/// <summary>
|
||||
/// Computes a peak-normalized loudness profile from raw interleaved PCM.
|
||||
/// </summary>
|
||||
/// <param name="pcmData">Interleaved, little-endian PCM sample bytes (the WAV data chunk).</param>
|
||||
/// <param name="channels">Number of interleaved channels; averaged to mono per sample.</param>
|
||||
/// <param name="sampleRate">Samples per second (unused by RMS but part of the contract for measures that need it).</param>
|
||||
/// <param name="bitsPerSample">Bit depth (8 unsigned, 16/24/32 signed) used to decode samples.</param>
|
||||
/// <param name="bucketCount">Number of equal time slices to reduce the signal to.</param>
|
||||
/// <returns>
|
||||
/// A <c>double[bucketCount]</c>, each value in [0, 1], peak-normalized so the loudest bucket
|
||||
/// is 1. All zeros when the signal is silent (peak is 0) or no samples are present.
|
||||
/// </returns>
|
||||
double[] Compute(ReadOnlySpan<byte> pcmData, int channels, int sampleRate, int bitsPerSample, int bucketCount);
|
||||
}
|
||||
@@ -0,0 +1,138 @@
|
||||
namespace DeepDrftContent.Processors;
|
||||
|
||||
/// <summary>
|
||||
/// Loudness via root-mean-square amplitude per time bucket. Decodes signed PCM (8-bit unsigned,
|
||||
/// 16/24/32-bit signed little-endian), averages channels to mono, partitions the frames into
|
||||
/// equal time slices, takes the RMS of each slice, then peak-normalizes so the loudest bucket is 1.
|
||||
/// No external audio dependency — operates directly on the WAV data-chunk bytes.
|
||||
/// </summary>
|
||||
public class RmsLoudnessAlgorithm : ILoudnessAlgorithm
|
||||
{
|
||||
public double[] Compute(ReadOnlySpan<byte> pcmData, int channels, int sampleRate, int bitsPerSample, int bucketCount)
|
||||
{
|
||||
if (bucketCount <= 0)
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(bucketCount), "Bucket count must be positive.");
|
||||
}
|
||||
|
||||
var result = new double[bucketCount];
|
||||
|
||||
if (channels <= 0)
|
||||
{
|
||||
return result;
|
||||
}
|
||||
|
||||
var bytesPerSample = bitsPerSample / 8;
|
||||
if (bytesPerSample <= 0)
|
||||
{
|
||||
return result;
|
||||
}
|
||||
|
||||
var bytesPerFrame = bytesPerSample * channels;
|
||||
var frameCount = pcmData.Length / bytesPerFrame;
|
||||
if (frameCount == 0)
|
||||
{
|
||||
return result;
|
||||
}
|
||||
|
||||
// Sum of squared mono amplitudes and the frame count, per bucket. A frame's bucket is
|
||||
// determined by its position in the timeline so buckets are equal-duration slices.
|
||||
var sumSquares = new double[bucketCount];
|
||||
var counts = new long[bucketCount];
|
||||
|
||||
for (var frame = 0; frame < frameCount; frame++)
|
||||
{
|
||||
var frameStart = frame * bytesPerFrame;
|
||||
|
||||
double channelSum = 0;
|
||||
for (var ch = 0; ch < channels; ch++)
|
||||
{
|
||||
var sampleStart = frameStart + ch * bytesPerSample;
|
||||
channelSum += ReadSampleNormalized(pcmData, sampleStart, bitsPerSample);
|
||||
}
|
||||
|
||||
var mono = channelSum / channels;
|
||||
|
||||
// long math avoids overflow on large files before the divide back into bucket index.
|
||||
var bucket = (int)((long)frame * bucketCount / frameCount);
|
||||
if (bucket >= bucketCount)
|
||||
{
|
||||
bucket = bucketCount - 1;
|
||||
}
|
||||
|
||||
sumSquares[bucket] += mono * mono;
|
||||
counts[bucket]++;
|
||||
}
|
||||
|
||||
var peak = 0.0;
|
||||
for (var i = 0; i < bucketCount; i++)
|
||||
{
|
||||
if (counts[i] > 0)
|
||||
{
|
||||
result[i] = Math.Sqrt(sumSquares[i] / counts[i]);
|
||||
if (result[i] > peak)
|
||||
{
|
||||
peak = result[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (peak <= 0)
|
||||
{
|
||||
// Silence — return all zeros (Array is already zero-initialized).
|
||||
Array.Clear(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
for (var i = 0; i < bucketCount; i++)
|
||||
{
|
||||
result[i] /= peak;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Decodes one PCM sample at <paramref name="offset"/> to a normalized amplitude in [-1, 1].
|
||||
/// 8-bit is unsigned (0..255, centered at 128); 16/24/32-bit are signed little-endian.
|
||||
/// </summary>
|
||||
private static double ReadSampleNormalized(ReadOnlySpan<byte> data, int offset, int bitsPerSample)
|
||||
{
|
||||
switch (bitsPerSample)
|
||||
{
|
||||
case 8:
|
||||
// Unsigned, midpoint 128.
|
||||
return (data[offset] - 128) / 128.0;
|
||||
|
||||
case 16:
|
||||
{
|
||||
short sample = (short)(data[offset] | (data[offset + 1] << 8));
|
||||
return sample / 32768.0;
|
||||
}
|
||||
|
||||
case 24:
|
||||
{
|
||||
// Sign-extend the 24-bit little-endian value into an int.
|
||||
int raw = data[offset] | (data[offset + 1] << 8) | (data[offset + 2] << 16);
|
||||
if ((raw & 0x800000) != 0)
|
||||
{
|
||||
raw |= unchecked((int)0xFF000000);
|
||||
}
|
||||
return raw / 8388608.0;
|
||||
}
|
||||
|
||||
case 32:
|
||||
{
|
||||
int sample = data[offset]
|
||||
| (data[offset + 1] << 8)
|
||||
| (data[offset + 2] << 16)
|
||||
| (data[offset + 3] << 24);
|
||||
return sample / 2147483648.0;
|
||||
}
|
||||
|
||||
default:
|
||||
throw new ArgumentOutOfRangeException(
|
||||
nameof(bitsPerSample), bitsPerSample, "Unsupported PCM bit depth.");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
namespace DeepDrftContent.Processors;
|
||||
|
||||
/// <summary>
|
||||
/// Configuration for waveform loudness profiling. <see cref="BucketCount"/> is the stored
|
||||
/// resolution — the number of loudness buckets computed and persisted per track, which is also
|
||||
/// the bar count the frontend WaveformSeeker renders.
|
||||
/// </summary>
|
||||
public class WaveformProfileOptions
|
||||
{
|
||||
public int BucketCount { get; set; } = 512;
|
||||
}
|
||||
@@ -0,0 +1,123 @@
|
||||
using DeepDrftContent.Constants;
|
||||
using DeepDrftContent.FileDatabase.Models;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using FileDb = DeepDrftContent.FileDatabase.Services.FileDatabase;
|
||||
|
||||
namespace DeepDrftContent.Processors;
|
||||
|
||||
/// <summary>
|
||||
/// Computes a track's waveform loudness profile from its WAV bytes and persists it as a sidecar
|
||||
/// in the <see cref="VaultConstants.WaveformProfiles"/> vault, keyed by the track's EntryKey.
|
||||
/// The profile is the upload-time, off-the-playback-path representation the frontend fetches to
|
||||
/// render the WaveformSeeker. The loudness measure is injected (<see cref="ILoudnessAlgorithm"/>)
|
||||
/// so it can be swapped without changing storage or the wire format.
|
||||
/// </summary>
|
||||
public class WaveformProfileService
|
||||
{
|
||||
private const string ProfileExtension = ".wfp";
|
||||
|
||||
private readonly FileDb _fileDatabase;
|
||||
private readonly AudioProcessor _audioProcessor;
|
||||
private readonly ILoudnessAlgorithm _loudnessAlgorithm;
|
||||
private readonly WaveformProfileOptions _options;
|
||||
private readonly ILogger<WaveformProfileService> _logger;
|
||||
|
||||
public WaveformProfileService(
|
||||
FileDb fileDatabase,
|
||||
AudioProcessor audioProcessor,
|
||||
ILoudnessAlgorithm loudnessAlgorithm,
|
||||
IOptions<WaveformProfileOptions> options,
|
||||
ILogger<WaveformProfileService> logger)
|
||||
{
|
||||
_fileDatabase = fileDatabase;
|
||||
_audioProcessor = audioProcessor;
|
||||
_loudnessAlgorithm = loudnessAlgorithm;
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes the loudness profile from <paramref name="wavBytes"/> and stores it under
|
||||
/// <paramref name="entryKey"/>. Returns false (and logs) on any failure — a missing profile
|
||||
/// is handled gracefully downstream, so callers on the upload path log-and-continue rather
|
||||
/// than failing the upload. Does not throw for expected failure modes.
|
||||
/// </summary>
|
||||
public async Task<bool> ComputeAndStoreAsync(ReadOnlyMemory<byte> wavBytes, string entryKey)
|
||||
{
|
||||
try
|
||||
{
|
||||
var pcm = _audioProcessor.TryExtractPcm(wavBytes.Span);
|
||||
if (pcm is null)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Waveform profile not computed for {EntryKey}: WAV PCM could not be extracted.",
|
||||
entryKey);
|
||||
return false;
|
||||
}
|
||||
|
||||
var value = pcm.Value;
|
||||
var profile = _loudnessAlgorithm.Compute(
|
||||
value.Pcm.Span,
|
||||
value.Channels,
|
||||
value.SampleRate,
|
||||
value.BitsPerSample,
|
||||
_options.BucketCount);
|
||||
|
||||
var quantized = Quantize(profile);
|
||||
|
||||
await EnsureVaultAsync();
|
||||
|
||||
var binary = new MediaBinary(new MediaBinaryParams(quantized, quantized.Length, ProfileExtension));
|
||||
var stored = await _fileDatabase.RegisterResourceAsync(
|
||||
VaultConstants.WaveformProfiles, entryKey, binary);
|
||||
|
||||
if (!stored)
|
||||
{
|
||||
_logger.LogWarning("Waveform profile vault write failed for {EntryKey}.", entryKey);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
catch (Exception ex) when (ex is not OperationCanceledException)
|
||||
{
|
||||
_logger.LogError(ex, "Waveform profile computation failed for {EntryKey}.", entryKey);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the stored quantized profile bytes for a track, or null if no profile is stored
|
||||
/// (existing tracks predate profiling, and computation may have failed). Each byte is a
|
||||
/// peak-normalized loudness value in [0, 255].
|
||||
/// </summary>
|
||||
public async Task<byte[]?> GetProfileAsync(string entryKey)
|
||||
{
|
||||
var binary = await _fileDatabase.LoadResourceAsync<MediaBinary>(
|
||||
VaultConstants.WaveformProfiles, entryKey);
|
||||
return binary?.Buffer;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Maps each [0, 1] bucket to a [0, 255] byte. 1.0 maps to 255; the multiply-by-255 with a
|
||||
/// truncating cast keeps every in-range value within a byte without a clamp branch.
|
||||
/// </summary>
|
||||
private static byte[] Quantize(double[] profile)
|
||||
{
|
||||
var bytes = new byte[profile.Length];
|
||||
for (var i = 0; i < profile.Length; i++)
|
||||
{
|
||||
bytes[i] = (byte)(profile[i] * 255);
|
||||
}
|
||||
return bytes;
|
||||
}
|
||||
|
||||
private async Task EnsureVaultAsync()
|
||||
{
|
||||
if (!_fileDatabase.HasVault(VaultConstants.WaveformProfiles))
|
||||
{
|
||||
await _fileDatabase.CreateVaultAsync(VaultConstants.WaveformProfiles, MediaVaultType.Media);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user