namespace DeepDrftContent.Processors;
///
/// Loudness via root-mean-square amplitude per time bucket. Decodes signed PCM (8-bit unsigned,
/// 16/24/32-bit signed little-endian), averages channels to mono, partitions the frames into
/// equal time slices, takes the RMS of each slice, applies a ~15 ms envelope-follower smoothing
/// so the contour reads as a smooth curve rather than a spikey polygon, then peak-normalizes so
/// the loudest bucket is 1. No external audio dependency — operates directly on the WAV data-chunk bytes.
///
public class RmsLoudnessAlgorithm : ILoudnessAlgorithm
{
///
/// Envelope-follower time constant, seconds. ~15 ms is the smoothing target (Phase 10
/// tuning, reduced from 50 ms which was over-smoothed): long enough to round off the
/// per-bucket RMS spikes into a smooth ribbon contour, short enough that real loudness
/// transients (kicks, drops) still read. Applied as a symmetric (forward+backward) one-pole
/// filter so the smoothing introduces no time lag.
///
public const double SmoothingTimeConstantSeconds = 0.015;
public double[] Compute(ReadOnlySpan pcmData, int channels, int sampleRate, int bitsPerSample, int bucketCount)
{
if (bucketCount <= 0)
{
throw new ArgumentOutOfRangeException(nameof(bucketCount), "Bucket count must be positive.");
}
var result = new double[bucketCount];
if (channels <= 0)
{
return result;
}
var bytesPerSample = bitsPerSample / 8;
if (bytesPerSample <= 0)
{
return result;
}
var bytesPerFrame = bytesPerSample * channels;
var frameCount = pcmData.Length / bytesPerFrame;
if (frameCount == 0)
{
return result;
}
// Sum of squared mono amplitudes and the frame count, per bucket. A frame's bucket is
// determined by its position in the timeline so buckets are equal-duration slices.
var sumSquares = new double[bucketCount];
var counts = new long[bucketCount];
for (var frame = 0; frame < frameCount; frame++)
{
var frameStart = frame * bytesPerFrame;
double channelSum = 0;
for (var ch = 0; ch < channels; ch++)
{
var sampleStart = frameStart + ch * bytesPerSample;
channelSum += ReadSampleNormalized(pcmData, sampleStart, bitsPerSample);
}
var mono = channelSum / channels;
// long math avoids overflow on large files before the divide back into bucket index.
var bucket = (int)((long)frame * bucketCount / frameCount);
if (bucket >= bucketCount)
{
bucket = bucketCount - 1;
}
sumSquares[bucket] += mono * mono;
counts[bucket]++;
}
for (var i = 0; i < bucketCount; i++)
{
if (counts[i] > 0)
{
result[i] = Math.Sqrt(sumSquares[i] / counts[i]);
}
}
// Envelope smoothing (~15 ms): round the spikey per-bucket RMS into a smooth contour before
// peak-normalization, so the rendered ribbon reads as a continuous curve, not faceted polygons.
// Each bucket spans (totalSeconds / bucketCount) of audio; the filter coefficient is derived
// from that against the time constant so the smoothing is duration-aware, not a fixed window.
var totalSeconds = (double)frameCount / sampleRate;
var bucketSeconds = totalSeconds / bucketCount;
SmoothEnvelope(result, bucketSeconds);
var peak = 0.0;
for (var i = 0; i < bucketCount; i++)
{
if (result[i] > peak)
{
peak = result[i];
}
}
if (peak <= 0)
{
// Silence — return all zeros (Array is already zero-initialized).
Array.Clear(result);
return result;
}
for (var i = 0; i < bucketCount; i++)
{
result[i] /= peak;
}
return result;
}
///
/// Symmetric one-pole envelope smoothing over the per-bucket loudness, in place. A forward pass
/// then a backward pass cancels the single-pole phase lag, so the smoothed contour stays aligned
/// with the audio (no rightward time shift). The coefficient a = exp(−bucketSeconds / τ)
/// gives a ~-relative response targeting the ~15 ms time constant:
/// each bucket blends (1 − a) of itself with a of the running envelope. A near-zero
/// or non-finite bucket duration leaves the data untouched (nothing to smooth meaningfully).
///
private static void SmoothEnvelope(double[] data, double bucketSeconds)
{
if (data.Length < 2 || bucketSeconds <= 0 || !double.IsFinite(bucketSeconds))
{
return;
}
var a = Math.Exp(-bucketSeconds / SmoothingTimeConstantSeconds);
// a→1 means buckets are far finer than τ (heavy smoothing); a→0 means each bucket already
// spans ≫ τ, so smoothing is a no-op. Either extreme is handled by the blend below.
// Forward pass.
var env = data[0];
for (var i = 0; i < data.Length; i++)
{
env = a * env + (1 - a) * data[i];
data[i] = env;
}
// Backward pass (zero-phase): smooth the forward result in reverse so the net lag is zero.
env = data[^1];
for (var i = data.Length - 1; i >= 0; i--)
{
env = a * env + (1 - a) * data[i];
data[i] = env;
}
}
///
/// Decodes one PCM sample at to a normalized amplitude in [-1, 1].
/// 8-bit is unsigned (0..255, centered at 128); 16/24/32-bit are signed little-endian.
///
private static double ReadSampleNormalized(ReadOnlySpan data, int offset, int bitsPerSample)
{
switch (bitsPerSample)
{
case 8:
// Unsigned, midpoint 128.
return (data[offset] - 128) / 128.0;
case 16:
{
short sample = (short)(data[offset] | (data[offset + 1] << 8));
return sample / 32768.0;
}
case 24:
{
// Sign-extend the 24-bit little-endian value into an int.
int raw = data[offset] | (data[offset + 1] << 8) | (data[offset + 2] << 16);
if ((raw & 0x800000) != 0)
{
raw |= unchecked((int)0xFF000000);
}
return raw / 8388608.0;
}
case 32:
{
int sample = data[offset]
| (data[offset + 1] << 8)
| (data[offset + 2] << 16)
| (data[offset + 3] << 24);
return sample / 2147483648.0;
}
default:
throw new ArgumentOutOfRangeException(
nameof(bitsPerSample), bitsPerSample, "Unsupported PCM bit depth.");
}
}
}