namespace DeepDrftContent.Processors; /// /// Loudness via root-mean-square amplitude per time bucket. Decodes signed PCM (8-bit unsigned, /// 16/24/32-bit signed little-endian), averages channels to mono, partitions the frames into /// equal time slices, takes the RMS of each slice, applies a ~15 ms envelope-follower smoothing /// so the contour reads as a smooth curve rather than a spikey polygon, then peak-normalizes so /// the loudest bucket is 1. No external audio dependency — operates directly on the WAV data-chunk bytes. /// public class RmsLoudnessAlgorithm : ILoudnessAlgorithm { /// /// Envelope-follower time constant, seconds. ~15 ms is the smoothing target (Phase 10 /// tuning, reduced from 50 ms which was over-smoothed): long enough to round off the /// per-bucket RMS spikes into a smooth ribbon contour, short enough that real loudness /// transients (kicks, drops) still read. Applied as a symmetric (forward+backward) one-pole /// filter so the smoothing introduces no time lag. /// public const double SmoothingTimeConstantSeconds = 0.015; public double[] Compute(ReadOnlySpan pcmData, int channels, int sampleRate, int bitsPerSample, int bucketCount) { if (bucketCount <= 0) { throw new ArgumentOutOfRangeException(nameof(bucketCount), "Bucket count must be positive."); } var result = new double[bucketCount]; if (channels <= 0) { return result; } var bytesPerSample = bitsPerSample / 8; if (bytesPerSample <= 0) { return result; } var bytesPerFrame = bytesPerSample * channels; var frameCount = pcmData.Length / bytesPerFrame; if (frameCount == 0) { return result; } // Sum of squared mono amplitudes and the frame count, per bucket. A frame's bucket is // determined by its position in the timeline so buckets are equal-duration slices. var sumSquares = new double[bucketCount]; var counts = new long[bucketCount]; for (var frame = 0; frame < frameCount; frame++) { var frameStart = frame * bytesPerFrame; double channelSum = 0; for (var ch = 0; ch < channels; ch++) { var sampleStart = frameStart + ch * bytesPerSample; channelSum += ReadSampleNormalized(pcmData, sampleStart, bitsPerSample); } var mono = channelSum / channels; // long math avoids overflow on large files before the divide back into bucket index. var bucket = (int)((long)frame * bucketCount / frameCount); if (bucket >= bucketCount) { bucket = bucketCount - 1; } sumSquares[bucket] += mono * mono; counts[bucket]++; } for (var i = 0; i < bucketCount; i++) { if (counts[i] > 0) { result[i] = Math.Sqrt(sumSquares[i] / counts[i]); } } // Envelope smoothing (~15 ms): round the spikey per-bucket RMS into a smooth contour before // peak-normalization, so the rendered ribbon reads as a continuous curve, not faceted polygons. // Each bucket spans (totalSeconds / bucketCount) of audio; the filter coefficient is derived // from that against the time constant so the smoothing is duration-aware, not a fixed window. var totalSeconds = (double)frameCount / sampleRate; var bucketSeconds = totalSeconds / bucketCount; SmoothEnvelope(result, bucketSeconds); var peak = 0.0; for (var i = 0; i < bucketCount; i++) { if (result[i] > peak) { peak = result[i]; } } if (peak <= 0) { // Silence — return all zeros (Array is already zero-initialized). Array.Clear(result); return result; } for (var i = 0; i < bucketCount; i++) { result[i] /= peak; } return result; } /// /// Symmetric one-pole envelope smoothing over the per-bucket loudness, in place. A forward pass /// then a backward pass cancels the single-pole phase lag, so the smoothed contour stays aligned /// with the audio (no rightward time shift). The coefficient a = exp(−bucketSeconds / τ) /// gives a ~-relative response targeting the ~15 ms time constant: /// each bucket blends (1 − a) of itself with a of the running envelope. A near-zero /// or non-finite bucket duration leaves the data untouched (nothing to smooth meaningfully). /// private static void SmoothEnvelope(double[] data, double bucketSeconds) { if (data.Length < 2 || bucketSeconds <= 0 || !double.IsFinite(bucketSeconds)) { return; } var a = Math.Exp(-bucketSeconds / SmoothingTimeConstantSeconds); // a→1 means buckets are far finer than τ (heavy smoothing); a→0 means each bucket already // spans ≫ τ, so smoothing is a no-op. Either extreme is handled by the blend below. // Forward pass. var env = data[0]; for (var i = 0; i < data.Length; i++) { env = a * env + (1 - a) * data[i]; data[i] = env; } // Backward pass (zero-phase): smooth the forward result in reverse so the net lag is zero. env = data[^1]; for (var i = data.Length - 1; i >= 0; i--) { env = a * env + (1 - a) * data[i]; data[i] = env; } } /// /// Decodes one PCM sample at to a normalized amplitude in [-1, 1]. /// 8-bit is unsigned (0..255, centered at 128); 16/24/32-bit are signed little-endian. /// private static double ReadSampleNormalized(ReadOnlySpan data, int offset, int bitsPerSample) { switch (bitsPerSample) { case 8: // Unsigned, midpoint 128. return (data[offset] - 128) / 128.0; case 16: { short sample = (short)(data[offset] | (data[offset + 1] << 8)); return sample / 32768.0; } case 24: { // Sign-extend the 24-bit little-endian value into an int. int raw = data[offset] | (data[offset + 1] << 8) | (data[offset + 2] << 16); if ((raw & 0x800000) != 0) { raw |= unchecked((int)0xFF000000); } return raw / 8388608.0; } case 32: { int sample = data[offset] | (data[offset + 1] << 8) | (data[offset + 2] << 16) | (data[offset + 3] << 24); return sample / 2147483648.0; } default: throw new ArgumentOutOfRangeException( nameof(bitsPerSample), bitsPerSample, "Unsupported PCM bit depth."); } } }