deepdrft/DeepDrftContent/Processors/RmsLoudnessAlgorithm.cs

namespace DeepDrftContent.Processors;

/// <summary>
/// Loudness via root-mean-square amplitude per time bucket. Decodes signed PCM (8-bit unsigned,
/// 16/24/32-bit signed little-endian), averages channels to mono, partitions the frames into
/// equal time slices, takes the RMS of each slice, applies a ~50 ms envelope-follower smoothing
/// so the contour reads as a smooth curve rather than a spikey polygon, then peak-normalizes so
/// the loudest bucket is 1. No external audio dependency — operates directly on the WAV data-chunk bytes.
/// </summary>
public class RmsLoudnessAlgorithm : ILoudnessAlgorithm
{
    /// <summary>
    /// Envelope-follower time constant, seconds. ~50 ms is the spec's smoothing target (Phase 10
    /// tuning): long enough to round off the per-bucket RMS spikes into a smooth ribbon contour,
    /// short enough that real loudness transients (kicks, drops) still read. Applied as a symmetric
    /// (forward+backward) one-pole filter so the smoothing introduces no time lag.
    /// </summary>
    public const double SmoothingTimeConstantSeconds = 0.05;

    public double[] Compute(ReadOnlySpan<byte> pcmData, int channels, int sampleRate, int bitsPerSample, int bucketCount)
    {
        if (bucketCount <= 0)
        {
            throw new ArgumentOutOfRangeException(nameof(bucketCount), "Bucket count must be positive.");
        }

        var result = new double[bucketCount];

        if (channels <= 0)
        {
            return result;
        }

        var bytesPerSample = bitsPerSample / 8;
        if (bytesPerSample <= 0)
        {
            return result;
        }

        var bytesPerFrame = bytesPerSample * channels;
        var frameCount = pcmData.Length / bytesPerFrame;
        if (frameCount == 0)
        {
            return result;
        }

        // Sum of squared mono amplitudes and the frame count, per bucket. A frame's bucket is
        // determined by its position in the timeline so buckets are equal-duration slices.
        var sumSquares = new double[bucketCount];
        var counts = new long[bucketCount];

        for (var frame = 0; frame < frameCount; frame++)
        {
            var frameStart = frame * bytesPerFrame;

            double channelSum = 0;
            for (var ch = 0; ch < channels; ch++)
            {
                var sampleStart = frameStart + ch * bytesPerSample;
                channelSum += ReadSampleNormalized(pcmData, sampleStart, bitsPerSample);
            }

            var mono = channelSum / channels;

            // long math avoids overflow on large files before the divide back into bucket index.
            var bucket = (int)((long)frame * bucketCount / frameCount);
            if (bucket >= bucketCount)
            {
                bucket = bucketCount - 1;
            }

            sumSquares[bucket] += mono * mono;
            counts[bucket]++;
        }

        for (var i = 0; i < bucketCount; i++)
        {
            if (counts[i] > 0)
            {
                result[i] = Math.Sqrt(sumSquares[i] / counts[i]);
            }
        }

        // Envelope smoothing (~50 ms): round the spikey per-bucket RMS into a smooth contour before
        // peak-normalization, so the rendered ribbon reads as a continuous curve, not faceted polygons.
        // Each bucket spans (totalSeconds / bucketCount) of audio; the filter coefficient is derived
        // from that against the time constant so the smoothing is duration-aware, not a fixed window.
        var totalSeconds = (double)frameCount / sampleRate;
        var bucketSeconds = totalSeconds / bucketCount;
        SmoothEnvelope(result, bucketSeconds);

        var peak = 0.0;
        for (var i = 0; i < bucketCount; i++)
        {
            if (result[i] > peak)
            {
                peak = result[i];
            }
        }

        if (peak <= 0)
        {
            // Silence — return all zeros (Array is already zero-initialized).
            Array.Clear(result);
            return result;
        }

        for (var i = 0; i < bucketCount; i++)
        {
            result[i] /= peak;
        }

        return result;
    }

    /// <summary>
    /// Symmetric one-pole envelope smoothing over the per-bucket loudness, in place. A forward pass
    /// then a backward pass cancels the single-pole phase lag, so the smoothed contour stays aligned
    /// with the audio (no rightward time shift). The coefficient <c>a = exp(−bucketSeconds / τ)</c>
    /// gives a ~<paramref name="bucketSeconds"/>-relative response targeting the ~50 ms time constant:
    /// each bucket blends <c>(1 − a)</c> of itself with <c>a</c> of the running envelope. A near-zero
    /// or non-finite bucket duration leaves the data untouched (nothing to smooth meaningfully).
    /// </summary>
    private static void SmoothEnvelope(double[] data, double bucketSeconds)
    {
        if (data.Length < 2 || bucketSeconds <= 0 || !double.IsFinite(bucketSeconds))
        {
            return;
        }

        var a = Math.Exp(-bucketSeconds / SmoothingTimeConstantSeconds);
        // a→1 means buckets are far finer than τ (heavy smoothing); a→0 means each bucket already
        // spans ≫ τ, so smoothing is a no-op. Either extreme is handled by the blend below.

        // Forward pass.
        var env = data[0];
        for (var i = 0; i < data.Length; i++)
        {
            env = a * env + (1 - a) * data[i];
            data[i] = env;
        }

        // Backward pass (zero-phase): smooth the forward result in reverse so the net lag is zero.
        env = data[^1];
        for (var i = data.Length - 1; i >= 0; i--)
        {
            env = a * env + (1 - a) * data[i];
            data[i] = env;
        }
    }

    /// <summary>
    /// Decodes one PCM sample at <paramref name="offset"/> to a normalized amplitude in [-1, 1].
    /// 8-bit is unsigned (0..255, centered at 128); 16/24/32-bit are signed little-endian.
    /// </summary>
    private static double ReadSampleNormalized(ReadOnlySpan<byte> data, int offset, int bitsPerSample)
    {
        switch (bitsPerSample)
        {
            case 8:
                // Unsigned, midpoint 128.
                return (data[offset] - 128) / 128.0;

            case 16:
            {
                short sample = (short)(data[offset] | (data[offset + 1] << 8));
                return sample / 32768.0;
            }

            case 24:
            {
                // Sign-extend the 24-bit little-endian value into an int.
                int raw = data[offset] | (data[offset + 1] << 8) | (data[offset + 2] << 16);
                if ((raw & 0x800000) != 0)
                {
                    raw |= unchecked((int)0xFF000000);
                }
                return raw / 8388608.0;
            }

            case 32:
            {
                int sample = data[offset]
                             | (data[offset + 1] << 8)
                             | (data[offset + 2] << 16)
                             | (data[offset + 3] << 24);
                return sample / 2147483648.0;
            }

            default:
                throw new ArgumentOutOfRangeException(
                    nameof(bitsPerSample), bitsPerSample, "Unsupported PCM bit depth.");
        }
    }
}