288 lines
10 KiB
C#
288 lines
10 KiB
C#
namespace DeepDrftContent.Processors;
|
||
|
||
/// <summary>
|
||
/// Loudness via root-mean-square amplitude per time bucket. Decodes signed PCM (8-bit unsigned,
|
||
/// 16/24/32-bit signed little-endian), averages channels to mono, partitions the frames into
|
||
/// equal time slices, takes the RMS of each slice, applies a ~15 ms envelope-follower smoothing
|
||
/// so the contour reads as a smooth curve rather than a spikey polygon, then peak-normalizes so
|
||
/// the loudest bucket is 1. No external audio dependency — operates directly on the WAV data-chunk bytes.
|
||
/// </summary>
|
||
public class RmsLoudnessAlgorithm : ILoudnessAlgorithm
|
||
{
|
||
/// <summary>
|
||
/// Envelope-follower time constant, seconds. ~15 ms is the smoothing target (Phase 10
|
||
/// tuning, reduced from 50 ms which was over-smoothed): long enough to round off the
|
||
/// per-bucket RMS spikes into a smooth ribbon contour, short enough that real loudness
|
||
/// transients (kicks, drops) still read. Applied as a symmetric (forward+backward) one-pole
|
||
/// filter so the smoothing introduces no time lag.
|
||
/// </summary>
|
||
public const double SmoothingTimeConstantSeconds = 0.005;
|
||
|
||
/// <summary>
|
||
/// Whole-buffer reduction. Defined in terms of <see cref="CreateAccumulator"/> so the streaming and
|
||
/// whole-buffer paths share one decode + finalize implementation — byte-identical output by
|
||
/// construction, not by parallel maintenance.
|
||
/// </summary>
|
||
public double[] Compute(ReadOnlySpan<byte> pcmData, int channels, int sampleRate, int bitsPerSample, int bucketCount)
|
||
{
|
||
var accumulator = CreateAccumulator(pcmData.Length, channels, sampleRate, bitsPerSample, bucketCount);
|
||
accumulator.Add(pcmData);
|
||
return accumulator.Finish();
|
||
}
|
||
|
||
public ILoudnessAccumulator CreateAccumulator(
|
||
long pcmByteLength, int channels, int sampleRate, int bitsPerSample, int bucketCount)
|
||
{
|
||
if (bucketCount <= 0)
|
||
{
|
||
throw new ArgumentOutOfRangeException(nameof(bucketCount), "Bucket count must be positive.");
|
||
}
|
||
|
||
return new RmsLoudnessAccumulator(pcmByteLength, channels, sampleRate, bitsPerSample, bucketCount);
|
||
}
|
||
|
||
/// <summary>
|
||
/// Symmetric one-pole envelope smoothing over the per-bucket loudness, in place. A forward pass
|
||
/// then a backward pass cancels the single-pole phase lag, so the smoothed contour stays aligned
|
||
/// with the audio (no rightward time shift). The coefficient <c>a = exp(−bucketSeconds / τ)</c>
|
||
/// gives a ~<paramref name="bucketSeconds"/>-relative response targeting the ~15 ms time constant:
|
||
/// each bucket blends <c>(1 − a)</c> of itself with <c>a</c> of the running envelope. A near-zero
|
||
/// or non-finite bucket duration leaves the data untouched (nothing to smooth meaningfully).
|
||
/// </summary>
|
||
internal static void SmoothEnvelope(double[] data, double bucketSeconds)
|
||
{
|
||
if (data.Length < 2 || bucketSeconds <= 0 || !double.IsFinite(bucketSeconds))
|
||
{
|
||
return;
|
||
}
|
||
|
||
var a = Math.Exp(-bucketSeconds / SmoothingTimeConstantSeconds);
|
||
// a→1 means buckets are far finer than τ (heavy smoothing); a→0 means each bucket already
|
||
// spans ≫ τ, so smoothing is a no-op. Either extreme is handled by the blend below.
|
||
|
||
// Forward pass.
|
||
var env = data[0];
|
||
for (var i = 0; i < data.Length; i++)
|
||
{
|
||
env = a * env + (1 - a) * data[i];
|
||
data[i] = env;
|
||
}
|
||
|
||
// Backward pass (zero-phase): smooth the forward result in reverse so the net lag is zero.
|
||
env = data[^1];
|
||
for (var i = data.Length - 1; i >= 0; i--)
|
||
{
|
||
env = a * env + (1 - a) * data[i];
|
||
data[i] = env;
|
||
}
|
||
}
|
||
|
||
/// <summary>
|
||
/// Decodes one PCM sample at <paramref name="offset"/> to a normalized amplitude in [-1, 1].
|
||
/// 8-bit is unsigned (0..255, centered at 128); 16/24/32-bit are signed little-endian.
|
||
/// </summary>
|
||
internal static double ReadSampleNormalized(ReadOnlySpan<byte> data, int offset, int bitsPerSample)
|
||
{
|
||
switch (bitsPerSample)
|
||
{
|
||
case 8:
|
||
// Unsigned, midpoint 128.
|
||
return (data[offset] - 128) / 128.0;
|
||
|
||
case 16:
|
||
{
|
||
short sample = (short)(data[offset] | (data[offset + 1] << 8));
|
||
return sample / 32768.0;
|
||
}
|
||
|
||
case 24:
|
||
{
|
||
// Sign-extend the 24-bit little-endian value into an int.
|
||
int raw = data[offset] | (data[offset + 1] << 8) | (data[offset + 2] << 16);
|
||
if ((raw & 0x800000) != 0)
|
||
{
|
||
raw |= unchecked((int)0xFF000000);
|
||
}
|
||
return raw / 8388608.0;
|
||
}
|
||
|
||
case 32:
|
||
{
|
||
int sample = data[offset]
|
||
| (data[offset + 1] << 8)
|
||
| (data[offset + 2] << 16)
|
||
| (data[offset + 3] << 24);
|
||
return sample / 2147483648.0;
|
||
}
|
||
|
||
default:
|
||
throw new ArgumentOutOfRangeException(
|
||
nameof(bitsPerSample), bitsPerSample, "Unsupported PCM bit depth.");
|
||
}
|
||
}
|
||
}
|
||
|
||
/// <summary>
|
||
/// Single-pass RMS accumulator backing <see cref="RmsLoudnessAlgorithm"/>. Frames are fed via
|
||
/// <see cref="Add"/> in arbitrary chunks; a partial frame straddling a chunk boundary is carried in a
|
||
/// one-frame buffer. The per-frame decode, bucket assignment, and per-bucket accumulation are the exact
|
||
/// arithmetic the former whole-buffer loop used, in the same frame order, so the floating-point result
|
||
/// is bit-identical whether the PCM arrives in one span or many. <see cref="Finish"/> applies the same
|
||
/// envelope smoothing and peak-normalization as before. Memory is O(bucketCount + one frame).
|
||
/// </summary>
|
||
public sealed class RmsLoudnessAccumulator : ILoudnessAccumulator
|
||
{
|
||
private readonly int _channels;
|
||
private readonly int _sampleRate;
|
||
private readonly int _bitsPerSample;
|
||
private readonly int _bucketCount;
|
||
private readonly int _bytesPerSample;
|
||
private readonly int _bytesPerFrame;
|
||
private readonly long _frameCount;
|
||
|
||
private readonly double[] _sumSquares;
|
||
private readonly long[] _counts;
|
||
private readonly byte[] _carry;
|
||
private int _carryLen;
|
||
private long _frameIndex;
|
||
|
||
internal RmsLoudnessAccumulator(long pcmByteLength, int channels, int sampleRate, int bitsPerSample, int bucketCount)
|
||
{
|
||
_channels = channels;
|
||
_sampleRate = sampleRate;
|
||
_bitsPerSample = bitsPerSample;
|
||
_bucketCount = bucketCount;
|
||
_sumSquares = new double[bucketCount];
|
||
_counts = new long[bucketCount];
|
||
|
||
// Guards mirror the former whole-buffer Compute exactly: any degenerate parameter leaves
|
||
// _frameCount at 0, so Add is a no-op and Finish returns the zero-initialized profile.
|
||
_bytesPerSample = bitsPerSample / 8;
|
||
if (channels <= 0 || _bytesPerSample <= 0)
|
||
{
|
||
_bytesPerFrame = 0;
|
||
_frameCount = 0;
|
||
_carry = [];
|
||
return;
|
||
}
|
||
|
||
_bytesPerFrame = _bytesPerSample * channels;
|
||
_frameCount = pcmByteLength / _bytesPerFrame;
|
||
_carry = new byte[_bytesPerFrame];
|
||
}
|
||
|
||
public void Add(ReadOnlySpan<byte> pcmChunk)
|
||
{
|
||
if (_frameIndex >= _frameCount)
|
||
{
|
||
return; // degenerate input, or every expected frame already consumed
|
||
}
|
||
|
||
var pos = 0;
|
||
|
||
// Complete a frame carried from the previous chunk first.
|
||
if (_carryLen > 0)
|
||
{
|
||
var need = _bytesPerFrame - _carryLen;
|
||
var take = Math.Min(need, pcmChunk.Length);
|
||
pcmChunk.Slice(0, take).CopyTo(_carry.AsSpan(_carryLen));
|
||
_carryLen += take;
|
||
pos += take;
|
||
|
||
if (_carryLen < _bytesPerFrame)
|
||
{
|
||
return; // still not a full frame
|
||
}
|
||
|
||
ProcessFrame(_carry);
|
||
_carryLen = 0;
|
||
if (_frameIndex >= _frameCount)
|
||
{
|
||
return;
|
||
}
|
||
}
|
||
|
||
// Whole frames directly from the chunk.
|
||
while (pos + _bytesPerFrame <= pcmChunk.Length && _frameIndex < _frameCount)
|
||
{
|
||
ProcessFrame(pcmChunk.Slice(pos, _bytesPerFrame));
|
||
pos += _bytesPerFrame;
|
||
}
|
||
|
||
// Stash a trailing partial frame for the next chunk — but only while frames are still expected.
|
||
// A trailing partial frame on the final chunk is dropped, matching the whole-buffer path.
|
||
if (_frameIndex < _frameCount && pos < pcmChunk.Length)
|
||
{
|
||
var remainder = pcmChunk.Slice(pos);
|
||
remainder.CopyTo(_carry);
|
||
_carryLen = remainder.Length;
|
||
}
|
||
}
|
||
|
||
private void ProcessFrame(ReadOnlySpan<byte> frame)
|
||
{
|
||
double channelSum = 0;
|
||
for (var ch = 0; ch < _channels; ch++)
|
||
{
|
||
channelSum += RmsLoudnessAlgorithm.ReadSampleNormalized(frame, ch * _bytesPerSample, _bitsPerSample);
|
||
}
|
||
|
||
var mono = channelSum / _channels;
|
||
|
||
// long math avoids overflow on large files before the divide back into bucket index.
|
||
var bucket = (int)(_frameIndex * _bucketCount / _frameCount);
|
||
if (bucket >= _bucketCount)
|
||
{
|
||
bucket = _bucketCount - 1;
|
||
}
|
||
|
||
_sumSquares[bucket] += mono * mono;
|
||
_counts[bucket]++;
|
||
_frameIndex++;
|
||
}
|
||
|
||
public double[] Finish()
|
||
{
|
||
var result = new double[_bucketCount];
|
||
if (_frameCount == 0)
|
||
{
|
||
return result; // degenerate input — all zeros, as the whole-buffer guards returned
|
||
}
|
||
|
||
for (var i = 0; i < _bucketCount; i++)
|
||
{
|
||
if (_counts[i] > 0)
|
||
{
|
||
result[i] = Math.Sqrt(_sumSquares[i] / _counts[i]);
|
||
}
|
||
}
|
||
|
||
// Envelope smoothing (~15 ms) then peak-normalization — identical to the whole-buffer finalize.
|
||
var totalSeconds = (double)_frameCount / _sampleRate;
|
||
var bucketSeconds = totalSeconds / _bucketCount;
|
||
RmsLoudnessAlgorithm.SmoothEnvelope(result, bucketSeconds);
|
||
|
||
var peak = 0.0;
|
||
for (var i = 0; i < _bucketCount; i++)
|
||
{
|
||
if (result[i] > peak)
|
||
{
|
||
peak = result[i];
|
||
}
|
||
}
|
||
|
||
if (peak <= 0)
|
||
{
|
||
Array.Clear(result);
|
||
return result;
|
||
}
|
||
|
||
for (var i = 0; i < _bucketCount; i++)
|
||
{
|
||
result[i] /= peak;
|
||
}
|
||
|
||
return result;
|
||
}
|
||
}
|