Files

288 lines
10 KiB
C#
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
namespace DeepDrftContent.Processors;
/// <summary>
/// Loudness via root-mean-square amplitude per time bucket. Decodes signed PCM (8-bit unsigned,
/// 16/24/32-bit signed little-endian), averages channels to mono, partitions the frames into
/// equal time slices, takes the RMS of each slice, applies a ~15 ms envelope-follower smoothing
/// so the contour reads as a smooth curve rather than a spikey polygon, then peak-normalizes so
/// the loudest bucket is 1. No external audio dependency — operates directly on the WAV data-chunk bytes.
/// </summary>
public class RmsLoudnessAlgorithm : ILoudnessAlgorithm
{
/// <summary>
/// Envelope-follower time constant, seconds. ~15 ms is the smoothing target (Phase 10
/// tuning, reduced from 50 ms which was over-smoothed): long enough to round off the
/// per-bucket RMS spikes into a smooth ribbon contour, short enough that real loudness
/// transients (kicks, drops) still read. Applied as a symmetric (forward+backward) one-pole
/// filter so the smoothing introduces no time lag.
/// </summary>
public const double SmoothingTimeConstantSeconds = 0.005;
/// <summary>
/// Whole-buffer reduction. Defined in terms of <see cref="CreateAccumulator"/> so the streaming and
/// whole-buffer paths share one decode + finalize implementation — byte-identical output by
/// construction, not by parallel maintenance.
/// </summary>
public double[] Compute(ReadOnlySpan<byte> pcmData, int channels, int sampleRate, int bitsPerSample, int bucketCount)
{
var accumulator = CreateAccumulator(pcmData.Length, channels, sampleRate, bitsPerSample, bucketCount);
accumulator.Add(pcmData);
return accumulator.Finish();
}
public ILoudnessAccumulator CreateAccumulator(
long pcmByteLength, int channels, int sampleRate, int bitsPerSample, int bucketCount)
{
if (bucketCount <= 0)
{
throw new ArgumentOutOfRangeException(nameof(bucketCount), "Bucket count must be positive.");
}
return new RmsLoudnessAccumulator(pcmByteLength, channels, sampleRate, bitsPerSample, bucketCount);
}
/// <summary>
/// Symmetric one-pole envelope smoothing over the per-bucket loudness, in place. A forward pass
/// then a backward pass cancels the single-pole phase lag, so the smoothed contour stays aligned
/// with the audio (no rightward time shift). The coefficient <c>a = exp(bucketSeconds / τ)</c>
/// gives a ~<paramref name="bucketSeconds"/>-relative response targeting the ~15 ms time constant:
/// each bucket blends <c>(1 a)</c> of itself with <c>a</c> of the running envelope. A near-zero
/// or non-finite bucket duration leaves the data untouched (nothing to smooth meaningfully).
/// </summary>
internal static void SmoothEnvelope(double[] data, double bucketSeconds)
{
if (data.Length < 2 || bucketSeconds <= 0 || !double.IsFinite(bucketSeconds))
{
return;
}
var a = Math.Exp(-bucketSeconds / SmoothingTimeConstantSeconds);
// a→1 means buckets are far finer than τ (heavy smoothing); a→0 means each bucket already
// spans ≫ τ, so smoothing is a no-op. Either extreme is handled by the blend below.
// Forward pass.
var env = data[0];
for (var i = 0; i < data.Length; i++)
{
env = a * env + (1 - a) * data[i];
data[i] = env;
}
// Backward pass (zero-phase): smooth the forward result in reverse so the net lag is zero.
env = data[^1];
for (var i = data.Length - 1; i >= 0; i--)
{
env = a * env + (1 - a) * data[i];
data[i] = env;
}
}
/// <summary>
/// Decodes one PCM sample at <paramref name="offset"/> to a normalized amplitude in [-1, 1].
/// 8-bit is unsigned (0..255, centered at 128); 16/24/32-bit are signed little-endian.
/// </summary>
internal static double ReadSampleNormalized(ReadOnlySpan<byte> data, int offset, int bitsPerSample)
{
switch (bitsPerSample)
{
case 8:
// Unsigned, midpoint 128.
return (data[offset] - 128) / 128.0;
case 16:
{
short sample = (short)(data[offset] | (data[offset + 1] << 8));
return sample / 32768.0;
}
case 24:
{
// Sign-extend the 24-bit little-endian value into an int.
int raw = data[offset] | (data[offset + 1] << 8) | (data[offset + 2] << 16);
if ((raw & 0x800000) != 0)
{
raw |= unchecked((int)0xFF000000);
}
return raw / 8388608.0;
}
case 32:
{
int sample = data[offset]
| (data[offset + 1] << 8)
| (data[offset + 2] << 16)
| (data[offset + 3] << 24);
return sample / 2147483648.0;
}
default:
throw new ArgumentOutOfRangeException(
nameof(bitsPerSample), bitsPerSample, "Unsupported PCM bit depth.");
}
}
}
/// <summary>
/// Single-pass RMS accumulator backing <see cref="RmsLoudnessAlgorithm"/>. Frames are fed via
/// <see cref="Add"/> in arbitrary chunks; a partial frame straddling a chunk boundary is carried in a
/// one-frame buffer. The per-frame decode, bucket assignment, and per-bucket accumulation are the exact
/// arithmetic the former whole-buffer loop used, in the same frame order, so the floating-point result
/// is bit-identical whether the PCM arrives in one span or many. <see cref="Finish"/> applies the same
/// envelope smoothing and peak-normalization as before. Memory is O(bucketCount + one frame).
/// </summary>
public sealed class RmsLoudnessAccumulator : ILoudnessAccumulator
{
private readonly int _channels;
private readonly int _sampleRate;
private readonly int _bitsPerSample;
private readonly int _bucketCount;
private readonly int _bytesPerSample;
private readonly int _bytesPerFrame;
private readonly long _frameCount;
private readonly double[] _sumSquares;
private readonly long[] _counts;
private readonly byte[] _carry;
private int _carryLen;
private long _frameIndex;
internal RmsLoudnessAccumulator(long pcmByteLength, int channels, int sampleRate, int bitsPerSample, int bucketCount)
{
_channels = channels;
_sampleRate = sampleRate;
_bitsPerSample = bitsPerSample;
_bucketCount = bucketCount;
_sumSquares = new double[bucketCount];
_counts = new long[bucketCount];
// Guards mirror the former whole-buffer Compute exactly: any degenerate parameter leaves
// _frameCount at 0, so Add is a no-op and Finish returns the zero-initialized profile.
_bytesPerSample = bitsPerSample / 8;
if (channels <= 0 || _bytesPerSample <= 0)
{
_bytesPerFrame = 0;
_frameCount = 0;
_carry = [];
return;
}
_bytesPerFrame = _bytesPerSample * channels;
_frameCount = pcmByteLength / _bytesPerFrame;
_carry = new byte[_bytesPerFrame];
}
public void Add(ReadOnlySpan<byte> pcmChunk)
{
if (_frameIndex >= _frameCount)
{
return; // degenerate input, or every expected frame already consumed
}
var pos = 0;
// Complete a frame carried from the previous chunk first.
if (_carryLen > 0)
{
var need = _bytesPerFrame - _carryLen;
var take = Math.Min(need, pcmChunk.Length);
pcmChunk.Slice(0, take).CopyTo(_carry.AsSpan(_carryLen));
_carryLen += take;
pos += take;
if (_carryLen < _bytesPerFrame)
{
return; // still not a full frame
}
ProcessFrame(_carry);
_carryLen = 0;
if (_frameIndex >= _frameCount)
{
return;
}
}
// Whole frames directly from the chunk.
while (pos + _bytesPerFrame <= pcmChunk.Length && _frameIndex < _frameCount)
{
ProcessFrame(pcmChunk.Slice(pos, _bytesPerFrame));
pos += _bytesPerFrame;
}
// Stash a trailing partial frame for the next chunk — but only while frames are still expected.
// A trailing partial frame on the final chunk is dropped, matching the whole-buffer path.
if (_frameIndex < _frameCount && pos < pcmChunk.Length)
{
var remainder = pcmChunk.Slice(pos);
remainder.CopyTo(_carry);
_carryLen = remainder.Length;
}
}
private void ProcessFrame(ReadOnlySpan<byte> frame)
{
double channelSum = 0;
for (var ch = 0; ch < _channels; ch++)
{
channelSum += RmsLoudnessAlgorithm.ReadSampleNormalized(frame, ch * _bytesPerSample, _bitsPerSample);
}
var mono = channelSum / _channels;
// long math avoids overflow on large files before the divide back into bucket index.
var bucket = (int)(_frameIndex * _bucketCount / _frameCount);
if (bucket >= _bucketCount)
{
bucket = _bucketCount - 1;
}
_sumSquares[bucket] += mono * mono;
_counts[bucket]++;
_frameIndex++;
}
public double[] Finish()
{
var result = new double[_bucketCount];
if (_frameCount == 0)
{
return result; // degenerate input — all zeros, as the whole-buffer guards returned
}
for (var i = 0; i < _bucketCount; i++)
{
if (_counts[i] > 0)
{
result[i] = Math.Sqrt(_sumSquares[i] / _counts[i]);
}
}
// Envelope smoothing (~15 ms) then peak-normalization — identical to the whole-buffer finalize.
var totalSeconds = (double)_frameCount / _sampleRate;
var bucketSeconds = totalSeconds / _bucketCount;
RmsLoudnessAlgorithm.SmoothEnvelope(result, bucketSeconds);
var peak = 0.0;
for (var i = 0; i < _bucketCount; i++)
{
if (result[i] > peak)
{
peak = result[i];
}
}
if (peak <= 0)
{
Array.Clear(result);
return result;
}
for (var i = 0; i < _bucketCount; i++)
{
result[i] /= peak;
}
return result;
}
}