feature: Phase 23 Track A — env-gated /robots.txt + /sitemap.xml public crawl endpoints

This commit is contained in:
daniel-c-harvey
2026-06-23 07:23:42 -04:00
parent 9a4b79d377
commit 5f4807cc4a
6 changed files with 432 additions and 0 deletions
@@ -0,0 +1,111 @@
using System.Net.Http.Json;
using System.Text.Json;
using DeepDrftModels.DTOs;
using Models.Common;
using DeepDrftPublic.Client.Common;
using DeepDrftPublic.Seo;
using Microsoft.AspNetCore.Mvc;
namespace DeepDrftPublic.Controllers;
/// <summary>
/// Serves the public crawl-directive surfaces (Phase 23): <c>GET /robots.txt</c> and
/// <c>GET /sitemap.xml</c>. Both are environment-gated server-side via
/// <see cref="IWebHostEnvironment.IsProduction"/> read directly here — not the WASM-only
/// <c>SeoEnvironment</c> bridge — and fail safe closed (non-production is uncrawlable, Invariant E1).
///
/// <para>
/// This is a thin host boundary: it owns the gate and the release walk, and delegates all body composition
/// to the pure <see cref="RobotsTxt"/> / <see cref="SitemapXml"/> builders. The sitemap walk reuses the
/// existing <c>"DeepDrft.API"</c> named client server-to-server (the same client SSR prerender uses) — it
/// <b>enumerates and transforms</b> releases into XML rather than relaying verbatim like the proxy controllers.
/// No new API endpoint, no schema change (Phase 22 C5 holds).
/// </para>
/// </summary>
[ApiController]
public class CrawlDirectiveController : ControllerBase
{
// A generous page size keeps the walk to a handful of round-trips even for a large catalogue.
private const int WalkPageSize = 200;
// The release walk deserializes a bare PagedResult<ReleaseDto> (no ApiResultDto envelope), matching TrackClient.
private static readonly JsonSerializerOptions JsonOptions = new(JsonSerializerDefaults.Web);
private readonly IWebHostEnvironment _environment;
private readonly SeoOptions _seoOptions;
private readonly HttpClient _upstream;
private readonly ILogger<CrawlDirectiveController> _logger;
public CrawlDirectiveController(
IWebHostEnvironment environment,
SeoOptions seoOptions,
IHttpClientFactory httpClientFactory,
ILogger<CrawlDirectiveController> logger)
{
_environment = environment;
_seoOptions = seoOptions;
_upstream = httpClientFactory.CreateClient("DeepDrft.API");
_logger = logger;
}
/// <summary>
/// <c>GET /robots.txt</c>. Production: allow + FramePlayer/api disallows + sitemap pointer. Any
/// non-production environment: <c>Disallow: /</c> with no sitemap pointer (E1). Always <c>text/plain</c>.
/// </summary>
[HttpGet("/robots.txt")]
public ContentResult GetRobots()
{
var body = RobotsTxt.Build(_environment.IsProduction(), _seoOptions.BaseUrl);
return Content(body, "text/plain");
}
/// <summary>
/// <c>GET /sitemap.xml</c>. Non-production: 404 (the non-prod robots carries no sitemap pointer, so
/// nothing references it). Production: the static roots plus one entry per release. Resilient — a
/// partial/empty/failed release read yields a well-formed (possibly roots-only) document, never a 500.
/// </summary>
[HttpGet("/sitemap.xml")]
public async Task<ActionResult> GetSitemap(CancellationToken ct = default)
{
if (!_environment.IsProduction())
return NotFound();
var releases = await GatherReleasesAsync(ct);
var xml = SitemapXml.Build(_seoOptions.BaseUrl, releases);
return Content(xml, "application/xml");
}
// Walks GET api/release page by page until every release is read. On any upstream failure, returns the
// releases gathered so far (possibly none) so the sitemap degrades to a well-formed roots-only document
// rather than 500ing — a sitemap that errors trains crawlers to stop fetching it (AC-S5).
private async Task<IReadOnlyList<ReleaseDto>> GatherReleasesAsync(CancellationToken ct)
{
var gathered = new List<ReleaseDto>();
var page = 1;
try
{
while (true)
{
var result = await _upstream.GetFromJsonAsync<PagedResult<ReleaseDto>>(
$"api/release?page={page}&pageSize={WalkPageSize}", JsonOptions, ct);
if (result?.Items is null)
break;
gathered.AddRange(result.Items);
if (gathered.Count >= result.TotalCount || !result.Items.Any())
break;
page++;
}
}
catch (Exception ex)
{
_logger.LogError(ex, "Sitemap release walk failed after gathering {Count} release(s); serving a partial sitemap", gathered.Count);
}
return gathered;
}
}
+34
View File
@@ -0,0 +1,34 @@
namespace DeepDrftPublic.Seo;
/// <summary>
/// Pure composition of the <c>robots.txt</c> body (Phase 23 wave 23.1). The environment gate is the
/// caller's: the endpoint reads <see cref="Microsoft.AspNetCore.Hosting.IWebHostEnvironment.IsProduction"/>
/// server-side and passes the boolean here, so the production-vs-beta branch lives in one testable place.
/// Fail-safe is closed — anything that is not Production yields <c>Disallow: /</c> (Invariant E1).
/// </summary>
public static class RobotsTxt
{
/// <summary>
/// Builds the directive body. In Production: allow everything except the embed shell and the proxy API
/// paths, plus a <c>Sitemap:</c> pointer (OQ-R2). In any non-production environment: a closed door
/// (<c>Disallow: /</c>) with no sitemap pointer, so a crawl of beta sees nothing and the sitemap is
/// never advertised.
/// </summary>
/// <param name="isProduction">The server-side <c>IsProduction()</c> result — the single gate.</param>
/// <param name="baseUrl">Canonical origin (no trailing slash) for the <c>Sitemap:</c> line; Production only.</param>
public static string Build(bool isProduction, string baseUrl)
{
if (!isProduction)
{
return "User-agent: *\n" +
"Disallow: /\n";
}
var origin = baseUrl.TrimEnd('/');
return "User-agent: *\n" +
"Allow: /\n" +
"Disallow: /FramePlayer\n" +
"Disallow: /api/\n" +
$"Sitemap: {origin}/sitemap.xml\n";
}
}
+68
View File
@@ -0,0 +1,68 @@
using System.Text;
using System.Xml;
using System.Xml.Linq;
using DeepDrftModels.DTOs;
using DeepDrftPublic.Client.Common;
namespace DeepDrftPublic.Seo;
/// <summary>
/// Pure composition of the sitemaps.org <c>urlset</c> document (Phase 23 wave 23.2). Enumerates the fixed
/// indexable roots plus one entry per release, every <c>&lt;loc&gt;</c> absolutized against
/// <see cref="SeoOptions.BaseUrl"/> and per-release paths resolved through
/// <see cref="ReleaseRoutes.DetailHref(string, DeepDrftModels.Enums.ReleaseMedium)"/> — so each sitemap URL
/// equals the page's <c>SeoHead</c> canonical by construction. No fetch, no env logic: the endpoint owns the
/// gate and the release walk; this turns the gathered DTOs into XML and never throws on partial input.
/// </summary>
public static class SitemapXml
{
private static readonly XNamespace Ns = "http://www.sitemaps.org/schemas/sitemap/0.9";
/// <summary>
/// The indexable static roots (OQ-S3). An explicit list, deliberately NOT derived from the nav index:
/// the indexable set is not the nav set (e.g. <c>/FramePlayer</c> is nav-absent and must stay out, and a
/// new nav entry is not automatically sitemap-worthy). Revisit here if the indexable-roots set grows.
/// </summary>
public static readonly IReadOnlyList<string> StaticRoots = ["/", "/about", "/cuts", "/sessions", "/mixes", "/archive"];
/// <summary>
/// Builds the full <c>urlset</c>: the static roots (no <c>lastmod</c>) followed by one <c>&lt;url&gt;</c>
/// per release. A release carries a <c>&lt;lastmod&gt;</c> sourced from <see cref="ReleaseDto.ReleaseDate"/>
/// in W3C <c>YYYY-MM-DD</c> form when present (OQ-S2 — the release date, accepted as a plausible crawl hint).
/// A null/empty release set yields a well-formed roots-only document.
/// </summary>
/// <param name="baseUrl">Canonical origin (no trailing slash) every <c>&lt;loc&gt;</c> is built from.</param>
/// <param name="releases">The gathered releases; may be empty or partial after an upstream failure.</param>
public static string Build(string baseUrl, IEnumerable<ReleaseDto> releases)
{
var origin = baseUrl.TrimEnd('/');
var roots = StaticRoots.Select(path => UrlElement(origin + path, lastmod: null));
var releaseUrls = releases.Select(release => UrlElement(
origin + ReleaseRoutes.DetailHref(release.EntryKey, release.Medium),
release.ReleaseDate?.ToString("yyyy-MM-dd")));
var urlset = new XElement(Ns + "urlset", roots.Concat(releaseUrls));
var document = new XDocument(new XDeclaration("1.0", "UTF-8", null), urlset);
// Save through a byte-based UTF-8 stream so the XML declaration reads encoding="utf-8". An
// XmlWriter over a StringBuilder/StringWriter is character-based (UTF-16) and would stamp the
// declaration utf-16, which is wrong for a body served as application/xml.
using var stream = new MemoryStream();
var settings = new XmlWriterSettings { Encoding = new UTF8Encoding(encoderShouldEmitUTF8Identifier: false), Indent = true };
using (var xmlWriter = XmlWriter.Create(stream, settings))
{
document.Save(xmlWriter);
}
return Encoding.UTF8.GetString(stream.ToArray());
}
private static XElement UrlElement(string loc, string? lastmod)
{
var element = new XElement(Ns + "url", new XElement(Ns + "loc", loc));
if (lastmod is not null)
element.Add(new XElement(Ns + "lastmod", lastmod));
return element;
}
}
+3
View File
@@ -40,6 +40,9 @@
The queue is pure domain logic, unit-testable against a fake IStreamingPlayerService The queue is pure domain logic, unit-testable against a fake IStreamingPlayerService
with no browser/JS. --> with no browser/JS. -->
<ProjectReference Include="..\DeepDrftPublic.Client\DeepDrftPublic.Client.csproj" /> <ProjectReference Include="..\DeepDrftPublic.Client\DeepDrftPublic.Client.csproj" />
<!-- Referenced for the Phase 23 crawl-directive builders (RobotsTxt / SitemapXml) — pure
string/XML composition over the env flag and release DTOs, unit-testable without HTTP. -->
<ProjectReference Include="..\DeepDrftPublic\DeepDrftPublic.csproj" />
</ItemGroup> </ItemGroup>
</Project> </Project>
+62
View File
@@ -0,0 +1,62 @@
using DeepDrftPublic.Seo;
namespace DeepDrftTests;
/// <summary>
/// Unit tests for <see cref="RobotsTxt"/> — the pure environment-branch composition of the robots.txt body
/// (Phase 23 wave 23.1). The gate (Production vs. anything-else) is the load-bearing branch: Production
/// allows + points at the sitemap and disallows the non-page routes; every non-production environment is a
/// closed door with no sitemap pointer (Invariant E1).
/// </summary>
[TestFixture]
public class RobotsTxtTests
{
private const string BaseUrl = "https://deepdrft.com";
[Test]
public void Build_Production_AllowsAndPointsAtSitemap()
{
var body = RobotsTxt.Build(isProduction: true, BaseUrl);
Assert.Multiple(() =>
{
Assert.That(body, Does.Contain("User-agent: *"));
Assert.That(body, Does.Contain("Allow: /"));
Assert.That(body, Does.Contain("Sitemap: https://deepdrft.com/sitemap.xml"));
});
}
[Test]
public void Build_Production_DisallowsFramePlayerAndApi()
{
var body = RobotsTxt.Build(isProduction: true, BaseUrl);
Assert.Multiple(() =>
{
Assert.That(body, Does.Contain("Disallow: /FramePlayer"));
Assert.That(body, Does.Contain("Disallow: /api/"));
});
}
[Test]
public void Build_NonProduction_DisallowsEverythingWithNoSitemapPointer()
{
var body = RobotsTxt.Build(isProduction: false, BaseUrl);
Assert.Multiple(() =>
{
Assert.That(body, Does.Contain("User-agent: *"));
Assert.That(body, Does.Contain("Disallow: /"));
Assert.That(body, Does.Not.Contain("Allow:"));
Assert.That(body, Does.Not.Contain("Sitemap:"));
});
}
[Test]
public void Build_Production_TrimsTrailingSlashOnBaseUrl()
{
var body = RobotsTxt.Build(isProduction: true, "https://deepdrft.com/");
Assert.That(body, Does.Contain("Sitemap: https://deepdrft.com/sitemap.xml"));
}
}
+154
View File
@@ -0,0 +1,154 @@
using System.Xml.Linq;
using DeepDrftModels.DTOs;
using DeepDrftModels.Enums;
using DeepDrftPublic.Client.Common;
using DeepDrftPublic.Seo;
namespace DeepDrftTests;
/// <summary>
/// Unit tests for <see cref="SitemapXml"/> — the pure sitemaps.org urlset composition (Phase 23 wave 23.2).
/// The document is parsed back to an <see cref="XDocument"/> so each assertion checks real structure, not a
/// substring: that every <c>&lt;loc&gt;</c> is absolute and built through <see cref="ReleaseRoutes"/> (so it
/// equals the page canonical), that <c>&lt;lastmod&gt;</c> tracks the release date, that the static roots are
/// present and FramePlayer is absent, and that empty input still yields a well-formed roots-only document.
/// </summary>
[TestFixture]
public class SitemapXmlTests
{
private const string BaseUrl = "https://deepdrft.com";
private static readonly XNamespace Ns = "http://www.sitemaps.org/schemas/sitemap/0.9";
private static ReleaseDto Release(string entryKey, ReleaseMedium medium, DateOnly? releaseDate = null) => new()
{
EntryKey = entryKey,
Title = "Title",
Artist = "Artist",
Medium = medium,
ReleaseDate = releaseDate,
};
private static List<string> Locs(string xml)
{
var doc = XDocument.Parse(xml);
return doc.Root!.Elements(Ns + "url")
.Select(u => u.Element(Ns + "loc")!.Value)
.ToList();
}
[Test]
public void Build_EmptyReleases_YieldsWellFormedRootsOnlyDocument()
{
var xml = SitemapXml.Build(BaseUrl, []);
var locs = Locs(xml);
Assert.Multiple(() =>
{
Assert.That(locs, Has.Count.EqualTo(SitemapXml.StaticRoots.Count));
Assert.That(locs, Does.Contain("https://deepdrft.com/"));
Assert.That(locs, Does.Contain("https://deepdrft.com/about"));
Assert.That(locs, Does.Contain("https://deepdrft.com/cuts"));
Assert.That(locs, Does.Contain("https://deepdrft.com/sessions"));
Assert.That(locs, Does.Contain("https://deepdrft.com/mixes"));
Assert.That(locs, Does.Contain("https://deepdrft.com/archive"));
});
}
[Test]
public void Build_IsWellFormedUrlsetWithSitemapsOrgNamespace()
{
var xml = SitemapXml.Build(BaseUrl, []);
var doc = XDocument.Parse(xml);
Assert.Multiple(() =>
{
Assert.That(doc.Root!.Name, Is.EqualTo(Ns + "urlset"));
Assert.That(xml, Does.Contain("utf-8").IgnoreCase);
});
}
[Test]
public void Build_FramePlayerIsNeverAStaticRoot()
{
var xml = SitemapXml.Build(BaseUrl, []);
Assert.That(Locs(xml), Has.None.Contains("FramePlayer"));
}
[TestCase(ReleaseMedium.Cut, "https://deepdrft.com/cuts/key-1")]
[TestCase(ReleaseMedium.Session, "https://deepdrft.com/sessions/key-1")]
[TestCase(ReleaseMedium.Mix, "https://deepdrft.com/mixes/key-1")]
public void Build_ReleaseLoc_IsAbsoluteAndResolvedThroughReleaseRoutes(ReleaseMedium medium, string expectedLoc)
{
var xml = SitemapXml.Build(BaseUrl, [Release("key-1", medium)]);
// The loc must equal BaseUrl + ReleaseRoutes.DetailHref — i.e. the page's SeoHead canonical, by construction.
var expected = BaseUrl + ReleaseRoutes.DetailHref("key-1", medium);
Assert.Multiple(() =>
{
Assert.That(expected, Is.EqualTo(expectedLoc));
Assert.That(Locs(xml), Does.Contain(expectedLoc));
});
}
[Test]
public void Build_AllReleasesEnumerated_AppendedAfterStaticRoots()
{
var releases = new[]
{
Release("a", ReleaseMedium.Cut),
Release("b", ReleaseMedium.Mix),
Release("c", ReleaseMedium.Session),
};
var xml = SitemapXml.Build(BaseUrl, releases);
Assert.That(Locs(xml), Has.Count.EqualTo(SitemapXml.StaticRoots.Count + releases.Length));
}
[Test]
public void Build_ReleaseWithDate_EmitsW3CLastmod()
{
var xml = SitemapXml.Build(BaseUrl, [Release("key-1", ReleaseMedium.Cut, new DateOnly(2026, 5, 12))]);
var doc = XDocument.Parse(xml);
var releaseUrl = doc.Root!.Elements(Ns + "url")
.Single(u => u.Element(Ns + "loc")!.Value.EndsWith("/cuts/key-1"));
Assert.That(releaseUrl.Element(Ns + "lastmod")!.Value, Is.EqualTo("2026-05-12"));
}
[Test]
public void Build_ReleaseWithoutDate_OmitsLastmod()
{
var xml = SitemapXml.Build(BaseUrl, [Release("key-1", ReleaseMedium.Cut)]);
var doc = XDocument.Parse(xml);
var releaseUrl = doc.Root!.Elements(Ns + "url")
.Single(u => u.Element(Ns + "loc")!.Value.EndsWith("/cuts/key-1"));
Assert.That(releaseUrl.Element(Ns + "lastmod"), Is.Null);
}
[Test]
public void Build_StaticRoots_NeverCarryLastmod()
{
var xml = SitemapXml.Build(BaseUrl, []);
var doc = XDocument.Parse(xml);
Assert.That(doc.Root!.Elements(Ns + "url").All(u => u.Element(Ns + "lastmod") is null), Is.True);
}
[Test]
public void Build_TrimsTrailingSlashOnBaseUrl()
{
var xml = SitemapXml.Build("https://deepdrft.com/", [Release("key-1", ReleaseMedium.Cut)]);
Assert.Multiple(() =>
{
// No doubled slash on the root or the release URL.
Assert.That(Locs(xml), Does.Contain("https://deepdrft.com/"));
Assert.That(Locs(xml), Does.Contain("https://deepdrft.com/cuts/key-1"));
});
}
}