From 5f4807cc4a44386beb9c06d0d016f9b38213a4f4 Mon Sep 17 00:00:00 2001 From: daniel-c-harvey Date: Tue, 23 Jun 2026 07:23:42 -0400 Subject: [PATCH 1/2] =?UTF-8?q?feature:=20Phase=2023=20Track=20A=20?= =?UTF-8?q?=E2=80=94=20env-gated=20/robots.txt=20+=20/sitemap.xml=20public?= =?UTF-8?q?=20crawl=20endpoints?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Controllers/CrawlDirectiveController.cs | 111 +++++++++++++ DeepDrftPublic/Seo/RobotsTxt.cs | 34 ++++ DeepDrftPublic/Seo/SitemapXml.cs | 68 ++++++++ DeepDrftTests/DeepDrftTests.csproj | 3 + DeepDrftTests/RobotsTxtTests.cs | 62 +++++++ DeepDrftTests/SitemapXmlTests.cs | 154 ++++++++++++++++++ 6 files changed, 432 insertions(+) create mode 100644 DeepDrftPublic/Controllers/CrawlDirectiveController.cs create mode 100644 DeepDrftPublic/Seo/RobotsTxt.cs create mode 100644 DeepDrftPublic/Seo/SitemapXml.cs create mode 100644 DeepDrftTests/RobotsTxtTests.cs create mode 100644 DeepDrftTests/SitemapXmlTests.cs diff --git a/DeepDrftPublic/Controllers/CrawlDirectiveController.cs b/DeepDrftPublic/Controllers/CrawlDirectiveController.cs new file mode 100644 index 0000000..40403de --- /dev/null +++ b/DeepDrftPublic/Controllers/CrawlDirectiveController.cs @@ -0,0 +1,111 @@ +using System.Net.Http.Json; +using System.Text.Json; +using DeepDrftModels.DTOs; +using Models.Common; +using DeepDrftPublic.Client.Common; +using DeepDrftPublic.Seo; +using Microsoft.AspNetCore.Mvc; + +namespace DeepDrftPublic.Controllers; + +/// +/// Serves the public crawl-directive surfaces (Phase 23): GET /robots.txt and +/// GET /sitemap.xml. Both are environment-gated server-side via +/// read directly here — not the WASM-only +/// SeoEnvironment bridge — and fail safe closed (non-production is uncrawlable, Invariant E1). +/// +/// +/// This is a thin host boundary: it owns the gate and the release walk, and delegates all body composition +/// to the pure / builders. The sitemap walk reuses the +/// existing "DeepDrft.API" named client server-to-server (the same client SSR prerender uses) — it +/// enumerates and transforms releases into XML rather than relaying verbatim like the proxy controllers. +/// No new API endpoint, no schema change (Phase 22 C5 holds). +/// +/// +[ApiController] +public class CrawlDirectiveController : ControllerBase +{ + // A generous page size keeps the walk to a handful of round-trips even for a large catalogue. + private const int WalkPageSize = 200; + + // The release walk deserializes a bare PagedResult (no ApiResultDto envelope), matching TrackClient. + private static readonly JsonSerializerOptions JsonOptions = new(JsonSerializerDefaults.Web); + + private readonly IWebHostEnvironment _environment; + private readonly SeoOptions _seoOptions; + private readonly HttpClient _upstream; + private readonly ILogger _logger; + + public CrawlDirectiveController( + IWebHostEnvironment environment, + SeoOptions seoOptions, + IHttpClientFactory httpClientFactory, + ILogger logger) + { + _environment = environment; + _seoOptions = seoOptions; + _upstream = httpClientFactory.CreateClient("DeepDrft.API"); + _logger = logger; + } + + /// + /// GET /robots.txt. Production: allow + FramePlayer/api disallows + sitemap pointer. Any + /// non-production environment: Disallow: / with no sitemap pointer (E1). Always text/plain. + /// + [HttpGet("/robots.txt")] + public ContentResult GetRobots() + { + var body = RobotsTxt.Build(_environment.IsProduction(), _seoOptions.BaseUrl); + return Content(body, "text/plain"); + } + + /// + /// GET /sitemap.xml. Non-production: 404 (the non-prod robots carries no sitemap pointer, so + /// nothing references it). Production: the static roots plus one entry per release. Resilient — a + /// partial/empty/failed release read yields a well-formed (possibly roots-only) document, never a 500. + /// + [HttpGet("/sitemap.xml")] + public async Task GetSitemap(CancellationToken ct = default) + { + if (!_environment.IsProduction()) + return NotFound(); + + var releases = await GatherReleasesAsync(ct); + var xml = SitemapXml.Build(_seoOptions.BaseUrl, releases); + return Content(xml, "application/xml"); + } + + // Walks GET api/release page by page until every release is read. On any upstream failure, returns the + // releases gathered so far (possibly none) so the sitemap degrades to a well-formed roots-only document + // rather than 500ing — a sitemap that errors trains crawlers to stop fetching it (AC-S5). + private async Task> GatherReleasesAsync(CancellationToken ct) + { + var gathered = new List(); + var page = 1; + + try + { + while (true) + { + var result = await _upstream.GetFromJsonAsync>( + $"api/release?page={page}&pageSize={WalkPageSize}", JsonOptions, ct); + + if (result?.Items is null) + break; + + gathered.AddRange(result.Items); + + if (gathered.Count >= result.TotalCount || !result.Items.Any()) + break; + + page++; + } + } + catch (Exception ex) + { + _logger.LogError(ex, "Sitemap release walk failed after gathering {Count} release(s); serving a partial sitemap", gathered.Count); + } + + return gathered; + } +} diff --git a/DeepDrftPublic/Seo/RobotsTxt.cs b/DeepDrftPublic/Seo/RobotsTxt.cs new file mode 100644 index 0000000..a8017ad --- /dev/null +++ b/DeepDrftPublic/Seo/RobotsTxt.cs @@ -0,0 +1,34 @@ +namespace DeepDrftPublic.Seo; + +/// +/// Pure composition of the robots.txt body (Phase 23 wave 23.1). The environment gate is the +/// caller's: the endpoint reads +/// server-side and passes the boolean here, so the production-vs-beta branch lives in one testable place. +/// Fail-safe is closed — anything that is not Production yields Disallow: / (Invariant E1). +/// +public static class RobotsTxt +{ + /// + /// Builds the directive body. In Production: allow everything except the embed shell and the proxy API + /// paths, plus a Sitemap: pointer (OQ-R2). In any non-production environment: a closed door + /// (Disallow: /) with no sitemap pointer, so a crawl of beta sees nothing and the sitemap is + /// never advertised. + /// + /// The server-side IsProduction() result — the single gate. + /// Canonical origin (no trailing slash) for the Sitemap: line; Production only. + public static string Build(bool isProduction, string baseUrl) + { + if (!isProduction) + { + return "User-agent: *\n" + + "Disallow: /\n"; + } + + var origin = baseUrl.TrimEnd('/'); + return "User-agent: *\n" + + "Allow: /\n" + + "Disallow: /FramePlayer\n" + + "Disallow: /api/\n" + + $"Sitemap: {origin}/sitemap.xml\n"; + } +} diff --git a/DeepDrftPublic/Seo/SitemapXml.cs b/DeepDrftPublic/Seo/SitemapXml.cs new file mode 100644 index 0000000..15552b2 --- /dev/null +++ b/DeepDrftPublic/Seo/SitemapXml.cs @@ -0,0 +1,68 @@ +using System.Text; +using System.Xml; +using System.Xml.Linq; +using DeepDrftModels.DTOs; +using DeepDrftPublic.Client.Common; + +namespace DeepDrftPublic.Seo; + +/// +/// Pure composition of the sitemaps.org urlset document (Phase 23 wave 23.2). Enumerates the fixed +/// indexable roots plus one entry per release, every <loc> absolutized against +/// and per-release paths resolved through +/// — so each sitemap URL +/// equals the page's SeoHead canonical by construction. No fetch, no env logic: the endpoint owns the +/// gate and the release walk; this turns the gathered DTOs into XML and never throws on partial input. +/// +public static class SitemapXml +{ + private static readonly XNamespace Ns = "http://www.sitemaps.org/schemas/sitemap/0.9"; + + /// + /// The indexable static roots (OQ-S3). An explicit list, deliberately NOT derived from the nav index: + /// the indexable set is not the nav set (e.g. /FramePlayer is nav-absent and must stay out, and a + /// new nav entry is not automatically sitemap-worthy). Revisit here if the indexable-roots set grows. + /// + public static readonly IReadOnlyList StaticRoots = ["/", "/about", "/cuts", "/sessions", "/mixes", "/archive"]; + + /// + /// Builds the full urlset: the static roots (no lastmod) followed by one <url> + /// per release. A release carries a <lastmod> sourced from + /// in W3C YYYY-MM-DD form when present (OQ-S2 — the release date, accepted as a plausible crawl hint). + /// A null/empty release set yields a well-formed roots-only document. + /// + /// Canonical origin (no trailing slash) every <loc> is built from. + /// The gathered releases; may be empty or partial after an upstream failure. + public static string Build(string baseUrl, IEnumerable releases) + { + var origin = baseUrl.TrimEnd('/'); + + var roots = StaticRoots.Select(path => UrlElement(origin + path, lastmod: null)); + var releaseUrls = releases.Select(release => UrlElement( + origin + ReleaseRoutes.DetailHref(release.EntryKey, release.Medium), + release.ReleaseDate?.ToString("yyyy-MM-dd"))); + + var urlset = new XElement(Ns + "urlset", roots.Concat(releaseUrls)); + var document = new XDocument(new XDeclaration("1.0", "UTF-8", null), urlset); + + // Save through a byte-based UTF-8 stream so the XML declaration reads encoding="utf-8". An + // XmlWriter over a StringBuilder/StringWriter is character-based (UTF-16) and would stamp the + // declaration utf-16, which is wrong for a body served as application/xml. + using var stream = new MemoryStream(); + var settings = new XmlWriterSettings { Encoding = new UTF8Encoding(encoderShouldEmitUTF8Identifier: false), Indent = true }; + using (var xmlWriter = XmlWriter.Create(stream, settings)) + { + document.Save(xmlWriter); + } + + return Encoding.UTF8.GetString(stream.ToArray()); + } + + private static XElement UrlElement(string loc, string? lastmod) + { + var element = new XElement(Ns + "url", new XElement(Ns + "loc", loc)); + if (lastmod is not null) + element.Add(new XElement(Ns + "lastmod", lastmod)); + return element; + } +} diff --git a/DeepDrftTests/DeepDrftTests.csproj b/DeepDrftTests/DeepDrftTests.csproj index b160638..7d55281 100644 --- a/DeepDrftTests/DeepDrftTests.csproj +++ b/DeepDrftTests/DeepDrftTests.csproj @@ -40,6 +40,9 @@ The queue is pure domain logic, unit-testable against a fake IStreamingPlayerService with no browser/JS. --> + + diff --git a/DeepDrftTests/RobotsTxtTests.cs b/DeepDrftTests/RobotsTxtTests.cs new file mode 100644 index 0000000..62a893a --- /dev/null +++ b/DeepDrftTests/RobotsTxtTests.cs @@ -0,0 +1,62 @@ +using DeepDrftPublic.Seo; + +namespace DeepDrftTests; + +/// +/// Unit tests for — the pure environment-branch composition of the robots.txt body +/// (Phase 23 wave 23.1). The gate (Production vs. anything-else) is the load-bearing branch: Production +/// allows + points at the sitemap and disallows the non-page routes; every non-production environment is a +/// closed door with no sitemap pointer (Invariant E1). +/// +[TestFixture] +public class RobotsTxtTests +{ + private const string BaseUrl = "https://deepdrft.com"; + + [Test] + public void Build_Production_AllowsAndPointsAtSitemap() + { + var body = RobotsTxt.Build(isProduction: true, BaseUrl); + + Assert.Multiple(() => + { + Assert.That(body, Does.Contain("User-agent: *")); + Assert.That(body, Does.Contain("Allow: /")); + Assert.That(body, Does.Contain("Sitemap: https://deepdrft.com/sitemap.xml")); + }); + } + + [Test] + public void Build_Production_DisallowsFramePlayerAndApi() + { + var body = RobotsTxt.Build(isProduction: true, BaseUrl); + + Assert.Multiple(() => + { + Assert.That(body, Does.Contain("Disallow: /FramePlayer")); + Assert.That(body, Does.Contain("Disallow: /api/")); + }); + } + + [Test] + public void Build_NonProduction_DisallowsEverythingWithNoSitemapPointer() + { + var body = RobotsTxt.Build(isProduction: false, BaseUrl); + + Assert.Multiple(() => + { + Assert.That(body, Does.Contain("User-agent: *")); + Assert.That(body, Does.Contain("Disallow: /")); + Assert.That(body, Does.Not.Contain("Allow:")); + Assert.That(body, Does.Not.Contain("Sitemap:")); + }); + } + + [Test] + public void Build_Production_TrimsTrailingSlashOnBaseUrl() + { + var body = RobotsTxt.Build(isProduction: true, "https://deepdrft.com/"); + + Assert.That(body, Does.Contain("Sitemap: https://deepdrft.com/sitemap.xml")); + } +} diff --git a/DeepDrftTests/SitemapXmlTests.cs b/DeepDrftTests/SitemapXmlTests.cs new file mode 100644 index 0000000..4020f79 --- /dev/null +++ b/DeepDrftTests/SitemapXmlTests.cs @@ -0,0 +1,154 @@ +using System.Xml.Linq; +using DeepDrftModels.DTOs; +using DeepDrftModels.Enums; +using DeepDrftPublic.Client.Common; +using DeepDrftPublic.Seo; + +namespace DeepDrftTests; + +/// +/// Unit tests for — the pure sitemaps.org urlset composition (Phase 23 wave 23.2). +/// The document is parsed back to an so each assertion checks real structure, not a +/// substring: that every <loc> is absolute and built through (so it +/// equals the page canonical), that <lastmod> tracks the release date, that the static roots are +/// present and FramePlayer is absent, and that empty input still yields a well-formed roots-only document. +/// +[TestFixture] +public class SitemapXmlTests +{ + private const string BaseUrl = "https://deepdrft.com"; + private static readonly XNamespace Ns = "http://www.sitemaps.org/schemas/sitemap/0.9"; + + private static ReleaseDto Release(string entryKey, ReleaseMedium medium, DateOnly? releaseDate = null) => new() + { + EntryKey = entryKey, + Title = "Title", + Artist = "Artist", + Medium = medium, + ReleaseDate = releaseDate, + }; + + private static List Locs(string xml) + { + var doc = XDocument.Parse(xml); + return doc.Root!.Elements(Ns + "url") + .Select(u => u.Element(Ns + "loc")!.Value) + .ToList(); + } + + [Test] + public void Build_EmptyReleases_YieldsWellFormedRootsOnlyDocument() + { + var xml = SitemapXml.Build(BaseUrl, []); + + var locs = Locs(xml); + Assert.Multiple(() => + { + Assert.That(locs, Has.Count.EqualTo(SitemapXml.StaticRoots.Count)); + Assert.That(locs, Does.Contain("https://deepdrft.com/")); + Assert.That(locs, Does.Contain("https://deepdrft.com/about")); + Assert.That(locs, Does.Contain("https://deepdrft.com/cuts")); + Assert.That(locs, Does.Contain("https://deepdrft.com/sessions")); + Assert.That(locs, Does.Contain("https://deepdrft.com/mixes")); + Assert.That(locs, Does.Contain("https://deepdrft.com/archive")); + }); + } + + [Test] + public void Build_IsWellFormedUrlsetWithSitemapsOrgNamespace() + { + var xml = SitemapXml.Build(BaseUrl, []); + var doc = XDocument.Parse(xml); + + Assert.Multiple(() => + { + Assert.That(doc.Root!.Name, Is.EqualTo(Ns + "urlset")); + Assert.That(xml, Does.Contain("utf-8").IgnoreCase); + }); + } + + [Test] + public void Build_FramePlayerIsNeverAStaticRoot() + { + var xml = SitemapXml.Build(BaseUrl, []); + + Assert.That(Locs(xml), Has.None.Contains("FramePlayer")); + } + + [TestCase(ReleaseMedium.Cut, "https://deepdrft.com/cuts/key-1")] + [TestCase(ReleaseMedium.Session, "https://deepdrft.com/sessions/key-1")] + [TestCase(ReleaseMedium.Mix, "https://deepdrft.com/mixes/key-1")] + public void Build_ReleaseLoc_IsAbsoluteAndResolvedThroughReleaseRoutes(ReleaseMedium medium, string expectedLoc) + { + var xml = SitemapXml.Build(BaseUrl, [Release("key-1", medium)]); + + // The loc must equal BaseUrl + ReleaseRoutes.DetailHref — i.e. the page's SeoHead canonical, by construction. + var expected = BaseUrl + ReleaseRoutes.DetailHref("key-1", medium); + Assert.Multiple(() => + { + Assert.That(expected, Is.EqualTo(expectedLoc)); + Assert.That(Locs(xml), Does.Contain(expectedLoc)); + }); + } + + [Test] + public void Build_AllReleasesEnumerated_AppendedAfterStaticRoots() + { + var releases = new[] + { + Release("a", ReleaseMedium.Cut), + Release("b", ReleaseMedium.Mix), + Release("c", ReleaseMedium.Session), + }; + + var xml = SitemapXml.Build(BaseUrl, releases); + + Assert.That(Locs(xml), Has.Count.EqualTo(SitemapXml.StaticRoots.Count + releases.Length)); + } + + [Test] + public void Build_ReleaseWithDate_EmitsW3CLastmod() + { + var xml = SitemapXml.Build(BaseUrl, [Release("key-1", ReleaseMedium.Cut, new DateOnly(2026, 5, 12))]); + + var doc = XDocument.Parse(xml); + var releaseUrl = doc.Root!.Elements(Ns + "url") + .Single(u => u.Element(Ns + "loc")!.Value.EndsWith("/cuts/key-1")); + + Assert.That(releaseUrl.Element(Ns + "lastmod")!.Value, Is.EqualTo("2026-05-12")); + } + + [Test] + public void Build_ReleaseWithoutDate_OmitsLastmod() + { + var xml = SitemapXml.Build(BaseUrl, [Release("key-1", ReleaseMedium.Cut)]); + + var doc = XDocument.Parse(xml); + var releaseUrl = doc.Root!.Elements(Ns + "url") + .Single(u => u.Element(Ns + "loc")!.Value.EndsWith("/cuts/key-1")); + + Assert.That(releaseUrl.Element(Ns + "lastmod"), Is.Null); + } + + [Test] + public void Build_StaticRoots_NeverCarryLastmod() + { + var xml = SitemapXml.Build(BaseUrl, []); + + var doc = XDocument.Parse(xml); + Assert.That(doc.Root!.Elements(Ns + "url").All(u => u.Element(Ns + "lastmod") is null), Is.True); + } + + [Test] + public void Build_TrimsTrailingSlashOnBaseUrl() + { + var xml = SitemapXml.Build("https://deepdrft.com/", [Release("key-1", ReleaseMedium.Cut)]); + + Assert.Multiple(() => + { + // No doubled slash on the root or the release URL. + Assert.That(Locs(xml), Does.Contain("https://deepdrft.com/")); + Assert.That(Locs(xml), Does.Contain("https://deepdrft.com/cuts/key-1")); + }); + } +} From 7a0ccdd784db4c5de1fcd63ed3ea3da8254574c9 Mon Sep 17 00:00:00 2001 From: daniel-c-harvey Date: Tue, 23 Jun 2026 07:33:24 -0400 Subject: [PATCH 2/2] fix: correct WalkPageSize to 100 (actual server PageSize cap) and update comment --- DeepDrftPublic/Controllers/CrawlDirectiveController.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DeepDrftPublic/Controllers/CrawlDirectiveController.cs b/DeepDrftPublic/Controllers/CrawlDirectiveController.cs index 40403de..76dada4 100644 --- a/DeepDrftPublic/Controllers/CrawlDirectiveController.cs +++ b/DeepDrftPublic/Controllers/CrawlDirectiveController.cs @@ -25,8 +25,8 @@ namespace DeepDrftPublic.Controllers; [ApiController] public class CrawlDirectiveController : ControllerBase { - // A generous page size keeps the walk to a handful of round-trips even for a large catalogue. - private const int WalkPageSize = 200; + // 100 is the server-side PageSize cap, so this is the largest page the walk can actually get. + private const int WalkPageSize = 100; // The release walk deserializes a bare PagedResult (no ApiResultDto envelope), matching TrackClient. private static readonly JsonSerializerOptions JsonOptions = new(JsonSerializerDefaults.Web);