using System.Net.Http.Json; using System.Text.Json; using DeepDrftModels.DTOs; using Models.Common; using DeepDrftPublic.Client.Common; using DeepDrftPublic.Seo; using Microsoft.AspNetCore.Mvc; namespace DeepDrftPublic.Controllers; /// /// Serves the public crawl-directive surfaces (Phase 23): GET /robots.txt and /// GET /sitemap.xml. Both are environment-gated server-side via /// read directly here — not the WASM-only /// SeoEnvironment bridge — and fail safe closed (non-production is uncrawlable, Invariant E1). /// /// /// This is a thin host boundary: it owns the gate and the release walk, and delegates all body composition /// to the pure / builders. The sitemap walk reuses the /// existing "DeepDrft.API" named client server-to-server (the same client SSR prerender uses) — it /// enumerates and transforms releases into XML rather than relaying verbatim like the proxy controllers. /// No new API endpoint, no schema change (Phase 22 C5 holds). /// /// [ApiController] public class CrawlDirectiveController : ControllerBase { // A generous page size keeps the walk to a handful of round-trips even for a large catalogue. private const int WalkPageSize = 200; // The release walk deserializes a bare PagedResult (no ApiResultDto envelope), matching TrackClient. private static readonly JsonSerializerOptions JsonOptions = new(JsonSerializerDefaults.Web); private readonly IWebHostEnvironment _environment; private readonly SeoOptions _seoOptions; private readonly HttpClient _upstream; private readonly ILogger _logger; public CrawlDirectiveController( IWebHostEnvironment environment, SeoOptions seoOptions, IHttpClientFactory httpClientFactory, ILogger logger) { _environment = environment; _seoOptions = seoOptions; _upstream = httpClientFactory.CreateClient("DeepDrft.API"); _logger = logger; } /// /// GET /robots.txt. Production: allow + FramePlayer/api disallows + sitemap pointer. Any /// non-production environment: Disallow: / with no sitemap pointer (E1). Always text/plain. /// [HttpGet("/robots.txt")] public ContentResult GetRobots() { var body = RobotsTxt.Build(_environment.IsProduction(), _seoOptions.BaseUrl); return Content(body, "text/plain"); } /// /// GET /sitemap.xml. Non-production: 404 (the non-prod robots carries no sitemap pointer, so /// nothing references it). Production: the static roots plus one entry per release. Resilient — a /// partial/empty/failed release read yields a well-formed (possibly roots-only) document, never a 500. /// [HttpGet("/sitemap.xml")] public async Task GetSitemap(CancellationToken ct = default) { if (!_environment.IsProduction()) return NotFound(); var releases = await GatherReleasesAsync(ct); var xml = SitemapXml.Build(_seoOptions.BaseUrl, releases); return Content(xml, "application/xml"); } // Walks GET api/release page by page until every release is read. On any upstream failure, returns the // releases gathered so far (possibly none) so the sitemap degrades to a well-formed roots-only document // rather than 500ing — a sitemap that errors trains crawlers to stop fetching it (AC-S5). private async Task> GatherReleasesAsync(CancellationToken ct) { var gathered = new List(); var page = 1; try { while (true) { var result = await _upstream.GetFromJsonAsync>( $"api/release?page={page}&pageSize={WalkPageSize}", JsonOptions, ct); if (result?.Items is null) break; gathered.AddRange(result.Items); if (gathered.Count >= result.TotalCount || !result.Items.Any()) break; page++; } } catch (Exception ex) { _logger.LogError(ex, "Sitemap release walk failed after gathering {Count} release(s); serving a partial sitemap", gathered.Count); } return gathered; } }