using System.Net.Http.Json;
using System.Text.Json;
using DeepDrftModels.DTOs;
using Models.Common;
using DeepDrftPublic.Client.Common;
using DeepDrftPublic.Seo;
using Microsoft.AspNetCore.Mvc;
namespace DeepDrftPublic.Controllers;
///
/// Serves the public crawl-directive surfaces (Phase 23): GET /robots.txt and
/// GET /sitemap.xml. Both are environment-gated server-side via
/// read directly here — not the WASM-only
/// SeoEnvironment bridge — and fail safe closed (non-production is uncrawlable, Invariant E1).
///
///
/// This is a thin host boundary: it owns the gate and the release walk, and delegates all body composition
/// to the pure / builders. The sitemap walk reuses the
/// existing "DeepDrft.API" named client server-to-server (the same client SSR prerender uses) — it
/// enumerates and transforms releases into XML rather than relaying verbatim like the proxy controllers.
/// No new API endpoint, no schema change (Phase 22 C5 holds).
///
///
[ApiController]
public class CrawlDirectiveController : ControllerBase
{
// 100 is the server-side PageSize cap, so this is the largest page the walk can actually get.
private const int WalkPageSize = 100;
// The release walk deserializes a bare PagedResult (no ApiResultDto envelope), matching TrackClient.
private static readonly JsonSerializerOptions JsonOptions = new(JsonSerializerDefaults.Web);
private readonly IWebHostEnvironment _environment;
private readonly SeoOptions _seoOptions;
private readonly HttpClient _upstream;
private readonly ILogger _logger;
public CrawlDirectiveController(
IWebHostEnvironment environment,
SeoOptions seoOptions,
IHttpClientFactory httpClientFactory,
ILogger logger)
{
_environment = environment;
_seoOptions = seoOptions;
_upstream = httpClientFactory.CreateClient("DeepDrft.API");
_logger = logger;
}
///
/// GET /robots.txt. Production: allow + FramePlayer/api disallows + sitemap pointer. Any
/// non-production environment: Disallow: / with no sitemap pointer (E1). Always text/plain.
///
[HttpGet("/robots.txt")]
public ContentResult GetRobots()
{
var body = RobotsTxt.Build(_environment.IsProduction(), _seoOptions.BaseUrl);
return Content(body, "text/plain");
}
///
/// GET /sitemap.xml. Non-production: 404 (the non-prod robots carries no sitemap pointer, so
/// nothing references it). Production: the static roots plus one entry per release. Resilient — a
/// partial/empty/failed release read yields a well-formed (possibly roots-only) document, never a 500.
///
[HttpGet("/sitemap.xml")]
public async Task GetSitemap(CancellationToken ct = default)
{
if (!_environment.IsProduction())
return NotFound();
var releases = await GatherReleasesAsync(ct);
var xml = SitemapXml.Build(_seoOptions.BaseUrl, releases);
return Content(xml, "application/xml");
}
// Walks GET api/release page by page until every release is read. On any upstream failure, returns the
// releases gathered so far (possibly none) so the sitemap degrades to a well-formed roots-only document
// rather than 500ing — a sitemap that errors trains crawlers to stop fetching it (AC-S5).
private async Task> GatherReleasesAsync(CancellationToken ct)
{
var gathered = new List();
var page = 1;
try
{
while (true)
{
var result = await _upstream.GetFromJsonAsync>(
$"api/release?page={page}&pageSize={WalkPageSize}", JsonOptions, ct);
if (result?.Items is null)
break;
gathered.AddRange(result.Items);
if (gathered.Count >= result.TotalCount || !result.Items.Any())
break;
page++;
}
}
catch (Exception ex)
{
_logger.LogError(ex, "Sitemap release walk failed after gathering {Count} release(s); serving a partial sitemap", gathered.Count);
}
return gathered;
}
}