Files
deepdrft/DeepDrftPublic/Controllers/CrawlDirectiveController.cs
T

112 lines
4.5 KiB
C#

using System.Net.Http.Json;
using System.Text.Json;
using DeepDrftModels.DTOs;
using Models.Common;
using DeepDrftPublic.Client.Common;
using DeepDrftPublic.Seo;
using Microsoft.AspNetCore.Mvc;
namespace DeepDrftPublic.Controllers;
/// <summary>
/// Serves the public crawl-directive surfaces (Phase 23): <c>GET /robots.txt</c> and
/// <c>GET /sitemap.xml</c>. Both are environment-gated server-side via
/// <see cref="IWebHostEnvironment.IsProduction"/> read directly here — not the WASM-only
/// <c>SeoEnvironment</c> bridge — and fail safe closed (non-production is uncrawlable, Invariant E1).
///
/// <para>
/// This is a thin host boundary: it owns the gate and the release walk, and delegates all body composition
/// to the pure <see cref="RobotsTxt"/> / <see cref="SitemapXml"/> builders. The sitemap walk reuses the
/// existing <c>"DeepDrft.API"</c> named client server-to-server (the same client SSR prerender uses) — it
/// <b>enumerates and transforms</b> releases into XML rather than relaying verbatim like the proxy controllers.
/// No new API endpoint, no schema change (Phase 22 C5 holds).
/// </para>
/// </summary>
[ApiController]
public class CrawlDirectiveController : ControllerBase
{
// A generous page size keeps the walk to a handful of round-trips even for a large catalogue.
private const int WalkPageSize = 200;
// The release walk deserializes a bare PagedResult<ReleaseDto> (no ApiResultDto envelope), matching TrackClient.
private static readonly JsonSerializerOptions JsonOptions = new(JsonSerializerDefaults.Web);
private readonly IWebHostEnvironment _environment;
private readonly SeoOptions _seoOptions;
private readonly HttpClient _upstream;
private readonly ILogger<CrawlDirectiveController> _logger;
public CrawlDirectiveController(
IWebHostEnvironment environment,
SeoOptions seoOptions,
IHttpClientFactory httpClientFactory,
ILogger<CrawlDirectiveController> logger)
{
_environment = environment;
_seoOptions = seoOptions;
_upstream = httpClientFactory.CreateClient("DeepDrft.API");
_logger = logger;
}
/// <summary>
/// <c>GET /robots.txt</c>. Production: allow + FramePlayer/api disallows + sitemap pointer. Any
/// non-production environment: <c>Disallow: /</c> with no sitemap pointer (E1). Always <c>text/plain</c>.
/// </summary>
[HttpGet("/robots.txt")]
public ContentResult GetRobots()
{
var body = RobotsTxt.Build(_environment.IsProduction(), _seoOptions.BaseUrl);
return Content(body, "text/plain");
}
/// <summary>
/// <c>GET /sitemap.xml</c>. Non-production: 404 (the non-prod robots carries no sitemap pointer, so
/// nothing references it). Production: the static roots plus one entry per release. Resilient — a
/// partial/empty/failed release read yields a well-formed (possibly roots-only) document, never a 500.
/// </summary>
[HttpGet("/sitemap.xml")]
public async Task<ActionResult> GetSitemap(CancellationToken ct = default)
{
if (!_environment.IsProduction())
return NotFound();
var releases = await GatherReleasesAsync(ct);
var xml = SitemapXml.Build(_seoOptions.BaseUrl, releases);
return Content(xml, "application/xml");
}
// Walks GET api/release page by page until every release is read. On any upstream failure, returns the
// releases gathered so far (possibly none) so the sitemap degrades to a well-formed roots-only document
// rather than 500ing — a sitemap that errors trains crawlers to stop fetching it (AC-S5).
private async Task<IReadOnlyList<ReleaseDto>> GatherReleasesAsync(CancellationToken ct)
{
var gathered = new List<ReleaseDto>();
var page = 1;
try
{
while (true)
{
var result = await _upstream.GetFromJsonAsync<PagedResult<ReleaseDto>>(
$"api/release?page={page}&pageSize={WalkPageSize}", JsonOptions, ct);
if (result?.Items is null)
break;
gathered.AddRange(result.Items);
if (gathered.Count >= result.TotalCount || !result.Items.Any())
break;
page++;
}
}
catch (Exception ex)
{
_logger.LogError(ex, "Sitemap release walk failed after gathering {Count} release(s); serving a partial sitemap", gathered.Count);
}
return gathered;
}
}