# ============================================================================
# Robots.txt for Pressonify.ai
# AI Discovery Protocol v2.1 Compliant
# Last Updated: 2025-12-29
# ============================================================================

# Sitemaps for Indexing
# Note: sitemap-ai.xml intentionally omitted from robots.txt
# (uses custom AI namespace that Google doesn't support)
# AI crawlers discover it via /.well-known/ai.json manifest
Sitemap: https://pressonify.ai/sitemap.xml
Sitemap: https://pressonify.ai/news-sitemap.xml

# ============================================================================
# STANDARD CRAWLER RULES (applies to all bots)
# ============================================================================

User-agent: *
Allow: /
Crawl-delay: 1

# ----------------------------------------------------------------------------
# AI DISCOVERY ENDPOINTS (ADP v2.1)
# These endpoints are specifically designed for LLM and AI crawler ingestion
# ----------------------------------------------------------------------------

# Master Discovery Manifest (Industry Standard)
# Format: JSON | Updates: Daily | Contains: all AI endpoint references
Allow: /.well-known/ai.json

# Security Contact (RFC 9116)
Allow: /.well-known/security.txt

# Meta-Index: Entry point for AI discovery, maps all resources
# Format: JSON | Updates: Hourly | Contains: endpoint catalog, entity counts
Allow: /ai-discovery.json
Allow: /ai-discovery.md

# Knowledge Graph: Structured entity catalog using Schema.org vocabulary
# Format: JSON-LD | Updates: Hourly | Contains: Organizations, NewsArticles, Persons
Allow: /knowledge-graph.json

# AI Interaction Protocol: Declares AI interaction preferences and attribution
# Format: Text | Updates: Weekly | Contains: site identity, licensing, discovery links
Allow: /ai.txt

# LLM Context Documents: Human-readable markdown for AI assistants
# Format: Markdown/Text | Updates: Daily | Contains: platform overview, latest PRs
Allow: /llms.txt
Allow: /llms-full.txt
Allow: /llms-lite.txt

# RSS/Atom/JSON Feeds: Syndication feeds for content updates
# Format: XML/JSON | Updates: Real-time on publish
Allow: /feed
Allow: /rss
Allow: /feed.json

# Delta Feed: Incremental updates for AI crawlers
# Format: JSON | Updates: Real-time | Contains: changes since timestamp
Allow: /updates.json

# News-Specific AI Endpoints (v2.9.5)
# Format: Various | Updates: Real-time | Contains: news-only content
Allow: /news/llms.txt
Allow: /news/archive.jsonl
Allow: /news/speakable.json
Allow: /news/changelog.json
Allow: /news/latest.json

# OpenSearch Description Document
# Format: XML | Updates: Weekly | Contains: search autodiscovery
Allow: /opensearch.xml

# ----------------------------------------------------------------------------
# PUBLIC CONTENT - EXPLICITLY ALLOWED
# ----------------------------------------------------------------------------

# Press release archive and individual articles
Allow: /news/
Allow: /news/*

# Public pages
Allow: /pricing
Allow: /about
Allow: /how-it-works
Allow: /blog/
Allow: /contact
Allow: /free-tools
Allow: /ai-visibility-checker
Allow: /free-headline-generator
Allow: /glossary
Allow: /changelog
Allow: /demo

# API documentation
Allow: /api/docs

# Static assets
Allow: /static/

# ----------------------------------------------------------------------------
# PROTECTED PATHS
# Administrative and internal endpoints
# ----------------------------------------------------------------------------

# Admin and dashboard (requires authentication)
Disallow: /admin/
Disallow: /dashboard/
Disallow: /my-releases/
Disallow: /user-dashboard

# Internal API endpoints
Disallow: /api/v1/internal/

# OpenGraph image API — generates images, not pages. Wastes crawl budget.
Disallow: /api/og-image/

# All v1 API endpoints — REST API surface, not user-facing pages
Disallow: /api/v1/

# OpenAPI/docs JSON — schema, not content
Disallow: /openapi.json

# NOTE: Filter URLs (?tag=, ?category=, ?company=, ?search=, ?page=, ?q=)
# on /blog and /news are intentionally NOT blocked here.
# Instead the handlers serve X-Robots-Tag: noindex,follow when query
# params are present. This avoids the "Indexed, though blocked by
# robots.txt" warning while still keeping filter pages out of the index.

# Authentication flows
Disallow: /login
Disallow: /register
Disallow: /forgot-password
Disallow: /verify-email

# Form pages (not content pages)
Disallow: /generate

# Legacy URL format (redirects to /news/)
Disallow: /pr/

# NOTE: legacy /news/draft_* URLs are intentionally NOT blocked.
# They 301-redirect to canonical slugs in main.py (see press_release_detail_seo),
# so allowing crawl lets Google follow the redirect and drop the old URL.
# Blocking caused "Indexed, though blocked by robots.txt" warnings in GSC.

# ============================================================================
# AI-SPECIFIC CRAWLER PERMISSIONS
# Full access granted to major AI platforms for content indexing
# ============================================================================

# OpenAI (ChatGPT, GPT-4)
User-agent: GPTBot
Allow: /

User-agent: ChatGPT-User
Allow: /

User-agent: OAI-SearchBot
Allow: /

# Anthropic (Claude)
User-agent: Claude-Web
Allow: /

User-agent: ClaudeBot
Allow: /

User-agent: anthropic-ai
Allow: /

# Perplexity
User-agent: PerplexityBot
Allow: /

# Google AI (Gemini, Bard)
User-agent: Google-Extended
Allow: /

# Apple (Siri, Apple Intelligence)
User-agent: Applebot-Extended
Allow: /

# Microsoft (Copilot)
User-agent: Bingbot
Allow: /

# You.com
User-agent: YouBot
Allow: /

# Cohere
User-agent: cohere-ai
Allow: /

# Meta AI
User-agent: Meta-ExternalAgent
Allow: /

# ============================================================================
# NOTES FOR AI CRAWLERS
# ============================================================================
#
# For structured data ingestion, use these endpoints in order:
# 1. /ai-discovery.json - Get the meta-index first
# 2. /knowledge-graph.json - Full entity graph with relationships
# 3. /llms.txt - Human-readable context and latest content
#
# All endpoints support:
# - CORS (Access-Control-Allow-Origin: *)
# - HTTP caching (ETag, Last-Modified)
# - Content versioning
#
# Contact: support@pressonify.ai
# Documentation: https://pressonify.ai/blog/seo-to-aeo-geo-llmo-adp-evolution
# ============================================================================