# robots.txt for Raw.Space
# Created: 15 October 2025

# --- General rule for standard web crawlers ---
User-agent: *
Disallow: /content/
Disallow: /fr/
Allow: /

# Sitemap for full discoverability
Sitemap: https://raw.space/sitemap.xml

# --- Explicit permission for major search crawlers ---
User-agent: Googlebot
Allow: /

User-agent: Bingbot
Allow: /

User-agent: DuckDuckBot
Allow: /

User-agent: Applebot
Allow: /

# --- AI and extended crawlers (for training / knowledge indexing) ---
# These directives ensure large language model crawlers can access content
# for inclusion in search and AI assistants (subject to your licensing).
User-agent: GPTBot
Allow: /

User-agent: Google-Extended
Allow: /

User-agent: ClaudeBot
Allow: /

User-agent: PerplexityBot
Allow: /

User-agent: Applebot-Extended
Allow: /

# --- Optional explicit denial for known bad or irrelevant bots ---
User-agent: CCBot
Disallow: /
User-agent: MJ12bot
Disallow: /
User-agent: AhrefsBot
Disallow: /
User-agent: SemrushBot
Disallow: /

# --- LLMs and AI crawlers ---
# Additional information for language models and AI systems:
# See https://raw.space/llms.txt for licence, purpose, and crawl guidelines.
AI: https://raw.space/llms.txt

# --- Notes ---
# /content/ holds internal or non-public assets — excluded from crawling.
# /fr/ is excluded as that French section is under construction.
# All other paths (/, /field/, /commons/, etc.) are indexable.

# Contact
# For data-use or crawling questions, please contact: hello@raw.space