From 8db2bd5720bece573e2985057082b86b3aaa5a43 Mon Sep 17 00:00:00 2001 From: Jakob Meier Date: Sun, 18 Feb 2024 18:09:30 +0100 Subject: [PATCH] Added more blacklist items --- static/robots.txt | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/static/robots.txt b/static/robots.txt index 5354eca..fa7e1ba 100644 --- a/static/robots.txt +++ b/static/robots.txt @@ -2,3 +2,46 @@ User-agent: gptbot Disallow: / User-agent: robots Disallow: / + +# taken from https://codeberg.org/benjaminhollon/robots.txt-deny-llm/src/branch/main/robots.txt + +# from https://neil-clarke.com/block-the-bots-that-feed-ai-models-by-scraping-your-website/ + +User-agent: CCBot +Disallow: / + +User-agent: ChatGPT-User +Disallow: / + +User-agent: GPTBot +Disallow: / + +User-agent: Google-Extended +Disallow: / + +User-agent: anthropic-ai +Disallow: / + +User-agent: Omgilibot +Disallow: / + +User-agent: Omgili +Disallow: / + +User-agent: FacebookBot +Disallow: / + +User-agent: Bytespider +Disallow: / + +# from https://github.com/healsdata/ai-training-opt-out + +# may not work, needs more research (see https://github.com/rom1504/img2dataset/issues/48) +User-agent: img2dataset +Disallow: / + +User-agent: Claude-Web +Disallow: / + +User-agent: magpie-crawler +Disallow: / -- 2.38.5