NeuralCrawl

South China Morning Post / robots.txt snapshot

← back to scmp.com · fetched 2026-06-20T01:10:30Z (15h ago) · HTTP 200 · 4349 bytes · sha256 3a4d4b15232a97ca · raw

final URL: https://www.scmp.com/robots.txt

1#
2# robots.txt
3#
4# This file is to prevent the crawling and indexing of certain parts
5# of your site by web crawlers and spiders run by sites like Yahoo!
6# and Google. By telling these "robots" where not to go on your site,
7# you save bandwidth and server resources.
8#
9# This file will be ignored unless it is at the root of your host:
10# Used: http://example.com/robots.txt
11# Ignored: http://example.com/site/robots.txt
12#
13# For more information about the robots.txt standard, see:
14# http://www.robotstxt.org/robotstxt.html
15#
16# For syntax checking, see:
17# http://www.sxw.org.uk/computing/robots/check.html
18
19# AmazonAdBot
20User-agent: AmazonAdBot
21Allow: /
22
23User-agent: *
24Crawl-delay: 10
25
26# PWA
27
28# Directories
29Disallow: /public/
30Disallow: /static/
31# Path
32Disallow: /login
33Disallow: /signin
34Disallow: /register
35Disallow: /logout
36Disallow: /login/facebook
37Disallow: /login/facebook/*
38Disallow: /styleguide/*
39Disallow: /healthz
40Disallow: /.well-known/*
41Disallow: /*/firebase-messaging-sw.js
42Disallow: /google97d8d43559c9b155.html
43Allow: /.well-known/amphtml/apikey.pub
44
45# CSS, JS, Image
46Allow: /static/*.css$
47Allow: /static/*.css?
48Allow: /static/*.js$
49Allow: /static/*.js?
50Allow: /static/*.gif
51Allow: /static/*.jpg
52Allow: /static/*.jpeg
53Allow: /static/*.png
54Allow: /public/*.css$
55Allow: /public/*.css?
56Allow: /public/*.js$
57Allow: /public/*.js?
58Allow: /public/*.gif
59Allow: /public/*.jpg
60Allow: /public/*.jpeg
61Allow: /public/*.png
62
63# Directories
64Disallow: /includes/
65Disallow: /misc/
66Disallow: /modules/
67Disallow: /profiles/
68Disallow: /scripts/
69Disallow: /themes/
70# Files
71Disallow: /CHANGELOG.txt
72Disallow: /cron.php
73Disallow: /INSTALL.mysql.txt
74Disallow: /INSTALL.pgsql.txt
75Disallow: /INSTALL.sqlite.txt
76Disallow: /install.php
77Disallow: /INSTALL.txt
78Disallow: /LICENSE.txt
79Disallow: /MAINTAINERS.txt
80Disallow: /update.php
81Disallow: /UPGRADE.txt
82Disallow: /xmlrpc.php
83Disallow: /sites/default/files/*.pdf
84Disallow: /sites/default/files/*.doc
85Disallow: /sites/default/files/*.docx
86Disallow: /sites/default/files/*.swf
87Disallow: /sites/default/files/styles/*.jpg
88Disallow: /_next/data/*.json
89
90# Paths (clean URLs)
91Disallow: /admin/
92Disallow: /comment/reply/
93Disallow: /filter/tips/
94Disallow: /node/add/
95Disallow: /user/register/
96Disallow: /user/password/
97Disallow: /user/login/
98Disallow: /user/logout/
99Disallow: *?destination=*
100Disallow: /ajax_comments/
101Disallow: /scmp_comments/
102Disallow: *Article_type=*
103Disallow: *field_article*
104Disallow: /label/
105Disallow: /node/*/nodequeue
106Disallow: /node/*/edit
107Disallow: /ajax
108Disallow: /ajax/*
109Disallow: /facebook-instant-articles/feed/*
110Disallow: /epaper
111Disallow: /epaper/*
112Disallow: /story/style/*
113
114# Paths (no clean URLs)
115Disallow: /?q=admin/
116Disallow: /?q=comment/reply/
117Disallow: /?q=filter/tips/
118Disallow: /?q=node/add/
119Disallow: /?q=user/password/
120Disallow: /?q=user/register/
121Disallow: /?q=user/login/
122Disallow: /?q=user/logout/
123Disallow: /?q=node/*/edit
124Disallow: /?q=node/*/nodequeue
125Disallow: /?q=epaper
126Disallow: /?q=epaper/*
127Disallow: /?q=facebook-instant-articles/feed/*
128
129Disallow: /*/logSend$
130Disallow: /*/spmException$
131Disallow: /*/spmact$
132Disallow: /*/antiSpam$
133Disallow: /*/nameStorage$
134Disallow: /*/spmMonitor$
135Disallow: /*/pvData$
136Disallow: /*/goldlog$
137Disallow: /*/initLoad$
138Disallow: /*/beforeUnload$
139Disallow: /*/util$
140Disallow: /*/metaInfo$
141Disallow: /*/beaconBase$
142Disallow: /*/spm$
143Disallow: /*/makeid$
144Disallow: /*/referrer$
145Disallow: /*/pvid$
146Disallow: /*/etag$
147Disallow: /*/iframe$
148Disallow: /*/client$
149Disallow: /*/windvane$
150Disallow: /*/cookie$
151Disallow: /*/sendpv$
152Disallow: /*/personality/index$
153Disallow: /*/misc$
154Disallow: /*/client$
155Disallow: /*/log$
156Disallow: /*/compose$
157Disallow: /*/lib_b/*$
158Disallow: /print/
159Disallow: /?q=print/
160
161# NewsNow
162User-agent: NewsNow
163Allow: /print/
164Allow: /?q=print/
165
166# GrapeShot
167User-agent: grapeshot
168Allow: /*/article/*$
169
170# Ads
171Disallow: /*?*cid=*
172Disallow: /*?*showonlyads=*
173Disallow: /*?*nograpeshot=*
174Disallow: /*?*noixwrapper=*
175Disallow: /*?*nogtm=*
176Disallow: /*?*nochartbeat=*
177Disallow: /*?*noga=*
178Disallow: /*?*nomoatyi=*
179Disallow: /*?*nomoat=*
180
181# Bot score and country
182Disallow: /navigator-info
183
184# Disallow article with query params and everything with campaign for google bot
185User-agent: Googlebot
186Disallow: /*?*campaign=*
187
188# Sitemap
189
190Sitemap: https://www.scmp.com/sitemap/sitemap.xml
191Sitemap: https://www.scmp.com/sitemap/archives-0.xml