Cornell University / robots.txt snapshot

← back to cornell.edu · fetched 2026-06-26T14:15:22Z (4h ago) · HTTP 200 · 1018 bytes · sha256 cf32953b99203e16 · raw

final URL: https://www.cornell.edu/robots.txt

1	User-agent: *
2	Crawl-Delay: 6
3	Disallow: /_dynamic_files/
4	Disallow: /_tasks/
5	Disallow: /test/
6	Disallow: /tools/
7	Disallow: /template/
8	Disallow: /search/
9	Disallow: /visit/plan/
10	Disallow: /video/kaltura/
11	Disallow: /video/tasks/
12	Disallow: /server-health-check/
13
14
15	# SiteImprove should ignore these page particularly because they aren't actually used, but are still linked for historical reasons
16	User-agent: Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0) SiteCheck-sitecrawl by Siteimprove.com
17	Disallow: /cuinfo/specialconditions/
18	Disallow: /_includes/header.cfm
19
20	User-agent: Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0) LinkCheck by Siteimprove.com
21	Disallow: /cuinfo/specialconditions/
22	Disallow: /_includes/header.cfm
23
24	User-agent: HTML validator: Siteimprove_W3C_Validator/1.3
25	Disallow: /cuinfo/specialconditions/
26	Disallow: /_includes/header.cfm
27
28	User-agent: CSS Validator: Jigsaw/2.3.0 W3C_CSS_Validator_JFouffa/2.0
29	Disallow: /cuinfo/specialconditions/
30	Disallow: /_includes/header.cfm