From 575298cbde54940021ad6d1f396e36ecd888f478 Mon Sep 17 00:00:00 2001 From: tallison Date: Mon, 16 Oct 2023 10:36:20 -0400 Subject: [PATCH] TIKA-4153 -- revert changes to robots.txt detection and add unit test for robots file starting with comments (cherry picked from commit 7825b59cb383411a4928ecbadf25d4a3f6f07c28) --- .../org/apache/tika/mime/tika-mimetypes.xml | 18 ++++++++---------- .../org/apache/tika/mime/TestMimeTypes.java | 1 + .../resources/test-documents/testRobots2.txt | 11 +++++++++++ 3 files changed, 20 insertions(+), 10 deletions(-) create mode 100644 tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testRobots2.txt diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index b49e355e64..53808c752f 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -2135,16 +2135,14 @@ - - - - - - - - - - + + + + + + + + diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java index 73945f3552..3dad7d6aff 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java @@ -1023,6 +1023,7 @@ public void testEmail() throws IOException { @Test public void testRobots() throws Exception { assertTypeByData("text/x-robots", "testRobots.txt"); + assertTypeByData("text/x-robots", "testRobots2.txt"); } @Test diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testRobots2.txt b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testRobots2.txt new file mode 100644 index 0000000000..2ad0152d07 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testRobots2.txt @@ -0,0 +1,11 @@ +# elevate robots begin +# robots.txt, added by the Elevate plugin for WordPress +# file version: 1 +User-agent: * +Disallow: /wp-admin/ +Disallow: /readme.html +Disallow: /trackback/ +Allow: /wp-admin/admin-ajax.php +Allow: /wp-content/uploads +Sitemap: https://blahdeblah.com/sitemap.xml +# elevate robots end \ No newline at end of file