From 3c137eb119550874b2b3e27d12b733ca67033377 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sun, 28 Feb 2021 06:26:40 +0900 Subject: [PATCH] Fix a parser bug that some data may be ignored before DOCTYPE HackerOne: HO-1104077 For example, "x 0 #STDERR.puts @source.encoding - @source.read if @source.buffer.size<2 #STDERR.puts "BUFFER = #{@source.buffer.inspect}" if @document_status == nil - #@source.consume( /^\s*/um ) - word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um ) + word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um ) word = word[1] unless word.nil? #STDERR.puts "WORD = #{word.inspect}" case word @@ -257,18 +255,16 @@ def pull_event @stack << [ :end_doctype ] end return args - when /^\s+/ + when /\A\s+/ else @document_status = :after_doctype - @source.read if @source.buffer.size<2 - md = @source.match(/\s*/um, true) if @source.encoding == "UTF-8" @source.buffer.force_encoding(::Encoding::UTF_8) end end end if @document_status == :in_doctype - md = @source.match(/\s*(.*?>)/um) + md = @source.match(/\A\s*(.*?>)/um) case md[1] when SYSTEMENTITY match = @source.match( SYSTEMENTITY, true )[1] @@ -349,7 +345,11 @@ def pull_event return [ :end_doctype ] end end + if @document_status == :after_doctype + @source.match(/\A\s*/um, true) + end begin + @source.read if @source.buffer.size<2 if @source.buffer[0] == ?< if @source.buffer[1] == ?/ @nsstack.shift @@ -392,6 +392,7 @@ def pull_event unless md raise REXML::ParseException.new("malformed XML: missing tag start", @source) end + @document_status = :in_element prefixes = Set.new prefixes << md[2] if md[2] @nsstack.unshift(curr_ns=Set.new) diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb index a23513fc..f0c0c24e 100644 --- a/test/parse/test_processing_instruction.rb +++ b/test/parse/test_processing_instruction.rb @@ -20,6 +20,25 @@ def test_no_name DETAIL end + + def test_garbage_text + # TODO: This should be parse error. + # Create test/parse/test_document.rb or something and move this to it. + doc = parse(<<-XML) +x?> + + XML + pi = doc.children[1] + assert_equal([ + "x", + "y\n