Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add benchmarks for XML deserialization #605

Merged
merged 3 commits into from
Mar 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

package aws.smithy.kotlin.runtime.serde.xml

import aws.smithy.kotlin.runtime.serde.xml.tokenization.StringTextStream
import aws.smithy.kotlin.runtime.serde.xml.tokenization.LexingXmlStreamReader
import aws.smithy.kotlin.runtime.serde.xml.tokenization.StringTextStream
import aws.smithy.kotlin.runtime.serde.xml.tokenization.XmlLexer

/**
Expand Down Expand Up @@ -84,6 +84,3 @@ fun xmlStreamReader(payload: ByteArray): XmlStreamReader {
val lexer = XmlLexer(stream)
return LexingXmlStreamReader(lexer)
}

// TODO remove me!
expect fun xmlPull(payload: ByteArray): XmlStreamReader
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@ internal sealed class LexerState {
* Describes the state of being inside a tag.
*/
sealed class Tag : LexerState() {
override val depth: Int by lazy { (parent?.depth ?: 0) + 1 }

abstract val name: XmlToken.QualifiedName
abstract val parent: OpenTag?

Expand All @@ -46,14 +44,18 @@ internal sealed class LexerState {
override val name: XmlToken.QualifiedName,
override val parent: OpenTag?,
val seenChildren: Boolean,
) : Tag()
) : Tag() {
override val depth: Int = (parent?.depth ?: 0) + 1
}

/**
* The lexer has read a self-closing tag (e.g., '<foo />') but only returned the [XmlToken.BeginElement] token
* to the caller. The subsequent [XmlLexer.parseNext] call will return an [XmlToken.EndElement] without
* actually reading more from the source.
*/
data class EmptyTag(override val name: XmlToken.QualifiedName, override val parent: OpenTag?) : Tag()
data class EmptyTag(override val name: XmlToken.QualifiedName, override val parent: OpenTag?) : Tag() {
override val depth: Int = (parent?.depth ?: 0) + 1
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,145 +21,216 @@ class StringTextStream(private val source: String) {
private var offset = 0

/**
* Checks whether the bounds of the stream would be exceeded by advancing the given number of characters and, if so,
* throws an exception.
* @param length The amount beyond the current position to check.
* @param errMessage A provider of an error message to include in the exception.
* Advance the position by the given [length]. Throws an exception if this would advance beyond the end of the
* stream.
* @param length The length by which to advance the stream position.
*/
private fun checkBounds(length: Int, errMessage: () -> String) {
if (offset + length > end) error(errMessage())
fun advance(length: Int, errCondition: String) {
checkBounds(length, errCondition)
offset += length
}

/**
* Throws a [DeserializationException] with the given message and location string.
* @param msg The error message to include with the exception.
* Advances the position if the given [text] is next in the stream. Otherwise, the offset is not updated.
* @param text The text to look for at the current offset.
* @return True if the given [text] was found and the offset was advanced; otherwise, false.
*/
fun advanceIf(text: String): Boolean =
if (source.startsWith(text, offset)) {
offset += text.length
true
} else {
false
}

/**
* Advances the position until a whitespace character is found (i.e., one of ' ', '\r', '\n', '\t').
*/
fun advanceUntilSpace() {
while (offset < end) {
val ch = source[offset]
if (ch == ' ' || ch == '\r' || ch == '\n' || ch == '\t') return
offset++
}
}

/**
* Advances the position until a non-whitespace character is found (i.e., not one of ' ', '\r', '\n', '\t').
*/
fun advanceWhileSpace() {
while (offset < end) {
val ch = source[offset]
if (ch != ' ' && ch != '\r' && ch != '\n' && ch != '\t') return
offset++
}
}

/**
* Checks whether the bounds of the stream would be exceeded by advancing the given number of characters and, if so,
* throws an exception.
* @param length The amount beyond the current position to check.
* @param errCondition The condition to include in an error message if necessary.
*/
@Suppress("NOTHING_TO_INLINE")
internal inline fun error(msg: String): Nothing {
val fullMsg = "$msg\n$locationMultilineString"
throw DeserializationException(fullMsg)
private inline fun checkBounds(length: Int, errCondition: String) {
if (offset + length > end) error("Unexpected end-of-doc while $errCondition")
}

/**
* Gets a multiline string that shows the current offset and a preview of the surrounding characters. For example:
* Throws a [DeserializationException] with the given message and location string. Automatically includes the
* current offset and a preview of the surrounding characters. For example:
* ```
* DeserializationException: Error msg
* At offset 123 (showing range 120-126):
* <b>!</b
* ^
* ```
* @param msg The error message to include with the exception.
*/
val locationMultilineString: String
get() {
val start = max(0, offset - 3)
val end = min(end - 1, offset + 3)
val snippet = source.substring(start..end).replace(nonAscii, "·")
val caretPos = offset - start
val caret = " ".repeat(caretPos) + "^"
return "At offset $offset (showing range $start-$end):\n$snippet\n$caret"
}
@Suppress("NOTHING_TO_INLINE")
internal inline fun error(msg: String): Nothing {
val start = max(0, offset - 3)
val end = min(end - 1, offset + 3)

/**
* Returns the next [length] characters in the stream without advancing the position. The return is truncated if
* [length] would exceed the end of the stream.
* @param length The number of characters (at most) to return.
*/
fun peekAtMost(length: Int): String {
val actualLength = min(length, end - offset)
return sliceByLength(actualLength)
val snippet = source.substring(start, end + 1).replace(nonAscii, "·")

val caretPos = offset - start
val caret = " ".repeat(caretPos) + "^"

val locationMultilineString = "At offset $offset (showing range $start-$end):\n$snippet\n$caret"

val fullMsg = "$msg\n$locationMultilineString"
throw DeserializationException(fullMsg)
}

/**
* Determines if the next several characters in the stream match the given text without advancing the position.
*/
fun peekMatches(text: String): Boolean = peekAtMost(text.length) == text

/**
* Returns the next character in the stream without advancing the position. Throws an exception if the position is
* at the stream's end.
* @param errMessage A provider of an error message to include in the exception.
*/
fun peekOrThrow(errMessage: () -> String): Char {
checkBounds(1, errMessage)
return source[offset]
fun peekMatches(text: String): Boolean {
val actualLength = min(text.length, end - offset)
return sliceByLength(actualLength) == text
}

/**
* Returns the next [length] characters in the stream without advancing the position. Throws an exception if the end
* of the stream would be exceeded.
* @param length The number of characters to read.
* @param errMessage A provider of an error message to include in the exception.
* Returns the next character in the stream and advances the position. Throws an exception if the end of the stream
* would be exceeded.
* @param errCondition The condition to include in an error message if necessary.
*/
fun peekOrThrow(length: Int, errMessage: () -> String): String {
checkBounds(length, errMessage)
return sliceByLength(length)
fun readOrThrow(errCondition: String): Char {
checkBounds(1, errCondition)
return source[offset++]
}

/**
* Returns contents of the stream up to and including the given text without advancing the position. Throws an
* exception if the text is not encountered before the end of the stream.
* @param text The text to seek
* @param errMessage A provider of an error message to include in the exception.
* Returns contents of the stream up to and including the given text and advances the position. Throws an exception
* if the text is not encountered before the end of the stream.
* @param text The text to seek.
* @param errCondition The condition to include in an error message if necessary.
* @return The stream contents from the current position up to and including [text].
*/
fun peekThrough(text: String, errMessage: () -> String): String {
fun readThrough(text: String, errCondition: String): String {
val charIndex = source.indexOf(text, startIndex = offset)
if (charIndex < 0) error(errMessage())
return sliceByEnd(charIndex + text.length)
if (charIndex < 0) error("Unexpected end-of-doc while $errCondition")

val endOfResult = charIndex + text.length
val result = sliceByEnd(endOfResult)
offset = endOfResult
return result
}

/**
* Returns zero or more characters from the stream while the given predicate is matched without advancing the
* position.
* @param predicate The evaluation function for each character.
* Returns contents of the stream up to but not including the given text and advances the position. Throws an
* exception if the text is not encountered before the end of the stream.
* @param text The text to seek.
* @param errCondition The condition to include in an error message if necessary.
* @return The stream contents from the current position up to but not including [text].
*/
fun peekWhile(predicate: (Char) -> Boolean): String {
var peekOffset = offset
while (peekOffset < end && predicate(source[peekOffset])) {
peekOffset++
}
return sliceByEnd(peekOffset)
fun readUntil(text: String, errCondition: String): String {
val charIndex = source.indexOf(text, startIndex = offset)
if (charIndex < 0) error("Unexpected end-of-doc while $errCondition")

val result = sliceByEnd(charIndex)
offset = charIndex
return result
}

/**
* Returns the next [length] characters in the stream and advances the position. Throws an exception if the end of
* the stream would be exceeded.
* @param length The number of characters to read.
* @param errMessage A provider of an error message to include in the exception.
* Returns an XML name from the stream and advances the position. Throws an exception if unable to find a valid XML
* name start character. See https://www.w3.org/TR/xml/#NT-Name for name character rules.
*/
fun readOrThrow(errMessage: () -> String): Char =
peekOrThrow(errMessage).also { offset++ }
fun readWhileXmlName(): String {
val c = source[offset]
if (
!(
'a' <= c && c <= 'z' ||
'A' <= c && c <= 'Z' ||
c == ':' ||
c == '_' ||
'\u00c0' <= c && c <= '\u00d6' ||
'\u00d8' <= c && c <= '\u00f6' ||
'\u00f8' <= c && c <= '\u02ff' ||
'\u0370' <= c && c <= '\u037d' ||
'\u037f' <= c && c <= '\u1fff' ||
'\u200c' <= c && c <= '\u200d' ||
'\u2070' <= c && c <= '\u218f' ||
'\u2c00' <= c && c <= '\u2fef' ||
'\u3001' <= c && c <= '\ud7ff'
)
) {
error("Found '$c' but expected a valid XML start name character")
}

/**
* Returns the next [length] characters in the stream and advances the position. Throws an exception if the end of
* the stream would be exceeded.
* @param length The number of characters to read.
* @param errMessage A provider of an error message to include in the exception.
*/
fun readOrThrow(length: Int, errMessage: () -> String): String =
peekOrThrow(length, errMessage).also { offset += length }
var peekOffset = offset + 1
while (peekOffset < end) {
val ch = source[peekOffset]
if (
Copy link
Contributor

@aajtodd aajtodd Mar 18, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The multiline if statement is fine. Though it may be cleaner to look for the valid character ranges rather than the invalid ones. I'd also expect that most XML we get is going to end a name by finding the end of the tag > (or a space indicating start of an attribute).

The way this is currently structured you actually end up checking until you hit an invalid character. It may be faster to look for valid characters (prioritizing ascii) first. In other words right now we end up checking every branch to prove that a character isn't an invalid name char.

The opposite may be quicker as we expect in most cases to find valid chars (especially given that the XML names come from smithy shape names which is a restricted character set anyway)

when(val ch = source[peekOffset]) {
    in 'a'..'z',
    in 'A'..'Z',
    ...,
    in  '\u203f'..'\u2040' -> { peekOffset++; continue }
    else -> error(...)
}

You could even prioritize the common branches for how a name will end:

when val ch = source[peekOffset++]) {
     in 'a'..'z', in 'A'..'Z' -> continue
     ' ', '>' -> break
     // other valid cases
     else -> invalid
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right, optimizing for valid and expected characters first improves the performance. Using when or range checks (e.g., c in 'A'..'Z') actually hurts performance so I'll skip those (although they are more readable).

!(
'a' <= ch && ch <= 'z' ||
'A' <= ch && ch <= 'Z' ||
'0' <= ch && ch <= '9' ||
ch == ':' ||
ch == '-' ||
ch == '.' ||
ch == '_' ||
ch == '\u00b7' ||
'\u00c0' <= ch && ch <= '\u00d6' ||
'\u00d8' <= ch && ch <= '\u00f6' ||
'\u00f8' <= ch && ch <= '\u02ff' ||
'\u0300' <= ch && ch <= '\u036f' ||
'\u0370' <= ch && ch <= '\u037d' ||
'\u037f' <= ch && ch <= '\u1fff' ||
'\u200c' <= ch && ch <= '\u200d' ||
'\u203f' <= ch && ch <= '\u2040' ||
'\u2070' <= ch && ch <= '\u218f' ||
'\u2c00' <= ch && ch <= '\u2fef' ||
'\u3001' <= ch && ch <= '\ud7ff'
)
) {
// Found end of name
break
}

/**
* Returns contents of the stream up to and including the given text and advances the position. Throws an exception
* if the text is not encountered before the end of the stream.
* @param text The text to seek
* @param errMessage A provider of an error message to include in the exception.
* @return The stream contents from the current position up to and including [text].
*/
fun readThrough(text: String, errMessage: () -> String): String =
peekThrough(text, errMessage).also { offset += it.length }
peekOffset++
}
return sliceByEnd(peekOffset).also { offset = peekOffset }
}

/**
* Returns zero or more characters from the stream while the given predicate is matched and advances the position.
* @param predicate The evaluation function for each character.
* Moves the stream position back by [length] characters. Throws an exception if this would exceed the bounds of the
* stream.
* @param length The amount of characters to go back.
* @param errCondition The condition to include in an error message if necessary.
*/
fun readWhile(predicate: (Char) -> Boolean): String =
peekWhile(predicate).also { offset += it.length }
fun rewind(length: Int, errCondition: String) {
checkBounds(-length, errCondition)
offset -= length
}

/**
* Returns a slice of the source up to (but not including) the given end position.
* @param endExclusive The exclusive end position.
*/
private fun sliceByEnd(endExclusive: Int): String = source.substring(offset until endExclusive)
private fun sliceByEnd(endExclusive: Int): String = source.substring(offset, endExclusive)

/**
* Returns a slice of the source that is [length] characters long.
Expand Down
Loading