Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: initial KMP XML reader implementation #601

Merged
merged 2 commits into from
Mar 14, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@

package aws.smithy.kotlin.runtime.serde.xml

import aws.smithy.kotlin.runtime.serde.xml.tokenization.StringTextStream
import aws.smithy.kotlin.runtime.serde.xml.tokenization.XmlLexer
import aws.smithy.kotlin.runtime.serde.xml.tokenization.XmlScanner

/**
* Provides stream-style access to an XML payload. This abstraction
* supports the ability to look ahead an arbitrary number of elements. It can also
Expand All @@ -22,7 +26,7 @@ interface XmlStreamReader {
/**
* The subtree's minimum depth is the same as the current node depth + 1.
*/
CHILD
CHILD,
}
/**
* Return the last token that was consumed by the reader.
Expand All @@ -39,7 +43,7 @@ interface XmlStreamReader {
/**
* Return the next actionable token or null if stream is exhausted.
*
* @throws XmlGenerationException upon any error.
* @throws [aws.smithy.kotlin.runtime.serde.DeserializationException] upon any error.
*/
fun nextToken(): XmlToken?

Expand All @@ -63,17 +67,23 @@ interface XmlStreamReader {
*/
inline fun <reified T : XmlToken> XmlStreamReader.seek(selectionPredicate: (T) -> Boolean = { true }): T? {
var token: XmlToken? = lastToken
var foundMatch = false

while (token != null && !foundMatch) {
foundMatch = if (token is T) selectionPredicate.invoke(token) else false
do {
val foundMatch = if (token is T) selectionPredicate.invoke(token) else false
if (!foundMatch) token = nextToken()
}
} while (token != null && !foundMatch)

return token as T?
}

/*
* Creates an [XmlStreamReader] instance
*/
expect fun xmlStreamReader(payload: ByteArray): XmlStreamReader
/**
* Creates an [XmlStreamReader] instance
*/
fun xmlStreamReader(payload: ByteArray): XmlStreamReader {
val stream = StringTextStream(payload.decodeToString())
val scanner = XmlScanner(stream)
return XmlLexer(scanner)
}

// TODO remove me!
expect fun xmlPull(payload: ByteArray): XmlStreamReader
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
package aws.smithy.kotlin.runtime.serde.xml

/**
* Raw tokens produced when reading a XML document as a stream
* Raw tokens produced when reading an XML document as a stream
*/
sealed class XmlToken {

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
/*
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
* SPDX-License-Identifier: Apache-2.0.
*/
package aws.smithy.kotlin.runtime.serde.xml.tokenization

import aws.smithy.kotlin.runtime.serde.xml.XmlToken

/**
* Describes an internal state of a [XmlScanner].
*/
sealed class ScannerState {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

internal?

/**
* The node depth at which the scanner is parsing tokens. Like the concept of depth in [XmlToken], this depth is
* 1 at the root (but 0 outside the root).
*/
abstract val depth: Int

/**
* The initial state at the beginning of a document before reading any tags, DTD, or prolog.
*/
object Initial : ScannerState() {
override val depth = 0
}

/**
* The scanner is expecting the root tag next.
*/
object BeforeRootTag : ScannerState() {
override val depth = 0
}

/**
* Describes the state of being inside a tag.
*/
sealed class Tag : ScannerState() {
override val depth: Int by lazy { (parent?.depth ?: 0) + 1 }

abstract val name: XmlToken.QualifiedName
abstract val parent: OpenTag?

/**
* The scanner is inside a tag. The next close tag should match the name of this tag.
*/
data class OpenTag(
override val name: XmlToken.QualifiedName,
override val parent: OpenTag?,
val seenChildren: Boolean,
) : Tag()

/**
* The scanner has read a self-closing tag (e.g., '<foo />') but only returned the [XmlToken.BeginElement] token
* to the caller. The subsequent [XmlScanner.parseNext] call will return an [XmlToken.EndElement] without
* actually reading more from the source.
*/
data class EmptyTag(override val name: XmlToken.QualifiedName, override val parent: OpenTag?) : Tag()
}

/**
* The end of the document is reached. No more data is available.
*/
object EndOfDocument : ScannerState() {
override val depth = 0
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
/*
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
* SPDX-License-Identifier: Apache-2.0.
*/
package aws.smithy.kotlin.runtime.serde.xml.tokenization

import aws.smithy.kotlin.runtime.serde.DeserializationException
import kotlin.math.max
import kotlin.math.min

private val nonAscii = """[^\x20-\x7E]""".toRegex()

/**
* A stream of text characters that can be processed sequentially. This stream maintains a current position (i.e.,
* offset in the string) from which all reading operations begin. The stream is advanced by `read` operations. The
* stream is **not** advanced by `peek` operations.
* @param bytes The source bytes for this stream (which will be decoded to a string)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

outdated comment

*/
class StringTextStream(private val source: String) {
private val end = source.length
private var offset = 0

/**
* Checks whether the bounds of the stream would be exceeded by advancing the given number of characters and, if so,
* throws an exception.
* @param length The amount beyond the current position to check.
* @param errMessage A provider of an error message to include in the exception.
*/
private fun checkBounds(length: Int, errMessage: () -> String) {
if (offset + length > end) error(errMessage())
}

/**
* Throws a [DeserializationException] with the given message and location string.
* @param msg The error message to include with the exception.
*/
@Suppress("NOTHING_TO_INLINE")
internal inline fun error(msg: String): Nothing {
val fullMsg = "$msg\n$locationMultilineString"
throw DeserializationException(fullMsg)
}

/**
* Gets a multiline string that shows the current offset and a preview of the surrounding characters. For example:
* ```
* At offset 123 (showing range 120-126):
* <b>!</b
* ^
* ```
*/
val locationMultilineString: String
get() {
val start = max(0, offset - 3)
val end = min(end - 1, offset + 3)
val snippet = source.substring(start..end).replace(nonAscii, "·")
val caretPos = offset - start
val caret = " ".repeat(caretPos) + "^"
return "At offset $offset (showing range $start-$end):\n$snippet\n$caret"
}

/**
* Returns the next [length] characters in the stream without advancing the position. The return is truncated if
* [length] would exceed the end of the stream.
* @param length The number of characters (at most) to return.
*/
fun peekAtMost(length: Int): String {
val actualLength = min(length, end - offset)
return sliceByLength(actualLength)
}

/**
* Determines if the next several characters in the stream match the given text without advancing the position.
*/
fun peekMatches(text: String): Boolean = peekAtMost(text.length) == text

/**
* Returns the next character in the stream without advancing the position. Throws an exception if the position is
* at the stream's end.
* @param errMessage A provider of an error message to include in the exception.
*/
fun peekOrThrow(errMessage: () -> String): Char {
checkBounds(1, errMessage)
return source[offset]
}

/**
* Returns the next [length] characters in the stream without advancing the position. Throws an exception if the end
* of the stream would be exceeded.
* @param length The number of characters to read.
* @param errMessage A provider of an error message to include in the exception.
*/
fun peekOrThrow(length: Int, errMessage: () -> String): String {
checkBounds(length, errMessage)
return sliceByLength(length)
}

/**
* Returns contents of the stream up to and including the given text without advancing the position. Throws an
* exception if the text is not encountered before the end of the stream.
* @param text The text to seek
* @param errMessage A provider of an error message to include in the exception.
* @return The stream contents from the current position up to and including [text].
*/
fun peekThrough(text: String, errMessage: () -> String): String {
val charIndex = source.indexOf(text, startIndex = offset)
if (charIndex < 0) error(errMessage())
return sliceByEnd(charIndex + text.length)
}

/**
* Returns zero or more characters from the stream while the given predicate is matched without advancing the
* position.
* @param predicate The evaluation function for each character.
*/
fun peekWhile(predicate: (Char) -> Boolean): String {
var peekOffset = offset
while (peekOffset < end && predicate(source[peekOffset])) {
peekOffset++
}
return sliceByEnd(peekOffset)
}

/**
* Returns the next [length] characters in the stream and advances the position. Throws an exception if the end of
* the stream would be exceeded.
* @param length The number of characters to read.
* @param errMessage A provider of an error message to include in the exception.
*/
fun readOrThrow(errMessage: () -> String): Char =
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if this is all internal we could probably inline all of this to avoid function call per char. I'd wait for the benchmarks of course but something to look into

peekOrThrow(errMessage).also { offset++ }

/**
* Returns the next [length] characters in the stream and advances the position. Throws an exception if the end of
* the stream would be exceeded.
* @param length The number of characters to read.
* @param errMessage A provider of an error message to include in the exception.
*/
fun readOrThrow(length: Int, errMessage: () -> String): String =
peekOrThrow(length, errMessage).also { offset += length }

/**
* Returns contents of the stream up to and including the given text and advances the position. Throws an exception
* if the text is not encountered before the end of the stream.
* @param text The text to seek
* @param errMessage A provider of an error message to include in the exception.
* @return The stream contents from the current position up to and including [text].
*/
fun readThrough(text: String, errMessage: () -> String): String =
peekThrough(text, errMessage).also { offset += it.length }

/**
* Returns zero or more characters from the stream while the given predicate is matched and advances the position.
* @param predicate The evaluation function for each character.
*/
fun readWhile(predicate: (Char) -> Boolean): String =
peekWhile(predicate).also { offset += it.length }

/**
* Returns a slice of the source up to (but not including) the given end position.
* @param endExclusive The exclusive end position.
*/
private fun sliceByEnd(endExclusive: Int): String = source.substring(offset until endExclusive)

/**
* Returns a slice of the source that is [length] characters long.
* @param length The number of characters to return.
*/
private fun sliceByLength(length: Int): String = sliceByEnd(offset + length)
}
Loading