From 67263a370d5489d9a9040e26503065db4ebd9672 Mon Sep 17 00:00:00 2001 From: belugabehr <12578579+belugabehr@users.noreply.github.com> Date: Mon, 7 Oct 2024 07:16:09 -0400 Subject: [PATCH] AVRO-4065: Do Not Copy Array Contents when Expanding UTF-8 Arrays (#3181) --- .../main/java/org/apache/avro/util/Utf8.java | 46 +++++++++---------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/lang/java/avro/src/main/java/org/apache/avro/util/Utf8.java b/lang/java/avro/src/main/java/org/apache/avro/util/Utf8.java index ae4df8e5c42..22c21c76be5 100644 --- a/lang/java/avro/src/main/java/org/apache/avro/util/Utf8.java +++ b/lang/java/avro/src/main/java/org/apache/avro/util/Utf8.java @@ -69,46 +69,44 @@ public Utf8(byte[] bytes) { } /** - * Return UTF-8 encoded bytes. Only valid through {@link #getByteLength()}. + * Return UTF-8 encoded bytes. Only valid through {@link #getByteLength()} + * assuming the bytes have been fully copied into the underlying buffer from the + * source. + * + * @see #setByteLength(int) + * @return a reference to the underlying byte array */ public byte[] getBytes() { return bytes; } - /** - * Return length in bytes. - * - * @deprecated call {@link #getByteLength()} instead. - */ - @Deprecated - public int getLength() { - return length; - } - /** Return length in bytes. */ public int getByteLength() { return length; } /** - * Set length in bytes. Should called whenever byte content changes, even if the - * length does not change, as this also clears the cached String. + * Set length in bytes. When calling this method, even if the new length is the + * same as the current length, the cached contents of this Utf8 object will be + * wiped out. After calling this method, no assumptions should be made about the + * internal state (e.g., contents, hashcode, equality, etc.) of this Utf8 String + * other than the internal buffer being large enough to accommodate a String of + * the new length. This should be called immediately before reading a String + * from the underlying data source. * - * @deprecated call {@link #setByteLength(int)} instead. - */ - @Deprecated - public Utf8 setLength(int newLength) { - return setByteLength(newLength); - } - - /** - * Set length in bytes. Should called whenever byte content changes, even if the - * length does not change, as this also clears the cached String. + * @param newLength the new length of the underlying buffer + * @return a reference to this object. + * @see org.apache.avro.io.BinaryDecoder#readString(Utf8) */ public Utf8 setByteLength(int newLength) { SystemLimitException.checkMaxStringLength(newLength); + + // Note that if the buffer size increases, the internal buffer is zero-ed out. + // If the buffer is large enough, just the length pointer moves and the old + // contents remain. For consistency's sake, we could zero-out the buffer in + // both cases, but would be a perf hit. if (this.bytes.length < newLength) { - this.bytes = Arrays.copyOf(this.bytes, newLength); + this.bytes = new byte[newLength]; } this.length = newLength; this.string = null;