Skip to content

Commit

Permalink
Fix IntegerOverflow exception in postings encoding as group-varint (#…
Browse files Browse the repository at this point in the history
…13376)

The exception happen because the tail postings list block, which encoding with GroupVInt, had a docID delta that was >= 1<<30, when the postings are also storing freqs.
  • Loading branch information
easyice committed May 17, 2024
1 parent f12e489 commit 9dd068b
Show file tree
Hide file tree
Showing 9 changed files with 218 additions and 75 deletions.
2 changes: 1 addition & 1 deletion lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ http://s.apache.org/luceneversions

Bug Fixes
---------------------
(No changes)
* GITHUB#13376: Fix integer overflow exception in postings encoding as group-varint. (Zhang Chao, Guo Feng)

======================== Lucene 9.10.0 =======================

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,31 +140,6 @@ public void init(IndexInput termsIn, SegmentReadState state) throws IOException
}
}

/** Read values that have been written using variable-length encoding instead of bit-packing. */
static void readVIntBlock(
IndexInput docIn,
long[] docBuffer,
long[] freqBuffer,
int num,
boolean indexHasFreq,
boolean decodeFreq)
throws IOException {
docIn.readGroupVInts(docBuffer, num);
if (indexHasFreq && decodeFreq) {
for (int i = 0; i < num; ++i) {
freqBuffer[i] = docBuffer[i] & 0x01;
docBuffer[i] >>= 1;
if (freqBuffer[i] == 0) {
freqBuffer[i] = docIn.readVInt();
}
}
} else if (indexHasFreq) {
for (int i = 0; i < num; ++i) {
docBuffer[i] >>= 1;
}
}
}

static void prefixSum(long[] buffer, int count, long base) {
buffer[0] += base;
for (int i = 1; i < count; ++i) {
Expand Down Expand Up @@ -475,7 +450,7 @@ private void refillDocs() throws IOException {
blockUpto++;
} else {
// Read vInts:
readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, needsFreq);
PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, needsFreq);
prefixSum(docBuffer, left, accum);
docBuffer[left] = NO_MORE_DOCS;
blockUpto += left;
Expand Down Expand Up @@ -768,7 +743,7 @@ private void refillDocs() throws IOException {
docBuffer[1] = NO_MORE_DOCS;
blockUpto++;
} else {
readVIntBlock(docIn, docBuffer, freqBuffer, left, true, true);
PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, true, true);
prefixSum(docBuffer, left, accum);
docBuffer[left] = NO_MORE_DOCS;
blockUpto += left;
Expand Down Expand Up @@ -1155,7 +1130,7 @@ private void refillDocs() throws IOException {
}
blockUpto += BLOCK_SIZE;
} else {
readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreqs, true);
PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreqs, true);
prefixSum(docBuffer, left, accum);
docBuffer[left] = NO_MORE_DOCS;
blockUpto += left;
Expand Down Expand Up @@ -1363,7 +1338,7 @@ private void refillDocs() throws IOException {
forDeltaUtil.decodeAndPrefixSum(docIn, accum, docBuffer);
pforUtil.decode(docIn, freqBuffer);
} else {
readVIntBlock(docIn, docBuffer, freqBuffer, left, true, true);
PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, true, true);
prefixSum(docBuffer, left, accum);
docBuffer[left] = NO_MORE_DOCS;
}
Expand Down Expand Up @@ -1753,7 +1728,7 @@ private void refillDocs() throws IOException {
false; // freq block will be loaded lazily when necessary, we don't load it here
}
} else {
readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, true);
PostingsUtil.readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq, true);
prefixSum(docBuffer, left, accum);
docBuffer[left] = NO_MORE_DOCS;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -371,20 +371,7 @@ public void finishTerm(BlockTermState _state) throws IOException {
} else {
singletonDocID = -1;
// Group vInt encode the remaining doc deltas and freqs:
if (writeFreqs) {
for (int i = 0; i < docBufferUpto; i++) {
docDeltaBuffer[i] = (docDeltaBuffer[i] << 1) | (freqBuffer[i] == 1 ? 1 : 0);
}
}
docOut.writeGroupVInts(docDeltaBuffer, docBufferUpto);
if (writeFreqs) {
for (int i = 0; i < docBufferUpto; i++) {
final int freq = (int) freqBuffer[i];
if (freq != 1) {
docOut.writeVInt(freq);
}
}
}
PostingsUtil.writeVIntBlock(docOut, docDeltaBuffer, freqBuffer, docBufferUpto, writeFreqs);
}

final long lastPosBlockOffset;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;

import java.io.IOException;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;

/** Utility class to encode/decode postings block. */
final class PostingsUtil {

/**
* Read values that have been written using variable-length encoding and group-varint encoding
* instead of bit-packing.
*/
static void readVIntBlock(
IndexInput docIn,
long[] docBuffer,
long[] freqBuffer,
int num,
boolean indexHasFreq,
boolean decodeFreq)
throws IOException {
docIn.readGroupVInts(docBuffer, num);
if (indexHasFreq && decodeFreq) {
for (int i = 0; i < num; ++i) {
freqBuffer[i] = docBuffer[i] & 0x01;
docBuffer[i] >>= 1;
if (freqBuffer[i] == 0) {
freqBuffer[i] = docIn.readVInt();
}
}
} else if (indexHasFreq) {
for (int i = 0; i < num; ++i) {
docBuffer[i] >>= 1;
}
}
}

/** Write freq buffer with variable-length encoding and doc buffer with group-varint encoding. */
static void writeVIntBlock(
IndexOutput docOut, long[] docBuffer, long[] freqBuffer, int num, boolean writeFreqs)
throws IOException {
if (writeFreqs) {
for (int i = 0; i < num; i++) {
docBuffer[i] = (docBuffer[i] << 1) | (freqBuffer[i] == 1 ? 1 : 0);
}
}
docOut.writeGroupVInts(docBuffer, num);
if (writeFreqs) {
for (int i = 0; i < num; i++) {
final int freq = (int) freqBuffer[i];
if (freq != 1) {
docOut.writeVInt(freq);
}
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ public final void readGroupVInts(long[] dst, int limit) throws IOException {
readGroupVInt(dst, i);
}
for (; i < limit; ++i) {
dst[i] = readVInt();
dst[i] = readVInt() & 0xFFFFFFFFL;
}
}

Expand Down
33 changes: 5 additions & 28 deletions lucene/core/src/java/org/apache/lucene/store/DataOutput.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
import java.util.Set;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.GroupVIntUtil;

/**
* Abstract base class for performing write operations of Lucene's low-level data types.
Expand All @@ -30,7 +30,7 @@
* internal state like file position).
*/
public abstract class DataOutput {
private final BytesRefBuilder groupVIntBytes = new BytesRefBuilder();
private byte[] groupVIntBytes;

/**
* Writes a single byte.
Expand Down Expand Up @@ -335,32 +335,9 @@ public void writeSetOfStrings(Set<String> set) throws IOException {
* @lucene.experimental
*/
public void writeGroupVInts(long[] values, int limit) throws IOException {
int off = 0;

// encode each group
while ((limit - off) >= 4) {
byte flag = 0;
groupVIntBytes.setLength(1);
flag |= (encodeGroupValue(Math.toIntExact(values[off++])) - 1) << 6;
flag |= (encodeGroupValue(Math.toIntExact(values[off++])) - 1) << 4;
flag |= (encodeGroupValue(Math.toIntExact(values[off++])) - 1) << 2;
flag |= (encodeGroupValue(Math.toIntExact(values[off++])) - 1);
groupVIntBytes.setByteAt(0, flag);
writeBytes(groupVIntBytes.bytes(), groupVIntBytes.length());
}

// tail vints
for (; off < limit; off++) {
writeVInt(Math.toIntExact(values[off]));
if (groupVIntBytes == null) {
groupVIntBytes = new byte[GroupVIntUtil.MAX_LENGTH_PER_GROUP];
}
}

private int encodeGroupValue(int v) {
int lastOff = groupVIntBytes.length();
do {
groupVIntBytes.append((byte) (v & 0xFF));
v >>>= 8;
} while (v != 0);
return groupVIntBytes.length() - lastOff;
GroupVIntUtil.writeGroupVInts(this, groupVIntBytes, values, limit);
}
}
52 changes: 51 additions & 1 deletion lucene/core/src/java/org/apache/lucene/util/GroupVIntUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import java.io.IOException;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;

/**
* This class contains utility methods and constants for group varint
Expand All @@ -27,7 +28,9 @@
public final class GroupVIntUtil {
// the maximum length of a single group-varint is 4 integers + 1 byte flag.
public static final int MAX_LENGTH_PER_GROUP = 17;
private static final int[] MASKS = new int[] {0xFF, 0xFFFF, 0xFFFFFF, 0xFFFFFFFF};

// we use long array instead of int array to make negative integer to be read as positive long.
private static final long[] MASKS = new long[] {0xFFL, 0xFFFFL, 0xFFFFFFL, 0xFFFFFFFFL};

/**
* Default implementation of read single group, for optimal performance, you should use {@link
Expand Down Expand Up @@ -111,4 +114,51 @@ public static int readGroupVInt(
pos += 1 + n4Minus1;
return (int) (pos - posStart);
}

private static int numBytes(int v) {
// | 1 to return 1 when v = 0
return Integer.BYTES - (Integer.numberOfLeadingZeros(v | 1) >> 3);
}

private static int toInt(long value) {
if ((Long.compareUnsigned(value, 0xFFFFFFFFL) > 0)) {
throw new ArithmeticException("integer overflow");
}
return (int) value;
}

/**
* The implementation for group-varint encoding, It uses a maximum of {@link
* #MAX_LENGTH_PER_GROUP} bytes scratch buffer.
*/
public static void writeGroupVInts(DataOutput out, byte[] scratch, long[] values, int limit)
throws IOException {
int readPos = 0;

// encode each group
while ((limit - readPos) >= 4) {
int writePos = 0;
final int n1Minus1 = numBytes(toInt(values[readPos])) - 1;
final int n2Minus1 = numBytes(toInt(values[readPos + 1])) - 1;
final int n3Minus1 = numBytes(toInt(values[readPos + 2])) - 1;
final int n4Minus1 = numBytes(toInt(values[readPos + 3])) - 1;
int flag = (n1Minus1 << 6) | (n2Minus1 << 4) | (n3Minus1 << 2) | (n4Minus1);
scratch[writePos++] = (byte) flag;
BitUtil.VH_LE_INT.set(scratch, writePos, (int) (values[readPos++]));
writePos += n1Minus1 + 1;
BitUtil.VH_LE_INT.set(scratch, writePos, (int) (values[readPos++]));
writePos += n2Minus1 + 1;
BitUtil.VH_LE_INT.set(scratch, writePos, (int) (values[readPos++]));
writePos += n3Minus1 + 1;
BitUtil.VH_LE_INT.set(scratch, writePos, (int) (values[readPos++]));
writePos += n4Minus1 + 1;

out.writeBytes(scratch, writePos);
}

// tail vints
for (; readPos < limit; readPos++) {
out.writeVInt(toInt(values[readPos]));
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene99;

import java.io.IOException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.tests.util.TestUtil;

public class TestPostingsUtil extends LuceneTestCase {

// checks for bug described in https://github.com/apache/lucene/issues/13373
public void testIntegerOverflow() throws IOException {
final int size = TestUtil.nextInt(random(), 1, ForUtil.BLOCK_SIZE);
final long[] docDeltaBuffer = new long[size];
final long[] freqBuffer = new long[size];

final int delta = 1 << 30;
docDeltaBuffer[0] = delta;
try (Directory dir = newDirectory()) {
try (IndexOutput out = dir.createOutput("test", IOContext.DEFAULT)) {
// In old implementation, this would cause integer overflow exception.
PostingsUtil.writeVIntBlock(out, docDeltaBuffer, freqBuffer, size, true);
}
long[] restoredDocs = new long[size];
long[] restoredFreqs = new long[size];
try (IndexInput in = dir.openInput("test", IOContext.DEFAULT)) {
PostingsUtil.readVIntBlock(in, restoredDocs, restoredFreqs, size, true, true);
}
assertEquals(delta, restoredDocs[0]);
}
}
}
Loading

0 comments on commit 9dd068b

Please sign in to comment.