Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-14901: [Java] ListSubfieldEncoder and StructSubfieldEncoder can decode without DictionaryHashTable #14902

Merged
merged 1 commit into from
Dec 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,10 @@ public ValueVector encode(ValueVector vector) {
}

/**
* Decodes a vector with the built hash table in this encoder.
* Decodes a vector with the dictionary in this encoder.
*
* {@link DictionaryEncoder#decode(ValueVector, Dictionary, BufferAllocator)} should be used instead if only decoding
* is required as it can avoid building the {@link DictionaryHashTable} which only makes sense when encoding.
*/
public ValueVector decode(ValueVector indices) {
return decode(indices, dictionary, allocator);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,11 @@ public ListSubfieldEncoder(Dictionary dictionary, BufferAllocator allocator, Arr
hashTable = new DictionaryHashTable(getDataVector(dictVector), hasher);
}

private FieldVector getDataVector(BaseListVector vector) {
private static FieldVector getDataVector(BaseListVector vector) {
return vector.getChildrenFromFields().get(0);
}

private BaseListVector cloneVector(BaseListVector vector) {
private static BaseListVector cloneVector(BaseListVector vector, BufferAllocator allocator) {

final FieldType fieldType = vector.getField().getFieldType();
BaseListVector cloned = (BaseListVector) fieldType.createNewSingleVector(vector.getField().getName(),
Expand All @@ -84,7 +84,7 @@ public BaseListVector encodeListSubField(BaseListVector vector) {
Field valueField = new Field(vector.getField().getName(), indexFieldType, null);

// clone list vector and initialize data vector
BaseListVector encoded = cloneVector(vector);
BaseListVector encoded = cloneVector(vector, allocator);
encoded.initializeChildrenFromFields(Collections.singletonList(valueField));
BaseIntVector indices = (BaseIntVector) getDataVector(encoded);

Expand All @@ -103,17 +103,35 @@ public BaseListVector encodeListSubField(BaseListVector vector) {

/**
* Decodes a dictionary subfields encoded vector using the provided dictionary.
*
* {@link ListSubfieldEncoder#decodeListSubField(BaseListVector, Dictionary, BufferAllocator)} should be used instead
* if only decoding is required as it can avoid building the {@link DictionaryHashTable} which only makes sense when
* encoding.
*
* @param vector dictionary encoded vector, its data vector must be int type
* @return vector with values restored from dictionary
*/
public BaseListVector decodeListSubField(BaseListVector vector) {
return decodeListSubField(vector, dictionary, allocator);
}

/**
* Decodes a dictionary subfields encoded vector using the provided dictionary.
*
* @param vector dictionary encoded vector, its data vector must be int type
* @param dictionary dictionary used to decode the values
* @param allocator allocator the decoded values use
* @return vector with values restored from dictionary
*/
public static BaseListVector decodeListSubField(BaseListVector vector,
Dictionary dictionary,
BufferAllocator allocator) {
int valueCount = vector.getValueCount();
BaseListVector dictionaryVector = (BaseListVector) dictionary.getVector();
int dictionaryValueCount = getDataVector(dictionaryVector).getValueCount();

// clone list vector and initialize data vector
BaseListVector decoded = cloneVector(vector);
BaseListVector decoded = cloneVector(vector, allocator);
Field dataVectorField = getDataVector(dictionaryVector).getField();
decoded.initializeChildrenFromFields(Collections.singletonList(dataVectorField));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,11 @@ public StructSubfieldEncoder(
dictionaryIdToHashTable.put(id, new DictionaryHashTable(provider.lookup(id).getVector(), hasher)));
}

private FieldVector getChildVector(StructVector vector, int index) {
private static FieldVector getChildVector(StructVector vector, int index) {
return vector.getChildrenFromFields().get(index);
}

private StructVector cloneVector(StructVector vector) {
private static StructVector cloneVector(StructVector vector, BufferAllocator allocator) {

final FieldType fieldType = vector.getField().getFieldType();
StructVector cloned = (StructVector) fieldType.createNewSingleVector(
Expand Down Expand Up @@ -117,7 +117,7 @@ public StructVector encode(StructVector vector, Map<Integer, Long> columnToDicti
}

// clone list vector and initialize data vector
StructVector encoded = cloneVector(vector);
StructVector encoded = cloneVector(vector, allocator);
encoded.initializeChildrenFromFields(childrenFields);
encoded.setValueCount(valueCount);

Expand All @@ -139,20 +139,38 @@ public StructVector encode(StructVector vector, Map<Integer, Long> columnToDicti

/**
* Decodes a dictionary subfields encoded vector using the provided dictionary.
*
* {@link StructSubfieldEncoder#decode(StructVector, DictionaryProvider.MapDictionaryProvider, BufferAllocator)}
* should be used instead if only decoding is required as it can avoid building the {@link DictionaryHashTable}
* which only makes sense when encoding.
*
* @param vector dictionary encoded vector, its child vector must be int type
* @return vector with values restored from dictionary
*/
public StructVector decode(StructVector vector) {
return decode(vector, provider, allocator);
}

/**
* Decodes a dictionary subfields encoded vector using the provided dictionary.
*
* @param vector dictionary encoded vector, its data vector must be int type
* @param provider dictionary provider used to decode the values
* @param allocator allocator the decoded values use
* @return vector with values restored from dictionary
*/
public static StructVector decode(StructVector vector,
DictionaryProvider.MapDictionaryProvider provider,
BufferAllocator allocator) {
final int valueCount = vector.getValueCount();
final int childCount = vector.getChildrenFromFields().size();

// clone list vector and initialize child vectors
StructVector decoded = cloneVector(vector);
StructVector decoded = cloneVector(vector, allocator);
List<Field> childFields = new ArrayList<>();
for (int i = 0; i < childCount; i++) {
FieldVector childVector = getChildVector(vector, i);
Dictionary dictionary = getChildVectorDictionary(childVector);
Dictionary dictionary = getChildVectorDictionary(childVector, provider);
// childVector is not encoded.
if (dictionary == null) {
childFields.add(childVector.getField());
Expand All @@ -167,7 +185,7 @@ public StructVector decode(StructVector vector) {
// get child vector
FieldVector childVector = getChildVector(vector, index);
FieldVector decodedChildVector = getChildVector(decoded, index);
Dictionary dictionary = getChildVectorDictionary(childVector);
Dictionary dictionary = getChildVectorDictionary(childVector, provider);
if (dictionary == null) {
childVector.makeTransferPair(decodedChildVector).splitAndTransfer(0, valueCount);
} else {
Expand All @@ -184,7 +202,8 @@ public StructVector decode(StructVector vector) {
/**
* Get the child vector dictionary, return null if not dictionary encoded.
*/
private Dictionary getChildVectorDictionary(FieldVector childVector) {
private static Dictionary getChildVectorDictionary(FieldVector childVector,
DictionaryProvider.MapDictionaryProvider provider) {
DictionaryEncoding dictionaryEncoding = childVector.getField().getDictionary();
if (dictionaryEncoding != null) {
Dictionary dictionary = provider.lookup(dictionaryEncoding.getId());
Expand Down