Skip to content

Commit

Permalink
GH-14901: [Java] ListSubfieldEncoder and StructSubfieldEncoder can de…
Browse files Browse the repository at this point in the history
…code without DictionaryHashTable (#14902)

* Closes: #14901

Authored-by: 郭峰 <[email protected]>
Signed-off-by: David Li <[email protected]>
  • Loading branch information
gf2121 authored Dec 10, 2022
1 parent a7502c9 commit 7e09f38
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,10 @@ public ValueVector encode(ValueVector vector) {
}

/**
* Decodes a vector with the built hash table in this encoder.
* Decodes a vector with the dictionary in this encoder.
*
* {@link DictionaryEncoder#decode(ValueVector, Dictionary, BufferAllocator)} should be used instead if only decoding
* is required as it can avoid building the {@link DictionaryHashTable} which only makes sense when encoding.
*/
public ValueVector decode(ValueVector indices) {
return decode(indices, dictionary, allocator);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,11 @@ public ListSubfieldEncoder(Dictionary dictionary, BufferAllocator allocator, Arr
hashTable = new DictionaryHashTable(getDataVector(dictVector), hasher);
}

private FieldVector getDataVector(BaseListVector vector) {
private static FieldVector getDataVector(BaseListVector vector) {
return vector.getChildrenFromFields().get(0);
}

private BaseListVector cloneVector(BaseListVector vector) {
private static BaseListVector cloneVector(BaseListVector vector, BufferAllocator allocator) {

final FieldType fieldType = vector.getField().getFieldType();
BaseListVector cloned = (BaseListVector) fieldType.createNewSingleVector(vector.getField().getName(),
Expand All @@ -84,7 +84,7 @@ public BaseListVector encodeListSubField(BaseListVector vector) {
Field valueField = new Field(vector.getField().getName(), indexFieldType, null);

// clone list vector and initialize data vector
BaseListVector encoded = cloneVector(vector);
BaseListVector encoded = cloneVector(vector, allocator);
encoded.initializeChildrenFromFields(Collections.singletonList(valueField));
BaseIntVector indices = (BaseIntVector) getDataVector(encoded);

Expand All @@ -103,17 +103,35 @@ public BaseListVector encodeListSubField(BaseListVector vector) {

/**
* Decodes a dictionary subfields encoded vector using the provided dictionary.
*
* {@link ListSubfieldEncoder#decodeListSubField(BaseListVector, Dictionary, BufferAllocator)} should be used instead
* if only decoding is required as it can avoid building the {@link DictionaryHashTable} which only makes sense when
* encoding.
*
* @param vector dictionary encoded vector, its data vector must be int type
* @return vector with values restored from dictionary
*/
public BaseListVector decodeListSubField(BaseListVector vector) {
return decodeListSubField(vector, dictionary, allocator);
}

/**
* Decodes a dictionary subfields encoded vector using the provided dictionary.
*
* @param vector dictionary encoded vector, its data vector must be int type
* @param dictionary dictionary used to decode the values
* @param allocator allocator the decoded values use
* @return vector with values restored from dictionary
*/
public static BaseListVector decodeListSubField(BaseListVector vector,
Dictionary dictionary,
BufferAllocator allocator) {
int valueCount = vector.getValueCount();
BaseListVector dictionaryVector = (BaseListVector) dictionary.getVector();
int dictionaryValueCount = getDataVector(dictionaryVector).getValueCount();

// clone list vector and initialize data vector
BaseListVector decoded = cloneVector(vector);
BaseListVector decoded = cloneVector(vector, allocator);
Field dataVectorField = getDataVector(dictionaryVector).getField();
decoded.initializeChildrenFromFields(Collections.singletonList(dataVectorField));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,11 @@ public StructSubfieldEncoder(
dictionaryIdToHashTable.put(id, new DictionaryHashTable(provider.lookup(id).getVector(), hasher)));
}

private FieldVector getChildVector(StructVector vector, int index) {
private static FieldVector getChildVector(StructVector vector, int index) {
return vector.getChildrenFromFields().get(index);
}

private StructVector cloneVector(StructVector vector) {
private static StructVector cloneVector(StructVector vector, BufferAllocator allocator) {

final FieldType fieldType = vector.getField().getFieldType();
StructVector cloned = (StructVector) fieldType.createNewSingleVector(
Expand Down Expand Up @@ -117,7 +117,7 @@ public StructVector encode(StructVector vector, Map<Integer, Long> columnToDicti
}

// clone list vector and initialize data vector
StructVector encoded = cloneVector(vector);
StructVector encoded = cloneVector(vector, allocator);
encoded.initializeChildrenFromFields(childrenFields);
encoded.setValueCount(valueCount);

Expand All @@ -139,20 +139,38 @@ public StructVector encode(StructVector vector, Map<Integer, Long> columnToDicti

/**
* Decodes a dictionary subfields encoded vector using the provided dictionary.
*
* {@link StructSubfieldEncoder#decode(StructVector, DictionaryProvider.MapDictionaryProvider, BufferAllocator)}
* should be used instead if only decoding is required as it can avoid building the {@link DictionaryHashTable}
* which only makes sense when encoding.
*
* @param vector dictionary encoded vector, its child vector must be int type
* @return vector with values restored from dictionary
*/
public StructVector decode(StructVector vector) {
return decode(vector, provider, allocator);
}

/**
* Decodes a dictionary subfields encoded vector using the provided dictionary.
*
* @param vector dictionary encoded vector, its data vector must be int type
* @param provider dictionary provider used to decode the values
* @param allocator allocator the decoded values use
* @return vector with values restored from dictionary
*/
public static StructVector decode(StructVector vector,
DictionaryProvider.MapDictionaryProvider provider,
BufferAllocator allocator) {
final int valueCount = vector.getValueCount();
final int childCount = vector.getChildrenFromFields().size();

// clone list vector and initialize child vectors
StructVector decoded = cloneVector(vector);
StructVector decoded = cloneVector(vector, allocator);
List<Field> childFields = new ArrayList<>();
for (int i = 0; i < childCount; i++) {
FieldVector childVector = getChildVector(vector, i);
Dictionary dictionary = getChildVectorDictionary(childVector);
Dictionary dictionary = getChildVectorDictionary(childVector, provider);
// childVector is not encoded.
if (dictionary == null) {
childFields.add(childVector.getField());
Expand All @@ -167,7 +185,7 @@ public StructVector decode(StructVector vector) {
// get child vector
FieldVector childVector = getChildVector(vector, index);
FieldVector decodedChildVector = getChildVector(decoded, index);
Dictionary dictionary = getChildVectorDictionary(childVector);
Dictionary dictionary = getChildVectorDictionary(childVector, provider);
if (dictionary == null) {
childVector.makeTransferPair(decodedChildVector).splitAndTransfer(0, valueCount);
} else {
Expand All @@ -184,7 +202,8 @@ public StructVector decode(StructVector vector) {
/**
* Get the child vector dictionary, return null if not dictionary encoded.
*/
private Dictionary getChildVectorDictionary(FieldVector childVector) {
private static Dictionary getChildVectorDictionary(FieldVector childVector,
DictionaryProvider.MapDictionaryProvider provider) {
DictionaryEncoding dictionaryEncoding = childVector.getField().getDictionary();
if (dictionaryEncoding != null) {
Dictionary dictionary = provider.lookup(dictionaryEncoding.getId());
Expand Down

0 comments on commit 7e09f38

Please sign in to comment.