Skip to content

Commit

Permalink
Rename NodeHash to FSTSuffixNodeCache (apache#13259)
Browse files Browse the repository at this point in the history
  • Loading branch information
dungba88 authored Nov 1, 2024
1 parent cfdd20f commit 1328527
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 13 deletions.
12 changes: 6 additions & 6 deletions lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,8 @@ public class FSTCompiler<T> {
// it will throw exceptions if attempt to call getReverseBytesReader() or writeTo(DataOutput)
private static final FSTReader NULL_FST_READER = new NullFSTReader();

private final NodeHash<T> dedupHash;
// a temporary FST used during building for NodeHash cache
private final FSTSuffixNodeCache<T> suffixDedupCache;
// a temporary FST used during building for FSTSuffixNodeCache cache
final FST<T> fst;
private final T NO_OUTPUT;

Expand Down Expand Up @@ -178,9 +178,9 @@ private FSTCompiler(
if (suffixRAMLimitMB < 0) {
throw new IllegalArgumentException("ramLimitMB must be >= 0; got: " + suffixRAMLimitMB);
} else if (suffixRAMLimitMB > 0) {
dedupHash = new NodeHash<>(this, suffixRAMLimitMB);
suffixDedupCache = new FSTSuffixNodeCache<>(this, suffixRAMLimitMB);
} else {
dedupHash = null;
suffixDedupCache = null;
}
NO_OUTPUT = outputs.getNoOutput();

Expand Down Expand Up @@ -379,12 +379,12 @@ public long getArcCount() {
private CompiledNode compileNode(UnCompiledNode<T> nodeIn) throws IOException {
final long node;
long bytesPosStart = numBytesWritten;
if (dedupHash != null) {
if (suffixDedupCache != null) {
if (nodeIn.numArcs == 0) {
node = addNode(nodeIn);
lastFrozenNode = node;
} else {
node = dedupHash.add(nodeIn);
node = suffixDedupCache.add(nodeIn);
}
} else {
node = addNode(nodeIn);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,24 @@
// TODO: couldn't we prune naturally back until we see a transition with an output? it's highly
// unlikely (mostly impossible) such suffixes can be shared?

// Used to dedup states (lookup already-frozen states)
final class NodeHash<T> {
/**
* This is essentially a LRU cache to maintain and lookup node suffix. Un-compiled node can be added
* into the cache and if a similar node exists we will return its address in the FST. A node is
* defined as similar if it has the same label, arcs, outputs & other properties that identify a
* node.
*
* <p>The total size of the cache is controlled through the constructor parameter <code>ramLimitMB
* </code> Implementation-wise, we maintain two lookup tables, a primary table where node can be
* looked up from, and a fallback lookup table in case the lookup in the primary table fails. Nodes
* from the fallback table can also be promoted to the primary table when that happens. When the
* primary table is full, we swap it with the fallback table and clear out the primary table.
*
* <p>To lookup the node address, we build a special hash table which maps from the Node hash value
* to the Node address in the FST, called <code>PagedGrowableHash</code>. Internally it uses {@link
* PagedGrowableWriter} to store the mapping, which allows efficient packing the hash & address long
* values, and uses {@link ByteBlockPool} to store the actual node content (arcs & outputs).
*/
final class FSTSuffixNodeCache<T> {

// primary table -- we add nodes into this until it reaches the requested tableSizeLimit/2, then
// we move it to fallback
Expand Down Expand Up @@ -60,7 +76,7 @@ final class NodeHash<T> {
* recently used suffixes are discarded, and the FST is no longer minimalI. Still, larger
* ramLimitMB will make the FST smaller (closer to minimal).
*/
public NodeHash(FSTCompiler<T> fstCompiler, double ramLimitMB) {
public FSTSuffixNodeCache(FSTCompiler<T> fstCompiler, double ramLimitMB) {
if (ramLimitMB <= 0) {
throw new IllegalArgumentException("ramLimitMB must be > 0; got: " + ramLimitMB);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,16 @@
import com.carrotsearch.randomizedtesting.generators.RandomBytes;
import org.apache.lucene.tests.util.LuceneTestCase;

public class TestNodeHash extends LuceneTestCase {
public class TestFSTSuffixNodeCache extends LuceneTestCase {

public void testCopyFallbackNodeBytes() {
// we don't need the FSTCompiler in this test
NodeHash<Object> nodeHash = new NodeHash<>(null, 1);
FSTSuffixNodeCache<Object> suffixCache = new FSTSuffixNodeCache<>(null, 1);

NodeHash<Object>.PagedGrowableHash primaryHashTable = nodeHash.new PagedGrowableHash();
NodeHash<Object>.PagedGrowableHash fallbackHashTable = nodeHash.new PagedGrowableHash();
FSTSuffixNodeCache<Object>.PagedGrowableHash primaryHashTable =
suffixCache.new PagedGrowableHash();
FSTSuffixNodeCache<Object>.PagedGrowableHash fallbackHashTable =
suffixCache.new PagedGrowableHash();
int nodeLength = atLeast(500);
long fallbackHashSlot = 1;
byte[] fallbackBytes = RandomBytes.randomBytesOfLength(random(), nodeLength);
Expand Down

0 comments on commit 1328527

Please sign in to comment.