Virtual Dataset performance: virtual Datasets are very slow to open #5295

ArvidJB · 2025-02-05T21:21:47Z

Is your feature request related to a problem? Please describe.
I noticed that virtual Dataset performance is very slow compared to non-virtual Datasets. This is particularly noticeable for very small queries since the overhead incurred in opening a virtual Dataset then totally dominates peformance.
In particular I observed that almost all time is spent in H5D__virtual_copy_layout and H5D__virtual_reset_layout which just allocate and free memory.
I attached a benchmark script which shows that reading a single point from a virtual Dataset is 3-4 orders of magnitude slower than reading from a regular Dataset:

SRC time: 0.001761 seconds
VDS time: 6.092287 seconds

Describe the solution you'd like
Can we speed up the opening (+closing) of virtual Datasets?

Describe alternatives you've considered
Some ideas:

Can we remove the deep copy in H5O__layout_copy and H5D__virtual_copy_layout? I assume it's there for a good reason, but, when I naively commented those lines out, reading/writing to a virtual Dataset in HDF5 still worked?
If we cannot remove the deep copy, can we make it faster? Instead of performing individual malloc/free calls for each mapping's filename, selection, etc., can we instead allocate a block of memory in one shot?
Could we make loading of the layout lazy? Currently this is not useful since a read must iterate over all mappings anyway, but if we could arrange the mappings in some spatial tree we could lazily load them and would also greatly improve read performance.

Additional context
Note that this performance issue occurred when I am using versioned-hdf5, a Python library which uses virtual Datasets to maintain all versions of the data. It does that by recording versions as virtual Dataset which map each chunk of the virtual Dataset to a chunk in a common "raw" Dataset.

#include "hdf5.h"
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define FILE    "h5-vds-performance-data.h5"

#define RANK    2

#define SRC_DATASET "/A"
#define SRC_DIM0    1000
#define SRC_DIM1    100

#define VDS_DATASET "/B"
#define VDS_DIM0    1000
#define VDS_DIM1    10000
#define VDS_CHUNK0  10
#define VDS_CHUNK1  100

#define MEM_DIM0    1
#define MEM_DIM1    1

double benchmark(const char *dataset, hsize_t *dims)
{
    hid_t   file, space, dset, mem_space; /* Handles */
    herr_t  status;
    hsize_t memdims[2] = {MEM_DIM0, MEM_DIM1}; /* Memory dimensions for reading */
    int rdata[MEM_DIM0][MEM_DIM1];             /* read buffer for performance testing */
    hsize_t i;
    hsize_t start[2], count[2], block[2];      /* Hyperslab parameters */
    clock_t clock_start, clock_end;
    double cpu_time_used;

    file = H5Fopen(FILE, H5F_ACC_RDONLY, H5P_DEFAULT);

    /* test performance of reading from SRC dataset */
    clock_start = clock();
    for (i = 0; i < 100; i++) {
        dset = H5Dopen2(file, dataset, H5P_DEFAULT);
        /* read single point at 0, 0 */
        start[0] = 0;
        start[1] = 0;
        count[0] = 1;
        count[1] = 1;
        block[0] = 1;
        block[1] = 1;

        mem_space = H5Screate_simple(RANK, memdims, NULL);
        status = H5Sselect_hyperslab(mem_space, H5S_SELECT_SET, start, NULL, count, block);

        space = H5Screate_simple(RANK, dims, NULL);
        status = H5Sselect_hyperslab(space, H5S_SELECT_SET, start, NULL, count, block);

        status = H5Dread(dset, H5T_NATIVE_INT, mem_space, space, H5P_DEFAULT, rdata[0]);

        status = H5Sclose(space);
        status = H5Sclose(mem_space);
        status = H5Dclose(dset);
    }
    clock_end = clock();
    cpu_time_used = ((double) (clock_end - clock_start)) / CLOCKS_PER_SEC;
    return cpu_time_used;
}

int
main(void)
{
    hid_t   file, src_space, vds_space, src_dset, vds_dset; /* Handles */
    hid_t   dcpl;
    herr_t  status;
    hsize_t vdsdims[2] = {VDS_DIM0, VDS_DIM1}; /* Virtual dataset dimension */
    hsize_t srcdims[2] = {SRC_DIM0, SRC_DIM1}; /* Source dataset dimensions */
    int wdata[SRC_DIM0][SRC_DIM1];             /* Write buffer for source dataset */
    hsize_t i, j;
    hsize_t start[2], count[2], block[2];      /* Hyperslab parameters */

    double cpu_time_used;

    assert(VDS_CHUNK1 == SRC_DIM1);

    /* file for both source and virtual dataset */
    file   = H5Fcreate(FILE, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);

    /*
     * Initialize data.
     */
    for (i = 0; i < SRC_DIM0; i++)
        for (j = 0; j < SRC_DIM1; j++)
            wdata[i][j] = i + j;

    /*
     * Create the source dataset and write data.
     */

    src_space = H5Screate_simple(RANK, srcdims, NULL);
    src_dset  = H5Dcreate2(file, SRC_DATASET, H5T_NATIVE_INT, src_space, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
    status = H5Dwrite(src_dset, H5T_NATIVE_INT, H5S_ALL, H5S_ALL, H5P_DEFAULT, wdata[0]);
    status = H5Sclose(src_space);
    status = H5Dclose(src_dset);

    /* Create VDS dataspace.  */
    src_space = H5Screate_simple(RANK, srcdims, NULL);
    vds_space = H5Screate_simple(RANK, vdsdims, NULL);
    dcpl = H5Pcreate(H5P_DATASET_CREATE);

    /*
     * Build the mappings.
     */
    for (i = 0; i < VDS_DIM0; i += VDS_CHUNK0) {
        for (j = 0; j < VDS_DIM1; j += VDS_CHUNK1) {
            /* Initialize hyperslab values for src. */
            start[0] = (i + j) % SRC_DIM0;
            start[1] = 0;
            count[0] = 1;
            count[1] = 1;
            block[0] = VDS_CHUNK0;
            block[1] = VDS_CHUNK1;
            status = H5Sselect_hyperslab(src_space, H5S_SELECT_SET, start, NULL, count, block);
            /* Initialize hyperslab values for dst. */
            start[0] = i;
            start[1] = j;
            count[0] = 1;
            count[1] = 1;
            block[0] = VDS_CHUNK0;
            block[1] = VDS_CHUNK1;
            status = H5Sselect_hyperslab(vds_space, H5S_SELECT_SET, start, NULL, count, block);
            /* add mapping */
            status = H5Pset_virtual(dcpl, vds_space, FILE, SRC_DATASET, src_space);
        }
    }

    /* Create a virtual dataset. */
    vds_dset = H5Dcreate2(file, VDS_DATASET, H5T_NATIVE_INT, vds_space, H5P_DEFAULT, dcpl, H5P_DEFAULT);
    status = H5Sclose(vds_space);
    status = H5Sclose(src_space);
    status = H5Dclose(vds_dset);

    status = H5Fclose(file);

    /*
     * Test performance of repeatedly reopening virtual dataset and reading a single point.
     */

    /* test performance of reading from SRC dataset */
    cpu_time_used = benchmark(SRC_DATASET, srcdims);
    printf("SRC time: %f seconds\n", cpu_time_used);

    /* test performance of reading from VDS dataset */
    cpu_time_used = benchmark(VDS_DATASET, vdsdims);
    printf("VDS time: %f seconds\n", cpu_time_used);

    return 0;
}

The text was updated successfully, but these errors were encountered:

ArvidJB · 2025-02-12T16:23:52Z

Here's the same example above when it uses versioned-hdf5 / virtual datasets. I made some changes to h5py to eliminate the call to H5Dget_create_plist (see h5py/h5py#2552), which eliminates some virtual dataset overhead introduced by h5py, so the performance difference here is solely due to HDF5:

In [2]: with h5py.File('/tmp/data.h5', 'w') as f:
   ...:     vf = VersionedHDF5File(f)
   ...:     with vf.stage_version('r0') as sv:
   ...:         sv.create_dataset('values', (1000, 10000), chunks=(10, 100), maxshape=(None, None),
   ...:                           data=[[i + j for j in range(10000)] for i in range(1000)])
   ...:

In [3]: def benchmark(cv):
   ...:     values = cv['values']
   ...:     _ = values[0, 0]
   ...:

In [4]: with h5py.File('/tmp/data.h5', 'r') as f:
   ...:     vf = VersionedHDF5File(f)
   ...:     cv = vf[vf.current_version]
   ...:     %timeit benchmark(cv)
   ...:
57.9 ms ± 1.55 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

compared to regular chunked datasets:

In [1]: with h5py.File('/tmp/data.h5', 'w') as f:
   ...:     f.create_dataset('values', (1000, 10000), chunks=(10, 100), maxshape=(None, None),
   ...:                      data=[[i + j for j in range(10000)] for i in range(1000)])
   ...:

In [2]: def benchmark(f):
   ...:     values = f['values']
   ...:     _ = values[0, 0]
   ...:

In [3]: with h5py.File('/tmp/data.h5', 'r') as f:
   ...:     %timeit benchmark(f)
   ...:
70.5 μs ± 292 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

Using virtual datasets is 1000x slower: milliseconds vs. microseconds!

ArvidJB mentioned this issue Feb 6, 2025

Initial overhead to accessing a versioned Dataset if very high (PyInf#14142) deshaw/versioned-hdf5#401

Open

brtnfld assigned fortnern Feb 7, 2025

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Virtual Dataset performance: virtual Datasets are very slow to open #5295

Virtual Dataset performance: virtual Datasets are very slow to open #5295

ArvidJB commented Feb 5, 2025

ArvidJB commented Feb 12, 2025

Virtual Dataset performance: virtual Datasets are very slow to open #5295

Virtual Dataset performance: virtual Datasets are very slow to open #5295

Comments

ArvidJB commented Feb 5, 2025

ArvidJB commented Feb 12, 2025