-
Notifications
You must be signed in to change notification settings - Fork 5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
estimate file car size #59
base: main
Are you sure you want to change the base?
Changes from 4 commits
fe18dab
994e0a0
29ca6b6
7427951
df0de45
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,6 +13,7 @@ import ( | |
basicnode "github.com/ipld/go-ipld-prime/node/basic" | ||
"github.com/multiformats/go-multicodec" | ||
multihash "github.com/multiformats/go-multihash/core" | ||
"github.com/multiformats/go-varint" | ||
|
||
// raw needed for opening as bytes | ||
_ "github.com/ipld/go-ipld-prime/codec/raw" | ||
|
@@ -57,6 +58,91 @@ func BuildUnixFSFile(r io.Reader, chunker string, ls *ipld.LinkSystem) (ipld.Lin | |
} | ||
} | ||
|
||
// EstimateUnixFSFile estimates the byte size of the car file that would be | ||
// needed to hold a UnixFS file containing data of the given length. | ||
func EstimateUnixFSFileDefaultChunking(dataLength uint64) uint64 { | ||
blkSize := chunk.DefaultBlockSize | ||
blocks := dataLength / uint64(blkSize) | ||
remainder := dataLength % uint64(blkSize) | ||
|
||
size := dataLength | ||
cidExample, _ := leafLinkProto.Prefix.Sum([]byte{0}) | ||
cidLength := uint64(len(cidExample.Bytes())) | ||
|
||
links := []uint64{} | ||
willscott marked this conversation as resolved.
Show resolved
Hide resolved
|
||
for i := uint64(0); i < blocks; i++ { | ||
links = append(links, uint64(chunk.DefaultBlockSize)) | ||
} | ||
// account for the uvarint + cid length of each block of raw data. | ||
size += uint64(len(links)) * (cidLength + uint64(varint.UvarintSize(cidLength+uint64(blkSize)))) | ||
if remainder > 0 { | ||
links = append(links, remainder) | ||
size += cidLength + uint64(varint.UvarintSize(cidLength+uint64(remainder))) | ||
} | ||
|
||
// account for the metadata overhead nodes. | ||
ls := cidlink.DefaultLinkSystem() | ||
storage := cidlink.Memory{} | ||
ls.StorageReadOpener = storage.OpenRead | ||
ls.StorageWriteOpener = storage.OpenWrite | ||
|
||
icnt := 0 | ||
for len(links) > 1 { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it make sense to update the godoc of this function with the cost complexity of estimating size vs writing the CAR out into a temporary file and getting the size that way? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think the extent of what we can really say is that the size calculation doesn't need memory, but will do the CPU work for generation of intermediate blocks. |
||
nxtLnks := []uint64{} | ||
for len(links) > 1 { | ||
icnt++ | ||
children := uint64(DefaultLinksPerBlock) | ||
if len(links) < DefaultLinksPerBlock { | ||
children = uint64(len(links)) | ||
} | ||
childrenLinks := links[:children] | ||
links = links[children:] | ||
totalSize := uint64(0) | ||
for _, l := range childrenLinks { | ||
totalSize += l | ||
} | ||
|
||
node, _ := BuildUnixFS(func(b *Builder) { | ||
FileSize(b, totalSize) | ||
BlockSizes(b, childrenLinks) | ||
}) | ||
|
||
// Pack into the dagpb node. | ||
dpbb := dagpb.Type.PBNode.NewBuilder() | ||
pbm, _ := dpbb.BeginMap(2) | ||
pblb, _ := pbm.AssembleEntry("Links") | ||
pbl, _ := pblb.BeginList(int64(len(childrenLinks))) | ||
for _, c := range childrenLinks { | ||
pbln, _ := BuildUnixFSDirectoryEntry("", int64(c), cidlink.Link{Cid: cidExample}) | ||
pbl.AssembleValue().AssignNode(pbln) | ||
} | ||
pbl.Finish() | ||
pbm.AssembleKey().AssignString("Data") | ||
pbm.AssembleValue().AssignBytes(data.EncodeUnixFSData(node)) | ||
pbm.Finish() | ||
pbn := dpbb.Build() | ||
pbLnk := ls.MustStore(ipld.LinkContext{}, fileLinkProto, pbn) | ||
pbRcrd, _ := ls.LoadRaw(ipld.LinkContext{}, pbLnk) | ||
|
||
// dagpb overhead | ||
intermediateNodeSize := uint64(len(pbRcrd)) | ||
|
||
size += intermediateNodeSize + cidLength + uint64(varint.UvarintSize(cidLength+intermediateNodeSize)) | ||
nxtLnks = append(nxtLnks, totalSize) | ||
} | ||
if len(links) == 1 { | ||
nxtLnks = append(nxtLnks, links[0]) | ||
} | ||
links = nxtLnks | ||
} | ||
fmt.Printf("estimated %d intermeidate nodes\n", icnt) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use logger? |
||
|
||
// add the car header | ||
size += 59 | ||
|
||
return size | ||
} | ||
|
||
var fileLinkProto = cidlink.LinkPrototype{ | ||
Prefix: cid.Prefix{ | ||
Version: 1, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,25 @@ | ||
package builder | ||
package builder_test | ||
|
||
import ( | ||
"bytes" | ||
"context" | ||
"crypto/rand" | ||
"fmt" | ||
"io" | ||
"testing" | ||
|
||
"github.com/ipfs/go-unixfsnode/data/builder" | ||
"github.com/multiformats/go-multicodec" | ||
|
||
"github.com/ipfs/go-cid" | ||
u "github.com/ipfs/go-ipfs-util" | ||
"github.com/ipfs/go-unixfsnode/file" | ||
"github.com/ipld/go-car/v2" | ||
dagpb "github.com/ipld/go-codec-dagpb" | ||
"github.com/ipld/go-ipld-prime" | ||
"github.com/ipld/go-ipld-prime/linking" | ||
cidlink "github.com/ipld/go-ipld-prime/linking/cid" | ||
selectorparse "github.com/ipld/go-ipld-prime/traversal/selector/parse" | ||
) | ||
|
||
func TestBuildUnixFSFile(t *testing.T) { | ||
|
@@ -23,7 +32,7 @@ func TestBuildUnixFSFile(t *testing.T) { | |
ls.StorageReadOpener = storage.OpenRead | ||
ls.StorageWriteOpener = storage.OpenWrite | ||
|
||
f, _, err := BuildUnixFSFile(r, "", &ls) | ||
f, _, err := builder.BuildUnixFSFile(r, "", &ls) | ||
if err != nil { | ||
t.Fatal(err) | ||
} | ||
|
@@ -43,6 +52,45 @@ func TestBuildUnixFSFile(t *testing.T) { | |
} | ||
} | ||
|
||
func TestEstimateUnixFSFileDefaultChunking(t *testing.T) { | ||
for i := 100; i < 1000000000; i *= 10 { | ||
b := make([]byte, i) | ||
rand.Read(b) | ||
|
||
ls := cidlink.DefaultLinkSystem() | ||
storage := cidlink.Memory{} | ||
ls.StorageReadOpener = storage.OpenRead | ||
nPB := 0 | ||
|
||
ls.StorageWriteOpener = func(lc linking.LinkContext) (io.Writer, linking.BlockWriteCommitter, error) { | ||
w, bwc, err := storage.OpenWrite(lc) | ||
return w, func(lnk ipld.Link) error { | ||
if lnk.(cidlink.Link).Cid.Prefix().Codec == uint64(multicodec.DagPb) { | ||
nPB++ | ||
} | ||
return bwc(lnk) | ||
}, err | ||
} | ||
rt, _, err := builder.BuildUnixFSFile(bytes.NewReader(b), "", &ls) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Compare total size returned with estimated size? |
||
if err != nil { | ||
t.Fatal(err) | ||
} | ||
|
||
ob := bytes.NewBuffer(nil) | ||
_, err = car.TraverseV1(context.Background(), &ls, rt.(cidlink.Link).Cid, selectorparse.CommonSelector_ExploreAllRecursively, ob) | ||
if err != nil { | ||
t.Fatal(err) | ||
} | ||
fileLen := len(ob.Bytes()) | ||
|
||
estimate := builder.EstimateUnixFSFileDefaultChunking(uint64(i)) | ||
if estimate != uint64(fileLen) { | ||
fmt.Printf("%d intermediate nodes.\n", nPB) | ||
t.Fatalf("estimate for file length %d was %d. should be %d", i, estimate, fileLen) | ||
} | ||
} | ||
} | ||
|
||
func TestUnixFSFileRoundtrip(t *testing.T) { | ||
buf := make([]byte, 10*1024*1024) | ||
u.NewSeededRand(0xdeadbeef).Read(buf) | ||
|
@@ -53,7 +101,7 @@ func TestUnixFSFileRoundtrip(t *testing.T) { | |
ls.StorageReadOpener = storage.OpenRead | ||
ls.StorageWriteOpener = storage.OpenWrite | ||
|
||
f, _, err := BuildUnixFSFile(r, "", &ls) | ||
f, _, err := builder.BuildUnixFSFile(r, "", &ls) | ||
if err != nil { | ||
t.Fatal(err) | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am curious why the word "estimate"? I read that as: the actual size may differ. Is that right? If so, would it make sense to update the godoc to elaborate on the discrepancy?
If not, I recommend avoiding this terminology.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the only real discrepancy in my head at this point is that depending on your actual data, the resulting car may have de-duplicated blocks so may be smaller than expected