From f60854a1c4990f6142ade593184c0b00af0e7260 Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Thu, 12 Sep 2024 17:39:01 +0800 Subject: [PATCH 01/26] Binary IDL With MessagePack Bytes Signed-off-by: Future-Outlier --- .../5741-binary-idl-with-message-pack.md | 340 ++++++++++++++++++ 1 file changed, 340 insertions(+) create mode 100644 rfc/system/5741-binary-idl-with-message-pack.md diff --git a/rfc/system/5741-binary-idl-with-message-pack.md b/rfc/system/5741-binary-idl-with-message-pack.md new file mode 100644 index 0000000000..3d3cc3d4e1 --- /dev/null +++ b/rfc/system/5741-binary-idl-with-message-pack.md @@ -0,0 +1,340 @@ +# Binary IDL With MessagePack Bytes + +**Authors:** + +- [@Han-Ru](https://github.com/future-outlier) +- [@Yee Hing Tong](https://github.com/wild-endeavor) +- [@Ping-Su](https://github.com/pingsutw) +- [@Eduardo Apolinario](https://github.com/eapolinario) +- [@Haytham Abuelfutuh](https://github.com/EngHabu) +- [@Ketan Umare](https://github.com/kumare3) + +## 1 Executive Summary +### Literal Value +- To Literal + +| Before | Now | +|-----------------------------------|----------------------------------------------| +| Python Val -> JSON String -> Protobuf Struct | Python Val -> Bytes -> Protobuf Binary (value: MessagePack Bytes, tag: msgpack) | + +- To Python Value + +| Before | Now | +|-----------------------------------|----------------------------------------------| +| Protobuf Struct -> JSON String -> Python Val | Protobuf JSON -> Bytes -> Python Val | + +Use bytes in Protobuf instead of a JSON string to fix case that int is not supported in Protobuf struct. + +### Literal Type +1. + +## 2 Motivation + +In Flytekit, when handling dataclasses, Pydantic base models, and dictionaries, we store data using a JSON string within Protobuf struct datatype. +This approach causes issues with integers, as Protobuf struct does not support int types, leading to their conversion to floats. +This results in performance issues since we need to recursively iterate through all attributes/keys in dataclasses and dictionaries to ensure floats types are converted to int. In addition to performance issues, the required code is complicated and error prone. + +Note: We have more than 10 issues about dict, dataclass and Pydantic. + +This feature can solve them all. + +## 3 Proposed Implementation +### Before +```python +@task +def t1() -> dict: + ... + return {"a": 1} # Protobuf Struct {"a": 1.0} + +@task +def t2(a: dict): + print(a["integer"]) # wrong, will be a float +``` +### After +```python +@task +def t1() -> dict: # JSON Bytes + ... + return {"a": 1} # Protobuf JSON b'\x81\xa1a\x01', produced by msgpack + +@task +def t2(a: dict): + print(a["integer"]) # correct, it will be a integer +``` + +#### Note +- We will use the same type interface and ensure the backward compatibility. + +### How to turn a value to bytes? +#### Use MsgPack to convert a value into bytes +##### Python +```python +import msgpack +import JSON + +# Encode +def to_literal(): + msgpack_bytes = msgpack.dumps(python_val) + return Literal(scalar=Scalar(json=Json(value=msgpack_bytes, serialization_format="msgpack"))) + +# Decode +def to_python_value(): + if lv.scalar.json.serialization_format == "msgpack": + msgpack_bytes = lv.scalar.json.value + return msgpack.loads(msgpack_bytes) +``` +reference: https://github.com/msgpack/msgpack-python + +##### Golang +```go +package main + +import ( + "fmt" + "github.com/vmihailenco/msgpack/v5" +) + +func main() { + // Example data to encode + data := map[string]int{"a": 1} + + // Encode the data + encodedData, err := msgpack.Marshal(data) + if err != nil { + panic(err) + } + + // Print the encoded data + fmt.Printf("Encoded data: %x\n", encodedData) // Output: 81a16101 + + // Decode the data + var decodedData map[string]int + err = msgpack.Unmarshal(encodedData, &decodedData) + if err != nil { + panic(err) + } + + // Print the decoded data + fmt.Printf("Decoded data: %+v\n", decodedData) // Output: map[a:1] +} +``` + +reference: https://github.com/vmihailenco/msgpack + +##### JavaScript +```javascript +import msgpack5 from 'msgpack5'; + +// Create a MessagePack instance +const msgpack = msgpack5(); + +// Example data to encode +const data = { a: 1 }; + +// Encode the data +const encodedData = msgpack.encode(data); + +// Print the encoded data +console.log(encodedData); // + +// Decode the data +const decodedData = msgpack.decode(encodedData); + +// Print the decoded data +console.log(decodedData); // { a: 1 } +``` +reference: https://github.com/msgpack/msgpack-javascript + + +### FlyteIDL +```proto +// Represents a JSON object encoded as a byte array. +// This field is used to store JSON-serialized data, which can include +// dataclasses, dictionaries, Pydantic models, or other structures that +// can be represented as JSON objects. When utilized, the data should be +// deserialized into its corresponding structure. +// This design ensures that the data is stored in a format that can be +// fully reconstructed without loss of information. +message Json { + // The JSON object serialized as a byte array. + bytes value = 1; + + // The format used to serialize the byte array. + // This field identifies the specific format of the serialized JSON data, + // allowing future flexibility in supporting different JSON variants. + string serialization_format = 2; + + // Placeholder for future extensions to support other types of JSON objects, + // such as "eJSON" or "ndJSON". + // reference: https://stackoverflow.com/questions/18692060/different-types-of-json + // string json_type = 3; +} + + +message Scalar { + oneof value { + Primitive primitive = 1; + Blob blob = 2; + Binary binary = 3; + Schema schema = 4; + Void none_type = 5; + Error error = 6; + google.Protobuf.Struct generic = 7; + StructuredDataset structured_dataset = 8; + Union union = 9; + Json json = 10; // New Type + } +} +``` + +### FlytePropeller +1. Attribute Access for dictionary, Dataclass, and Pydantic in workflow. +Dict[type, type] is supported already, we have to support Dataclass, Pydantic and dict now. +```python +from flytekit import task, workflow +from dataclasses import dataclass + +@dataclass +class DC: + a: int + +@task +def t1() -> DC: + return DC(a=1) + +@task +def t2(x: int): + print("x:", x) + return + +@workflow +def wf(): + o = t1() + t2(x=o.a) +``` +2. Create a Literal Type for Scalar when doing type validation. +```go +func literalTypeForScalar(scalar *core.Scalar) *core.LiteralType { + ... + case *core.Scalar_JSON: + literalType = &core.LiteralType{Type: &core.LiteralType_Simple{Simple: core.SimpleType_JSON}} + ... + return literalType +} +``` +3. Support input and default input. +```go +// Literal Input +func ExtractFromLiteral(literal *core.Literal) (interface{}, error) { + switch literalValue := literal.Value.(type) { + case *core.Literal_Scalar: + ... + case *core.Scalar_JSON: + return scalarValue.Json.GetValue(), nil + } +} +// Default Input +func MakeDefaultLiteralForType(typ *core.LiteralType) (*core.Literal, error) { + switch t := typ.GetType().(type) { + case *core.LiteralType_Simple: + ... + case core.SimpleType_JSON: + return &core.Literal{ + Value: &core.Literal_Scalar{ + Scalar: &core.Scalar{ + Value: &core.Scalar_JSON{ + JSON: &core.JSON{ + Value: []byte(""), + }, + SerializationFormat: "msgpack", + }, + }, + }, + }, nil + } +} +``` +4. Compiler (Backward Compatibility with `Struct` type) +```go +if upstreamTypeCopy.GetSimple() == flyte.SimpleType_STRUCT && downstreamTypeCopy.GetSimple() == flyte.SimpleType_JSON { + return true + } +``` +### FlyteKit +#### pyflyte run +The behavior will remain unchanged. +We will pass the value to our class, which inherits from `click.ParamType`, and use the corresponding type transformer to convert the input to the correct type. + +### Dict Transformer + +| **Stage** | **Conversion** | **Description** | +| --- | --- |---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| **Before** | Python Value to Literal | 1. `Dict[type, type]` uses type hints to construct a LiteralMap.
2. `dict` uses `JSON.dumps` to turn a `dict` value to a JSON string, and store it to Protobuf Struct. | +| | Literal to Python Value | 1. `Dict[type, type]` uses type hints to convert LiteralMap to Python Value.
2. `dict` uses `JSON.loads` to turn a JSON string into a dict value and store it to Protobuf Struct. | +| **After** | Python Value to Literal | 1. `Dict[type, type]` stays the same.
2. `dict` uses `msgpack.dumps` to turn a dict into msgpack bytes, and store it to Protobuf JSON. | +| | Literal to Python Value | 1. `Dict[type, type]` uses type hints to convert LiteralMap to Python Value.
2. `dict` conversion: msgpack bytes -> dict value, method: `msgpack.loads`. | + +### Dataclass Transformer + +| **Stage** | **Conversion** | **Description** | +| --- | --- |-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| **Before** | Python Value to Literal | Uses `mashumaro JSON Encoder` to turn a dataclass value to a JSON string, and store it to Protobuf `Struct`. | +| | Literal to Python Value | Uses `mashumaro JSON Decoder` to turn a JSON string to a python value, and recursively fixed int attributes to int (it will be float because we stored it in to `Struct`). | +| **After** | Python Value to Literal | Uses `mashumaro MessagePackEncoder` to convert a dataclass value into msgpack bytes, storing them in the Protobuf `JSON` field. | +| | Literal to Python Value | Uses `mashumaro MessagePackDecoder` to convert msgpack bytes back into a Python value. | + +### Pydantic Transformer + +| **Stage** | **Conversion** | **Description** | +| --- | --- | --- | +| **Before** | Python Value to Literal | Convert `BaseModel` to a JSON string, and then convert it to a Protobuf `Struct`. | +| | Literal to Python Value | Convert Protobuf `Struct` to a JSON string and then convert it to a `BaseModel`. | +| **After** | Python Value to Literal | Converts the Pydantic `BaseModel` to a dictionary, then serializes it into msgpack bytes using `msgpack.dumps`. | +| | Literal to Python Value | Deserializes `msgpack` bytes into a dictionary, then converts it back into a Pydantic `BaseModel`. | + +Note: Pydantic BaseModel can't be serialized directly by `msgpack`, but this implementation will still ensure 100% correct. + +### FlyteCtl +In FlyteCtl, we can construct input for the execution, so we have to make sure the values we passed to FlyteAdmin +can all be constructed to Literal. + +reference: https://github.com/flyteorg/flytectl/blob/131d6a20c7db601ca9156b8d43d243bc88669829/cmd/create/serialization_utils.go#L48 + +### FlyteConsole +#### Show input/output on FlyteConsole +We will get the node's input and output literal values by FlyteAdmin’s API, and obtain the JSON IDL bytes from the literal value. + +We can use MsgPack dumps the MsgPack into a dictionary, and shows it to the flyteconsole. +#### Construct Input +We should use `msgpack.encode` to encode input value and store it to the literal’s JSON field. + +## 4 Metrics & Dashboards + +None + +## 5 Drawbacks + +None + +## 6 Alternatives + +None, it's doable. + + +## 7 Potential Impact and Dependencies +We should check whether `serialization_format` is specified and supported in the Flyte backend, Flytekit, and Flyteconsole. Currently, we use `msgpack` as our default serialization format. + +In the future, we might want to support different JSON types such as "eJSON" or "ndJSON." We can add `json_type` to the JSON IDL to accommodate this. + +There are 3 reasons why we add `serialization_format` to the JSON IDL rather than the literal's `metadata`: +1. Metadata use cases are more related to when the data is created, where the data is stored, etc. +2. This is required information for all JSON IDLs, and it will seem more important if we include it as a field in the IDL. +3. If we want to add `json_type` or other JSON IDL-specific use cases in the future, we can include them in the JSON IDL field, making it more readable. + +## 8 Unresolved questions +None. + +## 9 Conclusion +MsgPack is better because it's more smaller and faster. +You can see the performance comparison here: https://github.com/flyteorg/flyte/pull/5607#issuecomment-2333174325 +We will use `msgpack` to do it. \ No newline at end of file From d97bb2a78091f00657cf565ff35160c4426e091b Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Thu, 12 Sep 2024 18:14:27 +0800 Subject: [PATCH 02/26] Finish part 1 and part 2 Signed-off-by: Future-Outlier --- .../5741-binary-idl-with-message-pack.md | 40 ++++++++++++++----- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/rfc/system/5741-binary-idl-with-message-pack.md b/rfc/system/5741-binary-idl-with-message-pack.md index 3d3cc3d4e1..9ac7b113c7 100644 --- a/rfc/system/5741-binary-idl-with-message-pack.md +++ b/rfc/system/5741-binary-idl-with-message-pack.md @@ -11,22 +11,37 @@ ## 1 Executive Summary ### Literal Value +Literal Value will be `Binary`. +Use `bytes` in `Binary` instead of `Protobuf struct`. + - To Literal | Before | Now | |-----------------------------------|----------------------------------------------| -| Python Val -> JSON String -> Protobuf Struct | Python Val -> Bytes -> Protobuf Binary (value: MessagePack Bytes, tag: msgpack) | +| Python Val -> JSON String -> Protobuf Struct | Python Val -> (Dict ->) Bytes -> Binary (value: MessagePack Bytes, tag: msgpack) IDL Object | - To Python Value | Before | Now | |-----------------------------------|----------------------------------------------| -| Protobuf Struct -> JSON String -> Python Val | Protobuf JSON -> Bytes -> Python Val | +| Protobuf Struct -> JSON String -> Python Val | Binary (value: MessagePack Bytes, tag: msgpack) IDL Object -> Bytes -> (Dict ->) -> Python Val | + +Note: if a python value can't directly be converted to `MessagePack Bytes`, we can convert it to `Dict`, and then convert it to `MessagePack Bytes`. + +For example, the pydantic to literal function will be `BaseModel` -> `dict` -> `MessagePack Bytes` -> `Binary (value: MessagePack Bytes, tag: msgpack) IDL Object`. -Use bytes in Protobuf instead of a JSON string to fix case that int is not supported in Protobuf struct. +For pure `dict` in python, the to literal function will be `dict` -> `MessagePack Bytes` -> `Binary (value: MessagePack Bytes, tag: msgpack) IDL Object`. ### Literal Type -1. +Literal Type will be `Protobuf struct`. +`Json Schema` will be stored in `Literal Type's metadata`. + +1. Dataclass, Pydantic BaseModel and pure dict in python will all use `Protobuf Struct`. +2. We will put `Json Schema` in Literal Type's `metadata` field, this will be used in flytekit remote api to construct dataclass/Pydantic BaseModel by `Json Schema`. +3. We will use libraries written in golang to compare `Json Schema` to solve this issue: ["[BUG] Union types fail for e.g. two different dataclasses"](https://github.com/flyteorg/flyte/issues/5489). + + +Note: The `metadata` of `Literal Type` and `Literal Value` are not the same. ## 2 Motivation @@ -53,9 +68,9 @@ def t2(a: dict): ### After ```python @task -def t1() -> dict: # JSON Bytes +def t1() -> dict: # Literal(scalar=Scalar(binary=Binary(value=b'msgpack_bytes', tag="msgpack"))) ... - return {"a": 1} # Protobuf JSON b'\x81\xa1a\x01', produced by msgpack + return {"a": 1} # Protobuf Binary value=b'\x81\xa1a\x01', produced by msgpack @task def t2(a: dict): @@ -63,24 +78,27 @@ def t2(a: dict): ``` #### Note -- We will use the same type interface and ensure the backward compatibility. +- We will use implement `to_python_value` to every type transformer to ensure backward compatibility. +For example, `Binary IDL Object` -> python value and `Protobuf Struct IDL Object` -> python value are both supported. ### How to turn a value to bytes? #### Use MsgPack to convert a value into bytes ##### Python ```python import msgpack -import JSON # Encode def to_literal(): msgpack_bytes = msgpack.dumps(python_val) - return Literal(scalar=Scalar(json=Json(value=msgpack_bytes, serialization_format="msgpack"))) + return Literal(scalar=Scalar(binary=Binary(value=b'msgpack_bytes', tag="msgpack"))) # Decode def to_python_value(): - if lv.scalar.json.serialization_format == "msgpack": - msgpack_bytes = lv.scalar.json.value + # lv: literal value + if lv.scalar.binary.tag == "msgpack": + msgpack_bytes = lv.scalar.json.value + else: + raise ValueError(f"{tag} is not supported to decode this Binary Literal: {lv.scalar.binary}.") return msgpack.loads(msgpack_bytes) ``` reference: https://github.com/msgpack/msgpack-python From 96a948ba402fc8e40379c63b70d5468cffa22ea2 Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Thu, 12 Sep 2024 18:28:30 +0800 Subject: [PATCH 03/26] Finish the flyteidl part Signed-off-by: Future-Outlier --- .../5741-binary-idl-with-message-pack.md | 61 ++++++++----------- 1 file changed, 26 insertions(+), 35 deletions(-) diff --git a/rfc/system/5741-binary-idl-with-message-pack.md b/rfc/system/5741-binary-idl-with-message-pack.md index 9ac7b113c7..4ea048e156 100644 --- a/rfc/system/5741-binary-idl-with-message-pack.md +++ b/rfc/system/5741-binary-idl-with-message-pack.md @@ -165,43 +165,34 @@ reference: https://github.com/msgpack/msgpack-javascript ### FlyteIDL +#### Literal Value ```proto -// Represents a JSON object encoded as a byte array. -// This field is used to store JSON-serialized data, which can include -// dataclasses, dictionaries, Pydantic models, or other structures that -// can be represented as JSON objects. When utilized, the data should be -// deserialized into its corresponding structure. -// This design ensures that the data is stored in a format that can be -// fully reconstructed without loss of information. -message Json { - // The JSON object serialized as a byte array. - bytes value = 1; - - // The format used to serialize the byte array. - // This field identifies the specific format of the serialized JSON data, - // allowing future flexibility in supporting different JSON variants. - string serialization_format = 2; - - // Placeholder for future extensions to support other types of JSON objects, - // such as "eJSON" or "ndJSON". - // reference: https://stackoverflow.com/questions/18692060/different-types-of-json - // string json_type = 3; +// A simple byte array with a tag to help different parts of the system communicate about what is in the byte array. +// It's strongly advisable that consumers of this type define a unique tag and validate the tag before parsing the data. +message Binary { + bytes value = 1; // Serialized data (MessagePack) for supported types like Dataclass, Pydantic BaseModel, and dict. + string tag = 2; // The serialization format identifier (e.g., MessagePack). Consumers must define unique tags and validate them before deserialization. } - - -message Scalar { - oneof value { - Primitive primitive = 1; - Blob blob = 2; - Binary binary = 3; - Schema schema = 4; - Void none_type = 5; - Error error = 6; - google.Protobuf.Struct generic = 7; - StructuredDataset structured_dataset = 8; - Union union = 9; - Json json = 10; // New Type - } +``` +#### Literal Type +```proto +import "google/protobuf/struct.proto"; + +enum SimpleType { + NONE = 0; + INTEGER = 1; + FLOAT = 2; + STRING = 3; + BOOLEAN = 4; + DATETIME = 5; + DURATION = 6; + BINARY = 7; + ERROR = 8; + STRUCT = 9; // Use this one. +} +message LiteralType { + SimpleType simple = 1; // Use this one. + google.protobuf.Struct metadata = 6; // Store Json Schema to differentiate different dataclass. } ``` From ae16b44ab1cd389ce960c8d0bb5d64701c681a72 Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Thu, 12 Sep 2024 20:18:47 +0800 Subject: [PATCH 04/26] Start from flyteclt Signed-off-by: Future-Outlier --- .../5741-binary-idl-with-message-pack.md | 195 ++++++++++++++---- 1 file changed, 151 insertions(+), 44 deletions(-) diff --git a/rfc/system/5741-binary-idl-with-message-pack.md b/rfc/system/5741-binary-idl-with-message-pack.md index 4ea048e156..d5f838bfe3 100644 --- a/rfc/system/5741-binary-idl-with-message-pack.md +++ b/rfc/system/5741-binary-idl-with-message-pack.md @@ -225,8 +225,8 @@ def wf(): ```go func literalTypeForScalar(scalar *core.Scalar) *core.LiteralType { ... - case *core.Scalar_JSON: - literalType = &core.LiteralType{Type: &core.LiteralType_Simple{Simple: core.SimpleType_JSON}} + case *core.Scalar_Binary: + literalType = &core.LiteralType{Type: &core.LiteralType_Simple{Simple: core.SimpleType_BINARY}} ... return literalType } @@ -238,36 +238,81 @@ func ExtractFromLiteral(literal *core.Literal) (interface{}, error) { switch literalValue := literal.Value.(type) { case *core.Literal_Scalar: ... - case *core.Scalar_JSON: - return scalarValue.Json.GetValue(), nil + case *core.Scalar_Binary: + return scalarValue.Binary, nil } } // Default Input func MakeDefaultLiteralForType(typ *core.LiteralType) (*core.Literal, error) { - switch t := typ.GetType().(type) { + switch t := typ.GetType().(type) { case *core.LiteralType_Simple: - ... - case core.SimpleType_JSON: - return &core.Literal{ - Value: &core.Literal_Scalar{ - Scalar: &core.Scalar{ - Value: &core.Scalar_JSON{ - JSON: &core.JSON{ - Value: []byte(""), - }, - SerializationFormat: "msgpack", - }, - }, - }, - }, nil + case core.SimpleType_BINARY: + return MakeLiteral([]byte{}) } } +// Use Message Pack as Default Tag for deserialization. +func MakeBinaryLiteral(v []byte) *core.Literal { + return &core.Literal{ + Value: &core.Literal_Scalar{ + Scalar: &core.Scalar{ + Value: &core.Scalar_Binary{ + Binary: &core.Binary{ + Value: v, + Tag: "msgpack", + }, + }, + }, + }, + } +} ``` -4. Compiler (Backward Compatibility with `Struct` type) +4. Compiler ```go -if upstreamTypeCopy.GetSimple() == flyte.SimpleType_STRUCT && downstreamTypeCopy.GetSimple() == flyte.SimpleType_JSON { - return true +func (t trivialChecker) CastsFrom(upstreamType *flyte.LiteralType) bool { + if upstreamType.GetEnumType() != nil { + if t.literalType.GetSimple() == flyte.SimpleType_STRING { + return true + } } + + if t.literalType.GetEnumType() != nil { + if upstreamType.GetSimple() == flyte.SimpleType_STRING { + return true + } + } + + if GetTagForType(upstreamType) != "" && GetTagForType(t.literalType) != GetTagForType(upstreamType) { + return false + } + + // Here is the new way to check if dataclass/pydantic BaseModel are castable or not. + if upstreamTypeCopy.GetSimple() == flyte.SimpleType_STRUCT &&\ + downstreamTypeCopy.GetSimple() == flyte.SimpleType_STRUCT { + // Json Schema is stored in Metadata + upstreamMetadata := upstreamTypeCopy.GetMetadata() + downstreamMetadata := downstreamTypeCopy.GetMetadata() + + // There's bug in flytekit's dataclass Transformer to generate JSON Scheam before, + // in some case, we the JSON Schema will be nil, so we can only pass it to support + // backward compatible. (reference task should be supported.) + if upstreamMetadata == nil || downstreamMetadata == nil { + return true + } + + return isSameTypeInJSON(upstreamMetadata, downstreamMetadata) ||\ + isSuperTypeInJSON(upstreamMetadata, downstreamMetadata) + } + + upstreamTypeCopy := *upstreamType + downstreamTypeCopy := *t.literalType + upstreamTypeCopy.Structure = &flyte.TypeStructure{} + downstreamTypeCopy.Structure = &flyte.TypeStructure{} + upstreamTypeCopy.Metadata = &structpb.Struct{} + downstreamTypeCopy.Metadata = &structpb.Struct{} + upstreamTypeCopy.Annotation = &flyte.TypeAnnotation{} + downstreamTypeCopy.Annotation = &flyte.TypeAnnotation{} + return upstreamTypeCopy.String() == downstreamTypeCopy.String() +} ``` ### FlyteKit #### pyflyte run @@ -275,39 +320,101 @@ The behavior will remain unchanged. We will pass the value to our class, which inherits from `click.ParamType`, and use the corresponding type transformer to convert the input to the correct type. ### Dict Transformer +There are 2 cases in Dict Transformer, `Dict[type, type]` and `dict`. +For `Dict[type, type]`, we will stay everything the same as before. -| **Stage** | **Conversion** | **Description** | -| --- | --- |---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| **Before** | Python Value to Literal | 1. `Dict[type, type]` uses type hints to construct a LiteralMap.
2. `dict` uses `JSON.dumps` to turn a `dict` value to a JSON string, and store it to Protobuf Struct. | -| | Literal to Python Value | 1. `Dict[type, type]` uses type hints to convert LiteralMap to Python Value.
2. `dict` uses `JSON.loads` to turn a JSON string into a dict value and store it to Protobuf Struct. | -| **After** | Python Value to Literal | 1. `Dict[type, type]` stays the same.
2. `dict` uses `msgpack.dumps` to turn a dict into msgpack bytes, and store it to Protobuf JSON. | -| | Literal to Python Value | 1. `Dict[type, type]` uses type hints to convert LiteralMap to Python Value.
2. `dict` conversion: msgpack bytes -> dict value, method: `msgpack.loads`. | +#### Literal Value +For `dict`, the life cycle of it will be as below. + +Before: +- `to_literal`: `dict` -> `JSON String` -> `Protobuf Struct` +- `to_python_val`: `Protobuf Struct` -> `JSON String` -> `dict` + +After: +- `to_literal`: `dict` -> `msgpack bytes` -> `Binary(value=b'msgpack_bytes', tag="msgpack")` +- `to_python_val`: `Binary(value=b'msgpack_bytes', tag="msgpack")` -> `msgpack bytes` -> `dict` +#### JSON Schema +The JSON Schema of `dict` will be empty. ### Dataclass Transformer +#### Literal Value +Before: +- `to_literal`: `dataclass` -> `JSON String` -> `Protobuf Struct` +- `to_python_val`: `Protobuf Struct` -> `JSON String` -> `dataclass` + +After: +- `to_literal`: `dataclass` -> `msgpack bytes` -> `Binary(value=b'msgpack_bytes', tag="msgpack")` +- `to_python_val`: `Binary(value=b'msgpack_bytes', tag="msgpack")` -> `msgpack bytes` -> `dataclass` -| **Stage** | **Conversion** | **Description** | -| --- | --- |-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| **Before** | Python Value to Literal | Uses `mashumaro JSON Encoder` to turn a dataclass value to a JSON string, and store it to Protobuf `Struct`. | -| | Literal to Python Value | Uses `mashumaro JSON Decoder` to turn a JSON string to a python value, and recursively fixed int attributes to int (it will be float because we stored it in to `Struct`). | -| **After** | Python Value to Literal | Uses `mashumaro MessagePackEncoder` to convert a dataclass value into msgpack bytes, storing them in the Protobuf `JSON` field. | -| | Literal to Python Value | Uses `mashumaro MessagePackDecoder` to convert msgpack bytes back into a Python value. | +Note: We will use mashumaro's `MessagePackEncoder` and `MessagePackDecoder` to serialize and deserialize dataclass value in python. +```python +from mashumaro.codecs.msgpack import MessagePackDecoder, MessagePackEncoder +``` + +#### JSON Schema +The JSON Schema of `dataclass` will be generated by `marshmallow` or `mashumaro`. +Check here: https://github.com/flyteorg/flytekit/blob/8c6f6f0f17d113447e1b10b03e25a34bad79685c/flytekit/core/type_engine.py#L442-L474 ### Pydantic Transformer +#### Literal Value +Pydantic can't be serialized to `msgpack bytes` directly. +But `dict` can. -| **Stage** | **Conversion** | **Description** | -| --- | --- | --- | -| **Before** | Python Value to Literal | Convert `BaseModel` to a JSON string, and then convert it to a Protobuf `Struct`. | -| | Literal to Python Value | Convert Protobuf `Struct` to a JSON string and then convert it to a `BaseModel`. | -| **After** | Python Value to Literal | Converts the Pydantic `BaseModel` to a dictionary, then serializes it into msgpack bytes using `msgpack.dumps`. | -| | Literal to Python Value | Deserializes `msgpack` bytes into a dictionary, then converts it back into a Pydantic `BaseModel`. | +- `to_literal`: `BaseModel` -> `dict` -> `msgpack bytes` -> `Binary(value=b'msgpack_bytes', tag="msgpack")` +- `to_python_val`: `Binary(value=b'msgpack_bytes', tag="msgpack")` -> `msgpack bytes` -> `dict` -> `BaseModel` Note: Pydantic BaseModel can't be serialized directly by `msgpack`, but this implementation will still ensure 100% correct. -### FlyteCtl -In FlyteCtl, we can construct input for the execution, so we have to make sure the values we passed to FlyteAdmin -can all be constructed to Literal. +```python +@dataclass +class DC_inside: + a: int + b: float + +@dataclass +class DC: + a: int + b: float + c: str + d: Dict[str, int] + e: DC_inside + +class MyDCModel(BaseModel): + dc: DC + +my_dc = MyDCModel(dc=DC(a=1, b=2.0, c="3", d={"4": 5}, e=DC_inside(a=6, b=7.0))) +# {'dc': {'a': 1, 'b': 2.0, 'c': '3', 'd': {'4': 5}, 'e': {'a': 6, 'b': 7.0}}} +``` -reference: https://github.com/flyteorg/flytectl/blob/131d6a20c7db601ca9156b8d43d243bc88669829/cmd/create/serialization_utils.go#L48 +#### JSON Schema +The JSON Schema of `BaseModel` will be generated by Pydantic's API. +```python +@dataclass +class DC_inside: + a: int + b: float + +@dataclass +class DC: + a: int + b: float + c: str + d: Dict[str, int] + e: DC_inside + +class MyDCModel(BaseModel): + dc: DC + +my_dc = MyDCModel(dc=DC(a=1, b=2.0, c="3", d={"4": 5}, e=DC_inside(a=6, b=7.0))) +my_dc.model_json_schema() +""" +{'$defs': {'DC': {'properties': {'a': {'title': 'A', 'type': 'integer'}, 'b': {'title': 'B', 'type': 'number'}, 'c': {'title': 'C', 'type': 'string'}, 'd': {'additionalProperties': {'type': 'integer'}, 'title': 'D', 'type': 'object'}, 'e': {'$ref': '#/$defs/DC_inside'}}, 'required': ['a', 'b', 'c', 'd', 'e'], 'title': 'DC', 'type': 'object'}, 'DC_inside': {'properties': {'a': {'title': 'A', 'type': 'integer'}, 'b': {'title': 'B', 'type': 'number'}}, 'required': ['a', 'b'], 'title': 'DC_inside', 'type': 'object'}}, 'properties': {'dc': {'$ref': '#/$defs/DC'}}, 'required': ['dc'], 'title': 'MyDCModel', 'type': 'object'} +""" +``` + +### FlyteCtl +In FlyteCtl, we can construct input for the execution. +We can ### FlyteConsole #### Show input/output on FlyteConsole From fd4aa0046cd78c309498711618f1d92774fc44b1 Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Thu, 12 Sep 2024 20:35:05 +0800 Subject: [PATCH 05/26] TBD: flyteconsole Signed-off-by: Future-Outlier --- .../5741-binary-idl-with-message-pack.md | 43 +++++++++++++------ 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/rfc/system/5741-binary-idl-with-message-pack.md b/rfc/system/5741-binary-idl-with-message-pack.md index d5f838bfe3..cd96d5f059 100644 --- a/rfc/system/5741-binary-idl-with-message-pack.md +++ b/rfc/system/5741-binary-idl-with-message-pack.md @@ -414,16 +414,40 @@ my_dc.model_json_schema() ### FlyteCtl In FlyteCtl, we can construct input for the execution. -We can + +We can construct a `Binary IDL Object` when we receive `Literal Type Struct`. + +In `flyteidl/clients/go/coreutils/literals.go`: +```go +if newT.Simple == core.SimpleType_STRUCT { + if _, isValueStringType := v.(string); !isValueStringType { + byteValue, err := msgpack.Marshal(v) + if err != nil { + return nil, fmt.Errorf("unable to marshal to json string for struct value %v", v) + } + strValue = string(byteValue) + } +} +``` ### FlyteConsole #### Show input/output on FlyteConsole -We will get the node's input and output literal values by FlyteAdmin’s API, and obtain the JSON IDL bytes from the literal value. -We can use MsgPack dumps the MsgPack into a dictionary, and shows it to the flyteconsole. +1. Get Bytes from the Binary IDL Object. +2. Get Tag from the Binary IDL Object. +3. Use Tag to determine how to deserialize `Bytes`, and show the deserialized value in FlyteConsole. + +#### Copy Input + + #### Construct Input +##### Input Bytes +1. Encode + We should use `msgpack.encode` to encode input value and store it to the literal’s JSON field. +##### Launch Form + ## 4 Metrics & Dashboards None @@ -438,19 +462,12 @@ None, it's doable. ## 7 Potential Impact and Dependencies -We should check whether `serialization_format` is specified and supported in the Flyte backend, Flytekit, and Flyteconsole. Currently, we use `msgpack` as our default serialization format. - -In the future, we might want to support different JSON types such as "eJSON" or "ndJSON." We can add `json_type` to the JSON IDL to accommodate this. - -There are 3 reasons why we add `serialization_format` to the JSON IDL rather than the literal's `metadata`: -1. Metadata use cases are more related to when the data is created, where the data is stored, etc. -2. This is required information for all JSON IDLs, and it will seem more important if we include it as a field in the IDL. -3. If we want to add `json_type` or other JSON IDL-specific use cases in the future, we can include them in the JSON IDL field, making it more readable. +None. ## 8 Unresolved questions None. ## 9 Conclusion -MsgPack is better because it's more smaller and faster. +MsgPack is a good choice because it's more smaller and faster than UTF-8 Encoded JSON String. You can see the performance comparison here: https://github.com/flyteorg/flyte/pull/5607#issuecomment-2333174325 -We will use `msgpack` to do it. \ No newline at end of file +We will use `msgpack` to do it. From 933bc387f3bf4106b34523d62a6d3f962dad8989 Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Thu, 12 Sep 2024 20:54:35 +0800 Subject: [PATCH 06/26] TBD: flyteconsole Signed-off-by: Future-Outlier --- rfc/system/5741-binary-idl-with-message-pack.md | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/rfc/system/5741-binary-idl-with-message-pack.md b/rfc/system/5741-binary-idl-with-message-pack.md index cd96d5f059..48618817d4 100644 --- a/rfc/system/5741-binary-idl-with-message-pack.md +++ b/rfc/system/5741-binary-idl-with-message-pack.md @@ -315,6 +315,13 @@ func (t trivialChecker) CastsFrom(upstreamType *flyte.LiteralType) bool { } ``` ### FlyteKit +#### Attribute Access +In most transformers, we should create a function `from_binary` to convert the Binary IDL Object into the desired type. + +When performing attribute access, Propeller will deserialize the msgpack bytes into a map object, retrieve the attribute, and then serialize it back into msgpack bytes (a Binary IDL Object containing msgpack bytes). + +This means that when converting a literal to a Python value, we will receive `msgpack bytes` instead of our `expected Python type`. + #### pyflyte run The behavior will remain unchanged. We will pass the value to our class, which inherits from `click.ParamType`, and use the corresponding type transformer to convert the input to the correct type. @@ -438,15 +445,17 @@ if newT.Simple == core.SimpleType_STRUCT { 3. Use Tag to determine how to deserialize `Bytes`, and show the deserialized value in FlyteConsole. #### Copy Input - +Return `MessagePack Bytes` to your clipboard, it can be used in `Input Bytes` below. #### Construct Input ##### Input Bytes -1. Encode - -We should use `msgpack.encode` to encode input value and store it to the literal’s JSON field. +1. Input `MessagePack Bytes` to the console, for example, `{"a": 1}` in python will be `b'\x81\xa1a\x01'`. +2. Tag can only use `msgpack`, it's forced now. ##### Launch Form +1. Use `JSON Schema` to know what every input field, and make users input it 1 by 1. +2. Serialize the input value into `MessagePack Bytes` +3. Create a `Binary IDL Object`, value is `MessagePack Bytes` and tag is `msgpack`. ## 4 Metrics & Dashboards From 78093383fb8e2e7af2b47fe588e92bc322fedb4c Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Thu, 12 Sep 2024 20:55:09 +0800 Subject: [PATCH 07/26] RFC Done Signed-off-by: Future-Outlier From 90bdcfffaa12b0cdb51ccaef214187ac09edef6f Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Thu, 12 Sep 2024 20:58:53 +0800 Subject: [PATCH 08/26] nit Signed-off-by: Future-Outlier --- rfc/system/5741-binary-idl-with-message-pack.md | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/rfc/system/5741-binary-idl-with-message-pack.md b/rfc/system/5741-binary-idl-with-message-pack.md index 48618817d4..9084d49444 100644 --- a/rfc/system/5741-binary-idl-with-message-pack.md +++ b/rfc/system/5741-binary-idl-with-message-pack.md @@ -12,6 +12,7 @@ ## 1 Executive Summary ### Literal Value Literal Value will be `Binary`. + Use `bytes` in `Binary` instead of `Protobuf struct`. - To Literal @@ -46,8 +47,12 @@ Note: The `metadata` of `Literal Type` and `Literal Value` are not the same. ## 2 Motivation In Flytekit, when handling dataclasses, Pydantic base models, and dictionaries, we store data using a JSON string within Protobuf struct datatype. + This approach causes issues with integers, as Protobuf struct does not support int types, leading to their conversion to floats. -This results in performance issues since we need to recursively iterate through all attributes/keys in dataclasses and dictionaries to ensure floats types are converted to int. In addition to performance issues, the required code is complicated and error prone. + +This results in performance issues since we need to recursively iterate through all attributes/keys in dataclasses and dictionaries to ensure floats types are converted to int. + +In addition to performance issues, the required code is complicated and error prone. Note: We have more than 10 issues about dict, dataclass and Pydantic. @@ -316,18 +321,20 @@ func (t trivialChecker) CastsFrom(upstreamType *flyte.LiteralType) bool { ``` ### FlyteKit #### Attribute Access -In most transformers, we should create a function `from_binary` to convert the Binary IDL Object into the desired type. +In most transformers, we should create a function `from_binary_idl` to convert the Binary IDL Object into the desired type. When performing attribute access, Propeller will deserialize the msgpack bytes into a map object, retrieve the attribute, and then serialize it back into msgpack bytes (a Binary IDL Object containing msgpack bytes). This means that when converting a literal to a Python value, we will receive `msgpack bytes` instead of our `expected Python type`. #### pyflyte run -The behavior will remain unchanged. +The behavior will remain unchanged. + We will pass the value to our class, which inherits from `click.ParamType`, and use the corresponding type transformer to convert the input to the correct type. ### Dict Transformer There are 2 cases in Dict Transformer, `Dict[type, type]` and `dict`. + For `Dict[type, type]`, we will stay everything the same as before. #### Literal Value @@ -478,5 +485,6 @@ None. ## 9 Conclusion MsgPack is a good choice because it's more smaller and faster than UTF-8 Encoded JSON String. + You can see the performance comparison here: https://github.com/flyteorg/flyte/pull/5607#issuecomment-2333174325 We will use `msgpack` to do it. From 16b741e3426f0f8681193a9b4fc68c335d15105b Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Mon, 16 Sep 2024 10:07:29 +0800 Subject: [PATCH 09/26] Update Yee's advice Signed-off-by: Future-Outlier --- rfc/system/5741-binary-idl-with-message-pack.md | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/rfc/system/5741-binary-idl-with-message-pack.md b/rfc/system/5741-binary-idl-with-message-pack.md index 9084d49444..e4e5f8486f 100644 --- a/rfc/system/5741-binary-idl-with-message-pack.md +++ b/rfc/system/5741-binary-idl-with-message-pack.md @@ -27,11 +27,19 @@ Use `bytes` in `Binary` instead of `Protobuf struct`. |-----------------------------------|----------------------------------------------| | Protobuf Struct -> JSON String -> Python Val | Binary (value: MessagePack Bytes, tag: msgpack) IDL Object -> Bytes -> (Dict ->) -> Python Val | -Note: if a python value can't directly be converted to `MessagePack Bytes`, we can convert it to `Dict`, and then convert it to `MessagePack Bytes`. -For example, the pydantic to literal function will be `BaseModel` -> `dict` -> `MessagePack Bytes` -> `Binary (value: MessagePack Bytes, tag: msgpack) IDL Object`. +Note: + +1. If a Python value can't be directly converted to `MessagePack Bytes`, we can first convert it to a `Dict`, and then convert it to `MessagePack Bytes`. + + - **For example:** The Pydantic-to-literal function workflow will be: + `BaseModel` -> `dict` -> `MessagePack Bytes` -> `Binary (value: MessagePack Bytes, tag: msgpack) IDL Object`. + + - **For pure `dict` in Python:** The to-literal function workflow will be: + `dict` -> `MessagePack Bytes` -> `Binary (value: MessagePack Bytes, tag: msgpack) IDL Object`. + +2. There is **NO JSON** involved in the new type at all. Only **JSON Schema** is used to construct `DataClass` or `Pydantic BaseModel`. -For pure `dict` in python, the to literal function will be `dict` -> `MessagePack Bytes` -> `Binary (value: MessagePack Bytes, tag: msgpack) IDL Object`. ### Literal Type Literal Type will be `Protobuf struct`. From b93569a4d41f7f81600f259d54b92bdc6284d453 Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Mon, 16 Sep 2024 15:32:57 +0800 Subject: [PATCH 10/26] Update msgpack golang library Signed-off-by: Future-Outlier --- .../5741-binary-idl-with-message-pack.md | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/rfc/system/5741-binary-idl-with-message-pack.md b/rfc/system/5741-binary-idl-with-message-pack.md index e4e5f8486f..3cc60f50e3 100644 --- a/rfc/system/5741-binary-idl-with-message-pack.md +++ b/rfc/system/5741-binary-idl-with-message-pack.md @@ -122,7 +122,7 @@ package main import ( "fmt" - "github.com/vmihailenco/msgpack/v5" + "github.com/shamaton/msgpack/v2" ) func main() { @@ -150,7 +150,25 @@ func main() { } ``` -reference: https://github.com/vmihailenco/msgpack +reference: [shamaton/msgpack GitHub Repository](https://github.com/shamaton/msgpack) + +Notes: + +1. **MessagePack Implementations**: + - We can explore all MessagePack implementations for Golang at the [MessagePack official website](https://msgpack.org/index.html). + +2. **Library Comparison**: + - The library [github.com/vmihailenco/msgpack](https://github.com/vmihailenco/msgpack) doesn't support strict type deserialization (for example, `map[int]string`), but `"github.com/shamaton/msgpack/v2"` supports this feature. This is super important for backward copmatibility. + +3. **Library Popularity**: + - While `"github.com/shamaton/msgpack/v2"` has fewer stars on GitHub, it has proven to be reliable in various test cases. All cases created by me have passed successfully, which you can find in this [pull request](https://github.com/flyteorg/flytekit/pull/2751). + +4. **Project Activity**: + - `"github.com/shamaton/msgpack/v2"` is still an actively maintained project. The author responds quickly to issues and questions, making it a well-supported choice for projects requiring ongoing maintenance and active support. + +5. **Testing Process**: + - I initially started with [github.com/vmihailenco/msgpack](https://github.com/vmihailenco/msgpack) but switched to `"github.com/shamaton/msgpack/v2"` due to its better support for strict typing and the active support provided by the author. + ##### JavaScript ```javascript From 343402ca5b40d448cbc686663215c12026c239d5 Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Mon, 16 Sep 2024 15:35:39 +0800 Subject: [PATCH 11/26] Update msgpack golang library Signed-off-by: Future-Outlier --- rfc/system/5741-binary-idl-with-message-pack.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rfc/system/5741-binary-idl-with-message-pack.md b/rfc/system/5741-binary-idl-with-message-pack.md index 3cc60f50e3..cdd06e2708 100644 --- a/rfc/system/5741-binary-idl-with-message-pack.md +++ b/rfc/system/5741-binary-idl-with-message-pack.md @@ -161,10 +161,10 @@ Notes: - The library [github.com/vmihailenco/msgpack](https://github.com/vmihailenco/msgpack) doesn't support strict type deserialization (for example, `map[int]string`), but `"github.com/shamaton/msgpack/v2"` supports this feature. This is super important for backward copmatibility. 3. **Library Popularity**: - - While `"github.com/shamaton/msgpack/v2"` has fewer stars on GitHub, it has proven to be reliable in various test cases. All cases created by me have passed successfully, which you can find in this [pull request](https://github.com/flyteorg/flytekit/pull/2751). + - While [github.com/shamaton/msgpack/v2](https://github.com/shamaton/msgpack) has fewer stars on GitHub, it has proven to be reliable in various test cases. All cases created by me have passed successfully, which you can find in this [pull request](https://github.com/flyteorg/flytekit/pull/2751). 4. **Project Activity**: - - `"github.com/shamaton/msgpack/v2"` is still an actively maintained project. The author responds quickly to issues and questions, making it a well-supported choice for projects requiring ongoing maintenance and active support. + - [github.com/shamaton/msgpack/v2](https://github.com/shamaton/msgpack) is still an actively maintained project. The author responds quickly to issues and questions, making it a well-supported choice for projects requiring ongoing maintenance and active support. 5. **Testing Process**: - I initially started with [github.com/vmihailenco/msgpack](https://github.com/vmihailenco/msgpack) but switched to `"github.com/shamaton/msgpack/v2"` due to its better support for strict typing and the active support provided by the author. From affc1821101b630ec71c32372e67cdba6e57048d Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Mon, 16 Sep 2024 20:23:51 +0800 Subject: [PATCH 12/26] better url reference Signed-off-by: Future-Outlier --- rfc/system/5741-binary-idl-with-message-pack.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rfc/system/5741-binary-idl-with-message-pack.md b/rfc/system/5741-binary-idl-with-message-pack.md index cdd06e2708..3a4b398b06 100644 --- a/rfc/system/5741-binary-idl-with-message-pack.md +++ b/rfc/system/5741-binary-idl-with-message-pack.md @@ -158,7 +158,7 @@ Notes: - We can explore all MessagePack implementations for Golang at the [MessagePack official website](https://msgpack.org/index.html). 2. **Library Comparison**: - - The library [github.com/vmihailenco/msgpack](https://github.com/vmihailenco/msgpack) doesn't support strict type deserialization (for example, `map[int]string`), but `"github.com/shamaton/msgpack/v2"` supports this feature. This is super important for backward copmatibility. + - The library [github.com/vmihailenco/msgpack](https://github.com/vmihailenco/msgpack) doesn't support strict type deserialization (for example, `map[int]string`), but [github.com/shamaton/msgpack/v2](https://github.com/shamaton/msgpack) supports this feature. This is super important for backward copmatibility. 3. **Library Popularity**: - While [github.com/shamaton/msgpack/v2](https://github.com/shamaton/msgpack) has fewer stars on GitHub, it has proven to be reliable in various test cases. All cases created by me have passed successfully, which you can find in this [pull request](https://github.com/flyteorg/flytekit/pull/2751). @@ -167,7 +167,7 @@ Notes: - [github.com/shamaton/msgpack/v2](https://github.com/shamaton/msgpack) is still an actively maintained project. The author responds quickly to issues and questions, making it a well-supported choice for projects requiring ongoing maintenance and active support. 5. **Testing Process**: - - I initially started with [github.com/vmihailenco/msgpack](https://github.com/vmihailenco/msgpack) but switched to `"github.com/shamaton/msgpack/v2"` due to its better support for strict typing and the active support provided by the author. + - I initially started with [github.com/vmihailenco/msgpack](https://github.com/vmihailenco/msgpack) but switched to [github.com/shamaton/msgpack/v2](https://github.com/shamaton/msgpack) due to its better support for strict typing and the active support provided by the author. ##### JavaScript From 8c6b57c0581e3d44a281b5d73da661d650191277 Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Mon, 16 Sep 2024 23:05:11 +0800 Subject: [PATCH 13/26] add conditional problem in msgpack Signed-off-by: Future-Outlier --- rfc/system/5741-binary-idl-with-message-pack.md | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/rfc/system/5741-binary-idl-with-message-pack.md b/rfc/system/5741-binary-idl-with-message-pack.md index 3a4b398b06..34046dec6e 100644 --- a/rfc/system/5741-binary-idl-with-message-pack.md +++ b/rfc/system/5741-binary-idl-with-message-pack.md @@ -506,8 +506,17 @@ None, it's doable. ## 7 Potential Impact and Dependencies None. -## 8 Unresolved questions -None. +## 8. Unresolved Questions + +Currently, our support for `DataClass/BaseModel/Dict[type, type]` within conditional branches is incomplete. At present, we only support comparisons of primitive types. However, there are two key challenges when attempting to handle these more complex types: + +1. **Primitive Type Comparison vs. Binary IDL Object:** + - In conditional branches, we receive a `Binary IDL Object` during type comparison, which needs to be converted into a `Primitive IDL Object`. + - The issue is that we don't know the expected Python type or primitive type beforehand, making this conversion ambiguous. + +2. **MsgPack Incompatibility with `Primitive_Datetime` and `Primitive_Duration`:** + - MsgPack does not natively support the `Primitive_Datetime` and `Primitive_Duration` types, and instead converts them to strings. + - This can lead to inconsistencies in comparison logic. One potential workaround is to convert both types to strings for comparison. However, it is uncertain whether this approach is the best solution. ## 9 Conclusion MsgPack is a good choice because it's more smaller and faster than UTF-8 Encoded JSON String. From eda65ef52348529799a5438d7a394f2ea6540298 Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Tue, 17 Sep 2024 09:31:59 +0800 Subject: [PATCH 14/26] update yee's advice Signed-off-by: Future-Outlier --- rfc/system/5741-binary-idl-with-message-pack.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rfc/system/5741-binary-idl-with-message-pack.md b/rfc/system/5741-binary-idl-with-message-pack.md index 34046dec6e..2e8b7f965b 100644 --- a/rfc/system/5741-binary-idl-with-message-pack.md +++ b/rfc/system/5741-binary-idl-with-message-pack.md @@ -54,7 +54,7 @@ Note: The `metadata` of `Literal Type` and `Literal Value` are not the same. ## 2 Motivation -In Flytekit, when handling dataclasses, Pydantic base models, and dictionaries, we store data using a JSON string within Protobuf struct datatype. +Currently, before this RFC, in flytekit, when handling dataclasses, Pydantic base models, and dictionaries, we store data using a JSON string within Protobuf struct datatype. This approach causes issues with integers, as Protobuf struct does not support int types, leading to their conversion to floats. From 8a822ff6bb0e4e55b70fc2cb01cb7183c69b5855 Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Tue, 17 Sep 2024 09:35:26 +0800 Subject: [PATCH 15/26] use SimpleType.STRUCT for literal type Signed-off-by: Future-Outlier --- rfc/system/5741-binary-idl-with-message-pack.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rfc/system/5741-binary-idl-with-message-pack.md b/rfc/system/5741-binary-idl-with-message-pack.md index 2e8b7f965b..b1925f923e 100644 --- a/rfc/system/5741-binary-idl-with-message-pack.md +++ b/rfc/system/5741-binary-idl-with-message-pack.md @@ -42,10 +42,10 @@ Note: ### Literal Type -Literal Type will be `Protobuf struct`. +Literal Type will be `SimpleType.STRUCT`. `Json Schema` will be stored in `Literal Type's metadata`. -1. Dataclass, Pydantic BaseModel and pure dict in python will all use `Protobuf Struct`. +1. Dataclass, Pydantic BaseModel and pure dict in python will all use `SimpleType.STRUCT`. 2. We will put `Json Schema` in Literal Type's `metadata` field, this will be used in flytekit remote api to construct dataclass/Pydantic BaseModel by `Json Schema`. 3. We will use libraries written in golang to compare `Json Schema` to solve this issue: ["[BUG] Union types fail for e.g. two different dataclasses"](https://github.com/flyteorg/flyte/issues/5489). @@ -455,7 +455,7 @@ my_dc.model_json_schema() ### FlyteCtl In FlyteCtl, we can construct input for the execution. -We can construct a `Binary IDL Object` when we receive `Literal Type Struct`. +We can construct a `Binary IDL Object` when we receive `SimpleType.STRUCT`. In `flyteidl/clients/go/coreutils/literals.go`: ```go From f33ef39aa2b70638c7e654b17dcb6d89b62eb8b0 Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Tue, 17 Sep 2024 09:36:39 +0800 Subject: [PATCH 16/26] update yee's advice Signed-off-by: Future-Outlier --- rfc/system/5741-binary-idl-with-message-pack.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/rfc/system/5741-binary-idl-with-message-pack.md b/rfc/system/5741-binary-idl-with-message-pack.md index b1925f923e..6ee1c04774 100644 --- a/rfc/system/5741-binary-idl-with-message-pack.md +++ b/rfc/system/5741-binary-idl-with-message-pack.md @@ -49,12 +49,11 @@ Literal Type will be `SimpleType.STRUCT`. 2. We will put `Json Schema` in Literal Type's `metadata` field, this will be used in flytekit remote api to construct dataclass/Pydantic BaseModel by `Json Schema`. 3. We will use libraries written in golang to compare `Json Schema` to solve this issue: ["[BUG] Union types fail for e.g. two different dataclasses"](https://github.com/flyteorg/flyte/issues/5489). - Note: The `metadata` of `Literal Type` and `Literal Value` are not the same. ## 2 Motivation -Currently, before this RFC, in flytekit, when handling dataclasses, Pydantic base models, and dictionaries, we store data using a JSON string within Protobuf struct datatype. +Prior to this RFC, in flytekit, when handling dataclasses, Pydantic base models, and dictionaries, we store data using a JSON string within Protobuf struct datatype. This approach causes issues with integers, as Protobuf struct does not support int types, leading to their conversion to floats. From 5c8604f7097d8a3551b9450f62985c6c9e5a082a Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Tue, 17 Sep 2024 13:24:52 +0800 Subject: [PATCH 17/26] update flyteidl part with permenant link Signed-off-by: Future-Outlier --- rfc/system/5741-binary-idl-with-message-pack.md | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/rfc/system/5741-binary-idl-with-message-pack.md b/rfc/system/5741-binary-idl-with-message-pack.md index 6ee1c04774..318face3f9 100644 --- a/rfc/system/5741-binary-idl-with-message-pack.md +++ b/rfc/system/5741-binary-idl-with-message-pack.md @@ -196,14 +196,10 @@ reference: https://github.com/msgpack/msgpack-javascript ### FlyteIDL #### Literal Value -```proto -// A simple byte array with a tag to help different parts of the system communicate about what is in the byte array. -// It's strongly advisable that consumers of this type define a unique tag and validate the tag before parsing the data. -message Binary { - bytes value = 1; // Serialized data (MessagePack) for supported types like Dataclass, Pydantic BaseModel, and dict. - string tag = 2; // The serialization format identifier (e.g., MessagePack). Consumers must define unique tags and validate them before deserialization. -} -``` + +Here is the [IDL definition](https://github.com/flyteorg/flyte/blob/7989209e15600b56fcf0f4c4a7c9af7bfeab6f3e/flyteidl/protos/flyteidl/core/literals.proto#L42-L47). + +The `bytes` field is used for serialized data, and the `tag` field specifies the serialization format identifier. #### Literal Type ```proto import "google/protobuf/struct.proto"; From 8f01c8a9b762e3e9f3bf7cef3aa95465f389ef54 Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Tue, 17 Sep 2024 13:46:05 +0800 Subject: [PATCH 18/26] update flytectl part by Yee's advice Signed-off-by: Future-Outlier --- .../5741-binary-idl-with-message-pack.md | 33 +++++++++++++++++-- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/rfc/system/5741-binary-idl-with-message-pack.md b/rfc/system/5741-binary-idl-with-message-pack.md index 318face3f9..d81eeb95c2 100644 --- a/rfc/system/5741-binary-idl-with-message-pack.md +++ b/rfc/system/5741-binary-idl-with-message-pack.md @@ -448,11 +448,11 @@ my_dc.model_json_schema() ``` ### FlyteCtl + In FlyteCtl, we can construct input for the execution. -We can construct a `Binary IDL Object` when we receive `SimpleType.STRUCT`. +When we receive `SimpleType.STRUCT`, we can construct a `Binary IDL Object` using the following logic in `flyteidl/clients/go/coreutils/literals.go`: -In `flyteidl/clients/go/coreutils/literals.go`: ```go if newT.Simple == core.SimpleType_STRUCT { if _, isValueStringType := v.(string); !isValueStringType { @@ -463,7 +463,34 @@ if newT.Simple == core.SimpleType_STRUCT { strValue = string(byteValue) } } -``` +``` + +This is how users can create an execution by using a YAML file: +```bash +flytectl create execution --execFile ./flytectl/create_dataclass_task.yaml -p flytesnacks -d development +``` + +Example YAML file (`create_dataclass_task.yaml`): +```yaml +iamRoleARN: "" +inputs: + input: + a: 1 + b: 3.14 + c: example_string + d: + "1": 100 + "2": 200 + e: + a: 1 + b: 3.14 +envs: {} +kubeServiceAcct: "" +targetDomain: "" +targetProject: "" +task: dataclass_example.dataclass_task +version: OSyTikiBTAkjBgrL5JVOVw +``` ### FlyteConsole #### Show input/output on FlyteConsole From 6de3b3d40827bdb16058f6bdb4a3080d964c0947 Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Tue, 17 Sep 2024 14:36:25 +0800 Subject: [PATCH 19/26] update comments for the flytekit part Signed-off-by: Future-Outlier --- .../5741-binary-idl-with-message-pack.md | 47 +++++++++++++++++-- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/rfc/system/5741-binary-idl-with-message-pack.md b/rfc/system/5741-binary-idl-with-message-pack.md index d81eeb95c2..48bd8bed3f 100644 --- a/rfc/system/5741-binary-idl-with-message-pack.md +++ b/rfc/system/5741-binary-idl-with-message-pack.md @@ -108,7 +108,7 @@ def to_literal(): def to_python_value(): # lv: literal value if lv.scalar.binary.tag == "msgpack": - msgpack_bytes = lv.scalar.json.value + msgpack_bytes = lv.scalar.binary.value else: raise ValueError(f"{tag} is not supported to decode this Binary Literal: {lv.scalar.binary}.") return msgpack.loads(msgpack_bytes) @@ -157,7 +157,7 @@ Notes: - We can explore all MessagePack implementations for Golang at the [MessagePack official website](https://msgpack.org/index.html). 2. **Library Comparison**: - - The library [github.com/vmihailenco/msgpack](https://github.com/vmihailenco/msgpack) doesn't support strict type deserialization (for example, `map[int]string`), but [github.com/shamaton/msgpack/v2](https://github.com/shamaton/msgpack) supports this feature. This is super important for backward copmatibility. + - The library [github.com/vmihailenco/msgpack](https://github.com/vmihailenco/msgpack) doesn't support strict type deserialization (for example, `map[int]string`), but [github.com/shamaton/msgpack/v2](https://github.com/shamaton/msgpack) supports this feature. This is super important for backward compatibility. 3. **Library Popularity**: - While [github.com/shamaton/msgpack/v2](https://github.com/shamaton/msgpack) has fewer stars on GitHub, it has proven to be reliable in various test cases. All cases created by me have passed successfully, which you can find in this [pull request](https://github.com/flyteorg/flytekit/pull/2751). @@ -342,11 +342,48 @@ func (t trivialChecker) CastsFrom(upstreamType *flyte.LiteralType) bool { ``` ### FlyteKit #### Attribute Access -In most transformers, we should create a function `from_binary_idl` to convert the Binary IDL Object into the desired type. -When performing attribute access, Propeller will deserialize the msgpack bytes into a map object, retrieve the attribute, and then serialize it back into msgpack bytes (a Binary IDL Object containing msgpack bytes). +In all transformers, we should implement a function called `from_binary_idl` to convert the Binary IDL Object into the desired type. -This means that when converting a literal to a Python value, we will receive `msgpack bytes` instead of our `expected Python type`. +A base method can be added to the `TypeTransformer` class, allowing child classes to override it as needed. + +During attribute access, Flyte Propeller will deserialize the msgpack bytes into a map object in golang, retrieve the specific attribute, and then serialize it back into msgpack bytes (resulting in a Binary IDL Object containing msgpack bytes). + +This implies that when converting a literal to a Python value, we will receive `msgpack bytes` instead of the `expected Python type`. + +```python +# In Mashumaro, the default encoder uses strict_map_key=False, while the default decoder uses strict_map_key=True. +# This is relevant for cases like Dict[int, str]. +# If strict_map_key=False is not used, the decoder will raise an error when trying to decode keys that are not strictly typed. +def _default_flytekit_decoder(data: bytes) -> Any: + return msgpack.unpackb(data, raw=False, strict_map_key=False) + + +def from_binary_idl(self, binary_idl_object: Binary, expected_python_type: Type[T]) -> Optional[T]: + # Handle msgpack serialization + if binary_idl_object.tag == "msgpack": + try: + # Retrieve the existing decoder for the expected type + decoder = self._msgpack_decoder[expected_python_type] + except KeyError: + # Create a new decoder if not already cached + decoder = MessagePackDecoder(expected_python_type, pre_decoder_func=_default_flytekit_decoder) + self._msgpack_decoder[expected_python_type] = decoder + # Decode the binary IDL object into the expected Python type + return decoder.decode(binary_idl_object.value) + else: + # Raise an error if the binary format is not supported + raise TypeTransformerFailedError(f"Unsupported binary format {binary_idl_object.tag}") +``` + +Note: +1. This base method can handle primitive types, nested typed dictionaries, nested typed lists, and combinations of nested typed dictionaries and lists. + +2. Dataclass transformer needs its own `from_binary_idl` method to handle specific cases such as [discriminated classes](https://github.com/flyteorg/flyte/issues/5588). + +3. Flyte types (e.g., FlyteFile, FlyteDirectory, StructuredDataset, and FlyteSchema) will need their own `from_binary_idl` methods, as they must handle downloading files from remote object storage when converting literals to Python values. + +For example, see the FlyteFile implementation: https://github.com/flyteorg/flytekit/pull/2751/files#diff-22cf9c7153b54371b4a77331ddf276a082cf4b3c5e7bd1595dd67232288594fdR522-R552 #### pyflyte run The behavior will remain unchanged. From 434855aa511dd6d1610526bec6563b422ee96ddd Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Tue, 17 Sep 2024 16:27:03 +0800 Subject: [PATCH 20/26] use msgpack as default tag in func MakeBinaryLiteral Signed-off-by: Future-Outlier --- rfc/system/5741-binary-idl-with-message-pack.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/rfc/system/5741-binary-idl-with-message-pack.md b/rfc/system/5741-binary-idl-with-message-pack.md index 48bd8bed3f..94b586b219 100644 --- a/rfc/system/5741-binary-idl-with-message-pack.md +++ b/rfc/system/5741-binary-idl-with-message-pack.md @@ -277,14 +277,15 @@ func MakeDefaultLiteralForType(typ *core.LiteralType) (*core.Literal, error) { } } // Use Message Pack as Default Tag for deserialization. -func MakeBinaryLiteral(v []byte) *core.Literal { +// "tag" will default be "msgpack" +func MakeBinaryLiteral(v []byte, tag string) *core.Literal { return &core.Literal{ Value: &core.Literal_Scalar{ Scalar: &core.Scalar{ Value: &core.Scalar_Binary{ Binary: &core.Binary{ Value: v, - Tag: "msgpack", + Tag: tag, }, }, }, From 0e8ed0395caf080f455fe71d20a0c522621d339c Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Tue, 17 Sep 2024 16:49:25 +0800 Subject: [PATCH 21/26] Add FlyteCopilot section Signed-off-by: Future-Outlier --- rfc/system/5741-binary-idl-with-message-pack.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/rfc/system/5741-binary-idl-with-message-pack.md b/rfc/system/5741-binary-idl-with-message-pack.md index 94b586b219..6c43b1b0d3 100644 --- a/rfc/system/5741-binary-idl-with-message-pack.md +++ b/rfc/system/5741-binary-idl-with-message-pack.md @@ -530,6 +530,16 @@ task: dataclass_example.dataclass_task version: OSyTikiBTAkjBgrL5JVOVw ``` +### FlyteCopilot + +When we need to pass an attribute access value to a copilot task, we must modify the code to convert a Binary Literal value with the `msgpack` tag into a primitive value. + +(Currently, we will only support primitive values.) + +You can reference the relevant section of code here: + +[FlyteCopilot - Data Download](https://github.com/flyteorg/flyte/blob/7989209e15600b56fcf0f4c4a7c9af7bfeab6f3e/flytecopilot/data/download.go#L88-L95) + ### FlyteConsole #### Show input/output on FlyteConsole From 71744f3ab296ed64d1e51c2a69ea03e8a67a395e Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Tue, 17 Sep 2024 23:11:12 +0800 Subject: [PATCH 22/26] update all Eduardo's advice Signed-off-by: Future-Outlier --- rfc/system/5741-binary-idl-with-message-pack.md | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/rfc/system/5741-binary-idl-with-message-pack.md b/rfc/system/5741-binary-idl-with-message-pack.md index 6c43b1b0d3..50c9e5432d 100644 --- a/rfc/system/5741-binary-idl-with-message-pack.md +++ b/rfc/system/5741-binary-idl-with-message-pack.md @@ -570,8 +570,11 @@ None ## 6 Alternatives -None, it's doable. +MsgPack is a good choice because it's more smaller and faster than UTF-8 Encoded JSON String. + +You can see the performance comparison here: https://github.com/flyteorg/flyte/pull/5607#issuecomment-2333174325 +We will use `msgpack` to do it. ## 7 Potential Impact and Dependencies None. @@ -589,7 +592,7 @@ Currently, our support for `DataClass/BaseModel/Dict[type, type]` within conditi - This can lead to inconsistencies in comparison logic. One potential workaround is to convert both types to strings for comparison. However, it is uncertain whether this approach is the best solution. ## 9 Conclusion -MsgPack is a good choice because it's more smaller and faster than UTF-8 Encoded JSON String. -You can see the performance comparison here: https://github.com/flyteorg/flyte/pull/5607#issuecomment-2333174325 -We will use `msgpack` to do it. +1. Binary IDL with MessagePack Bytes provides a better representation for dataclasses, Pydantic BaseModels, and untyped dictionaries in Flyte. + +2. This approach ensures 100% accuracy of each attribute and enables attribute access. From 3d122dd91bd0b44a365f64df572efdc2ab2cbd36 Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Wed, 18 Sep 2024 00:07:22 +0800 Subject: [PATCH 23/26] update Yee's advice Signed-off-by: Future-Outlier --- .../5741-binary-idl-with-message-pack.md | 31 ++++++++++++++----- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/rfc/system/5741-binary-idl-with-message-pack.md b/rfc/system/5741-binary-idl-with-message-pack.md index 50c9e5432d..2b21ced4d9 100644 --- a/rfc/system/5741-binary-idl-with-message-pack.md +++ b/rfc/system/5741-binary-idl-with-message-pack.md @@ -171,29 +171,25 @@ Notes: ##### JavaScript ```javascript -import msgpack5 from 'msgpack5'; - -// Create a MessagePack instance -const msgpack = msgpack5(); +import { encode, decode } from '@msgpack/msgpack'; // Example data to encode const data = { a: 1 }; // Encode the data -const encodedData = msgpack.encode(data); +const encodedData = encode(data); // Print the encoded data console.log(encodedData); // // Decode the data -const decodedData = msgpack.decode(encodedData); +const decodedData = decode(encodedData); // Print the decoded data console.log(decodedData); // { a: 1 } ``` reference: https://github.com/msgpack/msgpack-javascript - ### FlyteIDL #### Literal Value @@ -541,6 +537,25 @@ You can reference the relevant section of code here: [FlyteCopilot - Data Download](https://github.com/flyteorg/flyte/blob/7989209e15600b56fcf0f4c4a7c9af7bfeab6f3e/flytecopilot/data/download.go#L88-L95) ### FlyteConsole +#### How users input into launch form? +When FlyteConsole receives a literal type of `SimpleType.STRUCT`, the input method depends on the availability of a JSON schema: + +1. No JSON Schema provided: + +The input is expected as an `Object` (e.g., `{"a": 1}`). + +2. JSON Schema provided: + +Users can input values according to the expected type defined by the schema, and construct an appropriate `Object`. + +Note: + +1. For dataclass and Pydantic BaseModel, we will provide JSON Schema in their literal type, and construct the input form for them. + +#### How inputs/outputs are shown in the console? +We should use `msgpack` to deserialize bytes to an `Object`, and show it in the Flyte Console. + + #### Show input/output on FlyteConsole 1. Get Bytes from the Binary IDL Object. @@ -580,7 +595,7 @@ We will use `msgpack` to do it. None. ## 8. Unresolved Questions - +### Conditional Branch Currently, our support for `DataClass/BaseModel/Dict[type, type]` within conditional branches is incomplete. At present, we only support comparisons of primitive types. However, there are two key challenges when attempting to handle these more complex types: 1. **Primitive Type Comparison vs. Binary IDL Object:** From 123232e4313197ceea0216039f3e7bd1b2b40454 Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Wed, 18 Sep 2024 00:21:09 +0800 Subject: [PATCH 24/26] FlyteConsole update Signed-off-by: Future-Outlier --- .../5741-binary-idl-with-message-pack.md | 31 +++++-------------- 1 file changed, 7 insertions(+), 24 deletions(-) diff --git a/rfc/system/5741-binary-idl-with-message-pack.md b/rfc/system/5741-binary-idl-with-message-pack.md index 2b21ced4d9..58fd12eb3f 100644 --- a/rfc/system/5741-binary-idl-with-message-pack.md +++ b/rfc/system/5741-binary-idl-with-message-pack.md @@ -542,38 +542,21 @@ When FlyteConsole receives a literal type of `SimpleType.STRUCT`, the input meth 1. No JSON Schema provided: -The input is expected as an `Object` (e.g., `{"a": 1}`). +Input is expected as an `Object` (e.g., `{"a": 1}`). 2. JSON Schema provided: -Users can input values according to the expected type defined by the schema, and construct an appropriate `Object`. +Users can input values based on the schema's expected type and construct an appropriate `Object`. Note: -1. For dataclass and Pydantic BaseModel, we will provide JSON Schema in their literal type, and construct the input form for them. +For `dataclass` and Pydantic `BaseModel`, a JSON schema will be provided in their literal type, and the input form will be constructed accordingly. -#### How inputs/outputs are shown in the console? -We should use `msgpack` to deserialize bytes to an `Object`, and show it in the Flyte Console. +#### Displaying Inputs/Outputs in the Console +Use `msgpack` to deserialize bytes into an Object and display it in Flyte Console. - -#### Show input/output on FlyteConsole - -1. Get Bytes from the Binary IDL Object. -2. Get Tag from the Binary IDL Object. -3. Use Tag to determine how to deserialize `Bytes`, and show the deserialized value in FlyteConsole. - -#### Copy Input -Return `MessagePack Bytes` to your clipboard, it can be used in `Input Bytes` below. - -#### Construct Input -##### Input Bytes -1. Input `MessagePack Bytes` to the console, for example, `{"a": 1}` in python will be `b'\x81\xa1a\x01'`. -2. Tag can only use `msgpack`, it's forced now. - -##### Launch Form -1. Use `JSON Schema` to know what every input field, and make users input it 1 by 1. -2. Serialize the input value into `MessagePack Bytes` -3. Create a `Binary IDL Object`, value is `MessagePack Bytes` and tag is `msgpack`. +#### Copying Inputs/Outputs in the Console +Allow users to copy the `Object` to the clipboard, as currently implemented. ## 4 Metrics & Dashboards From b6dfb333bf98017958deefc70b2f3f30cd87b151 Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Wed, 18 Sep 2024 15:01:07 +0800 Subject: [PATCH 25/26] update Yee's advice Signed-off-by: Future-Outlier --- rfc/system/5741-binary-idl-with-message-pack.md | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/rfc/system/5741-binary-idl-with-message-pack.md b/rfc/system/5741-binary-idl-with-message-pack.md index 58fd12eb3f..c0b5706cb3 100644 --- a/rfc/system/5741-binary-idl-with-message-pack.md +++ b/rfc/system/5741-binary-idl-with-message-pack.md @@ -542,22 +542,31 @@ When FlyteConsole receives a literal type of `SimpleType.STRUCT`, the input meth 1. No JSON Schema provided: -Input is expected as an `Object` (e.g., `{"a": 1}`). +Input is expected as `a Javascript Object` (e.g., `{"a": 1}`). 2. JSON Schema provided: -Users can input values based on the schema's expected type and construct an appropriate `Object`. +Users can input values based on the schema's expected type and construct an appropriate `Javascript Object`. Note: For `dataclass` and Pydantic `BaseModel`, a JSON schema will be provided in their literal type, and the input form will be constructed accordingly. +##### What happens after the user enters data? + +Input values -> Javascript Object -> msgpack bytes -> Binary IDL With MessagePack Bytes + #### Displaying Inputs/Outputs in the Console Use `msgpack` to deserialize bytes into an Object and display it in Flyte Console. #### Copying Inputs/Outputs in the Console Allow users to copy the `Object` to the clipboard, as currently implemented. +#### Pasting and Copying from FlyteConsole +Currently, we should support JSON pasting if the content is a JavaScript object. However, there's a question of how we might handle other formats like YAML or MsgPack bytes, especially if copied from a binary file. + +For now, focusing on supporting JSON pasting makes sense. However, adding support for YAML and MsgPack bytes could be valuable future enhancements. + ## 4 Metrics & Dashboards None From 3716255d8f5ad36ec199f72624abd754348ff412 Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Wed, 18 Sep 2024 15:11:48 +0800 Subject: [PATCH 26/26] update Yee's advice Signed-off-by: Future-Outlier --- rfc/system/5741-binary-idl-with-message-pack.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/rfc/system/5741-binary-idl-with-message-pack.md b/rfc/system/5741-binary-idl-with-message-pack.md index c0b5706cb3..ae04b0903f 100644 --- a/rfc/system/5741-binary-idl-with-message-pack.md +++ b/rfc/system/5741-binary-idl-with-message-pack.md @@ -420,10 +420,18 @@ Note: We will use mashumaro's `MessagePackEncoder` and `MessagePackDecoder` to s from mashumaro.codecs.msgpack import MessagePackDecoder, MessagePackEncoder ``` +#### Literal Type's TypeStructure's dataclass_type +This is used for compiling dataclass attribute access. + +With it, we can retrieve the literal type of an attribute and validate it in Flyte's propeller compiler. + +For more details, check here: https://github.com/flyteorg/flytekit/blob/fb55841f8660b2a31e99381dd06e42f8cd22758e/flytekit/core/type_engine.py#L454-L525 + #### JSON Schema The JSON Schema of `dataclass` will be generated by `marshmallow` or `mashumaro`. Check here: https://github.com/flyteorg/flytekit/blob/8c6f6f0f17d113447e1b10b03e25a34bad79685c/flytekit/core/type_engine.py#L442-L474 + ### Pydantic Transformer #### Literal Value Pydantic can't be serialized to `msgpack bytes` directly. @@ -455,6 +463,13 @@ my_dc = MyDCModel(dc=DC(a=1, b=2.0, c="3", d={"4": 5}, e=DC_inside(a=6, b=7.0))) # {'dc': {'a': 1, 'b': 2.0, 'c': '3', 'd': {'4': 5}, 'e': {'a': 6, 'b': 7.0}}} ``` +#### Literal Type's TypeStructure's dataclass_type +This is used for compiling Pydantic BaseModel attribute access. + +With it, we can retrieve an attribute's literal type and validate it in Flyte's propeller compiler. + +Although this feature is not currently implemented, it will function similarly to the dataclass transformer in the future. + #### JSON Schema The JSON Schema of `BaseModel` will be generated by Pydantic's API. ```python