From 08a3c71757762974fb9987f6c7e3a594bddcf0b8 Mon Sep 17 00:00:00 2001 From: piotrczarnas <141012958+piotrczarnas@users.noreply.github.com> Date: Sat, 11 Jan 2025 17:52:00 +0100 Subject: [PATCH] Support for Avro files. --- CHANGELOG.md | 8 +- dqops/pom.xml | 2 +- .../Dashboard/DatabaseConnection/index.tsx | 1 + .../FileFormatConfiguration.tsx | 22 ++++- .../AvroFormatConfiguration.tsx | 35 +++++++ .../frontend/src/components/SvgIcon/index.tsx | 2 + .../src/components/SvgIcon/svg/avro-icon.svg | 20 ++++ .../src/pages/CreateConnection/index.tsx | 3 + .../src/main/frontend/src/shared/constants.ts | 6 ++ .../duckdb/DuckdbParametersSpec.java | 32 ++++++ .../duckdb/DuckdbSourceConnection.java | 7 +- .../duckdb/config/DuckdbFilesFormatType.java | 3 + .../id/HierarchyNodeResultVisitor.java | 9 ++ .../search/AbstractSearchVisitor.java | 13 +++ .../sources/fileformat/FileFormatSpec.java | 34 +++++++ .../fileformat/FileFormatSpecProvider.java | 2 + .../fileformat/avro/AvroFileFormatSpec.java | 99 +++++++++++++++++++ .../swagger-api/dqops-api-swagger-2.json | 19 +++- .../swagger-api/dqops-api-swagger-2.yaml | 14 +++ 19 files changed, 323 insertions(+), 8 deletions(-) create mode 100644 dqops/src/main/frontend/src/components/FileFormatConfiguration/FormatsConfiguration/AvroFormatConfiguration.tsx create mode 100644 dqops/src/main/frontend/src/components/SvgIcon/svg/avro-icon.svg create mode 100644 dqops/src/main/java/com/dqops/metadata/sources/fileformat/avro/AvroFileFormatSpec.java diff --git a/CHANGELOG.md b/CHANGELOG.md index 4b332b73e5..d634a36347 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,5 @@ -# 1.10.1 - -* Small UI fixes to open pages directly from an URL. -* Fix problems when installing on Windows using pip, when Python was installed from Windows Store and uses a deeply nested folder structure +# 1.11.0 +* Fixing problems when importing files from a non-existing folder +* Upgrade DuckDB to 1.1.3 +* Support Avro files diff --git a/dqops/pom.xml b/dqops/pom.xml index baa894e211..50187e6e32 100644 --- a/dqops/pom.xml +++ b/dqops/pom.xml @@ -36,7 +36,7 @@ 0.43.1 apache_v2 build - 1.1.2 + 1.1.3 diff --git a/dqops/src/main/frontend/src/components/Dashboard/DatabaseConnection/index.tsx b/dqops/src/main/frontend/src/components/Dashboard/DatabaseConnection/index.tsx index 74e663c4c5..bc903d5e10 100644 --- a/dqops/src/main/frontend/src/components/Dashboard/DatabaseConnection/index.tsx +++ b/dqops/src/main/frontend/src/components/Dashboard/DatabaseConnection/index.tsx @@ -254,6 +254,7 @@ const DatabaseConnection = ({ nameOfDatabase === 'CSV' || nameOfDatabase === 'Parquet' || nameOfDatabase === 'JSON' || + nameOfDatabase === 'Avro' || nameOfDatabase === 'Iceberg' || nameOfDatabase === 'Delta Lake' } diff --git a/dqops/src/main/frontend/src/components/FileFormatConfiguration/FileFormatConfiguration.tsx b/dqops/src/main/frontend/src/components/FileFormatConfiguration/FileFormatConfiguration.tsx index 08b4e7c4eb..4d01fdfb21 100644 --- a/dqops/src/main/frontend/src/components/FileFormatConfiguration/FileFormatConfiguration.tsx +++ b/dqops/src/main/frontend/src/components/FileFormatConfiguration/FileFormatConfiguration.tsx @@ -4,7 +4,8 @@ import { DuckdbParametersSpecFilesFormatTypeEnum, IcebergFileFormatSpec, JsonFileFormatSpec, - ParquetFileFormatSpec + ParquetFileFormatSpec, + AvroFileFormatSpec } from '../../api'; import SectionWrapper from '../Dashboard/SectionWrapper'; import Select from '../Select'; @@ -12,6 +13,7 @@ import CsvFormatConfiguration from './FormatsConfiguration/CsvFormatConfiguratio import IcebergFormatConfiguration from './FormatsConfiguration/IcebergFormatConfiguration'; import JsonFormatConfiguration from './FormatsConfiguration/JsonFormatConfiguration'; import ParquetFormatConfiguration from './FormatsConfiguration/ParquetFormatConfiguration'; +import AvroFormatConfiguration from './FormatsConfiguration/AvroFormatConfiguration'; import { TConfiguration } from './TConfiguration'; type TFileFormatConfigurationProps = { @@ -37,6 +39,10 @@ const sourceFilesTypeOptions = [ label: 'Parquet', value: DuckdbParametersSpecFilesFormatTypeEnum.parquet }, + { + label: 'Avro', + value: DuckdbParametersSpecFilesFormatTypeEnum.avro + }, { label: 'Iceberg', value: DuckdbParametersSpecFilesFormatTypeEnum.iceberg @@ -76,6 +82,12 @@ export default function FileFormatConfiguration({ return fileFormatType === DuckdbParametersSpecFilesFormatTypeEnum.parquet; } + function isAvroFileFormatSpec( + config: TConfiguration + ): config is AvroFileFormatSpec { + return fileFormatType === DuckdbParametersSpecFilesFormatTypeEnum.avro; + } + function isIcebergFileFormatSpec( config: TConfiguration ): config is IcebergFileFormatSpec { @@ -108,6 +120,14 @@ export default function FileFormatConfiguration({ /> ) : null; } + case DuckdbParametersSpecFilesFormatTypeEnum.avro: { + return isAvroFileFormatSpec(configuration) ? ( + + ) : null; + } case DuckdbParametersSpecFilesFormatTypeEnum.iceberg: { return isIcebergFileFormatSpec(configuration) ? ( ) => void; +}; + +export default function IcebergFormatConfiguration({ + configuration, + onChangeConfiguration +}: TIcebergConfigurationProps) { + + const avroConfigurationBooleans: TConfigurationItemRowBoolean[] = + useMemo(() => { + return [ + { + label: 'Filename', + value: configuration?.filename, + onChange: (value) => + onChangeConfiguration({ filename: value }) + } + ]; + }, [configuration]); + + return ( + + ); +} diff --git a/dqops/src/main/frontend/src/components/SvgIcon/index.tsx b/dqops/src/main/frontend/src/components/SvgIcon/index.tsx index 4c9c981d73..ae46378624 100644 --- a/dqops/src/main/frontend/src/components/SvgIcon/index.tsx +++ b/dqops/src/main/frontend/src/components/SvgIcon/index.tsx @@ -38,6 +38,7 @@ import { ReactComponent as CommentSvg } from './svg/comment.svg'; import { ReactComponent as ConfigurationSvg } from './svg/configuration.svg'; import { ReactComponent as CopyTextSvg } from './svg/copy-text.svg'; import { ReactComponent as CsvSvg } from './svg/csv-icon.svg'; +import { ReactComponent as AvroSvg } from './svg/avro-icon.svg'; import { ReactComponent as DashboardsSvg } from './svg/dashboards.svg'; import { ReactComponent as DataDictionarySvg } from './svg/data-dictionary.svg'; import { ReactComponent as DataSourcesSvg } from './svg/data_sources.svg'; @@ -245,6 +246,7 @@ const iconsMap: any = { datadictionary: DataDictionarySvg, duckdb: DuckdbSvg, csv: CsvSvg, + avro: AvroSvg, json: JsonSvg, parquet: ParquetSvg, comment: CommentSvg, diff --git a/dqops/src/main/frontend/src/components/SvgIcon/svg/avro-icon.svg b/dqops/src/main/frontend/src/components/SvgIcon/svg/avro-icon.svg new file mode 100644 index 0000000000..0c9d2451a9 --- /dev/null +++ b/dqops/src/main/frontend/src/components/SvgIcon/svg/avro-icon.svg @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + + + diff --git a/dqops/src/main/frontend/src/pages/CreateConnection/index.tsx b/dqops/src/main/frontend/src/pages/CreateConnection/index.tsx index 8ff3389042..86b14f885a 100644 --- a/dqops/src/main/frontend/src/pages/CreateConnection/index.tsx +++ b/dqops/src/main/frontend/src/pages/CreateConnection/index.tsx @@ -152,6 +152,9 @@ const CreateConnection = () => { case 'Parquet': fileFormat = DuckdbParametersSpecFilesFormatTypeEnum.parquet; break; + case 'Avro': + fileFormat = DuckdbParametersSpecFilesFormatTypeEnum.avro; + break; case 'Iceberg': fileFormat = DuckdbParametersSpecFilesFormatTypeEnum.iceberg; break; diff --git a/dqops/src/main/frontend/src/shared/constants.ts b/dqops/src/main/frontend/src/shared/constants.ts index c1df4ac0d2..d92ddc7f00 100644 --- a/dqops/src/main/frontend/src/shared/constants.ts +++ b/dqops/src/main/frontend/src/shared/constants.ts @@ -393,6 +393,12 @@ export const databaseOptions: IDatabaseOption[] = [ iconName: 'csv', displayName: 'CSV' }, + { + type: ConnectionModelProviderTypeEnum.duckdb, + name: 'Avro', + iconName: 'avro', + displayName: 'Avro' + }, { type: ConnectionModelProviderTypeEnum.databricks, name: 'Databricks', diff --git a/dqops/src/main/java/com/dqops/connectors/duckdb/DuckdbParametersSpec.java b/dqops/src/main/java/com/dqops/connectors/duckdb/DuckdbParametersSpec.java index e9faf03b8c..5396d68b43 100644 --- a/dqops/src/main/java/com/dqops/connectors/duckdb/DuckdbParametersSpec.java +++ b/dqops/src/main/java/com/dqops/connectors/duckdb/DuckdbParametersSpec.java @@ -26,6 +26,7 @@ import com.dqops.metadata.id.ChildHierarchyNodeFieldMap; import com.dqops.metadata.id.ChildHierarchyNodeFieldMapImpl; import com.dqops.metadata.sources.BaseProviderParametersSpec; +import com.dqops.metadata.sources.fileformat.avro.AvroFileFormatSpec; import com.dqops.metadata.sources.fileformat.csv.CsvFileFormatSpec; import com.dqops.metadata.sources.fileformat.deltalake.DeltaLakeFileFormatSpec; import com.dqops.metadata.sources.fileformat.iceberg.IcebergFileFormatSpec; @@ -64,6 +65,7 @@ public class DuckdbParametersSpec extends BaseProviderParametersSpec put("csv", o -> o.csv); put("json", o -> o.json); put("parquet", o -> o.parquet); + put("avro", o -> o.avro); put("iceberg", o -> o.iceberg); put("delta_lake", o -> o.deltaLake); } @@ -101,6 +103,11 @@ public class DuckdbParametersSpec extends BaseProviderParametersSpec @JsonSerialize(using = IgnoreEmptyYamlSerializer.class) private ParquetFileFormatSpec parquet; + @JsonPropertyDescription("Avro file format specification.") + @JsonInclude(JsonInclude.Include.NON_EMPTY) + @JsonSerialize(using = IgnoreEmptyYamlSerializer.class) + private AvroFileFormatSpec avro; + @JsonPropertyDescription("Iceberg file format specification.") @JsonInclude(JsonInclude.Include.NON_EMPTY) @JsonSerialize(using = IgnoreEmptyYamlSerializer.class) @@ -281,6 +288,24 @@ public void setParquet(ParquetFileFormatSpec parquet) { propagateHierarchyIdToField(parquet, "parquet"); } + /** + * Returns the avro file format specification. + * @return Avro file format specification. + */ + public AvroFileFormatSpec getAvro() { + return avro; + } + + /** + * Sets the avro file format specification. + * @param avro Avro file format specification. + */ + public void setAvro(AvroFileFormatSpec avro) { + setDirtyIf(!Objects.equals(this.avro, avro)); + this.avro = avro; + propagateHierarchyIdToField(avro, "avro"); + } + /** * Returns the Iceberg table format specification. * @return Iceberg table format specification. @@ -551,6 +576,7 @@ public boolean isSetHivePartitioning(){ case csv: return getCsv() != null && getCsv().getHivePartitioning() != null && getCsv().getHivePartitioning(); case json: return getJson() != null && getJson().getHivePartitioning() != null && getJson().getHivePartitioning(); case parquet: return getParquet() != null && getParquet().getHivePartitioning() != null && getParquet().getHivePartitioning(); + case avro: return false; // not supported by DuckDB } } return false; @@ -588,6 +614,8 @@ public String getFullExtension(){ return fileTypeExtension + (compressionExtension == null ? "" : compressionExtension); } } + // avro does not support compression + return fileTypeExtension; } @@ -604,6 +632,7 @@ public boolean isFormatSetForType(){ case csv: return this.getCsv() != null; case json: return this.getJson() != null; case parquet: return this.getParquet() != null; + case avro: return this.getAvro() != null; case iceberg: return this.getIceberg() != null; case delta_lake: return this.getDeltaLake() != null; default: throw new RuntimeException("The file format is not supported : " + filesFormatType); @@ -681,6 +710,9 @@ public DuckdbParametersSpec expandAndTrim(SecretValueProvider secretValueProvide if(cloned.json != null){ cloned.json = cloned.json.expandAndTrim(secretValueProvider, lookupContext); } + if(cloned.avro != null){ + cloned.avro = cloned.avro.expandAndTrim(secretValueProvider, lookupContext); + } cloned.user = secretValueProvider.expandValue(cloned.user, lookupContext); cloned.password = secretValueProvider.expandValue(cloned.password, lookupContext); cloned.region = secretValueProvider.expandValue(cloned.region, lookupContext); diff --git a/dqops/src/main/java/com/dqops/connectors/duckdb/DuckdbSourceConnection.java b/dqops/src/main/java/com/dqops/connectors/duckdb/DuckdbSourceConnection.java index 80079552be..1606f5cbc0 100644 --- a/dqops/src/main/java/com/dqops/connectors/duckdb/DuckdbSourceConnection.java +++ b/dqops/src/main/java/com/dqops/connectors/duckdb/DuckdbSourceConnection.java @@ -273,7 +273,8 @@ private List getAvailableExtensions() { "aws", "azure", "iceberg", - "delta" + "delta", + "avro" ); } @@ -293,6 +294,10 @@ private void registerExtensions() { availableExtensionList.stream().forEach(extensionName -> { try { String installExtensionQuery = "INSTALL " + extensionName; + if (Objects.equals(extensionName, "avro")) { + installExtensionQuery += " FROM community"; // https://duckdb.org/2024/12/09/duckdb-avro-extension.html + } + this.executeCommand(installExtensionQuery, JobCancellationToken.createDummyJobCancellationToken()); String loadExtensionQuery = "LOAD " + extensionName; this.executeCommand(loadExtensionQuery, JobCancellationToken.createDummyJobCancellationToken()); diff --git a/dqops/src/main/java/com/dqops/connectors/duckdb/config/DuckdbFilesFormatType.java b/dqops/src/main/java/com/dqops/connectors/duckdb/config/DuckdbFilesFormatType.java index 6fbf05a01a..d0f90d3eaa 100644 --- a/dqops/src/main/java/com/dqops/connectors/duckdb/config/DuckdbFilesFormatType.java +++ b/dqops/src/main/java/com/dqops/connectors/duckdb/config/DuckdbFilesFormatType.java @@ -15,6 +15,9 @@ public enum DuckdbFilesFormatType { @JsonProperty("parquet") parquet, + @JsonProperty("avro") + avro, + @JsonProperty("iceberg") iceberg, diff --git a/dqops/src/main/java/com/dqops/metadata/id/HierarchyNodeResultVisitor.java b/dqops/src/main/java/com/dqops/metadata/id/HierarchyNodeResultVisitor.java index a49a4fc985..14d9c8e946 100644 --- a/dqops/src/main/java/com/dqops/metadata/id/HierarchyNodeResultVisitor.java +++ b/dqops/src/main/java/com/dqops/metadata/id/HierarchyNodeResultVisitor.java @@ -78,6 +78,7 @@ import com.dqops.metadata.sources.fileformat.FileFormatSpec; import com.dqops.metadata.sources.fileformat.FilePathListSpec; import com.dqops.metadata.sources.fileformat.ParquetFileFormatSpec; +import com.dqops.metadata.sources.fileformat.avro.AvroFileFormatSpec; import com.dqops.metadata.sources.fileformat.csv.CsvFileFormatSpec; import com.dqops.metadata.sources.fileformat.deltalake.DeltaLakeFileFormatSpec; import com.dqops.metadata.sources.fileformat.iceberg.IcebergFileFormatSpec; @@ -1032,4 +1033,12 @@ public interface HierarchyNodeResultVisitor { * @return Accept's result. */ R accept(ConnectionSimilarityIndexListImpl connectionSimilarityIndexWrappers, P parameter); + + /** + * Accepts an Avro file configuration settings. + * @param avroFileFormatSpec Avro connection settings. + * @param parameter Additional visitor's parameter. + * @return Accept's result. + */ + R accept(AvroFileFormatSpec avroFileFormatSpec, P parameter); } \ No newline at end of file diff --git a/dqops/src/main/java/com/dqops/metadata/search/AbstractSearchVisitor.java b/dqops/src/main/java/com/dqops/metadata/search/AbstractSearchVisitor.java index 816c1282eb..10b0cd0f86 100644 --- a/dqops/src/main/java/com/dqops/metadata/search/AbstractSearchVisitor.java +++ b/dqops/src/main/java/com/dqops/metadata/search/AbstractSearchVisitor.java @@ -79,6 +79,7 @@ import com.dqops.metadata.sources.fileformat.FileFormatSpec; import com.dqops.metadata.sources.fileformat.FilePathListSpec; import com.dqops.metadata.sources.fileformat.ParquetFileFormatSpec; +import com.dqops.metadata.sources.fileformat.avro.AvroFileFormatSpec; import com.dqops.metadata.sources.fileformat.csv.CsvFileFormatSpec; import com.dqops.metadata.sources.fileformat.deltalake.DeltaLakeFileFormatSpec; import com.dqops.metadata.sources.fileformat.iceberg.IcebergFileFormatSpec; @@ -1380,4 +1381,16 @@ public TreeNodeTraversalResult accept(ConnectionSimilarityIndexWrapperImpl conne public TreeNodeTraversalResult accept(ConnectionSimilarityIndexListImpl connectionSimilarityIndexWrappers, T parameter) { return TreeNodeTraversalResult.TRAVERSE_CHILDREN; } + + /** + * Accepts an Avro file configuration settings. + * + * @param avroFileFormatSpec Avro connection settings. + * @param parameter Additional visitor's parameter. + * @return Accept's result. + */ + @Override + public TreeNodeTraversalResult accept(AvroFileFormatSpec avroFileFormatSpec, T parameter) { + return TreeNodeTraversalResult.TRAVERSE_CHILDREN; + } } diff --git a/dqops/src/main/java/com/dqops/metadata/sources/fileformat/FileFormatSpec.java b/dqops/src/main/java/com/dqops/metadata/sources/fileformat/FileFormatSpec.java index 6349e41974..2a8d813747 100644 --- a/dqops/src/main/java/com/dqops/metadata/sources/fileformat/FileFormatSpec.java +++ b/dqops/src/main/java/com/dqops/metadata/sources/fileformat/FileFormatSpec.java @@ -11,6 +11,7 @@ import com.dqops.metadata.id.ChildHierarchyNodeFieldMapImpl; import com.dqops.metadata.id.HierarchyNodeResultVisitor; import com.dqops.metadata.sources.TableSpec; +import com.dqops.metadata.sources.fileformat.avro.AvroFileFormatSpec; import com.dqops.metadata.sources.fileformat.csv.CsvFileFormatSpec; import com.dqops.metadata.sources.fileformat.deltalake.DeltaLakeFileFormatSpec; import com.dqops.metadata.sources.fileformat.iceberg.IcebergFileFormatSpec; @@ -43,6 +44,7 @@ public class FileFormatSpec extends AbstractSpec { put("csv", o -> o.csv); put("json", o -> o.json); put("parquet", o -> o.parquet); + put("avro", o -> o.avro); put("iceberg", o -> o.iceberg); put("delta_lake", o -> o.deltaLake); } @@ -63,6 +65,11 @@ public class FileFormatSpec extends AbstractSpec { @JsonSerialize(using = IgnoreEmptyYamlSerializer.class) private ParquetFileFormatSpec parquet; + @JsonPropertyDescription("Avro file format specification.") + @JsonInclude(JsonInclude.Include.NON_EMPTY) + @JsonSerialize(using = IgnoreEmptyYamlSerializer.class) + private AvroFileFormatSpec avro; + @JsonPropertyDescription("Iceberg file format specification.") @JsonInclude(JsonInclude.Include.NON_EMPTY) @JsonSerialize(using = IgnoreEmptyYamlSerializer.class) @@ -132,6 +139,24 @@ public void setParquet(ParquetFileFormatSpec parquet) { propagateHierarchyIdToField(parquet, "parquet"); } + /** + * Returns the avro file format specification. + * @return Avro file format specification. + */ + public AvroFileFormatSpec getAvro() { + return avro; + } + + /** + * Sets the avro file format specification. + * @param avro Avro file format specification. + */ + public void setAvro(AvroFileFormatSpec avro) { + setDirtyIf(!Objects.equals(this.avro, avro)); + this.avro = avro; + propagateHierarchyIdToField(avro, "avro"); + } + /** * Returns the Iceberg table format specification. * @return Iceberg table format specification. @@ -196,6 +221,7 @@ public boolean isSetHivePartitioning(DuckdbFilesFormatType duckdbFilesFormatType case csv: return getCsv() != null && getCsv().getHivePartitioning() != null && getCsv().getHivePartitioning(); case json: return getJson() != null && getJson().getHivePartitioning() != null && getJson().getHivePartitioning(); case parquet: return getParquet() != null && getParquet().getHivePartitioning() != null && getParquet().getHivePartitioning(); + case avro: return false; // not supported yet by DuckDB Avro extension } } return false; @@ -222,6 +248,7 @@ public String buildTableOptionsString(DuckdbParametersSpec duckdb, TableSpec tab case csv: return csv.buildSourceTableOptionsString(filePathList, tableSpec); case json: return json.buildSourceTableOptionsString(filePathList, tableSpec); case parquet: return parquet.buildSourceTableOptionsString(filePathList, tableSpec); + case avro: return avro.buildSourceTableOptionsString(filePathList, tableSpec); case iceberg: return iceberg.buildSourceTableOptionsString(filePathList, tableSpec); case delta_lake: return deltaLake.buildSourceTableOptionsString(filePathList, tableSpec); default: throw new RuntimeException("Cant create table options string for the given files: " + filePathList); @@ -238,6 +265,7 @@ public boolean isFormatSetForType(DuckdbFilesFormatType duckdbFilesFormatType){ case csv: return this.getCsv() != null; case json: return this.getJson() != null; case parquet: return this.getParquet() != null; + case avro: return this.getAvro() != null; case iceberg: return this.getIceberg() != null; case delta_lake: return this.getDeltaLake() != null; default: throw new RuntimeException("The file format is not supported : " + duckdbFilesFormatType); @@ -273,6 +301,9 @@ public String getFullExtension(DuckdbFilesFormatType duckdbFilesFormatType){ return fileTypeExtension + formatSpec.getCompression().getCompressionExtension(); } } + if (duckdbFilesFormatType.equals(DuckdbFilesFormatType.avro) && getAvro() != null) { + return fileTypeExtension; // compression not supported yet in DuckDB + } return fileTypeExtension; } @@ -333,6 +364,9 @@ public FileFormatSpec expandAndTrim(SecretValueProvider secretValueProvider, Sec if (cloned.parquet != null) { cloned.parquet = cloned.parquet.deepClone(); } + if (cloned.avro != null) { + cloned.avro = cloned.avro.deepClone(); + } if (cloned.iceberg != null) { cloned.iceberg = cloned.iceberg.deepClone(); } diff --git a/dqops/src/main/java/com/dqops/metadata/sources/fileformat/FileFormatSpecProvider.java b/dqops/src/main/java/com/dqops/metadata/sources/fileformat/FileFormatSpecProvider.java index 4211689557..45455ded4e 100644 --- a/dqops/src/main/java/com/dqops/metadata/sources/fileformat/FileFormatSpecProvider.java +++ b/dqops/src/main/java/com/dqops/metadata/sources/fileformat/FileFormatSpecProvider.java @@ -6,6 +6,7 @@ import com.dqops.connectors.duckdb.fileslisting.aws.AwsConstants; import com.dqops.connectors.duckdb.fileslisting.azure.AzureConstants; import com.dqops.metadata.sources.TableSpec; +import com.dqops.metadata.sources.fileformat.avro.AvroFileFormatSpec; import com.dqops.metadata.sources.fileformat.csv.CsvFileFormatSpec; import com.dqops.metadata.sources.fileformat.deltalake.DeltaLakeFileFormatSpec; import com.dqops.metadata.sources.fileformat.iceberg.IcebergFileFormatSpec; @@ -145,6 +146,7 @@ private static void fillDefaultFileFormat(FileFormatSpec fileFormatSpec, DuckdbF case csv: fileFormatSpec.setCsv(new CsvFileFormatSpec()); break; case json: fileFormatSpec.setJson(new JsonFileFormatSpec()); break; case parquet: fileFormatSpec.setParquet(new ParquetFileFormatSpec()); break; + case avro: fileFormatSpec.setAvro(new AvroFileFormatSpec()); break; case iceberg: fileFormatSpec.setIceberg(new IcebergFileFormatSpec()); break; case delta_lake: fileFormatSpec.setDeltaLake(new DeltaLakeFileFormatSpec()); break; default: throw new RuntimeException("Can't fill default file format for files type: " + duckdbFilesFormatType); diff --git a/dqops/src/main/java/com/dqops/metadata/sources/fileformat/avro/AvroFileFormatSpec.java b/dqops/src/main/java/com/dqops/metadata/sources/fileformat/avro/AvroFileFormatSpec.java new file mode 100644 index 0000000000..b9277295a7 --- /dev/null +++ b/dqops/src/main/java/com/dqops/metadata/sources/fileformat/avro/AvroFileFormatSpec.java @@ -0,0 +1,99 @@ +package com.dqops.metadata.sources.fileformat.avro; + +import com.dqops.core.secrets.SecretValueLookupContext; +import com.dqops.core.secrets.SecretValueProvider; +import com.dqops.metadata.basespecs.AbstractSpec; +import com.dqops.metadata.id.ChildHierarchyNodeFieldMap; +import com.dqops.metadata.id.ChildHierarchyNodeFieldMapImpl; +import com.dqops.metadata.id.HierarchyNodeResultVisitor; +import com.dqops.metadata.sources.TableSpec; +import com.dqops.metadata.sources.fileformat.TableOptionsFormatter; +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonPropertyDescription; +import com.fasterxml.jackson.databind.PropertyNamingStrategies; +import com.fasterxml.jackson.databind.annotation.JsonNaming; +import lombok.EqualsAndHashCode; +import lombok.experimental.FieldNameConstants; + +import java.util.List; +import java.util.Objects; + +/** + * Csv file format specification for querying data in the Avro format files. + */ +@JsonInclude(JsonInclude.Include.NON_NULL) +@JsonNaming(PropertyNamingStrategies.SnakeCaseStrategy.class) +@EqualsAndHashCode(callSuper = true) +@FieldNameConstants +public class AvroFileFormatSpec extends AbstractSpec { + + private static final ChildHierarchyNodeFieldMapImpl FIELDS = new ChildHierarchyNodeFieldMapImpl<>(AbstractSpec.FIELDS) { + { + } + }; + + @JsonPropertyDescription("Whether or not an extra filename column should be included in the result.") + @JsonInclude(JsonInclude.Include.NON_EMPTY) + private Boolean filename; + + /** + * Formats the table options to be used in SQL query. The set (non null) options are added only. + * + * @param filePathList The names of files with data. + * @return The formatted source table with the options. + */ + public String buildSourceTableOptionsString(List filePathList, TableSpec tableSpec) { + TableOptionsFormatter tableOptionsFormatter = new TableOptionsFormatter("read_avro", filePathList); + tableOptionsFormatter.formatValueWhenSet(Fields.filename, filename); + return tableOptionsFormatter.build(); + } + + /** + * Returns whether or not an extra filename column should be included in the result. + * + * @return The filename option state. + */ + public Boolean getFilename() { + return filename; + } + + /** + * Sets that an extra filename column should be included in the result. + * + * @param filename The filename option state. + */ + public void setFilename(Boolean filename) { + setDirtyIf(!Objects.equals(this.filename, filename)); + this.filename = filename; + } + @Override + protected ChildHierarchyNodeFieldMap getChildMap() { + return FIELDS; + } + + @Override + public R visit(HierarchyNodeResultVisitor visitor, P parameter) { + return visitor.accept(this, parameter); + } + + /** + * Creates and returns a deep clone (copy) of this object. + */ + @Override + public AvroFileFormatSpec deepClone() { + return (AvroFileFormatSpec)super.deepClone(); + } + + /** + * Creates an expanded and trimmed deep copy of the spec. + * Configurable properties will be expanded if they contain environment variables or secrets. + * + * @param secretValueProvider Secret value provider. + * @param lookupContext Secret value lookup context used to access shared credentials. + * @return Cloned, trimmed and expanded table specification. + */ + public AvroFileFormatSpec expandAndTrim(SecretValueProvider secretValueProvider, SecretValueLookupContext lookupContext) { + AvroFileFormatSpec cloned = this.deepClone(); + return cloned; + } +} diff --git a/dqops/src/main/resources/static/swagger-api/dqops-api-swagger-2.json b/dqops/src/main/resources/static/swagger-api/dqops-api-swagger-2.json index 0b607a729f..0cac7ab62f 100644 --- a/dqops/src/main/resources/static/swagger-api/dqops-api-swagger-2.json +++ b/dqops/src/main/resources/static/swagger-api/dqops-api-swagger-2.json @@ -20370,6 +20370,15 @@ } } }, + "AvroFileFormatSpec" : { + "type" : "object", + "properties" : { + "filename" : { + "type" : "boolean", + "description" : "Whether or not an extra filename column should be included in the result." + } + } + }, "BetweenFloatsRuleParametersSpec" : { "type" : "object", "properties" : { @@ -39393,7 +39402,7 @@ "files_format_type" : { "type" : "string", "description" : "Type of source files format for DuckDB.", - "enum" : [ "csv", "json", "parquet", "iceberg", "delta_lake" ] + "enum" : [ "csv", "json", "parquet", "avro", "iceberg", "delta_lake" ] }, "database" : { "type" : "string", @@ -39418,6 +39427,10 @@ "description" : "Parquet file format specification.", "$ref" : "#/definitions/ParquetFileFormatSpec" }, + "avro" : { + "description" : "Avro file format specification.", + "$ref" : "#/definitions/AvroFileFormatSpec" + }, "iceberg" : { "description" : "Iceberg file format specification.", "$ref" : "#/definitions/IcebergFileFormatSpec" @@ -39942,6 +39955,10 @@ "description" : "Parquet file format specification.", "$ref" : "#/definitions/ParquetFileFormatSpec" }, + "avro" : { + "description" : "Avro file format specification.", + "$ref" : "#/definitions/AvroFileFormatSpec" + }, "iceberg" : { "description" : "Iceberg file format specification.", "$ref" : "#/definitions/IcebergFileFormatSpec" diff --git a/dqops/src/main/resources/static/swagger-api/dqops-api-swagger-2.yaml b/dqops/src/main/resources/static/swagger-api/dqops-api-swagger-2.yaml index 059c2e3aa0..f35e2a7afe 100644 --- a/dqops/src/main/resources/static/swagger-api/dqops-api-swagger-2.yaml +++ b/dqops/src/main/resources/static/swagger-api/dqops-api-swagger-2.yaml @@ -17252,6 +17252,13 @@ definitions: schedule: description: "Schedule for importing source tables using a CRON scheduler." $ref: "#/definitions/CronScheduleSpec" + AvroFileFormatSpec: + type: "object" + properties: + filename: + type: "boolean" + description: "Whether or not an extra filename column should be included in\ + \ the result." BetweenFloatsRuleParametersSpec: type: "object" properties: @@ -38132,6 +38139,7 @@ definitions: - "csv" - "json" - "parquet" + - "avro" - "iceberg" - "delta_lake" database: @@ -38153,6 +38161,9 @@ definitions: parquet: description: "Parquet file format specification." $ref: "#/definitions/ParquetFileFormatSpec" + avro: + description: "Avro file format specification." + $ref: "#/definitions/AvroFileFormatSpec" iceberg: description: "Iceberg file format specification." $ref: "#/definitions/IcebergFileFormatSpec" @@ -38626,6 +38637,9 @@ definitions: parquet: description: "Parquet file format specification." $ref: "#/definitions/ParquetFileFormatSpec" + avro: + description: "Avro file format specification." + $ref: "#/definitions/AvroFileFormatSpec" iceberg: description: "Iceberg file format specification." $ref: "#/definitions/IcebergFileFormatSpec"