Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(validation): additional URN validation adjustments #11973

Merged
merged 3 commits into from
Nov 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions docs/what/urn.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,17 @@ urn:li:dataset:(urn:li:dataPlatform:hdfs,PageViewEvent,EI)

## Restrictions

There are a few restrictions when creating an urn:
There are a few restrictions when creating an URN:

1. Commas are reserved character in URN fields: `,`
2. Parentheses are reserved characters in URN fields: `(` or `)`
3. Colons are reserved characters in URN fields: `:`
4. Urn separator UTF-8 character `␟`
The following characters are not allowed anywhere in the URN

1. Parentheses are reserved characters in URN fields: `(` or `)`
2. The "unit separator" unicode character `␟` (U+241F)

The following characters are not allowed within an URN tuple.

1. Commas are reserved characters in URN tuples: `,`

Example: `urn:li:dashboard:(looker,dashboards.thelook)` is a valid urn, but `urn:li:dashboard:(looker,dashboards.the,look)` is invalid.

Please do not use these characters when creating or generating urns. One approach is to use URL encoding for the characters.
4 changes: 4 additions & 0 deletions metadata-io/metadata-io-api/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,7 @@ dependencies {
testImplementation externalDependency.lombok
testAnnotationProcessor externalDependency.lombok
}

test {
environment 'STRICT_URN_VALIDATION_ENABLED', 'true'
}
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ public class ValidationApiUtils {
// Related to BrowsePathv2
public static final String URN_DELIMITER_SEPARATOR = "␟";
// https://datahubproject.io/docs/what/urn/#restrictions
public static final Set<String> ILLEGAL_URN_COMPONENT_CHARACTERS = Set.of(":", "(", ")", ",");
public static final Set<String> ILLEGAL_URN_COMPONENT_CHARACTERS = Set.of("(", ")");
public static final Set<String> ILLEGAL_URN_TUPLE_CHARACTERS = Set.of(",");

/**
* Validates a {@link RecordTemplate} and throws {@link ValidationException} if validation fails.
Expand Down Expand Up @@ -86,11 +87,10 @@ public static void validateUrn(
"Error: URN cannot contain " + URN_DELIMITER_SEPARATOR + " character");
}

int totalParts = urn.getEntityKey().getParts().size();
List<String> illegalComponents =
urn.getEntityKey().getParts().stream()
.flatMap(ValidationApiUtils::processUrnPartRecursively)
.filter(
urnPart -> ILLEGAL_URN_COMPONENT_CHARACTERS.stream().anyMatch(urnPart::contains))
.flatMap(part -> processUrnPartRecursively(part, totalParts))
.collect(Collectors.toList());

if (!illegalComponents.isEmpty()) {
Expand All @@ -114,15 +114,25 @@ public static void validateUrn(
}

/** Recursively process URN parts with URL decoding */
private static Stream<String> processUrnPartRecursively(String urnPart) {
private static Stream<String> processUrnPartRecursively(String urnPart, int totalParts) {
String decodedPart =
URLDecoder.decode(URLEncodingFixer.fixURLEncoding(urnPart), StandardCharsets.UTF_8);
if (decodedPart.startsWith("urn:li:")) {
// Recursively process nested URN after decoding
int nestedParts = UrnUtils.getUrn(decodedPart).getEntityKey().getParts().size();
return UrnUtils.getUrn(decodedPart).getEntityKey().getParts().stream()
.flatMap(ValidationApiUtils::processUrnPartRecursively);
.flatMap(part -> processUrnPartRecursively(part, nestedParts));
}
return Stream.of(decodedPart);
if (totalParts > 1) {
if (ILLEGAL_URN_TUPLE_CHARACTERS.stream().anyMatch(c -> urnPart.contains(c))) {
return Stream.of(urnPart);
}
}
if (ILLEGAL_URN_COMPONENT_CHARACTERS.stream().anyMatch(c -> urnPart.contains(c))) {
return Stream.of(urnPart);
}

return Stream.empty();
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,36 @@ public void testValidateDatasetUrn() {
// If no exception is thrown, test passes
}

@Test(expectedExceptions = IllegalArgumentException.class)
@Test
public void testSimpleUrnColon() {
Urn invalidUrn = UrnUtils.getUrn("urn:li:corpuser:foo:bar");
ValidationApiUtils.validateUrn(entityRegistry, invalidUrn, true);
ValidationApiUtils.validateUrn(
entityRegistry, UrnUtils.getUrn("urn:li:corpuser:foo:bar"), true);
ValidationApiUtils.validateUrn(
entityRegistry, UrnUtils.getUrn("urn:li:dataPlatform:abc:def"), true);
ValidationApiUtils.validateUrn(
entityRegistry, UrnUtils.getUrn("urn:li:corpuser:foo:[email protected]"), true);
// If no exception is thrown, test passes
}

@Test
public void testSimpleUrnComma() {
ValidationApiUtils.validateUrn(entityRegistry, UrnUtils.getUrn("urn:li:corpuser:,"), true);
// If no exception is thrown, test passes
}

@Test(expectedExceptions = IllegalArgumentException.class)
public void testTupleUrnComma() {
ValidationApiUtils.validateUrn(
entityRegistry, UrnUtils.getUrn("urn:li:dashboard:(looker,dashboards,thelook)"), true);
}

@Test(expectedExceptions = IllegalArgumentException.class)
public void testFabricTypeCasing() {
// prod != PROD
ValidationApiUtils.validateUrn(
entityRegistry,
UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:abc:def,table_name,prod)"),
true);
}

@Test
Expand All @@ -34,7 +60,7 @@ public void testComplexUrnColon() throws URISyntaxException {
}

@Test(expectedExceptions = IllegalArgumentException.class)
public void testUrnFabricType() {
public void testFabricTypeParen() {
Urn invalidUrn = UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:hdfs,/path/to/data,())");
ValidationApiUtils.validateUrn(entityRegistry, invalidUrn, true);
}
Expand Down Expand Up @@ -83,20 +109,20 @@ public void testValidComplexUrn() {
UrnUtils.getUrn(
"urn:li:dataset:(urn:li:dataPlatform:bigquery,myproject.dataset.table,PROD)");

ValidationApiUtils.validateUrn(entityRegistry, validUrn);
ValidationApiUtils.validateUrn(entityRegistry, validUrn, true);
// If no exception is thrown, test passes
}

@Test(expectedExceptions = NullPointerException.class)
public void testUrnNull() {
ValidationApiUtils.validateUrn(entityRegistry, null);
ValidationApiUtils.validateUrn(entityRegistry, null, true);
}

@Test
public void testValidPartialUrlEncode() {
Urn validUrn = UrnUtils.getUrn("urn:li:assertion:123=-%28__% weekly__%29");

ValidationApiUtils.validateUrn(entityRegistry, validUrn);
ValidationApiUtils.validateUrn(entityRegistry, validUrn, true);
// If no exception is thrown, test passes
}

Expand All @@ -106,7 +132,23 @@ public void testValidPartialUrlEncode2() {
UrnUtils.getUrn(
"urn:li:dataset:(urn:li:dataPlatform:s3,urn:li:dataset:%28urn:li:dataPlatform:s3%2Ctest-datalake-concepts%prog_maintenance%2CPROD%29,PROD)");

ValidationApiUtils.validateUrn(entityRegistry, validUrn);
ValidationApiUtils.validateUrn(entityRegistry, validUrn, true);
// If no exception is thrown, test passes
}

@Test
public void testValidColon() {
Urn validUrn =
UrnUtils.getUrn("urn:li:dashboard:(looker,dashboards.thelook::cohort_data_tool)");

ValidationApiUtils.validateUrn(entityRegistry, validUrn, true);
// If no exception is thrown, test passes
}

@Test
public void testNoTupleComma() {
Urn invalidUrn = UrnUtils.getUrn("urn:li:corpuser:,");
ValidationApiUtils.validateUrn(entityRegistry, invalidUrn, true);
// If no exception is thrown, test passes
}
}
Loading