Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Openapi new auth #4086

Merged
merged 9 commits into from
Feb 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion metadata-ingestion/source_docs/openapi.md
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@ source:
name: test_endpoint # this name will appear in DatHub
url: https://test_endpoint.com/
swagger_file: classicapi/doc/swagger.json # where to search for the OpenApi definitions
get_token: True # optional, if you need to get an authentication token beforehand
get_token: # optional, if you need to get an authentication token beforehand
request_type: get
url: api/authentication/login?username={username}&password={password}
username: your_username # optional
password: your_password # optional
forced_examples: # optionals
Expand Down Expand Up @@ -137,6 +139,15 @@ and this URL will be called to get back the needed metadata.

## Config details

### Token authentication

If this tool needs to get an access token to interrogate the endpoints, this can be requested. Two methods are available at the moment:

* 'get' : this requires username/password combination to be present in the url. Note that {username} and {password} are mandatory placeholders. They will be replaced with the true credentials at runtime. Note that username and password will be sent in the request address, so it's unsecure. If your provider allows for the other method, please go for it.
* 'post' : username and password will be inserted in the body of the POST request

In both cases, username and password are the ones defined in the configuration file.

### Getting dataset metadata from `forced_example`

Suppose you have an endpoint defined in the swagger file, but without example given, and the tool is
Expand Down
48 changes: 38 additions & 10 deletions metadata-ingestion/src/datahub/ingestion/source/openapi.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import time
import warnings
from abc import ABC
from typing import Dict, Generator, Iterable, Tuple
from typing import Dict, Generator, Iterable, Optional, Tuple

from datahub.configuration.common import ConfigModel
from datahub.emitter.mce_builder import make_tag_urn
Expand Down Expand Up @@ -43,20 +43,46 @@ class OpenApiConfig(ConfigModel):
username: str = ""
password: str = ""
forced_examples: dict = {}
token: str = ""
get_token: bool = False
token: Optional[str] = None
get_token: dict = {}

def get_swagger(self) -> Dict:
if self.get_token: # token based authentication, to be tested
if self.token == "":
if self.get_token or self.token is not None:
if self.token is not None:
...
else:
assert (
"url_complement" in self.get_token.keys()
), "When 'request_type' is set to 'get', an url_complement is needed for the request."
if self.get_token["request_type"] == "get":
assert (
"{username}" in self.get_token["url_complement"]
), "we expect the keyword {username} to be present in the url"
assert (
"{password}" in self.get_token["url_complement"]
), "we expect the keyword {password} to be present in the url"
url4req = self.get_token["url_complement"].replace(
"{username}", self.username
)
url4req = url4req.replace("{password}", self.password)
elif self.get_token["request_type"] == "post":
url4req = self.get_token["url_complement"]
else:
raise KeyError(
"This tool accepts only 'get' and 'post' as method for getting tokens"
)
self.token = get_tok(
url=self.url, username=self.username, password=self.password
url=self.url,
username=self.username,
password=self.password,
tok_url=url4req,
method=self.get_token["request_type"],
)

sw_dict = get_swag_json(
self.url, token=self.token, swagger_file=self.swagger_file
) # load the swagger file
else:

else: # using basic auth for accessing endpoints
sw_dict = get_swag_json(
self.url,
username=self.username,
Expand Down Expand Up @@ -102,7 +128,9 @@ def report_bad_responses(self, status_code: int, key: str) -> None:
elif status_code == 504:
self.report.report_warning(key=key, reason="Timeout for reaching endpoint")
else:
raise Exception(f"Unable to retrieve endpoint, response code {status_code}")
raise Exception(
f"Unable to retrieve endpoint, response code {status_code}, key {key}"
)

def init_dataset(
self, endpoint_k: str, endpoint_dets: dict
Expand Down Expand Up @@ -269,7 +297,7 @@ def close(self):

class OpenApiSource(APISource):
def __init__(self, config: OpenApiConfig, ctx: PipelineContext):
super().__init__(config, ctx, "OpenApi")
Copy link
Contributor

@shirshanka shirshanka Feb 20, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. Is this (changing OpenApi to openapi) intentional?
  2. what happens to previously ingested urns? when you re-run ingestion they get new urns, so the old entities will continue to exist?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it's intentional. I spotted that there was issues with finding the icon, so I uniformed all to the lowercase format, as in other ingestion plugins.
The older urns will continue to exist as different entities.

super().__init__(config, ctx, "openapi")

@classmethod
def create(cls, config_dict, ctx):
Expand Down
44 changes: 34 additions & 10 deletions metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,8 @@ def get_endpoints(sw_dict: dict) -> dict: # noqa: C901
if "parameters" in p_o["get"].keys():
url_details[p_k]["parameters"] = p_o["get"]["parameters"]

return url_details
ord_d = dict(sorted(url_details.items())) # sorting for convenience
return ord_d


def guessing_url_name(url: str, examples: dict) -> str:
Expand Down Expand Up @@ -211,6 +212,10 @@ def guessing_url_name(url: str, examples: dict) -> str:
ex2use = root
elif root[:-1] in examples.keys():
ex2use = root[:-1]
elif root.replace("/", ".") in examples.keys():
ex2use = root.replace("/", ".")
elif root[:-1].replace("/", ".") in examples.keys():
ex2use = root[:-1].replace("/", ".")
else:
return url

Expand Down Expand Up @@ -332,19 +337,38 @@ def extract_fields(
return [], {}


def get_tok(url: str, username: str = "", password: str = "") -> str:
def get_tok(
url: str,
username: str = "",
password: str = "",
tok_url: str = "",
method: str = "post",
) -> str:
"""
Trying to post username/password to get auth.
Simplified version: it expect a POST at api/authenticate
"""
data = {"username": username, "password": password}
url2post = url + "api/authenticate/"
response = requests.post(url2post, data=data)
if response.status_code == 200:
cont = json.loads(response.content)
return cont["tokens"]["access"]
token = ""
url4req = url + tok_url
if method == "post":
# this will make a POST call with username and password
data = {"username": username, "password": password}
# url2post = url + "api/authenticate/"
response = requests.post(url4req, data=data)
if response.status_code == 200:
cont = json.loads(response.content)
token = cont["tokens"]["access"]
elif method == "get":
# this will make a GET call with username and password
response = requests.get(url4req)
if response.status_code == 200:
cont = json.loads(response.content)
token = cont["token"]
else:
raise ValueError(f"Method unrecognised: {method}")
if token != "":
return token
else:
raise Exception("Unable to get a valid token")
raise Exception(f"Unable to get a valid token: {response.text}")


def set_metadata(
Expand Down
6 changes: 3 additions & 3 deletions metadata-ingestion/tests/integration/openapi/openapi_mces_golden.json
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"auditHeader": null,
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
"urn": "urn:li:dataset:(urn:li:dataPlatform:OpenApi,test_openapi.root,PROD)",
"urn": "urn:li:dataset:(urn:li:dataPlatform:openapi,test_openapi.root,PROD)",
"aspects": [
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
Expand Down Expand Up @@ -95,7 +95,7 @@
"auditHeader": null,
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
"urn": "urn:li:dataset:(urn:li:dataPlatform:OpenApi,test_openapi.v2,PROD)",
"urn": "urn:li:dataset:(urn:li:dataPlatform:openapi,test_openapi.v2,PROD)",
"aspects": [
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
Expand Down Expand Up @@ -183,4 +183,4 @@
"properties": null
}
}
]
]