Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

created get_dataset, update_dataset API and fixed: delete #1201

Merged
merged 7 commits into from
Jun 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 107 additions & 30 deletions api/apps/dataset_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from datetime import datetime, timedelta
from flask import request, Response
from flask_login import login_required, current_user
from httpx import HTTPError

from api.db import FileType, ParserType, FileSource, StatusEnum
from api.db.db_models import APIToken, API4Conversation, Task, File
Expand All @@ -45,6 +46,7 @@
from api.contants import NAME_LENGTH_LIMIT

# ------------------------------ create a dataset ---------------------------------------

@manager.route('/', methods=['POST'])
@login_required # use login
@validate_request("name") # check name key
Expand Down Expand Up @@ -104,18 +106,20 @@ def create_dataset():
request_body["id"] = get_uuid()
request_body["tenant_id"] = tenant_id
request_body["created_by"] = tenant_id
e, t = TenantService.get_by_id(tenant_id)
if not e:
exist, t = TenantService.get_by_id(tenant_id)
if not exist:
return construct_result(code=RetCode.AUTHENTICATION_ERROR, message="Tenant not found.")
request_body["embd_id"] = t.embd_id
if not KnowledgebaseService.save(**request_body):
# failed to create new dataset
return construct_result()
return construct_json_result(data={"dataset_name": request_body["name"]})
return construct_json_result(code=RetCode.SUCCESS,
data={"dataset_name": request_body["name"], "dataset_id": request_body["id"]})
except Exception as e:
return construct_error_response(e)

# -----------------------------list datasets-------------------------------------------------------

@manager.route('/', methods=['GET'])
@login_required
def list_datasets():
Expand All @@ -125,67 +129,140 @@ def list_datasets():
desc = request.args.get("desc", True)
try:
tenants = TenantService.get_joined_tenants_by_user_id(current_user.id)
kbs = KnowledgebaseService.get_by_tenant_ids_by_offset(
datasets = KnowledgebaseService.get_by_tenant_ids_by_offset(
[m["tenant_id"] for m in tenants], current_user.id, int(offset), int(count), orderby, desc)
return construct_json_result(data=kbs, code=RetCode.DATA_ERROR, message=f"attempt to list datasets")
return construct_json_result(data=datasets, code=RetCode.SUCCESS, message=f"List datasets successfully!")
except Exception as e:
return construct_error_response(e)
except HTTPError as http_err:
return construct_json_result(http_err)

# ---------------------------------delete a dataset ----------------------------

@manager.route('/<dataset_id>', methods=['DELETE'])
@login_required
@validate_request("dataset_id")
def remove_dataset(dataset_id):
req = request.json
try:
kbs = KnowledgebaseService.query(
created_by=current_user.id, id=req["dataset_id"])
if not kbs:
return construct_json_result(
data=False, message=f'Only owner of knowledgebase authorized for this operation.',
code=RetCode.OPERATING_ERROR)

for doc in DocumentService.query(kb_id=req["dataset_id"]):
if not DocumentService.remove_document(doc, kbs[0].tenant_id):
return construct_json_result(
message="Database error (Document removal)!")
datasets = KnowledgebaseService.query(created_by=current_user.id, id=dataset_id)

# according to the id, searching for the dataset
if not datasets:
return construct_json_result(message=f'The dataset cannot be found for your current account.',
code=RetCode.OPERATING_ERROR)

# Iterating the documents inside the dataset
for doc in DocumentService.query(kb_id=dataset_id):
if not DocumentService.remove_document(doc, datasets[0].tenant_id):
# the process of deleting failed
return construct_json_result(code=RetCode.DATA_ERROR,
message="There was an error during the document removal process. "
"Please check the status of the RAGFlow server and try the removal again.")
# delete the other files
f2d = File2DocumentService.get_by_document_id(doc.id)
FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
File2DocumentService.delete_by_document_id(doc.id)

if not KnowledgebaseService.delete_by_id(req["dataset_id"]):
return construct_json_result(
message="Database error (Knowledgebase removal)!")
return construct_json_result(code=RetCode.DATA_ERROR, message=f"attempt to remove dataset: {dataset_id}")
# delete the dataset
if not KnowledgebaseService.delete_by_id(dataset_id):
return construct_json_result(code=RetCode.DATA_ERROR, message="There was an error during the dataset removal process. "
"Please check the status of the RAGFlow server and try the removal again.")
# success
return construct_json_result(code=RetCode.SUCCESS, message=f"Remove dataset: {dataset_id} successfully")
except Exception as e:
return construct_error_response(e)

# ------------------------------ get details of a dataset ----------------------------------------

@manager.route('/<dataset_id>', methods=['GET'])
@login_required
@validate_request("dataset_id")
def get_dataset():
dataset_id = request.args["dataset_id"]
def get_dataset(dataset_id):
try:
dataset = KnowledgebaseService.get_detail(dataset_id)
if not dataset:
return construct_json_result(
message="Can't find this knowledgebase!")
return construct_json_result(code=RetCode.DATA_ERROR, message=f"attempt to get detail of dataset: {dataset_id}")
return construct_json_result(code=RetCode.DATA_ERROR, message="Can't find this dataset!")
return construct_json_result(data=dataset, code=RetCode.SUCCESS)
except Exception as e:
return construct_json_result(e)

# ------------------------------ update a dataset --------------------------------------------

@manager.route('/<dataset_id>', methods=['PUT'])
@login_required
@validate_request("name")
def update_dataset(dataset_id):
return construct_json_result(code=RetCode.DATA_ERROR, message=f"attempt to update dataset: {dataset_id}")
req = request.json
try:
# the request cannot be empty
if not req:
return construct_json_result(code=RetCode.DATA_ERROR, message="Please input at least one parameter that "
"you want to update!")
# check whether the dataset can be found
if not KnowledgebaseService.query(created_by=current_user.id, id=dataset_id):
return construct_json_result(message=f'Only the owner of knowledgebase is authorized for this operation!',
code=RetCode.OPERATING_ERROR)

exist, dataset = KnowledgebaseService.get_by_id(dataset_id)
# check whether there is this dataset
if not exist:
return construct_json_result(code=RetCode.DATA_ERROR, message="This dataset cannot be found!")

if 'name' in req:
name = req["name"].strip()
# check whether there is duplicate name
if name.lower() != dataset.name.lower() \
and len(KnowledgebaseService.query(name=name, tenant_id=current_user.id,
status=StatusEnum.VALID.value)) > 1:
return construct_json_result(code=RetCode.DATA_ERROR, message=f"The name: {name.lower()} is already used by other "
f"datasets. Please choose a different name.")

dataset_updating_data = {}
chunk_num = req.get("chunk_num")
# modify the value of 11 parameters

# 2 parameters: embedding id and chunk method
# only if chunk_num is 0, the user can update the embedding id
if req.get('embedding_model_id'):
if chunk_num == 0:
dataset_updating_data['embd_id'] = req['embedding_model_id']
else:
construct_json_result(code=RetCode.DATA_ERROR, message="You have already parsed the document in this "
"dataset, so you cannot change the embedding "
"model.")
# only if chunk_num is 0, the user can update the chunk_method
if req.get("chunk_method"):
if chunk_num == 0:
dataset_updating_data['parser_id'] = req["chunk_method"]
else:
construct_json_result(code=RetCode.DATA_ERROR, message="You have already parsed the document "
"in this dataset, so you cannot "
"change the chunk method.")
# convert the photo parameter to avatar
if req.get("photo"):
dataset_updating_data['avatar'] = req["photo"]

# layout_recognize
if 'layout_recognize' in req:
if 'parser_config' not in dataset_updating_data:
dataset_updating_data['parser_config'] = {}
dataset_updating_data['parser_config']['layout_recognize'] = req['layout_recognize']

# TODO: updating use_raptor needs to construct a class

# 6 parameters
for key in ['name', 'language', 'description', 'permission', 'id', 'token_num']:
if key in req:
dataset_updating_data[key] = req.get(key)

# update
if not KnowledgebaseService.update_by_id(dataset.id, dataset_updating_data):
return construct_json_result(code=RetCode.OPERATING_ERROR, message="Failed to update! "
"Please check the status of RAGFlow "
"server and try again!")

exist, dataset = KnowledgebaseService.get_by_id(dataset.id)
if not exist:
return construct_json_result(code=RetCode.DATA_ERROR, message="Failed to get the dataset "
"using the dataset ID.")

return construct_json_result(data=dataset.to_json(), code=RetCode.SUCCESS)
except Exception as e:
return construct_error_response(e)
3 changes: 2 additions & 1 deletion docs/references/ragflow_api.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ You are *required* to save the `data.id` value returned in the response data, wh
{
"code": 0,
"data": {
"dataset_name": "kb1"
"dataset_name": "kb1",
"dataset_id": "375e8ada2d3c11ef98f93043d7ee537e"
},
"message": "success"
}
Expand Down
60 changes: 15 additions & 45 deletions sdk/python/ragflow/ragflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,9 @@
import requests
import json

from httpx import HTTPError


class RAGFlow:
def __init__(self, user_key, base_url, version = 'v1'):
def __init__(self, user_key, base_url, version='v1'):
'''
api_url: http://<host_address>/api/v1
dataset_url: http://<host_address>/api/v1/dataset
Expand All @@ -41,14 +39,10 @@ def create_dataset(self, dataset_name):

def delete_dataset(self, dataset_name):
dataset_id = self.find_dataset_id_by_name(dataset_name)
if not dataset_id:
return {"success": False, "message": "Dataset not found."}

res = requests.delete(f"{self.dataset_url}/{dataset_id}", headers=self.authorization_header)
if res.status_code == 200:
return {"success": True, "message": "Dataset deleted successfully!"}
else:
return {"success": False, "message": f"Other status code: {res.status_code}"}
endpoint = f"{self.dataset_url}/{dataset_id}"
res = requests.delete(endpoint, headers=self.authorization_header)
return res.json()

def find_dataset_id_by_name(self, dataset_name):
res = requests.get(self.dataset_url, headers=self.authorization_header)
Expand All @@ -64,42 +58,18 @@ def list_dataset(self, offset=0, count=-1, orderby="create_time", desc=True):
"orderby": orderby,
"desc": desc
}
try:
response = requests.get(url=self.dataset_url, params=params, headers=self.authorization_header)
response.raise_for_status() # if it is not 200
original_data = response.json()
# TODO: format the data
# print(original_data)
# # Process the original data into the desired format
# formatted_data = {
# "datasets": [
# {
# "id": dataset["id"],
# "created": dataset["create_time"], # Adjust the key based on the actual response
# "fileCount": dataset["doc_num"], # Adjust the key based on the actual response
# "name": dataset["name"]
# }
# for dataset in original_data
# ]
# }
return response.status_code, original_data
except HTTPError as http_err:
print(f"HTTP error occurred: {http_err}")
except Exception as err:
print(f"An error occurred: {err}")
response = requests.get(url=self.dataset_url, params=params, headers=self.authorization_header)
return response.json()

def get_dataset(self, dataset_id):
def get_dataset(self, dataset_name):
dataset_id = self.find_dataset_id_by_name(dataset_name)
endpoint = f"{self.dataset_url}/{dataset_id}"
response = requests.get(endpoint)
if response.status_code == 200:
return response.json()
else:
return None
response = requests.get(endpoint, headers=self.authorization_header)
return response.json()

def update_dataset(self, dataset_name, **params):
dataset_id = self.find_dataset_id_by_name(dataset_name)

def update_dataset(self, dataset_id, params):
endpoint = f"{self.dataset_url}/{dataset_id}"
response = requests.put(endpoint, json=params)
if response.status_code == 200:
return True
else:
return False
response = requests.put(endpoint, json=params, headers=self.authorization_header)
return response.json()
2 changes: 1 addition & 1 deletion sdk/python/test/common.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@


API_KEY = 'ImFmNWQ3YTY0Mjg5NjExZWZhNTdjMzA0M2Q3ZWU1MzdlIg.ZmldwA.9oP9pVtuEQSpg-Z18A2eOkWO-3E'
API_KEY = 'ImFhMmJhZmUwMmQxNzExZWZhZDdmMzA0M2Q3ZWU1MzdlIg.ZnDsIQ.u-0-_qCRU6a4WICxyAPsjaafyOo'
HOST_ADDRESS = 'http://127.0.0.1:9380'
Loading