Skip to content

Commit

Permalink
Merge pull request #192 from alan-turing-institute/azure-fileshare-do…
Browse files Browse the repository at this point in the history
…wnloader

Azure fileshare downloader
  • Loading branch information
rchan26 authored Jun 13, 2024
2 parents bb0a9db + 79bea08 commit 9c32d82
Show file tree
Hide file tree
Showing 5 changed files with 158 additions and 16 deletions.
54 changes: 44 additions & 10 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ openai = "^1.33.0"
pandas = "^2.2.2"
pulumi = { version="^3.100.0", optional=true }
pulumi-azure-native = { version="^2.24.0", optional=true }
pydantic = { version="^2.4.1", optional=true }
pydantic = { version="^2.7.4", optional=true }
requests = { version="^2.32.3", optional=true }
safetensors = "^0.4.3"
slack-sdk = "^3.27.2"
Expand All @@ -52,10 +52,11 @@ llama-index-llms-ollama = "^0.1.5"
llama-index-llms-llama-cpp = "^0.1.3"
llama-index-readers-file = "^0.1.23"
llama-index-embeddings-langchain = "^0.1.2"
typer = {extras = ["all"], version = "^0.12.3"}
typer = "^0.12.3"
langchain-community = "^0.2.4"
tiktoken = "^0.7.0"
llama-index-embeddings-huggingface = "^0.2.1"
azure-storage-file-share = "^12.16.0"
rich = "^13.7.1"


Expand Down
28 changes: 28 additions & 0 deletions reginald/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,3 +376,31 @@ def chat(
n_gpu_layers=n_gpu_layers,
device=device,
)


@cli.command()
def download(
data_dir: Annotated[
str, typer.Option(envvar="LLAMA_INDEX_DATA_DIR")
] = DEFAULT_ARGS["data_dir"],
which_index: Annotated[
str, typer.Option(envvar="LLAMA_INDEX_WHICH_INDEX")
] = DEFAULT_ARGS["which_index"],
azure_storage_key: Annotated[
Optional[str], typer.Option(envvar="AZURE_STORAGE_KEY")
] = None,
connection_str: Annotated[
Optional[str], typer.Option(envvar="AZURE_STORAGE_CONNECTION_STR")
] = None,
) -> None:
"""
Download data from an Azure file share.
"""
set_up_logging_config(level=20)
main(
cli="download",
data_dir=data_dir,
which_index=which_index,
azure_storage_key=azure_storage_key,
connection_str=connection_str,
)
71 changes: 67 additions & 4 deletions reginald/run.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import asyncio
import logging
import pathlib
import sys
from typing import Final

Expand All @@ -17,16 +18,14 @@
setup_slack_bot,
setup_slack_client,
)
from reginald.utils import create_folder

LISTENING_MSG: Final[str] = "Listening for requests..."


async def run_bot(api_url: str | None, emoji: str) -> None:
if api_url is None:
logging.error(
"API URL is not set. Please set the REGINALD_API_URL "
"environment variable or pass in the --api-url argument"
)
logging.error("api_url is not set.")
sys.exit(1)

# set up slack bot
Expand Down Expand Up @@ -91,11 +90,73 @@ async def connect_client(client: SocketModeClient) -> None:
await asyncio.sleep(float("inf"))


def download_from_fileshare(
data_dir: pathlib.Path | str,
which_index: str,
azure_storage_key: str | None,
connection_str: str | None,
) -> None:
from azure.storage.fileshare import ShareClient
from tqdm import tqdm

if azure_storage_key is None:
logging.error("azure_storage_key is not set.")
sys.exit(1)
if connection_str is None:
logging.error("connection_str is not set.")
sys.exit(1)

# set the file share name and directory
file_share_name = "llama-data"
file_share_directory = f"llama_index_indices/{which_index}"

# create a ShareClient object
share_client = ShareClient.from_connection_string(
conn_str=connection_str,
share_name=file_share_name,
credential=azure_storage_key,
)

# get a reference to the file share directory
file_share_directory_client = share_client.get_directory_client(
file_share_directory
)

# set the local download directory
local_download_directory = (
pathlib.Path(data_dir) / "llama_index_indices" / which_index
)

# create folder if does not exist
create_folder(local_download_directory)

# list all the files in the directory
files_list = file_share_directory_client.list_directories_and_files()

# check if the index exists
try:
files_list = list(files_list)
except:
logging.error(f"Index {which_index} does not exist in the file share")
sys.exit(1)

# iterate through each file in the list and download it
for file in tqdm(files_list):
if not file.is_directory:
file_client = file_share_directory_client.get_file_client(file.name)
download_path = local_download_directory / file.name
with open(download_path, "wb") as file_handle:
data = file_client.download_file()
data.readinto(file_handle)


def main(
cli: str,
api_url: str | None = None,
emoji: str = EMOJI_DEFAULT,
streaming: bool = False,
data_dir: str | None = None,
which_index: str | None = None,
**kwargs,
):
# initialise logging
Expand All @@ -109,6 +170,8 @@ def main(
run_chat_interact(streaming=streaming, **kwargs)
elif cli == "create_index":
create_index(**kwargs)
elif cli == "download":
download_from_fileshare(data_dir=data_dir, which_index=which_index, **kwargs)
else:
logging.info("No run options selected.")

Expand Down
16 changes: 16 additions & 0 deletions reginald/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,19 @@ def get_env_var(
logging.warn(f"Environment variable '{var}' not found.")

return value


def create_folder(folder: str) -> None:
"""
Function to create a folder if it does not already exist.
Parameters
----------
folder : str
Name of the folder to be created.
"""
if not os.path.exists(folder):
logging.info(f"Creating folder '{folder}'")
os.makedirs(folder)
else:
logging.info(f"Folder '{folder}' already exists")

0 comments on commit 9c32d82

Please sign in to comment.