userFiltering.py

import warnings
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Suppress all warnings
warnings.filterwarnings("ignore")

# dictionary for output
res = {
    "A": "Auditory",
    "V":"Visual",
    "K": "Kinesthetic"
}

# Function to load and preprocess data
def load_and_preprocess_data(file_path):
    # Load the dataset
    data = pd.read_csv(file_path)

    # Encode the target column (Learner Type)
    label_encoder = LabelEncoder()
    data['Learner'] = label_encoder.fit_transform(data['Learner'])

    # Encode categorical features (e.g., gender)
    data['Gender'] = LabelEncoder().fit_transform(data['Gender'])

    # Separate features and target
    X = data.drop(columns=['Learner'])  # All features including gender, age, and responses
    y = data['Learner']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale the features to standardize the range
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, y_train, scaler, label_encoder

# Function to predict the learner type for a new user
def predict_learner_type(new_user_input, file_path="./extras/SL_csv.csv", k=5):
    # Load and preprocess the data (only once)
    X_train, y_train, scaler, label_encoder = load_and_preprocess_data(file_path)

    # Scale the input using the same scaler
    new_user_scaled = scaler.transform([new_user_input])

    # Compute cosine similarity between the new user and training data
    new_user_similarity = cosine_similarity(new_user_scaled, X_train).flatten()

    # Find the indices of the top k similar users
    top_k_indices = np.argsort(new_user_similarity)[-k:]

    # Get the types of the top k similar users
    top_k_types = y_train.iloc[top_k_indices]

    # Predict the type as the most common type among the top k users
    predicted_label = top_k_types.mode()[0]
    predicted_type = label_encoder.inverse_transform([predicted_label])[0]
    predicted_type = res[predicted_type]

    return predicted_type