diff --git a/README.md b/README.md index 1138709..733e34d 100644 --- a/README.md +++ b/README.md @@ -121,6 +121,50 @@ By default method='wilson', the wilson interval, which behaves better for smalle method can be one of ['wilson', 'normal', 'agresti_coull', 'beta', 'jeffreys', 'binom_test'], or one of the boostrap methods. +## Get a Classification Report +The [classification_report.py](confidenceinterval%2Fclassification_report.py) function builds a text report showing the main classification metrics and their confidence intervals. +Each class will be first treated as a binary classification problem, the default CI for P and R used being Wilson, and Takahashi-binary for F1. Then the +micro and macro multi-class metric will be calculated using the Takahashi-methods. + +```python +from confidenceinterval import classification_report_with_ci + +y_true = [0, 1, 2, 2, 2, 1, 1, 1, 0, 2, 2, 1, 0, 2, 2, 1, 2, 2, 1, 1] +y_pred = [0, 1, 0, 0, 2, 1, 1, 1, 0, 2, 2, 1, 0, 1, 2, 1, 2, 2, 1, 1] + +classification_report_with_ci(y_true, y_pred) + + Class Precision Recall F1-Score Precision CI Recall CI F1-Score CI Support +0 Class 0 0.600 1.000 0.750 (0.231, 0.882) (0.439, 1.0) (0.408, 1.092) 3 +1 Class 1 0.889 1.000 0.941 (0.565, 0.98) (0.676, 1.0) (0.796, 1.086) 8 +2 Class 2 1.000 0.667 0.800 (0.61, 1.0) (0.354, 0.879) (0.562, 1.038) 9 +3 micro 0.850 0.850 0.850 (0.694, 1.006) (0.694, 1.006) (0.694, 1.006) 20 +4 macro 0.830 0.889 0.830 (0.702, 0.958) (0.775, 1.002) (0.548, 1.113) 20 +``` +You can also provide a custom mapping for the class names, as well as modify the binary CI method and rounding. +```python +from confidenceinterval import classification_report_with_ci + +y_true = [0, 1, 2, 2, 2, 1, 1, 1, 0, 2, 2, 1, 0, 2, 2, 1, 2, 2, 1, 1] +y_pred = [0, 1, 0, 0, 2, 1, 1, 1, 0, 2, 2, 1, 0, 1, 2, 1, 2, 2, 1, 1] + +numerical_to_label = { + 0: "Cherries", + 1: "Olives", + 2: "Tangerines" +} + +classification_report_with_ci(y_true, y_pred, round_ndigits=2, numerical_to_label_map = numerical_to_label, binary_method='wilson') + + Class Precision Recall F1-Score Precision CI Recall CI F1-Score CI Support +0 Cherries 0.60 1.00 0.75 (0.23, 0.88) (0.44, 1.0) (0.41, 1.09) 3 +1 Olives 0.89 1.00 0.94 (0.57, 0.98) (0.68, 1.0) (0.8, 1.09) 8 +2 Tangerines 1.00 0.67 0.80 (0.61, 1.0) (0.35, 0.88) (0.56, 1.04) 9 +3 micro 0.85 0.85 0.85 (0.69, 1.01) (0.69, 1.01) (0.69, 1.01) 20 +4 macro 0.83 0.89 0.83 (0.7, 0.96) (0.78, 1.0) (0.55, 1.11) 20 +``` + + ## Get a confidence interval for any custom metric with Bootstrapping With the bootstrap_ci method, you can get the CI for any metric function that gets y_true and y_pred as arguments. diff --git a/confidenceinterval/__init__.py b/confidenceinterval/__init__.py index 1984478..634a8d9 100644 --- a/confidenceinterval/__init__.py +++ b/confidenceinterval/__init__.py @@ -11,3 +11,5 @@ recall_score, \ f1_score from confidenceinterval.auc import roc_auc_score + +from confidenceinterval.classification_report import classification_report_with_ci diff --git a/confidenceinterval/classification_report.py b/confidenceinterval/classification_report.py new file mode 100644 index 0000000..db6cbe6 --- /dev/null +++ b/confidenceinterval/classification_report.py @@ -0,0 +1,116 @@ +""" Classification report similar to https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html +""" + +from typing import List, Dict, Optional + +import pandas as pd +import numpy as np +from confidenceinterval.takahashi_methods import precision_score,recall_score, f1_score + +def round_tuple(t, decimals=3): + return tuple(round(num, decimals) for num in t) + +def classification_report_with_ci(y_true: List[int], y_pred: List[int], + binary_method: str = 'wilson', + round_ndigits: int = 3, + numerical_to_label_map: Optional[Dict[int, str]] = None, + confidence_level: float = 0.95) -> pd.DataFrame: + """ + Parameters + ---------- + y_true : List[int] + The ground truth labels. + y_pred : List[int] + The predicted categories. + binary_method: str = 'wilson' + The method to calculate the CI for binary proportions. + round_ndigits: int = 3 + Number of digits to return after the decimal point. + numerical_to_label_map: Optional[Dict[int, str]] + Mapping from class indices to descriptive names. + confidence_level: float, optional + The confidence level, by default 0.95 + + Returns + ------- + pd.DataFrame + A DataFrame containing precision, recall, F1-score, and their confidence intervals for each class, + as well as micro and macro averages. + """ + + # Unique classes in the dataset + classes = np.unique(y_true) + + # Validate that all unique classes are covered in the numerical_to_label_map if provided + if numerical_to_label_map is not None: + missing_labels = [cls for cls in classes if cls not in numerical_to_label_map] + if missing_labels: + raise ValueError(f'Missing labels for classes: {missing_labels}') + + data = [] # List to store row dictionaries + + # Unique classes in the dataset + classes = np.unique(y_true) + + # Calculate precision, recall, f1 for each class treated as binary + for class_ in classes: + y_true_binary = [1 if y == class_ else 0 for y in y_true] + y_pred_binary = [1 if y == class_ else 0 for y in y_pred] + + # Calculate metrics + precision, precision_ci = precision_score(y_true_binary, y_pred_binary, average='binary', method=binary_method) + recall, recall_ci = recall_score(y_true_binary, y_pred_binary, average='binary', method=binary_method) + binary_f1, binary_f1_ci = f1_score(y_true_binary, y_pred_binary, confidence_level=confidence_level, average='binary') + + class_name = numerical_to_label_map[class_] if ( + numerical_to_label_map and class_ in numerical_to_label_map) else f'Class {class_}' + support = sum(y_true_binary) + + # Create a new row as a DataFrame and append it to the main DataFrame + # Append new row to the list + data.append({ + 'Class': class_name, + 'Precision': round(precision, round_ndigits), + 'Recall': round(recall, round_ndigits), + 'F1-Score': round(binary_f1, round_ndigits), + 'Precision CI': round_tuple(precision_ci, round_ndigits), + 'Recall CI': round_tuple(recall_ci, round_ndigits), + 'F1-Score CI': round_tuple(binary_f1_ci, round_ndigits), + 'Support': support + }) + + precision_micro, p_ci_micro = precision_score(y_true, y_pred, average='micro') + precision_macro, p_ci_macro = precision_score(y_true, y_pred, average='macro') + + recall_micro, r_ci_micro = recall_score(y_true, y_pred, average='micro') + recall_macro, r_ci_macro = recall_score(y_true, y_pred, average='macro') + + f1_micro, f1_ci_micro = f1_score(y_true, y_pred, average='micro') + f1_macro, f1_ci_macro = f1_score(y_true, y_pred, average='macro') + + data.append({ + 'Class': 'micro', + 'Precision': round(precision_micro, round_ndigits), + 'Recall': round(recall_micro, round_ndigits), + 'F1-Score': round(f1_micro, round_ndigits), + 'Precision CI': round_tuple(p_ci_micro, round_ndigits), + 'Recall CI': round_tuple(r_ci_micro, round_ndigits), + 'F1-Score CI': round_tuple(f1_ci_micro, round_ndigits), + 'Support': len(y_true) + }) + + data.append({ + 'Class': 'macro', + 'Precision': round(precision_macro, round_ndigits), + 'Recall': round(recall_macro, round_ndigits), + 'F1-Score': round(f1_macro, round_ndigits), + 'Precision CI': round_tuple(p_ci_macro, decimals=round_ndigits), + 'Recall CI': round_tuple(r_ci_macro, decimals=round_ndigits), + 'F1-Score CI': round_tuple(f1_ci_macro, decimals=round_ndigits), + 'Support': len(y_true) + + }) + + df = pd.DataFrame(data) + + return df \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 722f416..f1fc3cf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,6 @@ -statsmodels -scikit-learn -scipy -numpy \ No newline at end of file +statsmodels~=0.14.2 +scikit-learn~=1.4.2 +scipy~=1.13.0 +numpy~=1.26.4 +setuptools~=68.2.0 +pandas~=2.2.2 \ No newline at end of file