Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added code to generate a classification report #9

Merged
merged 5 commits into from
May 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,50 @@ By default method='wilson', the wilson interval, which behaves better for smalle

method can be one of ['wilson', 'normal', 'agresti_coull', 'beta', 'jeffreys', 'binom_test'], or one of the boostrap methods.

## Get a Classification Report
The [classification_report.py](confidenceinterval%2Fclassification_report.py) function builds a text report showing the main classification metrics and their confidence intervals.
Each class will be first treated as a binary classification problem, the default CI for P and R used being Wilson, and Takahashi-binary for F1. Then the
micro and macro multi-class metric will be calculated using the Takahashi-methods.

```python
from confidenceinterval import classification_report_with_ci

y_true = [0, 1, 2, 2, 2, 1, 1, 1, 0, 2, 2, 1, 0, 2, 2, 1, 2, 2, 1, 1]
y_pred = [0, 1, 0, 0, 2, 1, 1, 1, 0, 2, 2, 1, 0, 1, 2, 1, 2, 2, 1, 1]

classification_report_with_ci(y_true, y_pred)

Class Precision Recall F1-Score Precision CI Recall CI F1-Score CI Support
0 Class 0 0.600 1.000 0.750 (0.231, 0.882) (0.439, 1.0) (0.408, 1.092) 3
1 Class 1 0.889 1.000 0.941 (0.565, 0.98) (0.676, 1.0) (0.796, 1.086) 8
2 Class 2 1.000 0.667 0.800 (0.61, 1.0) (0.354, 0.879) (0.562, 1.038) 9
3 micro 0.850 0.850 0.850 (0.694, 1.006) (0.694, 1.006) (0.694, 1.006) 20
4 macro 0.830 0.889 0.830 (0.702, 0.958) (0.775, 1.002) (0.548, 1.113) 20
```
You can also provide a custom mapping for the class names, as well as modify the binary CI method and rounding.
```python
from confidenceinterval import classification_report_with_ci

y_true = [0, 1, 2, 2, 2, 1, 1, 1, 0, 2, 2, 1, 0, 2, 2, 1, 2, 2, 1, 1]
y_pred = [0, 1, 0, 0, 2, 1, 1, 1, 0, 2, 2, 1, 0, 1, 2, 1, 2, 2, 1, 1]

numerical_to_label = {
0: "Cherries",
1: "Olives",
2: "Tangerines"
}

classification_report_with_ci(y_true, y_pred, round_ndigits=2, numerical_to_label_map = numerical_to_label, binary_method='wilson')

Class Precision Recall F1-Score Precision CI Recall CI F1-Score CI Support
0 Cherries 0.60 1.00 0.75 (0.23, 0.88) (0.44, 1.0) (0.41, 1.09) 3
1 Olives 0.89 1.00 0.94 (0.57, 0.98) (0.68, 1.0) (0.8, 1.09) 8
2 Tangerines 1.00 0.67 0.80 (0.61, 1.0) (0.35, 0.88) (0.56, 1.04) 9
3 micro 0.85 0.85 0.85 (0.69, 1.01) (0.69, 1.01) (0.69, 1.01) 20
4 macro 0.83 0.89 0.83 (0.7, 0.96) (0.78, 1.0) (0.55, 1.11) 20
```


## Get a confidence interval for any custom metric with Bootstrapping
With the bootstrap_ci method, you can get the CI for any metric function that gets y_true and y_pred as arguments.

Expand Down
2 changes: 2 additions & 0 deletions confidenceinterval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,5 @@
recall_score, \
f1_score
from confidenceinterval.auc import roc_auc_score

from confidenceinterval.classification_report import classification_report_with_ci
116 changes: 116 additions & 0 deletions confidenceinterval/classification_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
""" Classification report similar to https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html
"""

from typing import List, Dict, Optional

import pandas as pd
import numpy as np
from confidenceinterval.takahashi_methods import precision_score,recall_score, f1_score

def round_tuple(t, decimals=3):
return tuple(round(num, decimals) for num in t)

def classification_report_with_ci(y_true: List[int], y_pred: List[int],
binary_method: str = 'wilson',
round_ndigits: int = 3,
numerical_to_label_map: Optional[Dict[int, str]] = None,
confidence_level: float = 0.95) -> pd.DataFrame:
"""
Parameters
----------
y_true : List[int]
The ground truth labels.
y_pred : List[int]
The predicted categories.
binary_method: str = 'wilson'
The method to calculate the CI for binary proportions.
round_ndigits: int = 3
Number of digits to return after the decimal point.
numerical_to_label_map: Optional[Dict[int, str]]
Mapping from class indices to descriptive names.
confidence_level: float, optional
The confidence level, by default 0.95

Returns
-------
pd.DataFrame
A DataFrame containing precision, recall, F1-score, and their confidence intervals for each class,
as well as micro and macro averages.
"""

# Unique classes in the dataset
classes = np.unique(y_true)

# Validate that all unique classes are covered in the numerical_to_label_map if provided
if numerical_to_label_map is not None:
missing_labels = [cls for cls in classes if cls not in numerical_to_label_map]
if missing_labels:
raise ValueError(f'Missing labels for classes: {missing_labels}')

data = [] # List to store row dictionaries

# Unique classes in the dataset
classes = np.unique(y_true)

# Calculate precision, recall, f1 for each class treated as binary
for class_ in classes:
y_true_binary = [1 if y == class_ else 0 for y in y_true]
y_pred_binary = [1 if y == class_ else 0 for y in y_pred]

# Calculate metrics
precision, precision_ci = precision_score(y_true_binary, y_pred_binary, average='binary', method=binary_method)
recall, recall_ci = recall_score(y_true_binary, y_pred_binary, average='binary', method=binary_method)
binary_f1, binary_f1_ci = f1_score(y_true_binary, y_pred_binary, confidence_level=confidence_level, average='binary')

class_name = numerical_to_label_map[class_] if (
numerical_to_label_map and class_ in numerical_to_label_map) else f'Class {class_}'
support = sum(y_true_binary)

# Create a new row as a DataFrame and append it to the main DataFrame
# Append new row to the list
data.append({
'Class': class_name,
'Precision': round(precision, round_ndigits),
'Recall': round(recall, round_ndigits),
'F1-Score': round(binary_f1, round_ndigits),
'Precision CI': round_tuple(precision_ci, round_ndigits),
'Recall CI': round_tuple(recall_ci, round_ndigits),
'F1-Score CI': round_tuple(binary_f1_ci, round_ndigits),
'Support': support
})

precision_micro, p_ci_micro = precision_score(y_true, y_pred, average='micro')
precision_macro, p_ci_macro = precision_score(y_true, y_pred, average='macro')

recall_micro, r_ci_micro = recall_score(y_true, y_pred, average='micro')
recall_macro, r_ci_macro = recall_score(y_true, y_pred, average='macro')

f1_micro, f1_ci_micro = f1_score(y_true, y_pred, average='micro')
f1_macro, f1_ci_macro = f1_score(y_true, y_pred, average='macro')

data.append({
'Class': 'micro',
'Precision': round(precision_micro, round_ndigits),
'Recall': round(recall_micro, round_ndigits),
'F1-Score': round(f1_micro, round_ndigits),
'Precision CI': round_tuple(p_ci_micro, round_ndigits),
'Recall CI': round_tuple(r_ci_micro, round_ndigits),
'F1-Score CI': round_tuple(f1_ci_micro, round_ndigits),
'Support': len(y_true)
})

data.append({
'Class': 'macro',
'Precision': round(precision_macro, round_ndigits),
'Recall': round(recall_macro, round_ndigits),
'F1-Score': round(f1_macro, round_ndigits),
'Precision CI': round_tuple(p_ci_macro, decimals=round_ndigits),
'Recall CI': round_tuple(r_ci_macro, decimals=round_ndigits),
'F1-Score CI': round_tuple(f1_ci_macro, decimals=round_ndigits),
'Support': len(y_true)

})

df = pd.DataFrame(data)

return df
10 changes: 6 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
statsmodels
scikit-learn
scipy
numpy
statsmodels~=0.14.2
scikit-learn~=1.4.2
scipy~=1.13.0
numpy~=1.26.4
setuptools~=68.2.0
pandas~=2.2.2
Loading