working modeling code

facebookexperimental · Dec 1, 2024 · 0d5974a · 0d5974a
1 parent 95cb36f
commit 0d5974a
Showing 5 changed files with 802,275 additions and 985 deletions.
diff --git a/python/src/robyn/modeling/ridge/ridge_evaluate_model.py b/python/src/robyn/modeling/ridge/ridge_evaluate_model.py
@@ -223,6 +223,15 @@ def _evaluate_model(
         self.logger.debug(f"Sample of X values: {X.head()}")
         self.logger.debug(f"Sample of y values: {y.head()}")
 
+        # Debug is True by default now
+        debug = True
+
+        if debug and (iter_ng == 0 or iter_ng % 25 == 0):
+            print(f"\nEvaluation Debug (trial {trial}, iteration {iter_ng}):")
+            print(f"X shape: {X.shape}")
+            print(f"y shape: {y.shape}")
+            print("Parameters:", params)
+
         # Split data using R's approach
         train_size = params.get("train_size", 1.0) if ts_validation else 1.0
         train_idx = int(len(X) * train_size)
@@ -246,7 +255,7 @@ def _evaluate_model(
         # Calculate lambda using R-matching helper function
         lambda_hp = params.get("lambda", 1.0)
         lambda_, lambda_max = self.ridge_metrics_calculator._calculate_lambda(
-            x_norm, y_norm, lambda_hp
+            x_norm, y_norm, lambda_hp, debug=debug, iteration=iter_ng
         )
         # After calculating lambda
         self.logger.debug(f"Lambda calculation debug:")
@@ -264,7 +273,7 @@ def _evaluate_model(
             y_norm, y_train_pred, x_norm.shape[1]
         )
         metrics["nrmse_train"] = self.ridge_metrics_calculator.calculate_nrmse(
-            y_norm, y_train_pred
+            y_norm, y_train_pred, debug=debug, iteration=iter_ng
         )
 
         # Validation and test metrics
@@ -299,7 +308,12 @@ def _evaluate_model(
             if col in self.mmm_data.mmmdata_spec.paid_media_spends
         ]
         decomp_rssd = self.ridge_metrics_calculator._calculate_rssd(
-            model, X_train, paid_media_cols, rssd_zero_penalty
+            model,
+            X_train,
+            paid_media_cols,
+            rssd_zero_penalty,
+            debug=debug,
+            iteration=iter_ng,
         )
 
         elapsed_time = time.time() - start_time

diff --git a/python/src/robyn/modeling/ridge/ridge_metrics_calculator.py b/python/src/robyn/modeling/ridge/ridge_metrics_calculator.py
@@ -157,38 +157,52 @@ def _calculate_decomp_spend_dist(
         return df[required_cols]
 
     def _calculate_lambda(
-        self, x_norm: np.ndarray, y_norm: np.ndarray, lambda_hp: float
+        self,
+        x_norm: np.ndarray,
+        y_norm: np.ndarray,
+        lambda_hp: float,
+        debug: bool = True,
+        iteration: int = 0,
     ) -> Tuple[float, float]:
         """Match R's glmnet lambda calculation exactly"""
         n_samples = len(y_norm)
 
-        # Scale to original magnitude like R
-        scale_factor = np.max(np.abs(x_norm)) * np.max(np.abs(y_norm))
-
-        # R-style standardization
+        # Standardize first before scale factor calculation
         def r_scale(x):
             mean = np.mean(x)
-            var = np.sum((x - mean) ** 2) / (len(x) - 1)
-            sd = np.sqrt(var)
-            return (x - mean) / sd if sd > 1e-10 else np.zeros_like(x)
+            sd = np.sqrt(np.sum((x - mean) ** 2) / (len(x) - 1))
+            return (x - mean) / (sd if sd > 1e-10 else 1.0)
 
-        # Scale X and y
+        # Scale X and y first
         x_scaled = np.apply_along_axis(r_scale, 0, x_norm)
         y_scaled = r_scale(y_norm)
 
-        # R's lambda calculation with original magnitude
+        # Now calculate scale factor using scaled data
+        scale_factor = np.mean(np.abs(x_scaled)) * np.mean(np.abs(y_scaled))
+
+        if debug and (iteration == 0 or iteration % 25 == 0):
+            print(f"\nLambda Calculation Debug (iteration {iteration}):")
+            print(f"n_samples: {n_samples}")
+            print(f"x_scaled mean abs: {np.mean(np.abs(x_scaled)):.6f}")
+            print(f"y_scaled mean abs: {np.mean(np.abs(y_scaled)):.6f}")
+            print(f"Scale factor: {scale_factor:.6f}")
+            print(f"lambda_hp: {lambda_hp:.6f}")
+
+        # R's lambda calculation
         alpha = 0.001
         gram = x_scaled.T @ x_scaled / n_samples
         ctx = np.abs(x_scaled.T @ y_scaled) / n_samples
 
-        # Rescale to match R's magnitude
-        lambda_max = np.max(ctx) * scale_factor / alpha
-
-        # R's lambda interpolation
-        lambda_min_ratio = 0.0001
-        lambda_min = lambda_min_ratio * lambda_max
+        lambda_max = np.max(ctx) / alpha
+        lambda_min = lambda_max * 0.0001
         lambda_ = lambda_min + lambda_hp * (lambda_max - lambda_min)
 
+        if debug and (iteration == 0 or iteration % 25 == 0):
+            print(f"max ctx: {np.max(ctx):.6f}")
+            print(f"lambda_max: {lambda_max:.6f}")
+            print(f"lambda_min: {lambda_min:.6f}")
+            print(f"final lambda_: {lambda_:.6f}")
+
         return lambda_, lambda_max
 
     def _calculate_rssd(
@@ -197,61 +211,60 @@ def _calculate_rssd(
         X: pd.DataFrame,
         paid_media_cols: List[str],
         rssd_zero_penalty: bool,
+        debug: bool = True,
+        iteration: int = 0,
     ) -> float:
-        """Calculate RSSD exactly like R's implementation"""
-        self.logger.debug(f"\nRSSD Calculation Debug:")
-        self.logger.debug(f"Paid media columns: {paid_media_cols}")
-
-        total_spend = np.abs(X[paid_media_cols].sum()).sum()
+        """Calculate RSSD with proper normalization"""
+        total_raw_spend = np.sum(np.abs(X[paid_media_cols].sum()))
         effects = []
         spends = []
 
-        self.logger.debug(f"Effect calculations per channel:")
+        # First collect all values
         for col in paid_media_cols:
             idx = list(X.columns).index(col)
             coef = model.coef_[idx]
-            spend = np.abs(X[col].sum())
-            effect = coef * spend
+            raw_spend = np.abs(X[col].sum())
+            effect = np.abs(coef * raw_spend)  # Keep absolute effect
             effects.append(effect)
-            spends.append(spend)
-            self.logger.debug(
-                f"{col}: coef={coef:.6f}, spend={spend:.6f}, effect={effect:.6f}"
-            )
+            spends.append(raw_spend)
+
+            if debug and (iteration == 0 or iteration % 25 == 0):
+                print(f"{col}:")
+                print(f"  coefficient: {coef:.6f}")
+                print(f"  raw spend: {raw_spend:.6f}")
+                print(f"  effect: {effect:.6f}")
 
+        # Convert to numpy arrays
         effects = np.array(effects)
         spends = np.array(spends)
 
-        # Calculate normalized effects and spends
-        total_effect = np.sum(np.abs(effects))
-        self.logger.debug(
-            f"Totals - spend: {total_spend:.6f}, effect: {total_effect:.6f}"
-        )
+        # Calculate totals for normalization
+        total_effect = np.sum(effects)
+        total_spend = np.sum(spends)
 
         if total_effect > 0 and total_spend > 0:
-            effects_norm = np.abs(effects) / total_effect
+            # Normalize proportionally
+            effects_norm = effects / total_effect
             spends_norm = spends / total_spend
-            self.logger.debug(f"Normalized values:")
-            self.logger.debug(f"Effects: {effects_norm}")
-            self.logger.debug(f"Spends: {spends_norm}")
 
-            rssd = np.sqrt(np.mean((effects_norm - spends_norm) ** 2))
+            if debug and (iteration == 0 or iteration % 25 == 0):
+                print("\nNormalized values:")
+                print("Effects:", effects_norm)
+                print("Spends:", spends_norm)
+                print("Effect total (check=1):", np.sum(effects_norm))
+                print("Spend total (check=1):", np.sum(spends_norm))
+
+            # Calculate RSSD
+            squared_diff = (effects_norm - spends_norm) ** 2
+            rssd = np.sqrt(np.mean(squared_diff))
 
             if rssd_zero_penalty:
                 zero_effects = sum(1 for e in effects if abs(e) < 1e-10)
                 if zero_effects > 0:
-                    rssd_original = rssd
                     rssd *= 1 + zero_effects / len(effects)
-                    self.logger.debug(
-                        f"Applied zero penalty: {zero_effects} zero effects found"
-                    )
-                    self.logger.debug(
-                        f"RSSD adjusted from {rssd_original:.6f} to {rssd:.6f}"
-                    )
 
-            self.logger.debug(f"Final RSSD: {rssd:.6f}")
             return float(rssd)
 
-        self.logger.debug("RSSD calculation returned infinity")
         return float(np.inf)
 
     def _calculate_mape(
@@ -439,15 +452,45 @@ def calculate_r2_score(
 
         return float(adj_r2)
 
-    def calculate_nrmse(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
-        """Match R's NRMSE calculation exactly"""
+    def calculate_nrmse(
+        self,
+        y_true: np.ndarray,
+        y_pred: np.ndarray,
+        debug: bool = True,
+        iteration: int = 0,
+    ) -> float:
+        """Calculate NRMSE with detailed debugging"""
         n = len(y_true)
-        rss = np.sum((y_true - y_pred) ** 2)
-        scale = np.max(y_true) - np.min(y_true)
-
-        if scale > 0:
-            nrmse = np.sqrt(rss / n) / scale
-        else:
-            nrmse = np.sqrt(rss / n)
+        y_true = np.asarray(y_true)
+        y_pred = np.asarray(y_pred)
+
+        # Calculate residuals and RSS
+        residuals = y_true - y_pred
+        rss = np.sum(residuals**2)
+
+        # Calculate range from true values
+        y_min = np.min(y_true)
+        y_max = np.max(y_true)
+        scale = y_max - y_min
+
+        # Calculate RMSE first
+        rmse = np.sqrt(rss / n)
+
+        if debug and (iteration == 0 or iteration % 25 == 0):
+            print(f"\nNRMSE Calculation Debug (iteration {iteration}):")
+            print(f"n: {n}")
+            print(f"RSS: {rss:.6f}")
+            print(f"RMSE: {rmse:.6f}")
+            print(f"y_true range: [{y_min:.6f}, {y_max:.6f}]")
+            print(f"scale: {scale:.6f}")
+            print("First 5 pairs (true, pred, residual):")
+            for i in range(min(5, len(y_true))):
+                print(f"  {y_true[i]:.6f}, {y_pred[i]:.6f}, {residuals[i]:.6f}")
+
+        # Calculate final NRMSE
+        nrmse = rmse / scale if scale > 0 else rmse
+
+        if debug and (iteration == 0 or iteration % 25 == 0):
+            print(f"Final NRMSE: {nrmse:.6f}")
 
         return float(nrmse)