diff --git a/docs/conf.py b/docs/conf.py
index 2e3450689..f8b074636 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -25,7 +25,6 @@
 
 
 def configureDoxyfile(input_dir, output_dir):
-
     with open("Doxyfile.in", "r") as file:
         filedata = file.read()
 
diff --git a/fbgemm_gpu/bench/histogram_binning_calibration_benchmark.py b/fbgemm_gpu/bench/histogram_binning_calibration_benchmark.py
index e089e0888..1f7d29fbc 100644
--- a/fbgemm_gpu/bench/histogram_binning_calibration_benchmark.py
+++ b/fbgemm_gpu/bench/histogram_binning_calibration_benchmark.py
@@ -178,9 +178,9 @@ def fbgemm_generic_hbc_by_feature_cpu(input: Tensor) -> Tuple[Tensor, Tensor]:
             if step >= warmup_runs:
                 total_time["hbc"]["cpu"][data_type] += hbc_time
                 total_time["hbc_by_feature"]["cpu"][data_type] += hbc_by_feature_time
-                total_time["generic_hbc_by_feature"]["cpu"][
-                    data_type
-                ] += generic_hbc_by_feature_time
+                total_time["generic_hbc_by_feature"]["cpu"][data_type] += (
+                    generic_hbc_by_feature_time
+                )
 
         if torch.cuda.is_available():
             bin_num_examples_gpu: Tensor = bin_num_examples.cuda()
@@ -260,12 +260,12 @@ def fbgemm_generic_hbc_by_feature_gpu(
                 )
                 if step >= warmup_runs:
                     total_time["hbc"]["gpu"][data_type] += hbc_time
-                    total_time["hbc_by_feature"]["gpu"][
-                        data_type
-                    ] += hbc_by_feature_time
-                    total_time["generic_hbc_by_feature"]["gpu"][
-                        data_type
-                    ] += generic_hbc_by_feature_time
+                    total_time["hbc_by_feature"]["gpu"][data_type] += (
+                        hbc_by_feature_time
+                    )
+                    total_time["generic_hbc_by_feature"]["gpu"][data_type] += (
+                        generic_hbc_by_feature_time
+                    )
 
     for op, curr_items in total_time.items():
         for platform, data_items in curr_items.items():
diff --git a/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py b/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
index 207fa350b..b2d4a911b 100644
--- a/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
+++ b/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
@@ -586,8 +586,9 @@ def uvm(
         assert (
             use_cache
         ), "--use-cache is required for --no-conflict-misses or all-conflict-misses"
-        assert (no_conflict_misses and not all_conflict_misses) or (
-            not no_conflict_misses and all_conflict_misses
+        assert (
+            (no_conflict_misses and not all_conflict_misses)
+            or (not no_conflict_misses and all_conflict_misses)
         ), "Cannot use both --no-conflict-misses and --all-conflict-misses at the same time!"
         logging.info(
             "Evaluate {}: Cache shape {}".format(
diff --git a/fbgemm_gpu/bench/ssd_table_batched_embeddings_benchmark.py b/fbgemm_gpu/bench/ssd_table_batched_embeddings_benchmark.py
index a35fcdddf..4baf64461 100644
--- a/fbgemm_gpu/bench/ssd_table_batched_embeddings_benchmark.py
+++ b/fbgemm_gpu/bench/ssd_table_batched_embeddings_benchmark.py
@@ -524,7 +524,6 @@ def nbit_ssd(
     enforce_hbm: bool,
     ssd_cache_loc: str,
 ) -> None:
-
     np.random.seed(42)
     torch.manual_seed(42)
     B = batch_size
diff --git a/fbgemm_gpu/experimental/example/test/triton_example_test.py b/fbgemm_gpu/experimental/example/test/triton_example_test.py
index b4a744e0f..ebc2285c9 100644
--- a/fbgemm_gpu/experimental/example/test/triton_example_test.py
+++ b/fbgemm_gpu/experimental/example/test/triton_example_test.py
@@ -15,8 +15,10 @@
 
 @triton.jit
 # fmt: off
-def triton_add_kernel(x_ptr, y_ptr, z_ptr, n_elements, BLOCK_SIZE: tl.constexpr) -> None:
-# fmt: on  # noqa E115
+def triton_add_kernel(
+    x_ptr, y_ptr, z_ptr, n_elements, BLOCK_SIZE: tl.constexpr
+) -> None:
+    # fmt: on  # noqa E115
 
     # We use a 1D launch grid so axis is 0.
     pid = tl.program_id(axis=0)
diff --git a/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py b/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py
index ebcff2265..878cb56b1 100644
--- a/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py
+++ b/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py
@@ -908,7 +908,6 @@ def _kernel_matmul_fp8_row_tma_persistent(
 
 
 class TmaAutoTuneHelper:
-
     # duck typing wrapper to implement the same interface as TmaDescKernelParam in Triton PR #4498
     class KernelParamWrapper:
         def __init__(self, desc):
@@ -1452,7 +1451,6 @@ def _kernel_matmul_fp8_block_fastacc(
     k_multiple = scale_block_k // BLOCK_K
 
     for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)):
-
         k_remaining = K - k * (BLOCK_K * SPLIT_K)
 
         if EVEN_K:
@@ -2336,8 +2334,8 @@ def triton_quantize_fp8_block(
         torch.Tensor : [M, K] fp8 scaled tensor.
         torch.Tensor: [cdiv(M, block_m), cdiv(K, block_k)] reciprocal scale tensor per block.
     """
-    assert x.device != torch.device(
-        "cpu"
+    assert (
+        x.device != torch.device("cpu")
     ), "Blockwise quantization not support on cpu, please use row-wise quantization instead."
     x_shape = x.shape
     x = x.view(-1, x.size(-1))
diff --git a/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py b/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py
index 65b34a956..feaf088ea 100644
--- a/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py
+++ b/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py
@@ -91,7 +91,7 @@ def benchmark(
         self,
         *args,
         bench_quantize: bool = False,
-        use_rotating_buffer_bench: bool = False
+        use_rotating_buffer_bench: bool = False,
     ) -> float:
         """Benchmark runtime of this operator."""
         if bench_quantize:
diff --git a/fbgemm_gpu/experimental/gen_ai/test/attention/gqa_test.py b/fbgemm_gpu/experimental/gen_ai/test/attention/gqa_test.py
index 8446e34e3..87d00006b 100755
--- a/fbgemm_gpu/experimental/gen_ai/test/attention/gqa_test.py
+++ b/fbgemm_gpu/experimental/gen_ai/test/attention/gqa_test.py
@@ -335,7 +335,6 @@ def mqa_reference(
             q[0][0][0][0] = 1000
         # cache_x_ref is for input to reference implementation
         if dtype in ["fp8", "int4"]:
-
             if dtype == "fp8":
                 num_groups = 1
                 qparam_offset = 4 * num_groups
diff --git a/fbgemm_gpu/experimental/gen_ai/test/kv_cache/kv_cache_test.py b/fbgemm_gpu/experimental/gen_ai/test/kv_cache/kv_cache_test.py
index 17a58c620..62c81db76 100644
--- a/fbgemm_gpu/experimental/gen_ai/test/kv_cache/kv_cache_test.py
+++ b/fbgemm_gpu/experimental/gen_ai/test/kv_cache/kv_cache_test.py
@@ -328,7 +328,6 @@ def test_positional_encoding_with_paged_attention(
         B: int,
         BLOCK_N: int,
     ) -> None:
-
         N_H_L = 1
         N_KVH_L = 8
         D_H = 128
diff --git a/fbgemm_gpu/fbgemm_gpu/split_embedding_inference_converter.py b/fbgemm_gpu/fbgemm_gpu/split_embedding_inference_converter.py
index 500157b4c..f9858837f 100644
--- a/fbgemm_gpu/fbgemm_gpu/split_embedding_inference_converter.py
+++ b/fbgemm_gpu/fbgemm_gpu/split_embedding_inference_converter.py
@@ -119,9 +119,7 @@ def _process_split_embs(self, model: torch.nn.Module) -> None:
                             f"Embedding dim {D} couldn't be divided by align size {weights_ty.align_size()}!"
                         )
                         assert D % 4 == 0
-                        weights_ty = (
-                            SparseType.FP16
-                        )  # fall back to FP16 if dimension couldn't be aligned with the required size
+                        weights_ty = SparseType.FP16  # fall back to FP16 if dimension couldn't be aligned with the required size
                     embedding_specs.append(("", E, D, weights_ty))
 
                 weight_lists = []
diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py
index 8eb048377..ea1c19e7c 100644
--- a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py
+++ b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py
@@ -2817,9 +2817,7 @@ def _recording_to_timer(
         if self.stats_reporter is not None and self.stats_reporter.should_report(
             self.step
         ):
-            assert (
-                timer
-            ), "We shouldn't be here, async timer must have been initiated if reporter is present."
+            assert timer, "We shouldn't be here, async timer must have been initiated if reporter is present."
             return timer.recording(**kwargs)
         # No-Op context manager
         return contextlib.nullcontext()
diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py b/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py
index c1f13bd74..afa3de987 100644
--- a/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py
+++ b/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py
@@ -1436,7 +1436,6 @@ def forward(
         )
 
         if len(self.timesteps_prefetched) == 0:
-
             with self._recording_to_timer(
                 self.ssd_prefetch_read_timer,
                 context=self.step,
@@ -1776,7 +1775,8 @@ def _report_ssd_io_stats(self) -> None:
         this function fetch the stats from EmbeddingRocksDB and report it with stats_reporter
         """
         ssd_io_duration = self.ssd_db.get_rocksdb_io_duration(
-            self.step, self.stats_reporter.report_interval  # pyre-ignore
+            self.step,
+            self.stats_reporter.report_interval,  # pyre-ignore
         )
 
         if len(ssd_io_duration) != 5:
@@ -1878,7 +1878,8 @@ def _report_l2_cache_perf_stats(self) -> None:
             return
 
         l2_cache_perf_stats = self.ssd_db.get_l2cache_perf(
-            self.step, stats_reporter.report_interval  # pyre-ignore
+            self.step,
+            stats_reporter.report_interval,  # pyre-ignore
         )
 
         if len(l2_cache_perf_stats) != 15:
@@ -2001,9 +2002,7 @@ def _recording_to_timer(
         if self.stats_reporter is not None and self.stats_reporter.should_report(
             self.step
         ):
-            assert (
-                timer
-            ), "We shouldn't be here, async timer must have been initiated if reporter is present."
+            assert timer, "We shouldn't be here, async timer must have been initiated if reporter is present."
             return timer.recording(**kwargs)
         # No-Op context manager
         return contextlib.nullcontext()
diff --git a/fbgemm_gpu/test/lint/check_meta_header.py b/fbgemm_gpu/test/lint/check_meta_header.py
index 5fd5e41f6..eaa3755de 100644
--- a/fbgemm_gpu/test/lint/check_meta_header.py
+++ b/fbgemm_gpu/test/lint/check_meta_header.py
@@ -6,8 +6,7 @@
 # LICENSE file in the root directory of this source tree.
 
 
-"""Check Python source code contains Meta copyright header
-"""
+"""Check Python source code contains Meta copyright header"""
 
 from __future__ import annotations
 
diff --git a/fbgemm_gpu/test/quantize/fused_8bit_rowwise_test.py b/fbgemm_gpu/test/quantize/fused_8bit_rowwise_test.py
index a2183fcd9..ccf6fdd1a 100644
--- a/fbgemm_gpu/test/quantize/fused_8bit_rowwise_test.py
+++ b/fbgemm_gpu/test/quantize/fused_8bit_rowwise_test.py
@@ -271,9 +271,7 @@ def test_quantize_and_dequantize_op(  # noqa: C901
                 reference = torch.from_numpy(
                     fused_rowwise_8bit_dequantize_2bytes_padding_scale_bias_first_reference(
                         quantize_data_numpy
-                    )[
-                        :, :ncols
-                    ]
+                    )[:, :ncols]
                 )
             if output_dtype == SparseType.FP32:
                 torch.testing.assert_close(
diff --git a/fbgemm_gpu/test/quantize/mx4_test.py b/fbgemm_gpu/test/quantize/mx4_test.py
index 592f3e15d..b3c3e8526 100644
--- a/fbgemm_gpu/test/quantize/mx4_test.py
+++ b/fbgemm_gpu/test/quantize/mx4_test.py
@@ -145,7 +145,6 @@ def fake_quantize_mx(
 
 # @optests.generate_opcheck_tests()
 class TestMXQuantizationConversion(unittest.TestCase):
-
     @unittest.skipIf(*gpu_unavailable)
     # pyre-fixme[56]:
     @given(
diff --git a/fbgemm_gpu/test/tbe/inference/nbit_forward_autovec_test.py b/fbgemm_gpu/test/tbe/inference/nbit_forward_autovec_test.py
index 920a86cbd..2d46c9263 100644
--- a/fbgemm_gpu/test/tbe/inference/nbit_forward_autovec_test.py
+++ b/fbgemm_gpu/test/tbe/inference/nbit_forward_autovec_test.py
@@ -296,9 +296,7 @@ def execute_nbit_forward_(  # noqa C901
                 scale_shift[:, :] = torch.tensor(
                     # pyre-fixme[61]: `scales` is undefined, or not always defined.
                     # pyre-fixme[61]: `shifts` is undefined, or not always defined.
-                    np.stack([scales, shifts], axis=1)
-                    .astype(np.float16)
-                    .view(np.uint8)
+                    np.stack([scales, shifts], axis=1).astype(np.float16).view(np.uint8)
                 )
 
             fake_quantize_embs(
diff --git a/fbgemm_gpu/test/tbe/inference/nbit_forward_test.py b/fbgemm_gpu/test/tbe/inference/nbit_forward_test.py
index 8f4c32eea..2334ff88c 100644
--- a/fbgemm_gpu/test/tbe/inference/nbit_forward_test.py
+++ b/fbgemm_gpu/test/tbe/inference/nbit_forward_test.py
@@ -523,9 +523,7 @@ def execute_nbit_forward_(  # noqa C901
                 scale_shift[:, :] = torch.tensor(
                     # pyre-fixme[61]: `scales` is undefined, or not always defined.
                     # pyre-fixme[61]: `shifts` is undefined, or not always defined.
-                    np.stack([scales, shifts], axis=1)
-                    .astype(np.float16)
-                    .view(np.uint8)
+                    np.stack([scales, shifts], axis=1).astype(np.float16).view(np.uint8)
                 )
 
             fake_quantize_embs(
diff --git a/fbgemm_gpu/test/tbe/ssd/ssd_split_tbe_inference_test.py b/fbgemm_gpu/test/tbe/ssd/ssd_split_tbe_inference_test.py
index 1df34d30a..f760d1faf 100644
--- a/fbgemm_gpu/test/tbe/ssd/ssd_split_tbe_inference_test.py
+++ b/fbgemm_gpu/test/tbe/ssd/ssd_split_tbe_inference_test.py
@@ -329,9 +329,7 @@ def test_nbit_ssd_cache(
                 scale_shift[:, :] = torch.tensor(
                     # pyre-fixme[61]: `scales` is undefined, or not always defined.
                     # pyre-fixme[61]: `shifts` is undefined, or not always defined.
-                    np.stack([scales, shifts], axis=1)
-                    .astype(np.float16)
-                    .view(np.uint8)
+                    np.stack([scales, shifts], axis=1).astype(np.float16).view(np.uint8)
                 )
 
             D_bytes = rounded_row_size_in_bytes(
diff --git a/fbgemm_gpu/test/tbe/ssd/ssd_split_tbe_training_test.py b/fbgemm_gpu/test/tbe/ssd/ssd_split_tbe_training_test.py
index 5adf7c89b..371b6253e 100644
--- a/fbgemm_gpu/test/tbe/ssd/ssd_split_tbe_training_test.py
+++ b/fbgemm_gpu/test/tbe/ssd/ssd_split_tbe_training_test.py
@@ -925,10 +925,7 @@ def _prefetch(b_it: int) -> int:
                 # pyre-fixme[16]: Optional type has no attribute `float`.
                 optim_state_r.add_(
                     # pyre-fixme[16]: `Optional` has no attribute `float`.
-                    emb_r.weight.grad.float()
-                    .to_dense()
-                    .pow(2)
-                    .mean(dim=1)
+                    emb_r.weight.grad.float().to_dense().pow(2).mean(dim=1)
                 )
                 torch.testing.assert_close(
                     optim_state_t.float(),
diff --git a/fbgemm_gpu/test/tbe/ssd/ssd_utils_test.py b/fbgemm_gpu/test/tbe/ssd/ssd_utils_test.py
index 5d94ae233..7cf03dea8 100644
--- a/fbgemm_gpu/test/tbe/ssd/ssd_utils_test.py
+++ b/fbgemm_gpu/test/tbe/ssd/ssd_utils_test.py
@@ -263,7 +263,6 @@ def test_scratch_pad_indices_queue(
         for indices, lookup_indices, count, lookup_count in zip(
             all_indices, all_lookup_indices, all_counts, all_lookup_counts
         ):
-
             # Run reference
             # Prepare inputs for the reference run
             sp_prev_curr_map_ref = torch.zeros_like(lookup_indices)
diff --git a/fbgemm_gpu/test/test_utils.py b/fbgemm_gpu/test/test_utils.py
index 5de322041..f1b67b0e6 100644
--- a/fbgemm_gpu/test/test_utils.py
+++ b/fbgemm_gpu/test/test_utils.py
@@ -212,7 +212,9 @@ def use_cpu_strategy() -> st.SearchStrategy[bool]:
         st.booleans()
         if (gpu_available and not TEST_WITH_ROCM)
         # fmt: off
-        else st.just(False) if (gpu_available and TEST_WITH_ROCM) else st.just(True)
+        else st.just(False)
+        if (gpu_available and TEST_WITH_ROCM)
+        else st.just(True)
         # fmt: on
     )