diff --git a/docs/conf.py b/docs/conf.py index 2e3450689..f8b074636 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -25,7 +25,6 @@ def configureDoxyfile(input_dir, output_dir): - with open("Doxyfile.in", "r") as file: filedata = file.read() diff --git a/fbgemm_gpu/bench/histogram_binning_calibration_benchmark.py b/fbgemm_gpu/bench/histogram_binning_calibration_benchmark.py index e089e0888..1f7d29fbc 100644 --- a/fbgemm_gpu/bench/histogram_binning_calibration_benchmark.py +++ b/fbgemm_gpu/bench/histogram_binning_calibration_benchmark.py @@ -178,9 +178,9 @@ def fbgemm_generic_hbc_by_feature_cpu(input: Tensor) -> Tuple[Tensor, Tensor]: if step >= warmup_runs: total_time["hbc"]["cpu"][data_type] += hbc_time total_time["hbc_by_feature"]["cpu"][data_type] += hbc_by_feature_time - total_time["generic_hbc_by_feature"]["cpu"][ - data_type - ] += generic_hbc_by_feature_time + total_time["generic_hbc_by_feature"]["cpu"][data_type] += ( + generic_hbc_by_feature_time + ) if torch.cuda.is_available(): bin_num_examples_gpu: Tensor = bin_num_examples.cuda() @@ -260,12 +260,12 @@ def fbgemm_generic_hbc_by_feature_gpu( ) if step >= warmup_runs: total_time["hbc"]["gpu"][data_type] += hbc_time - total_time["hbc_by_feature"]["gpu"][ - data_type - ] += hbc_by_feature_time - total_time["generic_hbc_by_feature"]["gpu"][ - data_type - ] += generic_hbc_by_feature_time + total_time["hbc_by_feature"]["gpu"][data_type] += ( + hbc_by_feature_time + ) + total_time["generic_hbc_by_feature"]["gpu"][data_type] += ( + generic_hbc_by_feature_time + ) for op, curr_items in total_time.items(): for platform, data_items in curr_items.items(): diff --git a/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py b/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py index 207fa350b..b2d4a911b 100644 --- a/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py +++ b/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py @@ -586,8 +586,9 @@ def uvm( assert ( use_cache ), "--use-cache is required for --no-conflict-misses or all-conflict-misses" - assert (no_conflict_misses and not all_conflict_misses) or ( - not no_conflict_misses and all_conflict_misses + assert ( + (no_conflict_misses and not all_conflict_misses) + or (not no_conflict_misses and all_conflict_misses) ), "Cannot use both --no-conflict-misses and --all-conflict-misses at the same time!" logging.info( "Evaluate {}: Cache shape {}".format( diff --git a/fbgemm_gpu/bench/ssd_table_batched_embeddings_benchmark.py b/fbgemm_gpu/bench/ssd_table_batched_embeddings_benchmark.py index a35fcdddf..4baf64461 100644 --- a/fbgemm_gpu/bench/ssd_table_batched_embeddings_benchmark.py +++ b/fbgemm_gpu/bench/ssd_table_batched_embeddings_benchmark.py @@ -524,7 +524,6 @@ def nbit_ssd( enforce_hbm: bool, ssd_cache_loc: str, ) -> None: - np.random.seed(42) torch.manual_seed(42) B = batch_size diff --git a/fbgemm_gpu/experimental/example/test/triton_example_test.py b/fbgemm_gpu/experimental/example/test/triton_example_test.py index b4a744e0f..ebc2285c9 100644 --- a/fbgemm_gpu/experimental/example/test/triton_example_test.py +++ b/fbgemm_gpu/experimental/example/test/triton_example_test.py @@ -15,8 +15,10 @@ @triton.jit # fmt: off -def triton_add_kernel(x_ptr, y_ptr, z_ptr, n_elements, BLOCK_SIZE: tl.constexpr) -> None: -# fmt: on # noqa E115 +def triton_add_kernel( + x_ptr, y_ptr, z_ptr, n_elements, BLOCK_SIZE: tl.constexpr +) -> None: + # fmt: on # noqa E115 # We use a 1D launch grid so axis is 0. pid = tl.program_id(axis=0) diff --git a/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py b/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py index ebcff2265..878cb56b1 100644 --- a/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py +++ b/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py @@ -908,7 +908,6 @@ def _kernel_matmul_fp8_row_tma_persistent( class TmaAutoTuneHelper: - # duck typing wrapper to implement the same interface as TmaDescKernelParam in Triton PR #4498 class KernelParamWrapper: def __init__(self, desc): @@ -1452,7 +1451,6 @@ def _kernel_matmul_fp8_block_fastacc( k_multiple = scale_block_k // BLOCK_K for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)): - k_remaining = K - k * (BLOCK_K * SPLIT_K) if EVEN_K: @@ -2336,8 +2334,8 @@ def triton_quantize_fp8_block( torch.Tensor : [M, K] fp8 scaled tensor. torch.Tensor: [cdiv(M, block_m), cdiv(K, block_k)] reciprocal scale tensor per block. """ - assert x.device != torch.device( - "cpu" + assert ( + x.device != torch.device("cpu") ), "Blockwise quantization not support on cpu, please use row-wise quantization instead." x_shape = x.shape x = x.view(-1, x.size(-1)) diff --git a/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py b/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py index 65b34a956..feaf088ea 100644 --- a/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py +++ b/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py @@ -91,7 +91,7 @@ def benchmark( self, *args, bench_quantize: bool = False, - use_rotating_buffer_bench: bool = False + use_rotating_buffer_bench: bool = False, ) -> float: """Benchmark runtime of this operator.""" if bench_quantize: diff --git a/fbgemm_gpu/experimental/gen_ai/test/attention/gqa_test.py b/fbgemm_gpu/experimental/gen_ai/test/attention/gqa_test.py index 8446e34e3..87d00006b 100755 --- a/fbgemm_gpu/experimental/gen_ai/test/attention/gqa_test.py +++ b/fbgemm_gpu/experimental/gen_ai/test/attention/gqa_test.py @@ -335,7 +335,6 @@ def mqa_reference( q[0][0][0][0] = 1000 # cache_x_ref is for input to reference implementation if dtype in ["fp8", "int4"]: - if dtype == "fp8": num_groups = 1 qparam_offset = 4 * num_groups diff --git a/fbgemm_gpu/experimental/gen_ai/test/kv_cache/kv_cache_test.py b/fbgemm_gpu/experimental/gen_ai/test/kv_cache/kv_cache_test.py index 17a58c620..62c81db76 100644 --- a/fbgemm_gpu/experimental/gen_ai/test/kv_cache/kv_cache_test.py +++ b/fbgemm_gpu/experimental/gen_ai/test/kv_cache/kv_cache_test.py @@ -328,7 +328,6 @@ def test_positional_encoding_with_paged_attention( B: int, BLOCK_N: int, ) -> None: - N_H_L = 1 N_KVH_L = 8 D_H = 128 diff --git a/fbgemm_gpu/fbgemm_gpu/split_embedding_inference_converter.py b/fbgemm_gpu/fbgemm_gpu/split_embedding_inference_converter.py index 500157b4c..f9858837f 100644 --- a/fbgemm_gpu/fbgemm_gpu/split_embedding_inference_converter.py +++ b/fbgemm_gpu/fbgemm_gpu/split_embedding_inference_converter.py @@ -119,9 +119,7 @@ def _process_split_embs(self, model: torch.nn.Module) -> None: f"Embedding dim {D} couldn't be divided by align size {weights_ty.align_size()}!" ) assert D % 4 == 0 - weights_ty = ( - SparseType.FP16 - ) # fall back to FP16 if dimension couldn't be aligned with the required size + weights_ty = SparseType.FP16 # fall back to FP16 if dimension couldn't be aligned with the required size embedding_specs.append(("", E, D, weights_ty)) weight_lists = [] diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py index 8eb048377..ea1c19e7c 100644 --- a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py +++ b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py @@ -2817,9 +2817,7 @@ def _recording_to_timer( if self.stats_reporter is not None and self.stats_reporter.should_report( self.step ): - assert ( - timer - ), "We shouldn't be here, async timer must have been initiated if reporter is present." + assert timer, "We shouldn't be here, async timer must have been initiated if reporter is present." return timer.recording(**kwargs) # No-Op context manager return contextlib.nullcontext() diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py b/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py index c1f13bd74..afa3de987 100644 --- a/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py +++ b/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py @@ -1436,7 +1436,6 @@ def forward( ) if len(self.timesteps_prefetched) == 0: - with self._recording_to_timer( self.ssd_prefetch_read_timer, context=self.step, @@ -1776,7 +1775,8 @@ def _report_ssd_io_stats(self) -> None: this function fetch the stats from EmbeddingRocksDB and report it with stats_reporter """ ssd_io_duration = self.ssd_db.get_rocksdb_io_duration( - self.step, self.stats_reporter.report_interval # pyre-ignore + self.step, + self.stats_reporter.report_interval, # pyre-ignore ) if len(ssd_io_duration) != 5: @@ -1878,7 +1878,8 @@ def _report_l2_cache_perf_stats(self) -> None: return l2_cache_perf_stats = self.ssd_db.get_l2cache_perf( - self.step, stats_reporter.report_interval # pyre-ignore + self.step, + stats_reporter.report_interval, # pyre-ignore ) if len(l2_cache_perf_stats) != 15: @@ -2001,9 +2002,7 @@ def _recording_to_timer( if self.stats_reporter is not None and self.stats_reporter.should_report( self.step ): - assert ( - timer - ), "We shouldn't be here, async timer must have been initiated if reporter is present." + assert timer, "We shouldn't be here, async timer must have been initiated if reporter is present." return timer.recording(**kwargs) # No-Op context manager return contextlib.nullcontext() diff --git a/fbgemm_gpu/test/lint/check_meta_header.py b/fbgemm_gpu/test/lint/check_meta_header.py index 5fd5e41f6..eaa3755de 100644 --- a/fbgemm_gpu/test/lint/check_meta_header.py +++ b/fbgemm_gpu/test/lint/check_meta_header.py @@ -6,8 +6,7 @@ # LICENSE file in the root directory of this source tree. -"""Check Python source code contains Meta copyright header -""" +"""Check Python source code contains Meta copyright header""" from __future__ import annotations diff --git a/fbgemm_gpu/test/quantize/fused_8bit_rowwise_test.py b/fbgemm_gpu/test/quantize/fused_8bit_rowwise_test.py index a2183fcd9..ccf6fdd1a 100644 --- a/fbgemm_gpu/test/quantize/fused_8bit_rowwise_test.py +++ b/fbgemm_gpu/test/quantize/fused_8bit_rowwise_test.py @@ -271,9 +271,7 @@ def test_quantize_and_dequantize_op( # noqa: C901 reference = torch.from_numpy( fused_rowwise_8bit_dequantize_2bytes_padding_scale_bias_first_reference( quantize_data_numpy - )[ - :, :ncols - ] + )[:, :ncols] ) if output_dtype == SparseType.FP32: torch.testing.assert_close( diff --git a/fbgemm_gpu/test/quantize/mx4_test.py b/fbgemm_gpu/test/quantize/mx4_test.py index 592f3e15d..b3c3e8526 100644 --- a/fbgemm_gpu/test/quantize/mx4_test.py +++ b/fbgemm_gpu/test/quantize/mx4_test.py @@ -145,7 +145,6 @@ def fake_quantize_mx( # @optests.generate_opcheck_tests() class TestMXQuantizationConversion(unittest.TestCase): - @unittest.skipIf(*gpu_unavailable) # pyre-fixme[56]: @given( diff --git a/fbgemm_gpu/test/tbe/inference/nbit_forward_autovec_test.py b/fbgemm_gpu/test/tbe/inference/nbit_forward_autovec_test.py index 920a86cbd..2d46c9263 100644 --- a/fbgemm_gpu/test/tbe/inference/nbit_forward_autovec_test.py +++ b/fbgemm_gpu/test/tbe/inference/nbit_forward_autovec_test.py @@ -296,9 +296,7 @@ def execute_nbit_forward_( # noqa C901 scale_shift[:, :] = torch.tensor( # pyre-fixme[61]: `scales` is undefined, or not always defined. # pyre-fixme[61]: `shifts` is undefined, or not always defined. - np.stack([scales, shifts], axis=1) - .astype(np.float16) - .view(np.uint8) + np.stack([scales, shifts], axis=1).astype(np.float16).view(np.uint8) ) fake_quantize_embs( diff --git a/fbgemm_gpu/test/tbe/inference/nbit_forward_test.py b/fbgemm_gpu/test/tbe/inference/nbit_forward_test.py index 8f4c32eea..2334ff88c 100644 --- a/fbgemm_gpu/test/tbe/inference/nbit_forward_test.py +++ b/fbgemm_gpu/test/tbe/inference/nbit_forward_test.py @@ -523,9 +523,7 @@ def execute_nbit_forward_( # noqa C901 scale_shift[:, :] = torch.tensor( # pyre-fixme[61]: `scales` is undefined, or not always defined. # pyre-fixme[61]: `shifts` is undefined, or not always defined. - np.stack([scales, shifts], axis=1) - .astype(np.float16) - .view(np.uint8) + np.stack([scales, shifts], axis=1).astype(np.float16).view(np.uint8) ) fake_quantize_embs( diff --git a/fbgemm_gpu/test/tbe/ssd/ssd_split_tbe_inference_test.py b/fbgemm_gpu/test/tbe/ssd/ssd_split_tbe_inference_test.py index 1df34d30a..f760d1faf 100644 --- a/fbgemm_gpu/test/tbe/ssd/ssd_split_tbe_inference_test.py +++ b/fbgemm_gpu/test/tbe/ssd/ssd_split_tbe_inference_test.py @@ -329,9 +329,7 @@ def test_nbit_ssd_cache( scale_shift[:, :] = torch.tensor( # pyre-fixme[61]: `scales` is undefined, or not always defined. # pyre-fixme[61]: `shifts` is undefined, or not always defined. - np.stack([scales, shifts], axis=1) - .astype(np.float16) - .view(np.uint8) + np.stack([scales, shifts], axis=1).astype(np.float16).view(np.uint8) ) D_bytes = rounded_row_size_in_bytes( diff --git a/fbgemm_gpu/test/tbe/ssd/ssd_split_tbe_training_test.py b/fbgemm_gpu/test/tbe/ssd/ssd_split_tbe_training_test.py index 5adf7c89b..371b6253e 100644 --- a/fbgemm_gpu/test/tbe/ssd/ssd_split_tbe_training_test.py +++ b/fbgemm_gpu/test/tbe/ssd/ssd_split_tbe_training_test.py @@ -925,10 +925,7 @@ def _prefetch(b_it: int) -> int: # pyre-fixme[16]: Optional type has no attribute `float`. optim_state_r.add_( # pyre-fixme[16]: `Optional` has no attribute `float`. - emb_r.weight.grad.float() - .to_dense() - .pow(2) - .mean(dim=1) + emb_r.weight.grad.float().to_dense().pow(2).mean(dim=1) ) torch.testing.assert_close( optim_state_t.float(), diff --git a/fbgemm_gpu/test/tbe/ssd/ssd_utils_test.py b/fbgemm_gpu/test/tbe/ssd/ssd_utils_test.py index 5d94ae233..7cf03dea8 100644 --- a/fbgemm_gpu/test/tbe/ssd/ssd_utils_test.py +++ b/fbgemm_gpu/test/tbe/ssd/ssd_utils_test.py @@ -263,7 +263,6 @@ def test_scratch_pad_indices_queue( for indices, lookup_indices, count, lookup_count in zip( all_indices, all_lookup_indices, all_counts, all_lookup_counts ): - # Run reference # Prepare inputs for the reference run sp_prev_curr_map_ref = torch.zeros_like(lookup_indices) diff --git a/fbgemm_gpu/test/test_utils.py b/fbgemm_gpu/test/test_utils.py index 5de322041..f1b67b0e6 100644 --- a/fbgemm_gpu/test/test_utils.py +++ b/fbgemm_gpu/test/test_utils.py @@ -212,7 +212,9 @@ def use_cpu_strategy() -> st.SearchStrategy[bool]: st.booleans() if (gpu_available and not TEST_WITH_ROCM) # fmt: off - else st.just(False) if (gpu_available and TEST_WITH_ROCM) else st.just(True) + else st.just(False) + if (gpu_available and TEST_WITH_ROCM) + else st.just(True) # fmt: on )