diff --git a/.flake8 b/.flake8
index 299ca9aa354c2..2e601fbe11088 100644
--- a/.flake8
+++ b/.flake8
@@ -2,26 +2,8 @@
 max-line-length = 120
 per-file-ignores =
     __init__.py:F401
-format = [flake8 PEP8 ERROR] %(path)s:%(row)d:%(col)d: %(code)s %(text)s
-exclude =
-    # ignore the .git directory
-    ./.git,
-    # ignore default build directory
-    ./build,
-    # ignore external dependency files
-    ./cmake/external,
-    # TODO enable
-    ./docs/python,
-    # ignore generated flatbuffers code
-    ./onnxruntime/core/flatbuffers/ort_flatbuffers_py,
-    # TODO enable
-    ./onnxruntime/python/tools,
-    # ignore test code for now
-    ./onnxruntime/test,
-    # TODO enable
-    ./orttraining,
-    # ignore server code for now
-    ./server,
-    # ignore issues from different git branches
-    ./.git,
-ignore = W503, E203
+# NOTE: Edit exclude list in .lintrunner.toml
+
+# Ignored codes:
+# DUO102: We use random only for math
+ignore = W503, E203, E501, DUO102
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 9944cd29ca152..a21f2d03cbf9e 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -8,27 +8,11 @@ on:
   pull_request:
 
 jobs:
-  lint-python:
-    name: Lint Python
+  optional-lint:
+    name: Optional Lint
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
-      - name: flake8
-        uses: reviewdog/action-flake8@v3
-        with:
-          github_token: ${{ secrets.github_token }}
-          # Change reviewdog reporter if you need [github-pr-check, github-check, github-pr-review].
-          reporter: github-pr-check
-          # Change reporter level if you need.
-          # GitHub Status Check won't become failure with a warning.
-          level: error
-          filter_mode: file
-      - name: pyflakes
-        uses: reviewdog/action-pyflakes@v1
-        with:
-          github_token: ${{ secrets.github_token }}
-          reporter: github-pr-check
-          level: warning
+      - uses: actions/checkout@v3
       - name: misspell # Check spellings as well
         uses: reviewdog/action-misspell@v1
         with:
@@ -63,24 +47,41 @@ jobs:
           glob_pattern: "**/*.py"
 
   lint-python-format:
-    # Separated black/isort from other Python linters because we want this job to
-    # fail and not affect other linters
-    # According to https://black.readthedocs.io/en/stable/integrations/github_actions.html:
-    #     We recommend the use of the @stable tag, but per version tags also exist if you prefer that.
-    #     Note that the action’s version you select is independent of the version of Black the action will use.
-    #     The version of Black the action will use can be configured via version.
+    # Required workflow
     name: Python format
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
-      - uses: actions/setup-python@v2
+      - uses: actions/checkout@v3
+      - name: Setup Python
+        uses: actions/setup-python@v4
         with:
+          # Version range or exact version of Python to use, using SemVer's version range syntax. Reads from .python-version if unset.
           python-version: "3.10"
-      - uses: psf/black@stable
+      - name: Install dependencies
+        run: |
+          python -m pip install -r requirements-dev.txt
+          lintrunner init
+      - name: Run lintrunner on all files
+        run: |
+          set +e
+          if ! lintrunner --force-color --all-files --tee-json=lint.json; then
+              echo ""
+              echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner\`.\e[0m"
+              exit 1
+          fi
+      - name: Produce SARIF
+        if: always()
+        run: |
+          python -m lintrunner_adapters to-sarif lint.json lintrunner.sarif
+      - name: Upload SARIF file
+        if: always()
+        continue-on-error: true
+        uses: github/codeql-action/upload-sarif@v2
         with:
-          options: "--check --diff --color"
-          version: "22.12.0"
-      - uses: isort/isort-action@master
+          # Path to SARIF file relative to the root of the repository
+          sarif_file: lintrunner.sarif
+          category: lintrunner
+          checkout_path: ${{ github.workspace }}
 
   lint-cpp:
     name: Lint C++
@@ -117,7 +118,7 @@ jobs:
     name: Lint JavaScript
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - uses: reviewdog/action-eslint@v1
         with:
           reporter: github-pr-check
diff --git a/.lintrunner.toml b/.lintrunner.toml
new file mode 100644
index 0000000000000..bfa0276e5cfce
--- /dev/null
+++ b/.lintrunner.toml
@@ -0,0 +1,139 @@
+# Configuration for lintrunner https://github.com/suo/lintrunner
+# You can install the dependencies and initialize with
+#
+# ```sh
+# pip install lintrunner lintrunner-adapters
+# lintrunner init
+# ```
+#
+# This will install lintrunner on your system and download all the necessary
+# dependencies to run linters locally.
+# If you want to see what lintrunner init will install, run
+# `lintrunner init --dry-run`.
+#
+# To lint local changes:
+#
+# ```bash
+# lintrunner -m main
+# ```
+#
+# To lint all files:
+#
+# ```bash
+# lintrunner --all-files
+# ```
+#
+# To format files:
+#
+# ```bash
+# lintrunner f --all-files
+# ```
+#
+# To read more about lintrunner, see [wiki](https://github.com/pytorch/pytorch/wiki/lintrunner).
+# To update an existing linting rule or create a new one, modify this file or create a
+# new adapter following examples in https://github.com/justinchuby/lintrunner-adapters.
+
+[[linter]]
+code = 'FLAKE8'
+include_patterns = [
+    '**/*.py',
+]
+exclude_patterns = [
+    'build/**',
+    'cmake/external/**',
+    # TODO enable
+    'docs/python/**',
+    # ignore generated flatbuffers code
+    'onnxruntime/core/flatbuffers/ort_flatbuffers_py/**',
+    # TODO enable
+    './onnxruntime/python/tools/**',
+    # FIXME(#7032): ignore test code for now
+    'onnxruntime/test/**',
+    # TODO enable
+    'orttraining/**',
+    # FIXME: DUO106
+    'tools/nuget/generate_nuspec_for_native_nuget.py',
+    # FIXME: DUO116
+    'js/scripts/build_web.py',
+    # FIXME: Too many errors
+    'onnxruntime/python/tools/tensorrt/perf/**',
+]
+command = [
+    'python3',
+    '-m',
+    'lintrunner_adapters',
+    'run',
+    'flake8_linter',
+    '--',
+    '@{{PATHSFILE}}'
+]
+init_command = [
+    'python3',
+    '-m',
+    'lintrunner_adapters',
+    'run',
+    'pip_init',
+    '--dry-run={{DRYRUN}}',
+    'flake8==5.0.4',
+    'flake8-bugbear==22.10.27',
+    'flake8-pyi==22.10.0',
+    'dlint==0.13.0',
+]
+
+[[linter]]
+code = 'BLACK-ISORT'
+include_patterns = [
+    '**/*.py',
+]
+exclude_patterns = [
+    'cmake/**',
+    'orttraining/*',
+    'onnxruntime/core/flatbuffers/**',
+]
+command = [
+    'python3',
+    '-m',
+    'lintrunner_adapters',
+    'run',
+    'black_isort_linter',
+    '--',
+    '@{{PATHSFILE}}'
+]
+init_command = [
+    'python3',
+    '-m',
+    'lintrunner_adapters',
+    'run',
+    'pip_init',
+    '--dry-run={{DRYRUN}}',
+    'black==22.10.0',
+    'isort==5.10.1',
+]
+is_formatter = true
+
+[[linter]]
+code = 'PYLINT'
+include_patterns = [
+    # TODO: Opt in to pylint by adding paths here
+]
+exclude_patterns = [
+]
+command = [
+    'python3',
+    '-m',
+    'lintrunner_adapters',
+    'run',
+    'pylint_linter',
+    '--rcfile=pyproject.toml',
+    '--',
+    '@{{PATHSFILE}}'
+]
+init_command = [
+    'python3',
+    '-m',
+    'lintrunner_adapters',
+    'run',
+    'pip_init',
+    '--dry-run={{DRYRUN}}',
+    'pylint==2.15.5',
+]
diff --git a/docs/Coding_Conventions_and_Standards.md b/docs/Coding_Conventions_and_Standards.md
index f8bc60ba152a2..1051e8a3e3ac6 100644
--- a/docs/Coding_Conventions_and_Standards.md
+++ b/docs/Coding_Conventions_and_Standards.md
@@ -112,15 +112,15 @@ void foo(gsl::span<const std::string> names) {
 * The following C++ warnings should never be disabled in onnxruntime VC++ projects(Required by [Binskim](https://github.com/microsoft/binskim/blob/d9afb65c89a621411efded74c27999281d87867e/src/BinSkim.Rules/PERules/BA2007.EnableCriticalCompilerWarnings.cs)).
   1. [4018](https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-3-c4018) 'token' : signed/unsigned mismatch
   2. [4146](https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-2-c4146?view=msvc-160) unary minus operator applied to unsigned type, result still unsigned
-  3. [4244](https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-2-c4244?view=msvc-160) 'argument' : conversion from 'type1' to 'type2', possible loss of data. For example, casting a int64_t to size_t. 
+  3. [4244](https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-2-c4244?view=msvc-160) 'argument' : conversion from 'type1' to 'type2', possible loss of data. For example, casting a int64_t to size_t.
   4. [4267](https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-3-c4267?view=msvc-160) 'var' : conversion from 'size_t' to 'type', possible loss of data.
   5. [4302](https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-2-c4302?view=msvc-160) 'conversion' : truncation from 'type 1' to 'type 2'
-  6. [4308](https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-2-c4308?view=msvc-160) negative integral constant converted to unsigned type 
+  6. [4308](https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-2-c4308?view=msvc-160) negative integral constant converted to unsigned type
   7. [4532](https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-1-c4532?view=msvc-160) 'continue' : jump out of \_\_finally/finally block has undefined behavior during termination handling
   8. [4533](https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-1-c4533?view=msvc-160) initialization of 'variable' is skipped by 'instruction'
   9. [4700](https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-1-and-level-4-c4700?view=msvc-160) uninitialized local variable 'name' used
   10. [4789](https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-1-c4789?view=msvc-160) buffer 'identifier' of size N bytes will be overrun; M bytes will be written starting at offset L
-  11. [4995](https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-3-c4995?view=msvc-160) 'function': name was marked as #pragma deprecated 
+  11. [4995](https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-3-c4995?view=msvc-160) 'function': name was marked as #pragma deprecated
   12. [4996](https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-3-c4996?view=msvc-160) Your code uses a function, class member, variable, or typedef that's marked deprecated
 
 #### Clang-format
@@ -150,6 +150,42 @@ There is a configuration file in `onnxruntime/VSCodeCoverage.runsettings` that c
 
 Using `Show Code Coverage Coloring` will allow you to visually inspect which lines were hit by the tests. See <https://docs.microsoft.com/en-us/visualstudio/test/using-code-coverage-to-determine-how-much-code-is-being-tested?view=vs-2017>.
 
+## Linting
+
+This project uses [lintrunner](https://github.com/suo/lintrunner) for linting. It provides a consistent linting experience locally and in CI. You can install the dependencies and initialize with
+
+```sh
+pip install lintrunner lintrunner-adapters
+lintrunner init
+```
+
+This will install lintrunner on your system and download all the necessary
+dependencies to run linters locally.
+If you want to see what lintrunner init will install, run
+`lintrunner init --dry-run`.
+
+To lint local changes:
+
+```bash
+lintrunner -m main
+```
+
+To lint all files:
+
+```bash
+lintrunner --all-files
+```
+
+To format files:
+
+```bash
+lintrunner -a --all-files
+```
+
+To read more about lintrunner, see [wiki](https://github.com/pytorch/pytorch/wiki/lintrunner).
+To update an existing linting rule or create a new one, modify `.lintrunner.toml` or create a
+new adapter following examples in https://github.com/justinchuby/lintrunner-adapters.
+
 ## Python Code Style
 
 Follow the [Black formatter](https://black.readthedocs.io)'s coding style when possible. A maximum line length of 120 characters is allowed for consistency with the C++ code.
@@ -160,11 +196,10 @@ Code can be validated with [flake8](https://pypi.org/project/flake8/) using the
 
 Use `pyright`, which is provided as a component of the `pylance` extension in VS Code for static type checking.
 
-Auto-formatting is done with `black` and `isort`. The tools are configured in `pyproject.toml`. From anywhere in the repository, you can run
+Auto-formatting is done with `black` and `isort`. The tools are configured in `pyproject.toml`. From the root of the repository, you can run
 
 ```sh
-black .
-isort .
+lintrunner f --all-files
 ```
 
 to format Python files.
diff --git a/onnxruntime/ReformatSourcePython.bat b/onnxruntime/ReformatSourcePython.bat
deleted file mode 100644
index ca92964b934ac..0000000000000
--- a/onnxruntime/ReformatSourcePython.bat
+++ /dev/null
@@ -1,14 +0,0 @@
-:: Copyright (c) Microsoft Corporation. All rights reserved.
-:: Licensed under the MIT License.
-
-:: Before running this, please make sure python.exe is in path, and black is installed like the following
-::    pip install --upgrade black isort
-
-:: For more info about black, see https://github.com/psf/black
-
-python -m isort ./python
-python -m isort ./test
-python -m black ./python
-python -m black ./test
-
-if errorlevel 1 echo please install python, then pip install --upgrade black isort
diff --git a/onnxruntime/__init__.py b/onnxruntime/__init__.py
index 5e28b60faa5c4..4eeda2ad16c6e 100644
--- a/onnxruntime/__init__.py
+++ b/onnxruntime/__init__.py
@@ -61,6 +61,8 @@
     OrtValue,
     SparseTensor,
 )
+
+# FIXME: Remove star imports
 from onnxruntime.capi.training import *  # noqa: F403
 
 # TODO: thiagofc: Temporary experimental namespace for new PyTorch front-end
diff --git a/onnxruntime/python/backend/backend.py b/onnxruntime/python/backend/backend.py
index 99592e3d5cf3e..a473011c64ffc 100644
--- a/onnxruntime/python/backend/backend.py
+++ b/onnxruntime/python/backend/backend.py
@@ -9,7 +9,7 @@
 import unittest
 
 import packaging.version
-from onnx import ModelProto, helper, version
+from onnx import helper, version
 from onnx.backend.base import Backend
 from onnx.checker import check_model
 
@@ -59,7 +59,7 @@ def is_opset_supported(cls, model):
                 domain = opset.domain if opset.domain else "ai.onnx"
                 try:
                     key = (domain, opset.version)
-                    if not (key in helper.OP_SET_ID_VERSION_MAP):
+                    if key not in helper.OP_SET_ID_VERSION_MAP:
                         error_message = (
                             "Skipping this test as only released onnx opsets are supported."
                             "To run this test set env variable ALLOW_RELEASED_ONNX_OPSET_ONLY to 0."
@@ -124,7 +124,7 @@ def prepare(cls, model, device=None, **kwargs):
                 raise RuntimeError("Incompatible device expected '{0}', got '{1}'".format(device, get_device()))
             return cls.prepare(inf, device, **kwargs)
         else:
-            # type: ModelProto
+            # ModelProto
             # check_model serializes the model anyways, so serialize the model once here
             # and reuse it below in the cls.prepare call to avoid an additional serialization
             # only works with onnx >= 1.10.0 hence the version check
diff --git a/onnxruntime/python/backend/backend_rep.py b/onnxruntime/python/backend/backend_rep.py
index 6dced3aba7f80..177af5aa6d8e3 100644
--- a/onnxruntime/python/backend/backend_rep.py
+++ b/onnxruntime/python/backend/backend_rep.py
@@ -5,7 +5,6 @@
 """
 Implements ONNX's backend API.
 """
-from typing import Any, Tuple
 
 from onnx.backend.base import BackendRep
 
@@ -24,7 +23,7 @@ def __init__(self, session):
         """
         self._session = session
 
-    def run(self, inputs, **kwargs):  # type: (Any, **Any) -> Tuple[Any, ...]
+    def run(self, inputs, **kwargs):
         """
         Computes the prediction.
         See :meth:`onnxruntime.InferenceSession.run`.
diff --git a/onnxruntime/python/onnxruntime_collect_build_info.py b/onnxruntime/python/onnxruntime_collect_build_info.py
index 6cd67938dd0ba..07ac21a11eb04 100644
--- a/onnxruntime/python/onnxruntime_collect_build_info.py
+++ b/onnxruntime/python/onnxruntime_collect_build_info.py
@@ -35,7 +35,7 @@ def get_cudart_version(find_cudart_version=None):
             status = cudart.cudaRuntimeGetVersion(ctypes.byref(version))
             if status != 0:
                 return None
-        except:  # noqa
+        except Exception:
             return None
 
         return version.value
@@ -93,7 +93,7 @@ def get_cudnn_supported_cuda_version(find_cudnn_version=None):
             # cudnn_ver = cudnn.cudnnGetVersion()
             cuda_ver = cudnn.cudnnGetCudartVersion()
             return cuda_ver
-        except:  # noqa
+        except Exception:
             return None
 
     # use set to avoid duplications
diff --git a/onnxruntime/python/onnxruntime_validation.py b/onnxruntime/python/onnxruntime_validation.py
index 8b313635527ac..5b0dd3198aa62 100644
--- a/onnxruntime/python/onnxruntime_validation.py
+++ b/onnxruntime/python/onnxruntime_validation.py
@@ -102,7 +102,7 @@ def validate_build_package_info():
 
             try:
                 from .build_and_package_info import cuda_version
-            except:  # noqa
+            except Exception:
                 pass
 
             if cuda_version:
@@ -110,7 +110,7 @@ def validate_build_package_info():
                 # when the build environment has none or multiple libraries installed
                 try:
                     from .build_and_package_info import cudart_version
-                except:  # noqa
+                except Exception:
                     warnings.warn("WARNING: failed to get cudart_version from onnxruntime build info.")
                     cudart_version = None
 
diff --git a/onnxruntime/python/tools/microbench/benchmark.py b/onnxruntime/python/tools/microbench/benchmark.py
index fcf8c6f23f362..323717441674f 100644
--- a/onnxruntime/python/tools/microbench/benchmark.py
+++ b/onnxruntime/python/tools/microbench/benchmark.py
@@ -76,7 +76,7 @@ def get_default_provider():
 
 class Benchmark:
     def __init__(self, model, inputs, outputs, args):
-        self.provider = get_default_provider() if args.provider == None else provider_name(args.provider)
+        self.provider = get_default_provider() if args.provider is None else provider_name(args.provider)
         logger.info(f"Execution provider: {self.provider}")
         self.profiling = args.profiling
         self.model = model
@@ -126,13 +126,13 @@ def benchmark(self):
         io_binding = self.create_io_binding(sess, input_tensors, output_tensors)
 
         # warm up
-        for iter in range(10):
+        for _ in range(10):
             sess.run_with_iobinding(io_binding)
 
         # measure
         max_iters = 100
         start_time = time.time()
-        for iter in range(max_iters):
+        for _ in range(max_iters):
             sess.run_with_iobinding(io_binding)
 
         # time is in milliseconds
diff --git a/onnxruntime/python/tools/onnxruntime_test.py b/onnxruntime/python/tools/onnxruntime_test.py
index 11759f3ad17d5..a437a2e9aca7b 100644
--- a/onnxruntime/python/tools/onnxruntime_test.py
+++ b/onnxruntime/python/tools/onnxruntime_test.py
@@ -29,7 +29,8 @@
 }
 
 
-def generate_feeds(sess, symbolic_dims={}):
+def generate_feeds(sess, symbolic_dims=None):
+    symbolic_dims = symbolic_dims or {}
     feeds = {}
     for input_meta in sess.get_inputs():
         # replace any symbolic dimensions
@@ -67,10 +68,11 @@ def run_model(
     num_iters=1,
     debug=None,
     profile=None,
-    symbolic_dims={},
+    symbolic_dims=None,
     feeds=None,
     override_initializers=True,
 ):
+    symbolic_dims = symbolic_dims or {}
     if debug:
         print("Pausing execution ready for debugger to attach to pid: {}".format(os.getpid()))
         print("Press key to continue.")
@@ -111,7 +113,7 @@ def run_model(
                 sys.exit(-1)
 
     start = timer()
-    for i in range(num_iters):
+    for _ in range(num_iters):
         outputs = sess.run([], feeds)  # fetch all outputs
     end = timer()
 
diff --git a/onnxruntime/python/tools/profile_explorer/profile_explorer.py b/onnxruntime/python/tools/profile_explorer/profile_explorer.py
index f3430a89e7a34..7012d6163dc66 100644
--- a/onnxruntime/python/tools/profile_explorer/profile_explorer.py
+++ b/onnxruntime/python/tools/profile_explorer/profile_explorer.py
@@ -13,7 +13,7 @@ def _demangle(name, demangler="c++filt"):
         with sp.Popen([demangler, name], stdin=sp.PIPE, stdout=sp.PIPE) as proc:
             out, _ = proc.communicate()
             return out.decode("utf-8").strip()
-    except:
+    except Exception:
         return name
 
 
diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py
index b431647313ad4..a7feba05cef43 100644
--- a/onnxruntime/python/tools/quantization/calibrate.py
+++ b/onnxruntime/python/tools/quantization/calibrate.py
@@ -81,7 +81,7 @@ def __init__(
         self.infer_session = None
         self.execution_providers = ["CPUExecutionProvider"]
 
-    def set_execution_providers(self, execution_providers=["CPUExecutionProvider"]):
+    def set_execution_providers(self, execution_providers=("CPUExecutionProvider",)):
         """
         reset the execution providers to execute the collect_data. It triggers to re-creating inference session.
         """
@@ -847,9 +847,9 @@ def create_calibrator(
     augmented_model_path="augmented_model.onnx",
     calibrate_method=CalibrationMethod.MinMax,
     use_external_data_format=False,
-    extra_options={},
+    extra_options=None,
 ):
-
+    extra_options = extra_options or {}
     calibrator = None
     if calibrate_method == CalibrationMethod.MinMax:
         # default settings for min-max algorithm
diff --git a/onnxruntime/python/tools/quantization/onnx_quantizer.py b/onnxruntime/python/tools/quantization/onnx_quantizer.py
index 3c54748ea9df0..30c5cf8cd7287 100644
--- a/onnxruntime/python/tools/quantization/onnx_quantizer.py
+++ b/onnxruntime/python/tools/quantization/onnx_quantizer.py
@@ -114,7 +114,7 @@ def __init__(
 
         self.opset_version = self.check_opset_version()
 
-        if not self.mode in QuantizationMode:
+        if self.mode not in QuantizationMode:
             raise ValueError("unsupported quantization mode {}".format(self.mode))
 
         self.quantization_params = self.calculate_quantization_params()
diff --git a/onnxruntime/python/tools/quantization/operators/activation.py b/onnxruntime/python/tools/quantization/operators/activation.py
index 1029e7b679b60..1655ac416606c 100644
--- a/onnxruntime/python/tools/quantization/operators/activation.py
+++ b/onnxruntime/python/tools/quantization/operators/activation.py
@@ -1,5 +1,4 @@
 import onnx
-from onnx import onnx_pb as onnx_proto
 
 from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
 from .base_operator import QuantOperatorBase
diff --git a/onnxruntime/python/tools/quantization/operators/attention.py b/onnxruntime/python/tools/quantization/operators/attention.py
index 36428b860d060..495cd97d81ee6 100644
--- a/onnxruntime/python/tools/quantization/operators/attention.py
+++ b/onnxruntime/python/tools/quantization/operators/attention.py
@@ -1,5 +1,4 @@
 import onnx
-from onnx import onnx_pb as onnx_proto
 
 from ..quant_utils import attribute_to_kwarg, ms_domain
 from .base_operator import QuantOperatorBase
diff --git a/onnxruntime/python/tools/quantization/operators/binary_op.py b/onnxruntime/python/tools/quantization/operators/binary_op.py
index 3beb96aabe575..c7d529c83553b 100644
--- a/onnxruntime/python/tools/quantization/operators/binary_op.py
+++ b/onnxruntime/python/tools/quantization/operators/binary_op.py
@@ -1,5 +1,4 @@
 import onnx
-from onnx import onnx_pb as onnx_proto
 
 from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
 from .base_operator import QuantOperatorBase
diff --git a/onnxruntime/python/tools/quantization/operators/concat.py b/onnxruntime/python/tools/quantization/operators/concat.py
index 998ca5c558743..47329c178620a 100644
--- a/onnxruntime/python/tools/quantization/operators/concat.py
+++ b/onnxruntime/python/tools/quantization/operators/concat.py
@@ -1,8 +1,7 @@
 import onnx
 
-from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, attribute_to_kwarg, ms_domain
 from .base_operator import QuantOperatorBase
-from .qdq_base_operator import QDQOperatorBase
 
 
 class QLinearConcat(QuantOperatorBase):
diff --git a/onnxruntime/python/tools/quantization/operators/conv.py b/onnxruntime/python/tools/quantization/operators/conv.py
index 0d137ab2eff14..62b7ce2afdf9d 100644
--- a/onnxruntime/python/tools/quantization/operators/conv.py
+++ b/onnxruntime/python/tools/quantization/operators/conv.py
@@ -4,7 +4,6 @@
 
 from ..quant_utils import (
     TENSOR_NAME_QUANT_SUFFIX,
-    BiasToQuantize,
     QuantizedValue,
     QuantizedValueType,
     attribute_to_kwarg,
diff --git a/onnxruntime/python/tools/quantization/operators/embed_layernorm.py b/onnxruntime/python/tools/quantization/operators/embed_layernorm.py
index 01b5fad1c3c75..2aacd306ecaea 100644
--- a/onnxruntime/python/tools/quantization/operators/embed_layernorm.py
+++ b/onnxruntime/python/tools/quantization/operators/embed_layernorm.py
@@ -1,7 +1,6 @@
 import logging
 
 import onnx
-from onnx import onnx_pb as onnx_proto
 
 from ..quant_utils import attribute_to_kwarg, ms_domain
 from .base_operator import QuantOperatorBase
diff --git a/onnxruntime/python/tools/quantization/operators/gemm.py b/onnxruntime/python/tools/quantization/operators/gemm.py
index 07e7678a34957..98ba0f7493429 100644
--- a/onnxruntime/python/tools/quantization/operators/gemm.py
+++ b/onnxruntime/python/tools/quantization/operators/gemm.py
@@ -1,19 +1,9 @@
 import logging
 
-import numpy as np
 import onnx
 from onnx import onnx_pb as onnx_proto
 
-from ..quant_utils import (
-    TENSOR_NAME_QUANT_SUFFIX,
-    QuantizedValue,
-    QuantizedValueType,
-    attribute_to_kwarg,
-    find_by_name,
-    get_mul_node,
-    ms_domain,
-)
-from .base_operator import QuantOperatorBase
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
 from .matmul import QOpMatMul
 from .qdq_base_operator import QDQOperatorBase
 
diff --git a/onnxruntime/python/tools/quantization/operators/lstm.py b/onnxruntime/python/tools/quantization/operators/lstm.py
index 87552a18a037e..263b69a5fa371 100644
--- a/onnxruntime/python/tools/quantization/operators/lstm.py
+++ b/onnxruntime/python/tools/quantization/operators/lstm.py
@@ -2,7 +2,7 @@
 import onnx
 from onnx import onnx_pb as onnx_proto
 
-from ..quant_utils import QuantType, attribute_to_kwarg, ms_domain
+from ..quant_utils import attribute_to_kwarg, ms_domain
 from .base_operator import QuantOperatorBase
 
 """
diff --git a/onnxruntime/python/tools/quantization/operators/pad.py b/onnxruntime/python/tools/quantization/operators/pad.py
index e7eeac2cec3ef..2d1690e545263 100644
--- a/onnxruntime/python/tools/quantization/operators/pad.py
+++ b/onnxruntime/python/tools/quantization/operators/pad.py
@@ -1,4 +1,3 @@
-import numpy as np
 import onnx
 
 from ..quant_utils import (
diff --git a/onnxruntime/python/tools/quantization/operators/qdq_base_operator.py b/onnxruntime/python/tools/quantization/operators/qdq_base_operator.py
index 0fe05df5191fa..73256e9115769 100644
--- a/onnxruntime/python/tools/quantization/operators/qdq_base_operator.py
+++ b/onnxruntime/python/tools/quantization/operators/qdq_base_operator.py
@@ -1,8 +1,5 @@
 import itertools
 
-from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, quantize_nparray
-from .base_operator import QuantOperatorBase
-
 
 class QDQOperatorBase:
     def __init__(self, onnx_quantizer, onnx_node):
diff --git a/onnxruntime/python/tools/quantization/qdq_loss_debug.py b/onnxruntime/python/tools/quantization/qdq_loss_debug.py
index a3adf675d890c..5eed354a05f89 100644
--- a/onnxruntime/python/tools/quantization/qdq_loss_debug.py
+++ b/onnxruntime/python/tools/quantization/qdq_loss_debug.py
@@ -10,7 +10,7 @@
 A use case is to debug quantization induced accuracy drop. An AI engineer can
 run the original float32 model and the quantized model with the same inputs,
 then compare the corresponding activations between the two models to find
-where the divergence is. 
+where the divergence is.
 
 Example Usage:
 
diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py
index 2ceefeadcd1e5..662739c4ed832 100644
--- a/onnxruntime/python/tools/quantization/quant_utils.py
+++ b/onnxruntime/python/tools/quantization/quant_utils.py
@@ -245,8 +245,8 @@ def __init__(
         rmaxs,
         zero_points,
         scales,
-        data=[],
-        quantized_data=[],
+        data=None,
+        quantized_data=None,
         axis=None,
     ):
         self.name = name
@@ -256,8 +256,8 @@ def __init__(
         # 1D tensor of zero points computed for each axis. scalar if axis is empty
         self.zero_points = zero_points
         self.scales = scales  # 1D tensor of scales computed for each axis. scalar if axis is empty
-        self.data = data  # original data from initializer TensorProto
-        self.quantized_data = quantized_data  # weight-packed data from data
+        self.data = data or []  # original data from initializer TensorProto
+        self.quantized_data = quantized_data or []  # weight-packed data from data
         # Scalar to specify which dimension in the initializer to weight pack.
         self.axis = axis
         # If empty, single zero point and scales computed from a single rmin and rmax
@@ -265,7 +265,7 @@ def __init__(
 
 class QuantizedValue:
     """
-    Represents a linearly quantized value (input\output\intializer)
+    Represents a linearly quantized value (input/output/intializer)
     """
 
     def __init__(
diff --git a/onnxruntime/python/tools/quantization/shape_inference.py b/onnxruntime/python/tools/quantization/shape_inference.py
index 7df2dec59bf42..9aaac95a8dc5b 100644
--- a/onnxruntime/python/tools/quantization/shape_inference.py
+++ b/onnxruntime/python/tools/quantization/shape_inference.py
@@ -89,7 +89,7 @@ def quant_pre_process(
                 sess_option.optimized_model_filepath = opt_model_path
                 sess_option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC
                 _ = onnxruntime.InferenceSession(input_model_path, sess_option, providers=["CPUExecutionProvider"])
-            except Exception as e:
+            except Exception:
                 logger.error(
                     "ONNX Runtime Model Optimization Failed! Consider rerun with option `--skip_optimization'."
                 )
diff --git a/onnxruntime/python/tools/symbolic_shape_infer.py b/onnxruntime/python/tools/symbolic_shape_infer.py
index bf7f7d86690ac..d504efa6cc58a 100755
--- a/onnxruntime/python/tools/symbolic_shape_infer.py
+++ b/onnxruntime/python/tools/symbolic_shape_infer.py
@@ -391,7 +391,7 @@ def _get_sympy_shape(self, node, idx):
                     else sympy.Symbol(d, integer=True, nonnegative=True)
                 )
             else:
-                assert None != d
+                assert None is not d
                 sympy_shape.append(d)
         return sympy_shape
 
@@ -418,7 +418,7 @@ def _update_computed_dims(self, new_sympy_shape):
                     new_sympy_shape[i] = self.symbolic_dims_[self.suggested_merge_[str_dim]]
                 else:
                     # add new_dim if it's a computational expression
-                    if not str(new_dim) in self.symbolic_dims_:
+                    if str(new_dim) not in self.symbolic_dims_:
                         self.symbolic_dims_[str(new_dim)] = new_dim
 
     def _onnx_infer_single_node(self, node):
@@ -494,7 +494,7 @@ def _onnx_infer_subgraph(self, node, subgraph, use_node_input=True, inc_subgraph
         # for example, with Scan/Loop, subgraph input shape would be trimmed from node input shape
         # besides, inputs in subgraph could shadow implicit inputs
         subgraph_inputs = set([i.name for i in list(subgraph.initializer) + list(subgraph.input)])
-        subgraph_implicit_input = set([name for name in self.known_vi_.keys() if not name in subgraph_inputs])
+        subgraph_implicit_input = set([name for name in self.known_vi_.keys() if name not in subgraph_inputs])
         tmp_graph = helper.make_graph(
             list(subgraph.node),
             "tmp",
@@ -515,11 +515,10 @@ def _onnx_infer_subgraph(self, node, subgraph, use_node_input=True, inc_subgraph
         if inc_subgraph_id:
             self.subgraph_id_ += 1
 
-        all_shapes_inferred = False
         symbolic_shape_inference._preprocess(self.tmp_mp_)
         symbolic_shape_inference.suggested_merge_ = self.suggested_merge_.copy()
         while symbolic_shape_inference.run_:
-            all_shapes_inferred = symbolic_shape_inference._infer_impl(self.sympy_data_.copy())
+            symbolic_shape_inference._infer_impl(self.sympy_data_.copy())
         symbolic_shape_inference._update_output_from_vi()
         if use_node_input:
             # if subgraph uses node input, it needs to update to merged dims
@@ -534,7 +533,7 @@ def _onnx_infer_subgraph(self, node, subgraph, use_node_input=True, inc_subgraph
         # for new symbolic dims from subgraph output, add to main graph symbolic dims
         subgraph_shapes = [get_shape_from_value_info(o) for o in symbolic_shape_inference.out_mp_.graph.output]
         subgraph_new_symbolic_dims = set(
-            [d for s in subgraph_shapes if s for d in s if type(d) == str and not d in self.symbolic_dims_]
+            [d for s in subgraph_shapes if s for d in s if type(d) == str and d not in self.symbolic_dims_]
         )
         new_dims = {}
         for d in subgraph_new_symbolic_dims:
@@ -810,7 +809,7 @@ def _infer_Compress(self, node):
         # create a new symbolic dimension for Compress output
         compress_len = str(self._new_symbolic_dim_from_output(node))
         axis = get_attribute(node, "axis")
-        if axis == None:
+        if axis is None:
             # when axis is not specified, input is flattened before compress so output is 1D
             output_shape = [compress_len]
         else:
@@ -1057,7 +1056,6 @@ def _infer_GatherND(self, node):
         data_shape = self._get_shape(node, 0)
         data_rank = len(data_shape)
         indices_shape = self._get_shape(node, 1)
-        indices_rank = len(indices_shape)
         last_index_dimension = indices_shape[-1]
         assert is_literal(last_index_dimension) and last_index_dimension <= data_rank
         new_shape = indices_shape[:-1] + data_shape[last_index_dimension:]
@@ -1910,7 +1908,7 @@ def _infer_TopK(self, node):
         else:
             k = self._get_int_values(node)[1]
 
-        if k == None:
+        if k is None:
             k = self._new_symbolic_dim_from_output(node)
         else:
             k = as_scalar(k)
@@ -2419,7 +2417,7 @@ def get_prereq(node):
                         self.run_ = False
 
                     # create new dynamic dims for ops not handled by symbolic shape inference
-                    if self.run_ == False and not node.op_type in self.dispatcher_ and not known_aten_op:
+                    if self.run_ is False and node.op_type not in self.dispatcher_ and not known_aten_op:
                         is_unknown_op = out_type_undefined and (out_shape is None or len(out_shape) == 0)
                         if is_unknown_op:
                             # unknown op to ONNX, maybe from higher opset or other domain
diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark.py b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
index 7bb23084e1ca9..6441278ea249f 100644
--- a/onnxruntime/python/tools/tensorrt/perf/benchmark.py
+++ b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
@@ -205,7 +205,7 @@ def run_trt_standalone(trtexec, model_name, model_path, test_data_dir, all_input
     avg_latency_match = re.search("mean = (.*?) ms", target)
     if avg_latency_match:
         result["average_latency_ms"] = avg_latency_match.group(1)  # extract number
-    percentile_match = re.search("percentile\(90%\) = (.*?) ms", target)
+    percentile_match = re.search(r"percentile\(90%\) = (.*?) ms", target)
     if percentile_match:
         result["latency_90_percentile"] = percentile_match.group(1)  # extract number
     if mem_usage:
diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py b/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py
index 918add64ce5f3..c6f334de266b1 100644
--- a/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py
+++ b/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py
@@ -1,13 +1,9 @@
-import argparse
-import copy
-import csv
 import json
-import logging
 import os
 import pprint
 import re
 
-import coloredlogs
+# FIXME: Remove star imports
 from benchmark import *
 from perf_utils import *
 
diff --git a/onnxruntime/python/tools/tensorrt/perf/build/ort_build_latest.py b/onnxruntime/python/tools/tensorrt/perf/build/ort_build_latest.py
index 2efacd1965f40..a1d33a7425b28 100755
--- a/onnxruntime/python/tools/tensorrt/perf/build/ort_build_latest.py
+++ b/onnxruntime/python/tools/tensorrt/perf/build/ort_build_latest.py
@@ -1,8 +1,9 @@
+import argparse
 import os
 import subprocess
-import argparse
 import tarfile
 
+
 def parse_arguments():
     parser = argparse.ArgumentParser()
 
@@ -15,52 +16,73 @@ def parse_arguments():
     args = parser.parse_args()
     return args
 
+
 def archive_wheel_file(save_path, ort_wheel_file):
     if not os.path.exists(save_path):
         os.mkdir(save_path)
     subprocess.run(["cp", ort_wheel_file, save_path], check=True)
 
+
 def install_new_ort_wheel(ort_master_path):
-    ort_wheel_path = os.path.join(ort_master_path, "build", "Linux", "Release", "dist") 
+    ort_wheel_path = os.path.join(ort_master_path, "build", "Linux", "Release", "dist")
     p1 = subprocess.run(["find", ort_wheel_path, "-name", "*.whl"], stdout=subprocess.PIPE, check=True)
     stdout = p1.stdout.decode("utf-8").strip()
     ort_wheel = stdout.split("\n")[0]
     subprocess.run(["python3", "-m", "pip", "install", "--force-reinstall", ort_wheel], check=True)
     return ort_wheel
 
+
 def main():
     args = parse_arguments()
 
-    cmake_tar = "cmake-3.18.4-Linux-x86_64.tar.gz" 
+    cmake_tar = "cmake-3.18.4-Linux-x86_64.tar.gz"
     if not os.path.exists(cmake_tar):
         p = subprocess.run(["wget", "-c", "https://cmake.org/files/v3.18/" + cmake_tar], check=True)
     tar = tarfile.open(cmake_tar)
     tar.extractall()
     tar.close()
-    
+
     os.environ["PATH"] = os.path.join(os.path.abspath("cmake-3.18.4-Linux-x86_64"), "bin") + ":" + os.environ["PATH"]
-    os.environ["CUDACXX"] = os.path.join(args.cuda_home, "bin", "nvcc") 
+    os.environ["CUDACXX"] = os.path.join(args.cuda_home, "bin", "nvcc")
 
-    ort_master_path = args.ort_master_path 
+    ort_master_path = args.ort_master_path
     pwd = os.getcwd()
     os.chdir(ort_master_path)
 
     if args.use_archived:
         ort_wheel_file = args.use_archived
         subprocess.run(["python3", "-m", "pip", "install", "--force-reinstall", ort_wheel_file], check=True)
-    
+
     else:
         subprocess.run(["git", "fetch"], check=True)
         subprocess.run(["git", "checkout", args.branch], check=True)
         subprocess.run(["git", "pull", "origin", args.branch], check=True)
-        subprocess.run(["./build.sh", "--config", "Release", "--use_tensorrt", "--tensorrt_home", args.tensorrt_home, "--cuda_home", args.cuda_home, "--cudnn", "/usr/lib/x86_64-linux-gnu", "--build_wheel", "--skip_tests", "--parallel"], check=True)
+        subprocess.run(
+            [
+                "./build.sh",
+                "--config",
+                "Release",
+                "--use_tensorrt",
+                "--tensorrt_home",
+                args.tensorrt_home,
+                "--cuda_home",
+                args.cuda_home,
+                "--cudnn",
+                "/usr/lib/x86_64-linux-gnu",
+                "--build_wheel",
+                "--skip_tests",
+                "--parallel",
+            ],
+            check=True,
+        )
 
         ort_wheel_file = install_new_ort_wheel(ort_master_path)
-    
+
         if args.save:
             archive_wheel_file(args.save, ort_wheel_file)
 
     os.chdir(pwd)
 
+
 if __name__ == "__main__":
     main()
diff --git a/onnxruntime/python/tools/tensorrt/perf/comparison_scripts/compare_latency.py b/onnxruntime/python/tools/tensorrt/perf/comparison_scripts/compare_latency.py
index 93df53c9825db..b44a672e7723b 100644
--- a/onnxruntime/python/tools/tensorrt/perf/comparison_scripts/compare_latency.py
+++ b/onnxruntime/python/tools/tensorrt/perf/comparison_scripts/compare_latency.py
@@ -48,7 +48,7 @@ def main():
     condition_fp16 = get_table_condition(common, "fp16", args.ep, args.tolerance)
 
     common["greater"] = np.where((condition_fp32 | condition_fp16), True, False)
-    greater = common[common["greater"] == True].drop(["greater"], axis=1)
+    greater = common[common["greater"] is True].drop(["greater"], axis=1)
 
     # arrange columns
     keys = list(greater.keys().sort_values())
diff --git a/onnxruntime/python/tools/tensorrt/perf/perf_utils.py b/onnxruntime/python/tools/tensorrt/perf/perf_utils.py
index 61cac72b271c1..755d3f00a40cf 100644
--- a/onnxruntime/python/tools/tensorrt/perf/perf_utils.py
+++ b/onnxruntime/python/tools/tensorrt/perf/perf_utils.py
@@ -1,12 +1,9 @@
 import json
-import logging
 import pprint
 import re
 import subprocess
 import sys
 
-import coloredlogs
-
 debug = False
 debug_verbose = False
 
@@ -122,7 +119,7 @@ def parse_single_file(f):
 
     try:
         data = json.load(f)
-    except Exception as e:
+    except Exception:
         return None
 
     model_run_flag = False
@@ -131,7 +128,7 @@ def parse_single_file(f):
     provider_op_map_first_run = {}  # ep -> map of operator to duration
 
     for row in data:
-        if not "cat" in row:
+        if "cat" not in row:
             continue
 
         if row["cat"] == "Session":
@@ -146,7 +143,7 @@ def parse_single_file(f):
             if "name" in row and "args" in row and re.search(".*kernel_time", row["name"]):
                 args = row["args"]
 
-                if not "op_name" in args or not "provider" in args:
+                if "op_name" not in args or "provider" not in args:
                     continue
 
                 provider = args["provider"]
@@ -172,7 +169,7 @@ def parse_single_file(f):
                     op_map = provider_op_map[provider]
 
                     # avoid duplicated metrics
-                    if not row["name"] in op_map:
+                    if row["name"] not in op_map:
                         op_map[row["name"]] = row["dur"]
                         provider_op_map[provider] = op_map
 
diff --git a/onnxruntime/python/tools/transformers/__init__.py b/onnxruntime/python/tools/transformers/__init__.py
index 4200447eefee5..9bf76fc38153c 100644
--- a/onnxruntime/python/tools/transformers/__init__.py
+++ b/onnxruntime/python/tools/transformers/__init__.py
@@ -8,9 +8,9 @@
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "models", "gpt2"))
 
-import convert_to_onnx
+import convert_to_onnx  # noqa: E402
 
 # added for backward compatible
-import gpt2_helper
+import gpt2_helper  # noqa: E402
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "models", "t5"))
diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py
index 23f1be3eeed2f..3e56d119703e8 100644
--- a/onnxruntime/python/tools/transformers/benchmark.py
+++ b/onnxruntime/python/tools/transformers/benchmark.py
@@ -43,18 +43,16 @@
 import argparse
 import logging
 import os
+import random
 import timeit
 from datetime import datetime
-from enum import Enum
 
 import numpy
-import onnx
 import psutil
 from benchmark_helper import (
     ConfigModifier,
     OptimizerInfo,
     Precision,
-    allocateOutputBuffers,
     create_onnxruntime_session,
     get_latency_result,
     inference_ort,
@@ -76,7 +74,7 @@
 
 logger = logging.getLogger("")
 
-from huggingface_models import MODEL_CLASSES, MODELS
+from huggingface_models import MODEL_CLASSES, MODELS  # noqa: E402
 
 cpu_count = psutil.cpu_count(logical=False)
 
@@ -84,8 +82,8 @@
 if "OMP_NUM_THREADS" not in os.environ:
     os.environ["OMP_NUM_THREADS"] = str(cpu_count)
 
-import torch
-from transformers import AutoConfig, AutoModel, AutoTokenizer, GPT2Model, LxmertConfig
+import torch  # noqa: E402
+from transformers import AutoConfig, AutoTokenizer, LxmertConfig  # noqa: E402
 
 
 def run_onnxruntime(
@@ -373,7 +371,7 @@ def run_pytorch(
                     )
                     inference(input_ids)
 
-                    runtimes = timeit.repeat(lambda: inference(input_ids), repeat=repeat_times, number=1)
+                    runtimes = timeit.repeat(lambda: inference(input_ids), repeat=repeat_times, number=1)  # noqa: B023
 
                     result = {
                         "engine": "torchscript" if torchscript else "torch2" if torch2 else "torch",
@@ -495,8 +493,6 @@ def run_tensorflow(
                     "Run Tensorflow on {} with input shape {}".format(model_name, [batch_size, sequence_length])
                 )
 
-                import random
-
                 rng = random.Random()
                 values = [rng.randint(0, config.vocab_size - 1) for i in range(batch_size * sequence_length)]
                 input_ids = tf.constant(values, shape=(batch_size, sequence_length), dtype=tf.int32)
@@ -505,18 +501,18 @@ def run_tensorflow(
                     # Disable both for better inference perf
                     @run_with_tf_optimizations(do_eager_mode=False, use_xla=False)
                     def encoder_forward():
-                        return model(input_ids, training=False)
+                        return model(input_ids, training=False)  # noqa: B023
 
                     @run_with_tf_optimizations(do_eager_mode=False, use_xla=False)
                     def encoder_decoder_forward():
-                        return model(input_ids, decoder_input_ids=input_ids, training=False)
+                        return model(input_ids, decoder_input_ids=input_ids, training=False)  # noqa: B023
 
                     @run_with_tf_optimizations(do_eager_mode=False, use_xla=False)
                     def lxmert_forward():
-                        feats = tf.random.normal([1, 1, config.visual_feat_dim])
-                        pos = tf.random.normal([1, 1, config.visual_pos_dim])
-                        return model(
-                            input_ids,
+                        feats = tf.random.normal([1, 1, config.visual_feat_dim])  # noqa: B023
+                        pos = tf.random.normal([1, 1, config.visual_pos_dim])  # noqa: B023
+                        return model(  # noqa: B023
+                            input_ids,  # noqa: B023
                             visual_feats=feats,
                             visual_pos=pos,
                             training=False,
@@ -530,7 +526,7 @@ def lxmert_forward():
 
                     inference()
 
-                    runtimes = timeit.repeat(lambda: inference(), repeat=repeat_times, number=1)
+                    runtimes = timeit.repeat(lambda: inference(), repeat=repeat_times, number=1)  # noqa: B023
 
                     result = {
                         "engine": "tensorflow",
@@ -891,8 +887,8 @@ def main():
                     args.model_source,
                     args,
                 )
-            except:
-                logger.error(f"Exception", exc_info=True)
+            except Exception:
+                logger.error("Exception", exc_info=True)
 
     time_stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
     if model_fusion_statistics:
diff --git a/onnxruntime/python/tools/transformers/benchmark_helper.py b/onnxruntime/python/tools/transformers/benchmark_helper.py
index 2b5c53b867257..440aa15215fc0 100644
--- a/onnxruntime/python/tools/transformers/benchmark_helper.py
+++ b/onnxruntime/python/tools/transformers/benchmark_helper.py
@@ -81,8 +81,9 @@ def create_onnxruntime_session(
     num_threads=-1,
     enable_profiling=False,
     verbose=False,
-    provider_options={},  # map execution provider name to its option
+    provider_options: Optional[dict] = None,  # map execution provider name to its option
 ):
+    provider_options = provider_options or {}
     session = None
     try:
         sess_options = onnxruntime.SessionOptions()
@@ -133,7 +134,7 @@ def create_onnxruntime_session(
             providers = [(name, provider_options[name]) if name in provider_options else name for name in providers]
 
         session = onnxruntime.InferenceSession(onnx_model_path, sess_options, providers=providers)
-    except:
+    except Exception:
         logger.error("Exception", exc_info=True)
 
     return session
diff --git a/onnxruntime/python/tools/transformers/bert_perf_test.py b/onnxruntime/python/tools/transformers/bert_perf_test.py
index 022ee076770be..16f376bafeeae 100644
--- a/onnxruntime/python/tools/transformers/bert_perf_test.py
+++ b/onnxruntime/python/tools/transformers/bert_perf_test.py
@@ -173,7 +173,7 @@ def onnxruntime_inference_with_io_binding(session, all_inputs, output_names, tes
     results = []
     latency_list = []
     device = "cuda" if test_setting.use_gpu else "cpu"
-    for test_case_id, inputs in enumerate(all_inputs):
+    for inputs in all_inputs:
         result = session.run(output_names, inputs)
         results.append(result)
         outputs = {}
@@ -201,7 +201,7 @@ def onnxruntime_inference(session, all_inputs, output_names):
 
     results = []
     latency_list = []
-    for test_case_id, inputs in enumerate(all_inputs):
+    for inputs in all_inputs:
         start_time = timeit.default_timer()
         result = session.run(output_names, inputs)
         latency = timeit.default_timer() - start_time
@@ -240,14 +240,12 @@ def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op
 
     all_latency_list = []
     if test_setting.use_io_binding:
-        for i in range(test_setting.test_times):
-            results, latency_list = onnxruntime_inference_with_io_binding(
-                session, all_inputs, output_names, test_setting
-            )
+        for _ in range(test_setting.test_times):
+            _, latency_list = onnxruntime_inference_with_io_binding(session, all_inputs, output_names, test_setting)
             all_latency_list.extend(latency_list)
     else:
-        for i in range(test_setting.test_times):
-            results, latency_list = onnxruntime_inference(session, all_inputs, output_names)
+        for _ in range(test_setting.test_times):
+            _, latency_list = onnxruntime_inference(session, all_inputs, output_names)
             all_latency_list.extend(latency_list)
 
     # latency in miliseconds
diff --git a/onnxruntime/python/tools/transformers/bert_test_data.py b/onnxruntime/python/tools/transformers/bert_test_data.py
index 12c2145fe3eb0..88b0f2f9bafdd 100644
--- a/onnxruntime/python/tools/transformers/bert_test_data.py
+++ b/onnxruntime/python/tools/transformers/bert_test_data.py
@@ -175,7 +175,7 @@ def fake_test_data(
     random.seed(random_seed)
 
     all_inputs = []
-    for test_case in range(test_cases):
+    for _ in range(test_cases):
         input_1 = fake_input_ids_data(input_ids, batch_size, sequence_length, dictionary_size)
         inputs = {input_ids.name: input_1}
 
@@ -317,7 +317,7 @@ def find_bert_inputs(
                 if "mask" in input_name_lower:
                     input_mask = input
         if input_mask is None:
-            raise ValueError(f"Failed to find attention mask input")
+            raise ValueError("Failed to find attention mask input")
 
         return input_ids, segment_ids, input_mask
 
diff --git a/onnxruntime/python/tools/transformers/convert_generation.py b/onnxruntime/python/tools/transformers/convert_generation.py
index a106d906d052d..543abddf794be 100644
--- a/onnxruntime/python/tools/transformers/convert_generation.py
+++ b/onnxruntime/python/tools/transformers/convert_generation.py
@@ -64,10 +64,10 @@
 from models.gpt2.convert_to_onnx import main as convert_gpt2_to_onnx  # noqa: E402
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "models", "t5"))
-from benchmark_helper import setup_logger
+from benchmark_helper import setup_logger  # noqa: E402
 from models.t5.convert_to_onnx import export_onnx_models as export_t5_onnx_models  # noqa: E402
 from models.t5.t5_helper import PRETRAINED_MT5_MODELS, PRETRAINED_T5_MODELS  # noqa: E402
-from onnx_model import OnnxModel
+from onnx_model import OnnxModel  # noqa: E402
 
 logger = logging.getLogger("")
 
@@ -1457,7 +1457,7 @@ def convert_generation_model(args: argparse.Namespace, generation_type: Generati
             args.decoder_onnx, args.use_external_data_format
         ):
             # Can't proceed further - better to raise an exception
-            raise ValueError(f"Could not update the input shapes for the non-initial decoder subgraph.")
+            raise ValueError("Could not update the input shapes for the non-initial decoder subgraph.")
 
     # If the user explicitly requests running shape inference or if we padded/mutated
     # weight(s)/input shape(s) in the decoder, we want to run shape inference to capture the new
diff --git a/onnxruntime/python/tools/transformers/convert_tf_models_to_pytorch.py b/onnxruntime/python/tools/transformers/convert_tf_models_to_pytorch.py
index a035790b50954..fb2acad9e5096 100644
--- a/onnxruntime/python/tools/transformers/convert_tf_models_to_pytorch.py
+++ b/onnxruntime/python/tools/transformers/convert_tf_models_to_pytorch.py
@@ -90,7 +90,7 @@ def download_tf_checkpoint(model_name, tf_models_dir="tf_models"):
 
     import re
 
-    if re.search(".zip$", tf_ckpt_url) != None:
+    if re.search(".zip$", tf_ckpt_url) is not None:
         zip_dir = download_compressed_file(tf_ckpt_url, ckpt_dir)
 
         # unzip file
@@ -102,7 +102,7 @@ def download_tf_checkpoint(model_name, tf_models_dir="tf_models"):
 
         return get_ckpt_prefix_path(ckpt_dir)
 
-    elif re.search(".tar.gz$", tf_ckpt_url) != None:
+    elif re.search(".tar.gz$", tf_ckpt_url) is not None:
         tar_dir = download_compressed_file(tf_ckpt_url, ckpt_dir)
 
         # untar file
diff --git a/onnxruntime/python/tools/transformers/float16.py b/onnxruntime/python/tools/transformers/float16.py
index a7904c39f8491..dbfa688b3ec44 100644
--- a/onnxruntime/python/tools/transformers/float16.py
+++ b/onnxruntime/python/tools/transformers/float16.py
@@ -8,7 +8,7 @@
 
 import itertools
 import logging
-from typing import Dict, List
+from typing import Dict
 
 import numpy as np
 import onnx
@@ -334,7 +334,7 @@ def convert_float_to_float16(
 
         queue = next_level
 
-    for key, value in fp32_initializers.items():
+    for value in fp32_initializers.values():
         # By default, to avoid precision loss, do not convert an initializer to fp16 when it is used only by fp32 nodes.
         if force_fp16_initializers or value.fp16_nodes:
             value.initializer = convert_tensor_float_to_float16(value.initializer, min_positive_val, max_finite_val)
diff --git a/onnxruntime/python/tools/transformers/fusion_attention.py b/onnxruntime/python/tools/transformers/fusion_attention.py
index 342d43306e699..aa98e14f0937b 100644
--- a/onnxruntime/python/tools/transformers/fusion_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_attention.py
@@ -2,10 +2,7 @@
 # Copyright (c) Microsoft Corporation.  All rights reserved.
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
-from enum import Enum
 from logging import getLogger
-from os import name
-from sys import path
 from typing import Tuple, Union
 
 import numpy as np
@@ -14,7 +11,6 @@
 from fusion_utils import FusionUtils, NumpyHelper
 from onnx import NodeProto, TensorProto, helper, numpy_helper
 from onnx_model import OnnxModel
-from shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
 
 logger = getLogger(__name__)
 
@@ -420,7 +416,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                 return
 
         other_inputs = []
-        for i, input in enumerate(start_node.input):
+        for input in start_node.input:
             if input not in output_name_to_node:
                 continue
 
diff --git a/onnxruntime/python/tools/transformers/fusion_embedlayer.py b/onnxruntime/python/tools/transformers/fusion_embedlayer.py
index f4ae184bdf825..42c7faa37d649 100644
--- a/onnxruntime/python/tools/transformers/fusion_embedlayer.py
+++ b/onnxruntime/python/tools/transformers/fusion_embedlayer.py
@@ -112,7 +112,7 @@ def check_attention_subgraph(
                 logger.debug("No Attention like subgraph in children of LayerNormalization")
                 return False
         else:
-            if children_types != ["Add", "MatMul", "MatMul", "MatMul",] and children_types != [
+            if children_types != ["Add", "MatMul", "MatMul", "MatMul"] and children_types != [
                 "MatMul",
                 "MatMul",
                 "MatMul",
@@ -233,7 +233,7 @@ def match_position_embedding_roberta(self, position_embedding_gather, input_ids,
         return False
 
     def match_position_embedding_bert(self, position_embedding_gather, input_ids, output_name_to_node):
-        """  Match position embedding path from input_ids to Gather for BERT.
+        r"""Match position embedding path from input_ids to Gather for BERT.
 
         BERT Embedding Layer Pattern:
                                     (input_ids)
diff --git a/onnxruntime/python/tools/transformers/fusion_gelu_approximation.py b/onnxruntime/python/tools/transformers/fusion_gelu_approximation.py
index ba231e9e05ea4..fc9013fff0a80 100644
--- a/onnxruntime/python/tools/transformers/fusion_gelu_approximation.py
+++ b/onnxruntime/python/tools/transformers/fusion_gelu_approximation.py
@@ -3,7 +3,6 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-from logging import getLogger
 
 from fusion_base import Fusion
 from onnx import helper
diff --git a/onnxruntime/python/tools/transformers/fusion_gpt_attention.py b/onnxruntime/python/tools/transformers/fusion_gpt_attention.py
index 7fe3257950568..9af34b23d79a2 100644
--- a/onnxruntime/python/tools/transformers/fusion_gpt_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_gpt_attention.py
@@ -7,7 +7,7 @@
 import numpy as np
 from fusion_base import Fusion
 from fusion_utils import FusionUtils
-from onnx import TensorProto, helper, numpy_helper
+from onnx import helper, numpy_helper
 from onnx_model import OnnxModel
 
 logger = getLogger(__name__)
@@ -335,7 +335,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
         # (2) SkipLayerNorm fusion was turned ON but upstream layer's LayerNorm + Add was not
         # fused into a SkipLayerNorm. This can happen if the shapes to the Add node are different.
         # So, keep the following check if SkipLayerNorm fusion is turned ON or OFF.
-        if another_input is not None and not another_input in layernorm_before_attention.input:
+        if another_input is not None and another_input not in layernorm_before_attention.input:
             logger.debug("Upstream Add and (Skip)LayerNormalization shall have one same input")
             return
 
diff --git a/onnxruntime/python/tools/transformers/fusion_gpt_attention_megatron.py b/onnxruntime/python/tools/transformers/fusion_gpt_attention_megatron.py
index 1c0b0b7074745..052dd243fd788 100644
--- a/onnxruntime/python/tools/transformers/fusion_gpt_attention_megatron.py
+++ b/onnxruntime/python/tools/transformers/fusion_gpt_attention_megatron.py
@@ -5,10 +5,8 @@
 from logging import getLogger
 
 import numpy as np
-from fusion_base import Fusion
 from fusion_gpt_attention import FusionGptAttentionPastBase
-from fusion_utils import FusionUtils
-from onnx import TensorProto, helper, numpy_helper
+from onnx import helper
 from onnx_model import OnnxModel
 
 logger = getLogger(__name__)
diff --git a/onnxruntime/python/tools/transformers/fusion_gpt_attention_no_past.py b/onnxruntime/python/tools/transformers/fusion_gpt_attention_no_past.py
index 8176be523bcca..83fa51dcfafa6 100644
--- a/onnxruntime/python/tools/transformers/fusion_gpt_attention_no_past.py
+++ b/onnxruntime/python/tools/transformers/fusion_gpt_attention_no_past.py
@@ -4,10 +4,8 @@
 # --------------------------------------------------------------------------
 from logging import getLogger
 
-import numpy as np
 from fusion_base import Fusion
-from fusion_utils import FusionUtils
-from onnx import TensorProto, helper, numpy_helper
+from onnx import helper
 from onnx_model import OnnxModel
 
 logger = getLogger(__name__)
@@ -146,9 +144,9 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
         # fused into a SkipLayerNorm. This can happen if the shapes to the Add node are different.
         # So, keep the following check if SkipLayerNorm fusion is turned ON or OFF.
         if another_input is not None:
-            if not another_input in layernorm_before_attention.input:
+            if another_input not in layernorm_before_attention.input:
                 # match openai-gpt
-                if not another_input in layernorm_before_attention.output:
+                if another_input not in layernorm_before_attention.output:
                     logger.debug("Add and (Skip)LayerNormalization shall have one same input")
                     return
 
diff --git a/onnxruntime/python/tools/transformers/fusion_layernorm.py b/onnxruntime/python/tools/transformers/fusion_layernorm.py
index 893d3283691be..3a1629197f6de 100644
--- a/onnxruntime/python/tools/transformers/fusion_layernorm.py
+++ b/onnxruntime/python/tools/transformers/fusion_layernorm.py
@@ -106,7 +106,7 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
             input_name_to_nodes,
             output_name_to_node,
         ):
-            logger.debug(f"It is not safe to fuse LayerNormalization node. Skip")
+            logger.debug("It is not safe to fuse LayerNormalization node. Skip")
             return
 
         weight_input = mul_node.input[1 - self.model.input_index(div_node.output[0], mul_node)]
diff --git a/onnxruntime/python/tools/transformers/fusion_qordered_attention.py b/onnxruntime/python/tools/transformers/fusion_qordered_attention.py
index b3d8743414b91..cac79ebd3e327 100644
--- a/onnxruntime/python/tools/transformers/fusion_qordered_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_qordered_attention.py
@@ -128,7 +128,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
 
         # Identify the root input to the Attention node
         other_inputs = []
-        for i, input in enumerate(start_node.input):
+        for input in start_node.input:
             if input not in output_name_to_node:
                 continue
 
diff --git a/onnxruntime/python/tools/transformers/fusion_qordered_gelu.py b/onnxruntime/python/tools/transformers/fusion_qordered_gelu.py
index a92c8f94d49af..6c44bb11e24dc 100644
--- a/onnxruntime/python/tools/transformers/fusion_qordered_gelu.py
+++ b/onnxruntime/python/tools/transformers/fusion_qordered_gelu.py
@@ -81,7 +81,7 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
             input_name_to_nodes,
             output_name_to_node,
         ):
-            logger.debug(f"It is not safe to fuse QOrderedGelu node. Skip")
+            logger.debug("It is not safe to fuse QOrderedGelu node. Skip")
             return
 
         self.nodes_to_remove.extend(subgraph_nodes)
diff --git a/onnxruntime/python/tools/transformers/fusion_qordered_layernorm.py b/onnxruntime/python/tools/transformers/fusion_qordered_layernorm.py
index f8198bcaa1419..cf2b357721757 100644
--- a/onnxruntime/python/tools/transformers/fusion_qordered_layernorm.py
+++ b/onnxruntime/python/tools/transformers/fusion_qordered_layernorm.py
@@ -83,7 +83,7 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
             input_name_to_nodes,
             output_name_to_node,
         ):
-            logger.debug(f"It is not safe to fuse QOrderedLayerNormalization node. Skip")
+            logger.debug("It is not safe to fuse QOrderedLayerNormalization node. Skip")
             return
 
         self.nodes_to_remove.extend(subgraph_nodes)
diff --git a/onnxruntime/python/tools/transformers/fusion_qordered_matmul.py b/onnxruntime/python/tools/transformers/fusion_qordered_matmul.py
index 2fbd3262684ce..681160479faef 100644
--- a/onnxruntime/python/tools/transformers/fusion_qordered_matmul.py
+++ b/onnxruntime/python/tools/transformers/fusion_qordered_matmul.py
@@ -170,7 +170,7 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
         if not self.model.is_safe_to_fuse_nodes(
             subgraph_nodes, downstream_quantize_node.output, input_name_to_nodes, output_name_to_node
         ):
-            logger.debug(f"It is not safe to fuse QOrderedMatMul node. Skip")
+            logger.debug("It is not safe to fuse QOrderedMatMul node. Skip")
             return
 
         # Deal with the case where-in the Attention subgraph is not fused
diff --git a/onnxruntime/python/tools/transformers/fusion_shape.py b/onnxruntime/python/tools/transformers/fusion_shape.py
index a6a74719b9c42..f7f6d0ac4ab4b 100644
--- a/onnxruntime/python/tools/transformers/fusion_shape.py
+++ b/onnxruntime/python/tools/transformers/fusion_shape.py
@@ -48,7 +48,7 @@ def fuse(
         input_name_to_nodes: Dict[str, List[NodeProto]],
         output_name_to_node: Dict[str, NodeProto],
     ):
-        """
+        r"""
         Smplify subgraph like
 
                    (2d_input)
diff --git a/onnxruntime/python/tools/transformers/fusion_skiplayernorm.py b/onnxruntime/python/tools/transformers/fusion_skiplayernorm.py
index 7c54649553168..0c49087642d34 100644
--- a/onnxruntime/python/tools/transformers/fusion_skiplayernorm.py
+++ b/onnxruntime/python/tools/transformers/fusion_skiplayernorm.py
@@ -42,7 +42,7 @@ def fuse(self, node, input_name_to_nodes, output_name_to_node):
             return
 
         for add_input in add.input:
-            if self.model.get_initializer(add_input) != None:
+            if self.model.get_initializer(add_input) is not None:
                 return
 
         # The number of input node of add should be 2
@@ -158,15 +158,15 @@ def fuse(self, node, input_name_to_nodes, output_name_to_node):
             bias_weight = NumpyHelper.to_array(initializer)
             break
         if bias_weight is None:
-            logger.debug(f"Bias weight not found")
+            logger.debug("Bias weight not found")
             return
         if len(bias_weight.shape) != 1:
-            logger.debug(f"Bias weight is not 1D")
+            logger.debug("Bias weight is not 1D")
             return
 
         subgraph_nodes = [node, add]
         if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, node.output, input_name_to_nodes, output_name_to_node):
-            logger.debug(f"Skip fusing SkipLayerNormalization with Bias since it is not safe")
+            logger.debug("Skip fusing SkipLayerNormalization with Bias since it is not safe")
             return
 
         self.nodes_to_remove.extend(subgraph_nodes)
diff --git a/onnxruntime/python/tools/transformers/fusion_utils.py b/onnxruntime/python/tools/transformers/fusion_utils.py
index 0945be6cc6898..d4d2edf0a8c12 100644
--- a/onnxruntime/python/tools/transformers/fusion_utils.py
+++ b/onnxruntime/python/tools/transformers/fusion_utils.py
@@ -172,7 +172,7 @@ def check_qdq_node_for_fusion(node: NodeProto, model: OnnxModel, allow_per_tenso
         Returns:
             bool: whether the check is passed or not
         """
-        if not node.op_type in {"QuantizeLinear", "DequantizeLinear"}:
+        if node.op_type not in {"QuantizeLinear", "DequantizeLinear"}:
             logger.debug(f"Provided node is not a Q/DQ node. Op Type: {node.op_type}")
 
         scale = model.get_constant_value(node.input[1])
diff --git a/onnxruntime/python/tools/transformers/io_binding_helper.py b/onnxruntime/python/tools/transformers/io_binding_helper.py
index 3182107cd8050..84295bb205321 100644
--- a/onnxruntime/python/tools/transformers/io_binding_helper.py
+++ b/onnxruntime/python/tools/transformers/io_binding_helper.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Dict, List, Union
+from typing import Dict, List
 
 import numpy
 import torch
@@ -12,14 +12,14 @@
 class TypeHelper:
     @staticmethod
     def get_input_type(ort_session: InferenceSession, name: str) -> str:
-        for i, input in enumerate(ort_session.get_inputs()):
+        for input in ort_session.get_inputs():
             if input.name == name:
                 return input.type
         raise ValueError(f"input name {name} not found")
 
     @staticmethod
     def get_output_type(ort_session, name: str) -> str:
-        for i, output in enumerate(ort_session.get_outputs()):
+        for output in ort_session.get_outputs():
             if output.name == name:
                 return output.type
 
diff --git a/onnxruntime/python/tools/transformers/machine_info.py b/onnxruntime/python/tools/transformers/machine_info.py
index e872e2a6c00c6..72f6a26153564 100644
--- a/onnxruntime/python/tools/transformers/machine_info.py
+++ b/onnxruntime/python/tools/transformers/machine_info.py
@@ -9,9 +9,8 @@
 import json
 import logging
 import platform
-import sys
 from os import environ
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List
 
 import cpuinfo
 import psutil
diff --git a/onnxruntime/python/tools/transformers/models/bart/export.py b/onnxruntime/python/tools/transformers/models/bart/export.py
index c1e0f3224a445..8cddaea822139 100644
--- a/onnxruntime/python/tools/transformers/models/bart/export.py
+++ b/onnxruntime/python/tools/transformers/models/bart/export.py
@@ -87,14 +87,14 @@ def user_command():
         )
 
     if not args.no_encoder:
-        logger.info(f"========== EXPORTING ENCODER ==========")
+        logger.info("========== EXPORTING ENCODER ==========")
         export_summarization_edinit.export_encoder(args)
     if not args.no_decoder:
-        logger.info(f"========== EXPORTING DECODER ==========")
+        logger.info("========== EXPORTING DECODER ==========")
         export_summarization_enc_dec_past.export_decoder(args)
     if not args.no_chain:
-        logger.info(f"========== CONVERTING MODELS ==========")
+        logger.info("========== CONVERTING MODELS ==========")
         chain_enc_dec_with_beamsearch.convert_model(args)
     if not args.no_inference:
-        logger.info(f"========== INFERENCING WITH ONNX MODEL ==========")
+        logger.info("========== INFERENCING WITH ONNX MODEL ==========")
         onnx_inference.run_inference(args)
diff --git a/onnxruntime/python/tools/transformers/models/bart/utils/chain_enc_dec_with_beamsearch.py b/onnxruntime/python/tools/transformers/models/bart/utils/chain_enc_dec_with_beamsearch.py
index 4230684e5e7ee..e729b07013774 100644
--- a/onnxruntime/python/tools/transformers/models/bart/utils/chain_enc_dec_with_beamsearch.py
+++ b/onnxruntime/python/tools/transformers/models/bart/utils/chain_enc_dec_with_beamsearch.py
@@ -88,7 +88,7 @@ def convert_model(args):
     ]
     outputs = ["sequences"]
 
-    node = helper.make_node("BeamSearch", inputs=inputs, outputs=outputs, name=f"BeamSearch_zcode")
+    node = helper.make_node("BeamSearch", inputs=inputs, outputs=outputs, name="BeamSearch_zcode")
     node.domain = "com.microsoft"
     # NOTE: take value from args or config
     node.attribute.extend(
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py b/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py
index 01a5e5d8883d7..e8553e2cae0f7 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py
@@ -21,14 +21,14 @@
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
 
-from benchmark_helper import (
+from benchmark_helper import (  # noqa: E402
     Precision,
     create_onnxruntime_session,
     get_ort_environment_variables,
     prepare_environment,
     setup_logger,
 )
-from quantize_helper import QuantizeHelper
+from quantize_helper import QuantizeHelper  # noqa: E402
 
 logger = logging.getLogger("")
 
@@ -404,8 +404,8 @@ def main(args):
                             "onnxruntime_latency": f"{ort_latency:.2f}",
                         }
                         csv_writer.writerow(row)
-                    except:
-                        logger.error(f"Exception", exc_info=True)
+                    except Exception:
+                        logger.error("Exception", exc_info=True)
                         return None
 
     logger.info(f"Results are saved to file {csv_filename}")
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/gpt2/convert_to_onnx.py
index 78e718e6e80c4..cb8e1a337de68 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/convert_to_onnx.py
@@ -30,14 +30,14 @@
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
 
-from benchmark_helper import (
+from benchmark_helper import (  # noqa: E402
     Precision,
     create_onnxruntime_session,
     get_ort_environment_variables,
     prepare_environment,
     setup_logger,
 )
-from quantize_helper import QuantizeHelper
+from quantize_helper import QuantizeHelper  # noqa: E402
 
 logger = logging.getLogger("")
 
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py
index f5de69e8f0524..95f6a7dcaa000 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py
@@ -13,7 +13,7 @@
 import tempfile
 import time
 from pathlib import Path
-from typing import Dict, List, Tuple, Union
+from typing import Collection, Dict, List, Tuple, Union
 
 import numpy
 import onnx
@@ -22,12 +22,12 @@
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
 
-from benchmark_helper import Precision
-from float16 import float_to_float16_max_diff
-from fusion_options import AttentionMaskFormat
-from io_binding_helper import IOBindingHelper
-from onnx_model import OnnxModel
-from torch_onnx_export_helper import torch_onnx_export
+from benchmark_helper import Precision  # noqa: E402
+from float16 import float_to_float16_max_diff  # noqa: E402
+from fusion_options import AttentionMaskFormat  # noqa: E402
+from io_binding_helper import IOBindingHelper  # noqa: E402
+from onnx_model import OnnxModel  # noqa: E402
+from torch_onnx_export_helper import torch_onnx_export  # noqa: E402
 
 logger = logging.getLogger(__name__)
 
@@ -551,24 +551,26 @@ def optimize_onnx(
     @staticmethod
     def auto_mixed_precision(
         onnx_model: OnnxModel,
-        op_block_list: List[str] = [
-            "Add",
-            "LayerNormalization",
-            "SkipLayerNormalization",
-            "FastGelu",
-            "EmbedLayerNormalization",
-        ],
+        op_block_list: Collection[str] = frozenset(
+            (
+                "Add",
+                "LayerNormalization",
+                "SkipLayerNormalization",
+                "FastGelu",
+                "EmbedLayerNormalization",
+            )
+        ),
     ):
         """Convert GPT-2 model to mixed precision.
            It detects whether original model has fp16 weights, and set parameters for float16 conversion automatically.
         Args:
-            onnx_model (OnnxModel): optimized ONNX model
-            op_block_list (List[str], optional): operators to compute in fp32. Defaults to ["Add", "LayerNormalization",
-                                                 "SkipLayerNormalization", "FastGelu", "EmbedLayerNormalization"]
+            onnx_model: Optimized ONNX model
+            op_block_list: Operators to compute in fp32. Defaults to {"Add", "LayerNormalization",
+                "SkipLayerNormalization", "FastGelu", "EmbedLayerNormalization"}.
         Returns:
-            parameters(dict): a dictionary of parameters used in float16 conversion
+            A dictionary of parameters used in float16 conversion.
         """
-        op_full_set = set([node.op_type for node in onnx_model.nodes()])
+        op_full_set = set(node.op_type for node in onnx_model.nodes())
         fp32_op_set = set(op_block_list)
         fp16_op_set = op_full_set.difference(fp32_op_set)
         logger.info(f"fp32 op: {fp32_op_set} fp16 op: {fp16_op_set}")
@@ -647,7 +649,7 @@ def pytorch_inference(model, inputs: Gpt2Inputs, total_runs: int = 0):
     @staticmethod
     def onnxruntime_inference(ort_session, inputs: Gpt2Inputs, total_runs: int = 0):
         """Run inference of ONNX model, and returns average latency in ms when total_runs > 0 besides outputs."""
-        logger.debug(f"start onnxruntime_inference")
+        logger.debug("start onnxruntime_inference")
 
         ort_inputs = {"input_ids": numpy.ascontiguousarray(inputs.input_ids.cpu().numpy())}
 
@@ -715,7 +717,7 @@ def onnxruntime_inference_with_binded_io(
         include_copy_output_latency: bool = False,
     ):
         """Inference with IO binding. Returns outputs, and optional latency when total_runs > 0."""
-        logger.debug(f"start onnxruntime_inference_with_binded_io")
+        logger.debug("start onnxruntime_inference_with_binded_io")
 
         # Bind inputs and outputs to onnxruntime session
         io_binding = Gpt2Helper.prepare_io_binding(
@@ -986,7 +988,7 @@ def get_onnx_paths(
         model_class: str = "GPT2LMHeadModel",
         has_past=True,
         new_folder=False,
-        remove_existing=["raw", "fp32", "fp16", "int8"],
+        remove_existing=frozenset(["raw", "fp32", "fp16", "int8"]),
     ):
         """Build a  path name for given model based on given attributes."""
         model_name = model_name_or_path
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_parity.py b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_parity.py
index e48fcc1cfc119..265e76cf6e4d0 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_parity.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_parity.py
@@ -26,7 +26,7 @@
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
 
-from benchmark_helper import get_ort_environment_variables, setup_logger
+from benchmark_helper import get_ort_environment_variables, setup_logger  # noqa: E402
 
 logger = logging.getLogger("")
 
@@ -120,7 +120,7 @@ def run(self, argv, experiment_name):
             )
             if result:
                 self.results.append(result)
-        except:
+        except Exception:
             logger.exception(f"Failed to run experiment {experiment_name}")
             result = None
 
@@ -330,8 +330,9 @@ def run_candidate(
     task: ParityTask,
     args,
     last_matmul_node_name,
-    op_block_list=["FastGelu", "LayerNormalization"],
+    op_block_list=("FastGelu", "LayerNormalization"),
 ):
+    op_block_list = list(op_block_list)
     parameters = get_mixed_precision_parameters(args, last_matmul_node_name, op_block_list)
     op_block_list_str = ",".join(sorted(op_block_list))
 
@@ -450,7 +451,8 @@ def run_parity(task: ParityTask, args):
     # Mixed precision baseline
     run_candidate(task, args, last_matmul_node_name, op_block_list=[])
 
-    get_fp32_ops = lambda x: [op for op in x if op in all_ops]
+    def get_fp32_ops(x):
+        return [op for op in x if op in all_ops]
 
     if args.all:
         run_tuning_step0(task, fp16_baseline, all_ops, optimized_ops)
@@ -509,7 +511,7 @@ def run_parity(task: ParityTask, args):
 
     try:
         rows = load_results_from_csv(task.csv_path)
-    except:
+    except Exception:
         logger.exception(f"Failed to load csv {task.csv_path}")
         rows = task.results
 
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_tester.py b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_tester.py
index be303b4e188bf..b6e9a1406fb5c 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_tester.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_tester.py
@@ -17,7 +17,7 @@
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
 
-from benchmark_helper import Precision
+from benchmark_helper import Precision  # noqa: E402
 
 logger = logging.getLogger(__name__)
 
@@ -151,7 +151,7 @@ def __init__(
             0,
             hidden_size // num_attention_heads,
         ]
-        for i in range(num_layer):
+        for _ in range(num_layer):
             empty_past = torch.empty(past_shape).type(torch.float16 if is_fp16 else torch.float32)
             self.past.append(empty_past.to(device))
 
@@ -194,7 +194,7 @@ def add_tensor(input_tensors, torch_tensor, name):
                 f.write(tensor.SerializeToString())
 
         output_names = [output.name for output in session.get_outputs()]
-        for i, name in enumerate(output_names):
+        for i, _ in enumerate(output_names):
             tensor = numpy_helper.from_array(
                 output[i] if isinstance(output[i], numpy.ndarray) else output[i].clone().cpu().numpy()
             )
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/parity_check_helper.py b/onnxruntime/python/tools/transformers/models/gpt2/parity_check_helper.py
index c122e243293aa..87dc766628aab 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/parity_check_helper.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/parity_check_helper.py
@@ -19,7 +19,7 @@
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
 
-from benchmark_helper import create_onnxruntime_session
+from benchmark_helper import create_onnxruntime_session  # noqa: E402
 
 NON_ZERO_VALUE = str(1)
 ZERO_VALUE = str(0)
@@ -107,7 +107,7 @@ def post_processing(outputs_path, outputs_path_other):
                 record[Path(filename).name.split(".")[0]] = diff
                 if_close[Path(filename).name.split(".")[0]] = numpy.allclose(array, array_other, rtol=1e-04, atol=1e-04)
 
-    results = [f"Node\tDiff\tClose"]
+    results = ["Node\tDiff\tClose"]
     for k, v in sorted(record.items(), key=lambda x: x[1], reverse=True):
         results.append(f"{k}\t{v}\t{if_close[k]}")
     for line in results:
@@ -137,8 +137,8 @@ def post_processing(outputs_path, outputs_path_other):
     dummy_inputs_fp32 = dummy_inputs_fp16.to_fp32()
 
     # Get GPT-2 model from huggingface using convert_to_onnx.py
-    os.system("python convert_to_onnx.py -m gpt2 --output gpt2_fp32.onnx -o -p fp32 --use_gpu")
-    os.system("python convert_to_onnx.py -m gpt2 --output gpt2_fp16.onnx -o -p fp16 --use_gpu")
+    os.system("python convert_to_onnx.py -m gpt2 --output gpt2_fp32.onnx -o -p fp32 --use_gpu")  # noqa: DUO106
+    os.system("python convert_to_onnx.py -m gpt2 --output gpt2_fp16.onnx -o -p fp16 --use_gpu")  # noqa: DUO106
 
     # Specify the directory to dump the node's I/O
     outputs_path_fp32_gpu = "./fp32_gpu"
diff --git a/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py b/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py
index 679004c6ea89c..fe62d0889a25f 100644
--- a/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py
+++ b/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py
@@ -51,7 +51,7 @@
 import onnxruntime
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
-import benchmark_helper
+import benchmark_helper  # noqa: E402
 
 logger = logging.getLogger("")
 
@@ -80,7 +80,7 @@ def test_torch_latency(
                 input_list = inputs.to_list()
 
                 _ = model(*input_list)
-                runtimes = timeit.repeat(lambda: model(*input_list), repeat=test_times, number=1)
+                runtimes = timeit.repeat(lambda: model(*input_list), repeat=test_times, number=1)  # noqa: B023
                 result = {
                     "engine": "torch",  # TODO: test torchscript
                     "version": torch.__version__,
@@ -647,7 +647,7 @@ def run_tests(
                         latency_results = launch_test(args)
                     except KeyboardInterrupt as exc:
                         raise RuntimeError("Keyboard Interrupted") from exc
-                    except:
+                    except Exception:
                         traceback.print_exc()
                         continue
 
diff --git a/onnxruntime/python/tools/transformers/models/longformer/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/longformer/convert_to_onnx.py
index 7427b65a2bf36..39bfb38db9d6b 100644
--- a/onnxruntime/python/tools/transformers/models/longformer/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/longformer/convert_to_onnx.py
@@ -47,8 +47,8 @@
 from transformers import LongformerModel, LongformerSelfAttention
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
-from onnx_model_bert import BertOnnxModel
-from torch_onnx_export_helper import torch_onnx_export
+from onnx_model_bert import BertOnnxModel  # noqa: E402
+from torch_onnx_export_helper import torch_onnx_export  # noqa: E402
 
 # Supports format 0 or 1
 weight_bias_format = 0
@@ -266,7 +266,7 @@ def my_longformer_self_attention_forward_4_3(
     is_global_attn=None,
     output_attentions=False,
 ):
-    assert output_attentions == False
+    assert output_attentions is False
     return my_longformer_self_attention_forward_4(
         self,
         hidden_states,
@@ -288,7 +288,7 @@ def my_longformer_self_attention_forward_4_3_2(
     is_global_attn=None,
     output_attentions=False,
 ):
-    assert output_attentions == False
+    assert output_attentions is False
     assert layer_head_mask is None
     return my_longformer_self_attention_forward_4(
         self,
diff --git a/onnxruntime/python/tools/transformers/models/longformer/generate_test_data.py b/onnxruntime/python/tools/transformers/models/longformer/generate_test_data.py
index 379efce27b27a..109f739667f6d 100644
--- a/onnxruntime/python/tools/transformers/models/longformer/generate_test_data.py
+++ b/onnxruntime/python/tools/transformers/models/longformer/generate_test_data.py
@@ -12,11 +12,11 @@
 from pathlib import Path
 
 import numpy as np
-from onnx import ModelProto, TensorProto, numpy_helper
+from onnx import ModelProto, TensorProto
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
-from bert_test_data import fake_input_ids_data, fake_input_mask_data, output_test_data
-from onnx_model import OnnxModel
+from bert_test_data import fake_input_ids_data, fake_input_mask_data, output_test_data  # noqa: E402
+from onnx_model import OnnxModel  # noqa: E402
 
 
 def parse_arguments():
diff --git a/onnxruntime/python/tools/transformers/models/t5/past_helper.py b/onnxruntime/python/tools/transformers/models/t5/past_helper.py
index fe113491067fd..d6b1d50f5a47f 100644
--- a/onnxruntime/python/tools/transformers/models/t5/past_helper.py
+++ b/onnxruntime/python/tools/transformers/models/t5/past_helper.py
@@ -38,7 +38,7 @@ def group_by_self_or_cross(present_key_values):
         """
         present_self = []
         present_cross = []
-        for i, present_layer_i in enumerate(present_key_values):
+        for present_layer_i in present_key_values:
             assert len(present_layer_i) == 4, f"Expected to have four items. Got {len(present_layer_i)}"
             (
                 present_key_self,
diff --git a/onnxruntime/python/tools/transformers/models/t5/t5_helper.py b/onnxruntime/python/tools/transformers/models/t5/t5_helper.py
index c91c0da178e13..bc7c2e5cc4878 100644
--- a/onnxruntime/python/tools/transformers/models/t5/t5_helper.py
+++ b/onnxruntime/python/tools/transformers/models/t5/t5_helper.py
@@ -8,7 +8,7 @@
 import os
 import sys
 from pathlib import Path
-from typing import Dict, List, Union
+from typing import Collection, Dict, Union
 
 import torch
 from t5_decoder import T5Decoder, T5DecoderHelper, T5DecoderInit
@@ -19,9 +19,9 @@
 from onnxruntime import InferenceSession
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
-from float16 import float_to_float16_max_diff
-from onnx_model import OnnxModel
-from optimizer import optimize_model
+from float16 import float_to_float16_max_diff  # noqa: E402
+from onnx_model import OnnxModel  # noqa: E402
+from optimizer import optimize_model  # noqa: E402
 
 logger = logging.getLogger(__name__)
 
@@ -150,26 +150,31 @@ def export_onnx(
     @staticmethod
     def auto_mixed_precision(
         onnx_model: OnnxModel,
-        op_block_list: List[str] = [
-            "Pow",
-            "ReduceMean",
-            "Add",
-            "Sqrt",
-            "Div",
-            "Mul",
-            "Softmax",
-            "Relu",
-        ],
+        op_block_list: Collection[str] = frozenset(
+            (
+                "Pow",
+                "ReduceMean",
+                "Add",
+                "Sqrt",
+                "Div",
+                "Mul",
+                "Softmax",
+                "Relu",
+            )
+        ),
     ):
         """Convert model to mixed precision.
-           It detects whether original model has fp16 precision weights, and set parameters for float16 conversion automatically.
+
+        It detects whether original model has fp16 precision weights,
+        and set parameters for float16 conversion automatically.
+
         Args:
-            onnx_model (OnnxModel): optimized ONNX model
-            op_block_list (List[str], optional): . Defaults to ["Pow", "ReduceMean", "Add", "Sqrt", "Div", "Mul", "Softmax", "Relu"]
+            onnx_model: Optimized ONNX model
+            op_block_list: Defaults to {"Pow", "ReduceMean", "Add", "Sqrt", "Div", "Mul", "Softmax", "Relu"}
         Returns:
-            parameters(dict): a dictionary of parameters used in float16 conversion
+            parameters: A dictionary of parameters used in float16 conversion
         """
-        op_full_set = set([node.op_type for node in onnx_model.nodes()])
+        op_full_set = set(node.op_type for node in onnx_model.nodes())
         fp32_op_set = set(op_block_list)
         fp16_op_set = op_full_set.difference(fp32_op_set)
         logger.info(f"fp32 op: {fp32_op_set} fp16 op: {fp16_op_set}")
diff --git a/onnxruntime/python/tools/transformers/onnx_exporter.py b/onnxruntime/python/tools/transformers/onnx_exporter.py
index c4dda99496ebe..edb10a17832ba 100644
--- a/onnxruntime/python/tools/transformers/onnx_exporter.py
+++ b/onnxruntime/python/tools/transformers/onnx_exporter.py
@@ -19,7 +19,7 @@
 from transformers import AutoConfig, AutoTokenizer, LxmertConfig, TransfoXLConfig
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "models", "gpt2"))
-from gpt2_helper import PRETRAINED_GPT2_MODELS, GPT2ModelNoPastState, TFGPT2ModelNoPastState
+from gpt2_helper import PRETRAINED_GPT2_MODELS, GPT2ModelNoPastState, TFGPT2ModelNoPastState  # noqa: E402
 
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
 
@@ -172,7 +172,7 @@ def get_onnx_file_path(
         filename = f"{normalized_model_name}_{input_count}_{precision}_{device}"
 
     if optimized_by_onnxruntime:
-        filename += f"_ort"
+        filename += "_ort"
 
     directory = onnx_dir
     # ONNXRuntime will not write external data so the raw and optimized models shall be in same directory.
@@ -268,7 +268,7 @@ def optimize_onnx_model(
 
 
 def modelclass_dispatcher(model_name, custom_model_class):
-    if custom_model_class != None:
+    if custom_model_class is not None:
         if custom_model_class in MODEL_CLASSES:
             return custom_model_class
         else:
@@ -279,11 +279,11 @@ def modelclass_dispatcher(model_name, custom_model_class):
 
     import re
 
-    if re.search("-squad$", model_name) != None:
+    if re.search("-squad$", model_name) is not None:
         return "AutoModelForQuestionAnswering"
-    elif re.search("-mprc$", model_name) != None:
+    elif re.search("-mprc$", model_name) is not None:
         return "AutoModelForSequenceClassification"
-    elif re.search("gpt2", model_name) != None:
+    elif re.search("gpt2", model_name) is not None:
         return "AutoModelWithLMHead"
 
     return "AutoModel"
@@ -600,7 +600,7 @@ def export_onnx_model_from_tf(
         # Use no past state for these models
         if config.use_cache:
             config.use_cache = False
-    except:
+    except Exception:
         pass
 
     example_outputs = model(example_inputs, training=False)
diff --git a/onnxruntime/python/tools/transformers/onnx_model.py b/onnxruntime/python/tools/transformers/onnx_model.py
index dd58f82171a70..37145c4632189 100644
--- a/onnxruntime/python/tools/transformers/onnx_model.py
+++ b/onnxruntime/python/tools/transformers/onnx_model.py
@@ -31,7 +31,8 @@ def initialize(self, model):
     def disable_shape_inference(self):
         self.enable_shape_infer = False
 
-    def infer_runtime_shape(self, dynamic_axis_mapping={}, update=False):
+    def infer_runtime_shape(self, dynamic_axis_mapping=None, update=False):
+        dynamic_axis_mapping = dynamic_axis_mapping or {}
         if self.enable_shape_infer:
             if self.shape_infer_helper is None or update:
                 self.shape_infer_helper = SymbolicShapeInferenceHelper(self.model)
@@ -39,7 +40,7 @@ def infer_runtime_shape(self, dynamic_axis_mapping={}, update=False):
             try:
                 if self.shape_infer_helper.infer(dynamic_axis_mapping):
                     return self.shape_infer_helper
-            except:  # noqa
+            except Exception:
                 self.enable_shape_infer = False  # disable shape inference to suppress same error message.
                 print("failed in shape inference", sys.exc_info()[0])
 
@@ -241,7 +242,7 @@ def get_parent(self, node, i, output_name_to_node=None):
 
         return output_name_to_node[input]
 
-    def match_first_parent(self, node, parent_op_type, output_name_to_node, exclude=[]):
+    def match_first_parent(self, node, parent_op_type, output_name_to_node, exclude=()):
         """
         Find parent node based on constraints on op_type.
 
@@ -270,7 +271,7 @@ def match_parent(
         parent_op_type,
         input_index=None,
         output_name_to_node=None,
-        exclude=[],
+        exclude=None,
         return_indice=None,
     ):
         """
@@ -289,6 +290,7 @@ def match_parent(
         Returns:
             parent: The matched parent node.
         """
+        exclude = exclude or []
         assert node is not None
         assert input_index is None or input_index >= 0
 
diff --git a/onnxruntime/python/tools/transformers/onnx_model_bart.py b/onnxruntime/python/tools/transformers/onnx_model_bart.py
index 33db231c52332..df5c841938b90 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_bart.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_bart.py
@@ -97,7 +97,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             return
 
         other_inputs = []
-        for i, input in enumerate(normalize_node.input):
+        for input in normalize_node.input:
             if input not in output_name_to_node:
                 continue
             if input == qkv_nodes[0].output[0]:
diff --git a/onnxruntime/python/tools/transformers/onnx_model_bert.py b/onnxruntime/python/tools/transformers/onnx_model_bert.py
index 81c83d222529f..bb3218a5c0af0 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_bert.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_bert.py
@@ -33,7 +33,7 @@ class BertOptimizationOptions(FusionOptions):
     """This class is deprecated"""
 
     def __init__(self, model_type):
-        logger.warning(f"BertOptimizationOptions is depreciated. Please use FusionOptions instead.")
+        logger.warning("BertOptimizationOptions is depreciated. Please use FusionOptions instead.")
         super().__init__(model_type)
 
 
@@ -235,7 +235,6 @@ def use_dynamic_axes(self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_
             casted=True
         ) + self.get_graph_inputs_from_fused_nodes(casted=False)
 
-        dynamic_batch_inputs = {}
         for input in self.model.graph.input:
             if input.name in bert_graph_inputs:
                 dim_proto = input.type.tensor_type.shape.dim[0]
diff --git a/onnxruntime/python/tools/transformers/onnx_model_bert_keras.py b/onnxruntime/python/tools/transformers/onnx_model_bert_keras.py
index 33bb1d66a7528..3d88b5a775bc3 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_bert_keras.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_bert_keras.py
@@ -3,14 +3,10 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-import argparse
 import logging
-import sys
-from collections import deque
 
-import numpy as np
 import onnx
-from onnx import ModelProto, TensorProto, numpy_helper
+from onnx import numpy_helper
 from onnx_model_bert_tf import BertOnnxModelTF
 
 logger = logging.getLogger(__name__)
@@ -61,7 +57,7 @@ def check_attention_input(self, matmul_q, matmul_k, matmul_v, parent, output_nam
         return True, reshape_nodes
 
     def fuse_attention(self):
-        input_name_to_nodes = self.input_name_to_nodes()
+        self.input_name_to_nodes()
         output_name_to_node = self.output_name_to_node()
 
         nodes_to_remove = []
@@ -227,11 +223,8 @@ def preprocess(self):
         self.skip_reshape()
 
     def skip_reshape(self):
-        input_name_to_nodes = self.input_name_to_nodes()
-        output_name_to_node = self.output_name_to_node()
-
-        nodes_to_remove = []
-        attention_count = 0
+        self.input_name_to_nodes()
+        self.output_name_to_node()
 
         count = 0
         reshape_nodes = self.get_nodes_by_op_type("Reshape")
diff --git a/onnxruntime/python/tools/transformers/onnx_model_bert_tf.py b/onnxruntime/python/tools/transformers/onnx_model_bert_tf.py
index 7455777273846..6d0c7cf055d27 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_bert_tf.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_bert_tf.py
@@ -3,14 +3,11 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-import argparse
 import logging
-import sys
-from collections import deque
 
 import numpy as np
 import onnx
-from onnx import ModelProto, TensorProto, helper, numpy_helper
+from onnx import TensorProto, helper, numpy_helper
 from onnx_model_bert import BertOnnxModel
 
 logger = logging.getLogger(__name__)
diff --git a/onnxruntime/python/tools/transformers/onnx_model_gpt2.py b/onnxruntime/python/tools/transformers/onnx_model_gpt2.py
index 92197e7e4f09f..263857ffbc130 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_gpt2.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_gpt2.py
@@ -31,7 +31,7 @@ def postprocess(self):
         """
         Remove extra reshape nodes.
         """
-        logger.debug(f"start postprocessing...")
+        logger.debug("start postprocessing...")
 
         input_name_to_nodes = self.input_name_to_nodes()
         output_name_to_node = self.output_name_to_node()
@@ -42,7 +42,6 @@ def postprocess(self):
                 gemm_node, "Reshape", input_name_to_nodes, recursive=False
             )
 
-            return_indice = []
             nodes = self.match_parent_path(gemm_node, ["Reshape", "FastGelu"], [0, 0], output_name_to_node)
             if nodes is None:
                 nodes = self.match_parent_path(
diff --git a/onnxruntime/python/tools/transformers/onnx_model_tnlr.py b/onnxruntime/python/tools/transformers/onnx_model_tnlr.py
index 85e510a828990..9379f62db95b9 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_tnlr.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_tnlr.py
@@ -5,6 +5,7 @@
 import logging
 from typing import Union
 
+import numpy as np
 from fusion_attention import AttentionMask, FusionAttention
 from fusion_utils import NumpyHelper
 from onnx import NodeProto, TensorProto, helper, numpy_helper
@@ -118,12 +119,12 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             [1, 1, 1, 0, 0, 0],
         )
         if qkv_nodes is not None:
-            (_, _, matmul_below, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes
+            (_, _, _, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes
         else:
             return
 
         other_inputs = []
-        for i, input in enumerate(start_node.input):
+        for input in start_node.input:
             if input not in output_name_to_node:
                 continue
 
diff --git a/onnxruntime/python/tools/transformers/optimizer.py b/onnxruntime/python/tools/transformers/optimizer.py
index 65de1bf770992..978f68c789cfa 100644
--- a/onnxruntime/python/tools/transformers/optimizer.py
+++ b/onnxruntime/python/tools/transformers/optimizer.py
@@ -64,7 +64,7 @@ def optimize_by_onnxruntime(
     use_gpu: bool = False,
     optimized_model_path: Optional[str] = None,
     opt_level: Optional[int] = 99,
-    disabled_optimizers=[],
+    disabled_optimizers=None,
 ) -> str:
     """
     Use onnxruntime to optimize model.
@@ -108,9 +108,7 @@ def optimize_by_onnxruntime(
         kwargs["disabled_optimizers"] = disabled_optimizers
 
     if not use_gpu:
-        session = onnxruntime.InferenceSession(
-            onnx_model_path, sess_options, providers=["CPUExecutionProvider"], **kwargs
-        )
+        onnxruntime.InferenceSession(onnx_model_path, sess_options, providers=["CPUExecutionProvider"], **kwargs)
     else:
         gpu_ep = []
 
@@ -120,7 +118,7 @@ def optimize_by_onnxruntime(
             gpu_ep.append("MIGraphXExecutionProvider")
             gpu_ep.append("ROCMExecutionProvider")
 
-        session = onnxruntime.InferenceSession(onnx_model_path, sess_options, providers=gpu_ep, **kwargs)
+        onnxruntime.InferenceSession(onnx_model_path, sess_options, providers=gpu_ep, **kwargs)
         assert not set(onnxruntime.get_available_providers()).isdisjoint(
             ["CUDAExecutionProvider", "ROCMExecutionProvider", "MIGraphXExecutionProvider"]
         )
diff --git a/onnxruntime/python/tools/transformers/profiler.py b/onnxruntime/python/tools/transformers/profiler.py
index 9f41654af3533..047c1e007d2b1 100644
--- a/onnxruntime/python/tools/transformers/profiler.py
+++ b/onnxruntime/python/tools/transformers/profiler.py
@@ -256,7 +256,7 @@ def parse_kernel_results(sess_time, threshold=0):
         else:
             op_time[op_name] = duration
 
-    lines.append(f"\nGroup kernel time by operator:")
+    lines.append("\nGroup kernel time by operator:")
     lines.append("-" * 64)
     lines.append("Total(μs)\tTime%\tOperator")
     for op_name, duration in sorted(op_time.items(), key=lambda x: x[1], reverse=True):
diff --git a/onnxruntime/python/tools/transformers/quantize_helper.py b/onnxruntime/python/tools/transformers/quantize_helper.py
index d7e9eb9718a9e..eb30ba7dc3b31 100644
--- a/onnxruntime/python/tools/transformers/quantize_helper.py
+++ b/onnxruntime/python/tools/transformers/quantize_helper.py
@@ -7,7 +7,6 @@
 import logging
 import os
 
-import onnx
 import torch
 from transformers.modeling_utils import Conv1D
 
diff --git a/onnxruntime/python/tools/transformers/shape_infer_helper.py b/onnxruntime/python/tools/transformers/shape_infer_helper.py
index e877497ffb1cb..f8a5464d8af78 100644
--- a/onnxruntime/python/tools/transformers/shape_infer_helper.py
+++ b/onnxruntime/python/tools/transformers/shape_infer_helper.py
@@ -15,7 +15,7 @@
 else:
     sys.path.append(os.path.join(file_path, ".."))
 
-from symbolic_shape_infer import SymbolicShapeInference, get_shape_from_type_proto, sympy
+from symbolic_shape_infer import SymbolicShapeInference, get_shape_from_type_proto, sympy  # noqa: E402
 
 logger = logging.getLogger(__name__)
 
diff --git a/onnxruntime/python/tools/transformers/shape_optimizer.py b/onnxruntime/python/tools/transformers/shape_optimizer.py
index 7174af0ac9ba0..0fc4f448a0b28 100644
--- a/onnxruntime/python/tools/transformers/shape_optimizer.py
+++ b/onnxruntime/python/tools/transformers/shape_optimizer.py
@@ -10,12 +10,9 @@
 import argparse
 import logging
 import os
-import re
 import sys
 import tempfile
-from collections import deque
 from datetime import datetime
-from pathlib import Path
 from typing import List
 
 import numpy as np
@@ -73,7 +70,7 @@ def get_reshape_shape_inputs(self):
         """
         Returns a list of shape input names of Reshape nodes.
         """
-        output_name_to_node = self.output_name_to_node()
+        self.output_name_to_node()
 
         shape_inputs = []
         for node in self.model.graph.node:
diff --git a/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py b/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py
index 119455684cea1..997461befd198 100644
--- a/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py
+++ b/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py
@@ -4,9 +4,9 @@
 # --------------------------------------------------------------------------
 
 import torch
+from packaging.version import Version
 
 TrainingMode = torch.onnx.TrainingMode
-from packaging.version import Version
 
 
 def torch_onnx_export(
diff --git a/onnxruntime/test/onnx/gen_test_models.py b/onnxruntime/test/onnx/gen_test_models.py
index 509c27ec4efea..e83da7bcb7b44 100644
--- a/onnxruntime/test/onnx/gen_test_models.py
+++ b/onnxruntime/test/onnx/gen_test_models.py
@@ -7,7 +7,7 @@
 
 import numpy as np
 import onnx
-from onnx import AttributeProto, GraphProto, TensorProto, helper, numpy_helper, utils
+from onnx import TensorProto, helper, numpy_helper
 
 
 def parse_arguments():
diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py b/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py
index 235b4111bbcb0..4cefe1439fcbc 100644
--- a/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py
+++ b/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py
@@ -1,7 +1,6 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-import os
 
 import numpy as np
 
diff --git a/onnxruntime/test/providers/cpu/rnn/GRU.py b/onnxruntime/test/providers/cpu/rnn/GRU.py
index 3fee29e9928f0..bb016ee540a65 100644
--- a/onnxruntime/test/providers/cpu/rnn/GRU.py
+++ b/onnxruntime/test/providers/cpu/rnn/GRU.py
@@ -29,7 +29,6 @@ def __init__(self, **params):
             assert i in params, "Missing Required Input: {0}".format(i)
 
         num_directions = params["W"].shape[0]
-        sequence_length = params["X"].shape[0]
 
         hidden_size = params["R"].shape[-1]
         batch_size = params["X"].shape[1]
@@ -138,7 +137,6 @@ def execute(self):
         # print_with_shape("r_br", r_br)
         # print_with_shape("r_bh", r_bh)
 
-        seq_len = self.X.shape[0]
         num_directions = 1
         hidden_size = self.R.shape[-1]
         batch_size = self.X.shape[1]
@@ -249,8 +247,6 @@ def ReverseDefaultActivationsSimpleWeightsNoBiasTwoRows():
 
         print(GRU_ONNXRuntimeUnitTests.ReverseDefaultActivationsSimpleWeightsNoBiasTwoRows.__name__)
 
-        seq_length = 2
-        batch_size = 2
         input_size = 1
         hidden_size = 3
         input = np.array([[[1.0], [2.0]], [[10.0], [11.0]]]).astype(np.float32)
@@ -273,8 +269,6 @@ def BidirectionalDefaultActivationsSimpleWeightsNoBias(linear_before_reset=0):
             + str(linear_before_reset)
         )
 
-        seq_length = 2
-        batch_size = 3 if linear_before_reset else 2
         input_size = 1
         hidden_size = 3
 
diff --git a/onnxruntime/test/providers/cpu/rnn/LSTM.py b/onnxruntime/test/providers/cpu/rnn/LSTM.py
index 039a419552586..11402936752f2 100644
--- a/onnxruntime/test/providers/cpu/rnn/LSTM.py
+++ b/onnxruntime/test/providers/cpu/rnn/LSTM.py
@@ -3,8 +3,6 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-from typing import Any, Tuple
-
 import numpy as np  # type: ignore
 
 # import onnx
@@ -42,7 +40,7 @@ def __init__(self, **params):  # type: (*Any) -> None
         R = params["R"]
 
         num_directions = W.shape[0]
-        sequence_length = X.shape[0]
+        X.shape[0]
         batch_size = X.shape[1]
         hidden_size = R.shape[-1]
 
@@ -256,8 +254,6 @@ def SimpleWeightsNoBiasTwoRows(direction):  # type: () -> None
 
         print(LSTM.SimpleWeightsNoBiasTwoRows.__name__ + " direction=" + direction)
 
-        seq_length = 2
-        batch_size = 2
         input_size = 1
         hidden_size = 3
         number_of_gates = 4
@@ -395,7 +391,7 @@ def export_peepholes():  # type: () -> None
         W = weight_scale * np.ones((1, number_of_gates * hidden_size, input_size)).astype(np.float32)
         R = weight_scale * np.ones((1, number_of_gates * hidden_size, hidden_size)).astype(np.float32)
         B = np.zeros((1, 2 * number_of_gates * hidden_size)).astype(np.float32)
-        seq_lens = np.repeat(input.shape[0], input.shape[1]).astype(np.int32)
+        np.repeat(input.shape[0], input.shape[1]).astype(np.int32)
         init_h = np.zeros((1, input.shape[1], hidden_size)).astype(np.float32)
         init_c = np.zeros((1, input.shape[1], hidden_size)).astype(np.float32)
         P = weight_scale * np.ones((1, number_of_peepholes * hidden_size)).astype(np.float32)
diff --git a/onnxruntime/test/python/onnxruntime_test_ort_trainer.py b/onnxruntime/test/python/onnxruntime_test_ort_trainer.py
index 30e2dc62e16da..f60e80d60d47c 100644
--- a/onnxruntime/test/python/onnxruntime_test_ort_trainer.py
+++ b/onnxruntime/test/python/onnxruntime_test_ort_trainer.py
@@ -3,17 +3,15 @@
 
 import copy
 import os
-import sys
 import unittest
 
 import numpy as np
 import onnx
-import pytest
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from helper import get_name
-from numpy.testing import assert_allclose, assert_array_equal
+from numpy.testing import assert_allclose
 from torchvision import datasets, transforms
 
 import onnxruntime
@@ -603,7 +601,6 @@ def testMNISTStateDict(self):
         assert state_dict == {}
 
         learningRate = 0.02
-        epoch = 0
 
         data, target = next(iter(train_loader))
         data, target = data.to(device), target.to(device)
@@ -636,7 +633,6 @@ def testMNISTSaveAsONNX(self):
         assert not os.path.exists(onnx_file_name)
 
         learningRate = 0.02
-        epoch = 0
 
         data, target = next(iter(train_loader))
         data, target = data.to(device), target.to(device)
@@ -659,7 +655,6 @@ def testMNISTDevice(self):
             model.to(model_device)
             trainer = mnist.get_trainer(model, model_desc, device)
             learningRate = 0.02
-            epoch = 0
 
             data, target = next(iter(train_loader))
             data, target = data.to(device), target.to(device)
@@ -677,7 +672,6 @@ def testMNISTInitializerNames(self):
 
         trainer = mnist.get_trainer(model, model_desc, device)
         learningRate = 0.02
-        epoch = 0
 
         data, target = next(iter(train_loader))
         data, target = data.to(device), target.to(device)
@@ -708,7 +702,6 @@ def get_lr_this_step(global_step):
             internal_loss_fn=True,
             get_lr_this_step=get_lr_this_step,
         )
-        epoch = 0
 
         data, target = next(iter(train_loader))
         data, target = data.to(device), target.to(device)
@@ -731,7 +724,6 @@ def testMNISTFrozenWeight(self):
         trainer = mnist.get_trainer(model, model_desc, device, frozen_weights=["fc1.weight"])
 
         learningRate = 0.02
-        epoch = 0
 
         data, target = next(iter(train_loader))
         data, target = data.to(device), target.to(device)
@@ -759,7 +751,6 @@ def testMNISTTorchBuffer(self):
         trainer = mnist.get_trainer(model, model_desc, device)
 
         learningRate = 0.02
-        epoch = 0
 
         data, target = next(iter(train_loader))
         data, target = data.to(device), target.to(device)
@@ -789,7 +780,6 @@ def testMNISTFrozenWeightCheckpoint(self):
         trainer = mnist.get_trainer(model, model_desc, device, frozen_weights=["fc1.weight"])
 
         learningRate = 0.02
-        epoch = 0
 
         # do one train step
         data, target = next(iter(train_loader))
@@ -835,7 +825,6 @@ def testMNISTTrainingCheckpoint(self):
         )
 
         learningRate = 0.02
-        epoch = 0
 
         # do 5 train step
         for i in range(5):
diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py
index 8232044a29a59..762912b36752b 100644
--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@@ -170,7 +170,6 @@ def testSetProvidersWithOptions(self):
 
         if "CUDAExecutionProvider" in onnxrt.get_available_providers():
             import ctypes
-            import sys
 
             CUDA_SUCCESS = 0
 
@@ -746,7 +745,7 @@ def testRaiseWrongNumInputs(self):
         with self.assertRaises(ValueError) as context:
             sess = onnxrt.InferenceSession(get_name("logicaland.onnx"), providers=onnxrt.get_available_providers())
             a = np.array([[True, True], [False, False]], dtype=bool)
-            res = sess.run([], {"input:0": a})
+            sess.run([], {"input:0": a})
 
         self.assertTrue("Model requires 2 inputs" in str(context.exception))
 
@@ -817,7 +816,7 @@ def testGraphOptimizationLevel(self):
         a = np.array([[True, True], [False, False]], dtype=bool)
         b = np.array([[True, False], [True, False]], dtype=bool)
 
-        res = sess.run([], {"input1:0": a, "input:0": b})
+        sess.run([], {"input1:0": a, "input:0": b})
 
     def testSequenceLength(self):
         sess = onnxrt.InferenceSession(get_name("sequence_length.onnx"), providers=available_providers_without_tvm)
@@ -1067,14 +1066,14 @@ def testRegisterCustomOpsLibrary(self):
         so2 = so1
 
         # Model loading successfully indicates that the custom op node could be resolved successfully
-        sess2 = onnxrt.InferenceSession(
+        onnxrt.InferenceSession(
             custom_op_model, sess_options=so2, providers=available_providers_without_tvm_and_tensorrt
         )
 
         # Create another SessionOptions instance with the same shared library referenced
         so3 = onnxrt.SessionOptions()
         so3.register_custom_ops_library(shared_library)
-        sess3 = onnxrt.InferenceSession(
+        onnxrt.InferenceSession(
             custom_op_model, sess_options=so3, providers=available_providers_without_tvm_and_tensorrt
         )
 
@@ -1242,7 +1241,7 @@ def testSparseTensorCsrFormat(self):
     def testRunModelWithCudaCopyStream(self):
         available_providers = onnxrt.get_available_providers()
 
-        if not "CUDAExecutionProvider" in available_providers:
+        if "CUDAExecutionProvider" not in available_providers:
             print("Skipping testRunModelWithCudaCopyStream when CUDA is not available")
         else:
             # adapted from issue #4829 for a race condition when copy is not on default stream
@@ -1259,7 +1258,7 @@ def testRunModelWithCudaCopyStream(self):
             session = onnxrt.InferenceSession(get_name("issue4829.onnx"), providers=providers)
             shape = np.array([2, 2], dtype=np.int64)
             for iteration in range(100000):
-                result = session.run(output_names=["output"], input_feed={"shape": shape})
+                session.run(output_names=["output"], input_feed={"shape": shape})
 
     def testSharedAllocatorUsingCreateAndRegisterAllocator(self):
         # Create and register an arena based allocator
diff --git a/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py b/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py
index b71b3a07cd41f..eecc31edb78d1 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py
@@ -1,11 +1,6 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-import gc
-import os
-import sys
-import threading
-import time
 
 # -*- coding: UTF-8 -*-
 import unittest
@@ -14,7 +9,6 @@
 from helper import get_name
 
 import onnxruntime as onnxrt
-from onnxruntime.capi.onnxruntime_pybind11_state import Fail
 
 
 class TestInferenceSessionWithCudaGraph(unittest.TestCase):
diff --git a/onnxruntime/test/python/onnxruntime_test_python_iobinding.py b/onnxruntime/test/python/onnxruntime_test_python_iobinding.py
index ff1c0d17fd3ec..03eb64e88af90 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_iobinding.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_iobinding.py
@@ -43,7 +43,7 @@ def create_expected_output_alternate(self):
         return np.array([[2.0, 8.0], [18.0, 32.0], [50.0, 72.0]], dtype=np.float32)
 
     def test_bind_input_to_cpu_arr(self):
-        input = self.create_numpy_input()
+        self.create_numpy_input()
 
         session = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
         io_binding = session.io_binding()
diff --git a/onnxruntime/test/python/onnxruntime_test_python_mlops.py b/onnxruntime/test/python/onnxruntime_test_python_mlops.py
index b6604a6d51e8a..217361f7a880f 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_mlops.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_mlops.py
@@ -1,7 +1,6 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-import os
 
 # -*- coding: UTF-8 -*-
 import unittest
diff --git a/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py b/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
index fed6892f13f4e..8c75ce331a4de 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
@@ -5,7 +5,7 @@
 
 # -*- coding: UTF-8 -*-
 import onnx
-from onnx import AttributeProto, GraphProto, TensorProto, helper
+from onnx import TensorProto, helper
 
 if os.path.exists(
     os.path.join(
diff --git a/onnxruntime/test/python/quantization/test_conv_dynamic.py b/onnxruntime/test/python/quantization/test_conv_dynamic.py
index 045bddccbfbb2..08f329cdf3735 100644
--- a/onnxruntime/test/python/quantization/test_conv_dynamic.py
+++ b/onnxruntime/test/python/quantization/test_conv_dynamic.py
@@ -11,13 +11,7 @@
 import numpy as np
 import onnx
 from onnx import TensorProto, helper, numpy_helper
-from op_test_utils import (
-    TestDataFeeds,
-    check_model_correctness,
-    check_op_type_count,
-    check_op_type_order,
-    check_qtype_by_node_type,
-)
+from op_test_utils import check_model_correctness, check_op_type_count, check_qtype_by_node_type
 
 from onnxruntime.quantization import DynamicQuantConfig, QuantType, quantize, quantize_dynamic
 
diff --git a/onnxruntime/test/python/quantization/test_onnx_model.py b/onnxruntime/test/python/quantization/test_onnx_model.py
index fc29810e9b97d..d2c34f159a97c 100644
--- a/onnxruntime/test/python/quantization/test_onnx_model.py
+++ b/onnxruntime/test/python/quantization/test_onnx_model.py
@@ -11,9 +11,8 @@
 import numpy as np
 import onnx
 from onnx import TensorProto, helper, numpy_helper
-from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_op_type_order
+from op_test_utils import check_op_type_order
 
-import onnxruntime
 from onnxruntime.quantization.onnx_model import ONNXModel
 
 
diff --git a/onnxruntime/test/python/quantization/test_op_gavgpool.py b/onnxruntime/test/python/quantization/test_op_gavgpool.py
index a34c52f912ced..eea115b6f6847 100644
--- a/onnxruntime/test/python/quantization/test_op_gavgpool.py
+++ b/onnxruntime/test/python/quantization/test_op_gavgpool.py
@@ -13,7 +13,7 @@
 from onnx import TensorProto, helper
 from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_qtype_by_node_type
 
-from onnxruntime.quantization import QuantFormat, QuantType, quantize_dynamic, quantize_static
+from onnxruntime.quantization import QuantFormat, QuantType, quantize_static
 
 
 class TestOpGlobalAveragePool(unittest.TestCase):
diff --git a/onnxruntime/test/python/quantization/test_op_pooling.py b/onnxruntime/test/python/quantization/test_op_pooling.py
index b0561bd79f8e1..f18ae564c4730 100644
--- a/onnxruntime/test/python/quantization/test_op_pooling.py
+++ b/onnxruntime/test/python/quantization/test_op_pooling.py
@@ -11,13 +11,7 @@
 import numpy as np
 import onnx
 from onnx import TensorProto, helper
-from op_test_utils import (
-    TestDataFeeds,
-    check_model_correctness,
-    check_op_nodes,
-    check_op_type_count,
-    check_qtype_by_node_type,
-)
+from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_qtype_by_node_type
 
 from onnxruntime.quantization import QuantFormat, QuantType, quantize_static
 
diff --git a/onnxruntime/test/python/quantization/test_symmetric_flag.py b/onnxruntime/test/python/quantization/test_symmetric_flag.py
index 26f7ba6ce59b3..c3aa94db99685 100644
--- a/onnxruntime/test/python/quantization/test_symmetric_flag.py
+++ b/onnxruntime/test/python/quantization/test_symmetric_flag.py
@@ -48,7 +48,7 @@ def perform_quantization(self, activations, weight, act_sym, wgt_sym):
 
         # One-layer convolution model
         act = helper.make_tensor_value_info("ACT", TensorProto.FLOAT, activations[0].shape)
-        wgt = helper.make_tensor_value_info("WGT", TensorProto.FLOAT, weight.shape)
+        helper.make_tensor_value_info("WGT", TensorProto.FLOAT, weight.shape)
         res = helper.make_tensor_value_info("RES", TensorProto.FLOAT, [None, None, None, None])
         wgt_init = numpy_helper.from_array(weight, "WGT")
         conv_node = onnx.helper.make_node("Conv", ["ACT", "WGT"], ["RES"])
diff --git a/onnxruntime/test/python/transformers/gpt2_model_generator.py b/onnxruntime/test/python/transformers/gpt2_model_generator.py
index 2fe24739fcffb..aabe4ae391bb6 100644
--- a/onnxruntime/test/python/transformers/gpt2_model_generator.py
+++ b/onnxruntime/test/python/transformers/gpt2_model_generator.py
@@ -5,7 +5,6 @@
 # --------------------------------------------------------------------------
 
 import math
-from typing import List
 
 import numpy
 import onnx
@@ -494,9 +493,6 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
         )
         initializers.append(helper.make_tensor("axes_1", TensorProto.INT64, [1], [1]))
 
-    batch_size = 1
-    sequence_length = 3
-    past_sequence_length = 2
     graph = helper.make_graph(
         [node for node in nodes if node],
         "GPT2",  # name
diff --git a/onnxruntime/test/python/transformers/model_loader.py b/onnxruntime/test/python/transformers/model_loader.py
index 126df89240c70..2d871123ec8bb 100644
--- a/onnxruntime/test/python/transformers/model_loader.py
+++ b/onnxruntime/test/python/transformers/model_loader.py
@@ -5,7 +5,6 @@
 # --------------------------------------------------------------------------
 
 import os
-import unittest
 
 from onnx import ModelProto, TensorProto, external_data_helper, load_model, numpy_helper
 from parity_utilities import find_transformers_source
diff --git a/onnxruntime/test/python/transformers/parity_utilities.py b/onnxruntime/test/python/transformers/parity_utilities.py
index f77c5a41b2fa3..4dee20e4fc140 100644
--- a/onnxruntime/test/python/transformers/parity_utilities.py
+++ b/onnxruntime/test/python/transformers/parity_utilities.py
@@ -48,7 +48,7 @@ def export_onnx(model, onnx_model_path, float16, hidden_size, device):
 
     input_hidden_states = create_inputs(hidden_size=hidden_size, float16=float16, device=device)
     with torch.no_grad():
-        outputs = model(input_hidden_states)
+        model(input_hidden_states)
 
     dynamic_axes = {
         "input": {0: "batch_size", 1: "seq_len"},
@@ -132,7 +132,6 @@ def compare_outputs(torch_outputs, ort_outputs, atol=1e-06, verbose=True):
 
 def create_ort_session(onnx_model_path, use_gpu=True):
     from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions
-    from onnxruntime import __version__ as onnxruntime_version
 
     sess_options = SessionOptions()
     sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL
diff --git a/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py b/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py
index 47145fc213a0d..0ffd70d01815d 100644
--- a/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py
+++ b/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py
@@ -26,7 +26,6 @@
 import argparse
 import os
 import random
-import sys
 import timeit
 from pathlib import Path
 
@@ -302,7 +301,6 @@ def use_dynamic_axes(self, dynamic_batch_dim="batch_size", seq_len=7):
         """
         Update input and output shape to use dynamic axes.
         """
-        dynamic_batch_inputs = {}
         for input in self.model.graph.input:
             dim_proto = input.type.tensor_type.shape.dim[0]
             dim_proto.dim_param = dynamic_batch_dim
@@ -355,7 +353,7 @@ def generate_test_data(
         sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
         sess = onnxruntime.InferenceSession(onnx_file, sess_options, providers=["CPUExecutionProvider"])
 
-        input1_name = sess.get_inputs()[0].name
+        sess.get_inputs()[0].name
         output_names = [output.name for output in sess.get_outputs()]
         inputs = {
             "input_ids": input_1,
diff --git a/onnxruntime/test/python/transformers/test_data/gpt2_pytorch1.5_opset11/generate_tiny_gpt2_model.py b/onnxruntime/test/python/transformers/test_data/gpt2_pytorch1.5_opset11/generate_tiny_gpt2_model.py
index 7f613a8674989..afa683751cf96 100644
--- a/onnxruntime/test/python/transformers/test_data/gpt2_pytorch1.5_opset11/generate_tiny_gpt2_model.py
+++ b/onnxruntime/test/python/transformers/test_data/gpt2_pytorch1.5_opset11/generate_tiny_gpt2_model.py
@@ -7,9 +7,6 @@
 
 import argparse
 import os
-import random
-import sys
-import timeit
 from pathlib import Path
 
 import numpy as np
diff --git a/onnxruntime/test/python/transformers/test_optimizer.py b/onnxruntime/test/python/transformers/test_optimizer.py
index 2a02ce277fb67..36def7d183925 100644
--- a/onnxruntime/test/python/transformers/test_optimizer.py
+++ b/onnxruntime/test/python/transformers/test_optimizer.py
@@ -310,7 +310,7 @@ def test_huggingface_bart_fusion(self):
 class TestTensorflowModelOptimization(unittest.TestCase):
     def Setup(self):
         try:
-            import tf2onnx
+            pass
         except ImportError:
             self.skipTest("skip TestBertOptimizationTF since tf2onnx not installed")
 
diff --git a/onnxruntime/test/python/transformers/test_parity_decoder_attention.py b/onnxruntime/test/python/transformers/test_parity_decoder_attention.py
index 6c05e321f7618..710013e4e8bab 100644
--- a/onnxruntime/test/python/transformers/test_parity_decoder_attention.py
+++ b/onnxruntime/test/python/transformers/test_parity_decoder_attention.py
@@ -10,9 +10,7 @@
 # license information.
 # -------------------------------------------------------------------------
 
-import math
-import os
-from typing import Dict, List, Optional, Tuple
+from typing import List, Optional, Tuple
 
 import numpy
 import torch
diff --git a/onnxruntime/test/python/transformers/test_parity_gelu.py b/onnxruntime/test/python/transformers/test_parity_gelu.py
index 7fe42dc76f193..791d2702a1959 100644
--- a/onnxruntime/test/python/transformers/test_parity_gelu.py
+++ b/onnxruntime/test/python/transformers/test_parity_gelu.py
@@ -27,8 +27,8 @@
 import os
 import unittest
 
+import parity_utilities
 import torch
-from parity_utilities import *
 from torch import nn
 
 
@@ -36,6 +36,7 @@ class Gelu(nn.Module):
     def __init__(self, formula=4, fp32_gelu_op=False):
         super().__init__()
         self.formula = formula
+        # FIXME(justinchuby): fp32_gelu_op is always True
         self.fp32_gelu_op = True
 
     def gelu(self, x):
@@ -97,12 +98,12 @@ def run(
 
     # Do not re-use onnx file from previous test since weights of model are random.
     onnx_model_path = "./temp/gelu_{}_{}.onnx".format(formula, "fp16" if float16 else "fp32")
-    export_onnx(model, onnx_model_path, float16, hidden_size, device)
+    parity_utilities.export_onnx(model, onnx_model_path, float16, hidden_size, device)
 
     if optimized:
         optimized_onnx_path = "./temp/gelu_{}_opt_{}.onnx".format(formula, "fp16" if float16 else "fp32")
         use_gpu = float16 and not fp32_gelu_op
-        optimize_onnx(
+        parity_utilities.optimize_onnx(
             onnx_model_path,
             optimized_onnx_path,
             Gelu.get_fused_op(formula),
@@ -113,7 +114,7 @@ def run(
     else:
         onnx_path = onnx_model_path
 
-    num_failure = run_parity(
+    num_failure = parity_utilities.run_parity(
         model,
         onnx_path,
         batch_size,
@@ -217,9 +218,7 @@ def test_cpu(self):
 
     def test_cuda(self):
         if not torch.cuda.is_available():
-            import pytest
-
-            pytest.skip("test requires GPU and torch+cuda")
+            self.skipTest("test requires GPU and torch+cuda")
         else:
             gpu = torch.device("cuda")
             for i in self.formula_to_test:
diff --git a/onnxruntime/test/python/transformers/test_parity_huggingface_gpt_attention.py b/onnxruntime/test/python/transformers/test_parity_huggingface_gpt_attention.py
index c29cf969734c4..a3b7efc08fd82 100644
--- a/onnxruntime/test/python/transformers/test_parity_huggingface_gpt_attention.py
+++ b/onnxruntime/test/python/transformers/test_parity_huggingface_gpt_attention.py
@@ -19,7 +19,7 @@
 import pytest
 import torch
 from onnx import helper
-from parity_utilities import compare_outputs, create_ort_session, diff_outputs
+from parity_utilities import compare_outputs, create_ort_session
 from torch import nn
 from transformers.modeling_utils import Conv1D
 
@@ -219,7 +219,7 @@ def export_onnx(model, onnx_model_path, float16, hidden_size, num_attention_head
     )
 
     with torch.no_grad():
-        outputs = model(input_hidden_states, attention_mask=attention_mask, layer_past=layer_past)
+        model(input_hidden_states, attention_mask=attention_mask, layer_past=layer_past)
 
     dynamic_axes = {
         "input_hidden_states": {0: "batch_size", 1: "seq_len"},
diff --git a/onnxruntime/test/python/transformers/test_parity_layernorm.py b/onnxruntime/test/python/transformers/test_parity_layernorm.py
index 01122b4830bfa..d0ddfe23fac73 100644
--- a/onnxruntime/test/python/transformers/test_parity_layernorm.py
+++ b/onnxruntime/test/python/transformers/test_parity_layernorm.py
@@ -8,11 +8,11 @@
 import unittest
 
 import onnx
+import parity_utilities
 import torch
-from parity_utilities import *
 from torch import nn
 
-if find_transformers_source():
+if parity_utilities.find_transformers_source():
     from onnx_model import OnnxModel
 else:
     from onnxruntime.transformers.onnx_model import OnnxModel
@@ -150,12 +150,12 @@ def run(
 
     # Do not re-use onnx file from previous test since weights of model are random.
     onnx_model_path = "./temp/layer_norm_{}_formula{}.onnx".format("fp16" if float16 else "fp32", formula)
-    export_onnx(model, onnx_model_path, float16, hidden_size, device)
+    parity_utilities.export_onnx(model, onnx_model_path, float16, hidden_size, device)
 
     if optimized:
         optimized_onnx_path = "./temp/layer_norm_{}_formula{}_opt.onnx".format("fp16" if float16 else "fp32", formula)
         if (not float16) or cast_fp16:
-            optimize_onnx(
+            parity_utilities.optimize_onnx(
                 onnx_model_path,
                 optimized_onnx_path,
                 expected_op=LayerNorm.get_fused_op(),
@@ -170,7 +170,7 @@ def run(
     else:
         onnx_path = onnx_model_path
 
-    num_failure = run_parity(
+    num_failure = parity_utilities.run_parity(
         model,
         onnx_path,
         batch_size,
@@ -295,9 +295,7 @@ def test_cpu(self):
 
     def test_cuda(self):
         if not torch.cuda.is_available():
-            import pytest
-
-            pytest.skip("test requires GPU and torch+cuda")
+            self.skipTest("test requires GPU and torch+cuda")
         else:
             gpu = torch.device("cuda")
             self.run_one(self.optimized, gpu, hidden_size=self.hidden_size, run_extra_tests=True)
diff --git a/onnxruntime/test/testdata/dynamic_quantize_matmul_test.py b/onnxruntime/test/testdata/dynamic_quantize_matmul_test.py
index d681723810e65..136db2949b101 100644
--- a/onnxruntime/test/testdata/dynamic_quantize_matmul_test.py
+++ b/onnxruntime/test/testdata/dynamic_quantize_matmul_test.py
@@ -1,5 +1,3 @@
-from enum import Enum
-
 import onnx
 from onnx import TensorProto, helper
 
diff --git a/onnxruntime/test/testdata/ep_partitioning_tests.py b/onnxruntime/test/testdata/ep_partitioning_tests.py
index a85b9bda6c187..367cafb795bad 100644
--- a/onnxruntime/test/testdata/ep_partitioning_tests.py
+++ b/onnxruntime/test/testdata/ep_partitioning_tests.py
@@ -1,4 +1,3 @@
-import numpy as np
 import onnx
 from onnx import TensorProto, helper
 
diff --git a/onnxruntime/test/testdata/matmul_integer_to_float.py b/onnxruntime/test/testdata/matmul_integer_to_float.py
index 6b126fb3a2a1f..4b1dc90f6f468 100644
--- a/onnxruntime/test/testdata/matmul_integer_to_float.py
+++ b/onnxruntime/test/testdata/matmul_integer_to_float.py
@@ -1,5 +1,3 @@
-from enum import Enum
-
 import onnx
 from onnx import TensorProto, helper
 
diff --git a/onnxruntime/test/testdata/model_with_external_initializers.py b/onnxruntime/test/testdata/model_with_external_initializers.py
index 8b591549963fd..0413efc7e1017 100644
--- a/onnxruntime/test/testdata/model_with_external_initializers.py
+++ b/onnxruntime/test/testdata/model_with_external_initializers.py
@@ -1,14 +1,17 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
+import os
+from typing import Any, List
+
 import numpy as np
 import onnx
-from onnx import TensorProto, helper
+from onnx import helper
 from onnx.external_data_helper import set_external_data
 from onnx.numpy_helper import from_array
 
 
-def create_external_data_tensor(value, tensor_name):  # type: (List[Any], Text) -> TensorProto
+def create_external_data_tensor(value, tensor_name):  # type: (List[Any], str) -> onnx.TensorProto
     tensor = from_array(np.array(value))
     tensor.name = tensor_name
     tensor_filename = "{}.bin".format(tensor_name)
@@ -23,13 +26,13 @@ def create_external_data_tensor(value, tensor_name):  # type: (List[Any], Text)
 
 def GenerateModel(model_name):
     # Create one input (ValueInfoProto)
-    X = helper.make_tensor_value_info("X", TensorProto.FLOAT, [1, 2])
+    X = helper.make_tensor_value_info("X", onnx.TensorProto.FLOAT, [1, 2])
 
     # Create second input (ValueInfoProto)
-    Pads = helper.make_tensor_value_info("Pads", TensorProto.INT64, [4])
+    Pads = helper.make_tensor_value_info("Pads", onnx.TensorProto.INT64, [4])
 
     # Create one output (ValueInfoProto)
-    Y = helper.make_tensor_value_info("Y", TensorProto.FLOAT, [1, 4])
+    Y = helper.make_tensor_value_info("Y", onnx.TensorProto.FLOAT, [1, 4])
 
     # Create a node (NodeProto)
     node_def = helper.make_node(
diff --git a/onnxruntime/test/testdata/sparse_initializer_as_output.py b/onnxruntime/test/testdata/sparse_initializer_as_output.py
index 741ed6439e815..1c5cc8f783340 100644
--- a/onnxruntime/test/testdata/sparse_initializer_as_output.py
+++ b/onnxruntime/test/testdata/sparse_initializer_as_output.py
@@ -1,22 +1,10 @@
 import argparse
-import os
 import sys
 import traceback
-from typing import Any, Callable, Dict, List, Optional, Sequence, Text, Tuple, TypeVar, Union, cast
 
 import numpy as np
 import onnx
-from onnx import (
-    AttributeProto,
-    GraphProto,
-    SparseTensorProto,
-    TensorProto,
-    ValueInfoProto,
-    helper,
-    mapping,
-    numpy_helper,
-    utils,
-)
+from onnx import TensorProto, ValueInfoProto, helper
 from onnx.helper import make_opsetid
 
 
diff --git a/onnxruntime/test/testdata/sparse_to_dense_matmul.py b/onnxruntime/test/testdata/sparse_to_dense_matmul.py
index 26fb426968c39..f5c9fb347baef 100644
--- a/onnxruntime/test/testdata/sparse_to_dense_matmul.py
+++ b/onnxruntime/test/testdata/sparse_to_dense_matmul.py
@@ -1,22 +1,9 @@
 import argparse
-import os
 import sys
 import traceback
-from typing import Any, Callable, Dict, List, Optional, Sequence, Text, Tuple, TypeVar, Union, cast
 
-import numpy as np
 import onnx
-from onnx import (
-    AttributeProto,
-    GraphProto,
-    SparseTensorProto,
-    TensorProto,
-    ValueInfoProto,
-    helper,
-    mapping,
-    numpy_helper,
-    utils,
-)
+from onnx import TensorProto, ValueInfoProto, helper
 from onnx.helper import make_opsetid
 
 
diff --git a/onnxruntime/test/testdata/test_data_generation/lr_scheduler/lr_scheduler_test_data_generator.py b/onnxruntime/test/testdata/test_data_generation/lr_scheduler/lr_scheduler_test_data_generator.py
index d54b15c276e28..7df02979f2bd9 100644
--- a/onnxruntime/test/testdata/test_data_generation/lr_scheduler/lr_scheduler_test_data_generator.py
+++ b/onnxruntime/test/testdata/test_data_generation/lr_scheduler/lr_scheduler_test_data_generator.py
@@ -42,7 +42,6 @@ def lr_lambda(self, step):
 def main():
     """Main entry."""
     num_training_steps = 100
-    seed = 8888
     device = "cuda"
     batch_size, dimension_in, dimension_hidden = 2, 2, 3
 
diff --git a/onnxruntime/test/testdata/test_data_generation/sgd_test/sgd_test_data_generator.py b/onnxruntime/test/testdata/test_data_generation/sgd_test/sgd_test_data_generator.py
index ac064963b5e43..a3d7946d63214 100644
--- a/onnxruntime/test/testdata/test_data_generation/sgd_test/sgd_test_data_generator.py
+++ b/onnxruntime/test/testdata/test_data_generation/sgd_test/sgd_test_data_generator.py
@@ -136,7 +136,7 @@ def _data_func():
         target = torch.randn(batch_size, dimension_hidden, device=device, dtype=torch.float32)
         return input, target
 
-    json_file_name = f"sgd_test_single_weight.json"
+    json_file_name = "sgd_test_single_weight.json"
     generate_sgd_test_data(seed, _model_setup_func, _data_func, run_step_count, json_file_name, device)
 
 
@@ -154,7 +154,7 @@ def data_func():
         target = torch.randn(batch_size, dim_out, device=device, dtype=torch.float32)
         return input, target
 
-    json_file_name = f"sgd_test_multiple_weights.json"
+    json_file_name = "sgd_test_multiple_weights.json"
     generate_sgd_test_data(seed, _model_setup_func, data_func, run_step_count, json_file_name, device)
 
 
diff --git a/onnxruntime/test/testdata/transform/cast_elimination.py b/onnxruntime/test/testdata/transform/cast_elimination.py
index fbf0932dcaa0d..466221bcf7aac 100644
--- a/onnxruntime/test/testdata/transform/cast_elimination.py
+++ b/onnxruntime/test/testdata/transform/cast_elimination.py
@@ -1,6 +1,5 @@
-import numpy as np
 import onnx
-from onnx import GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import OperatorSetIdProto, TensorProto, helper
 
 X1 = helper.make_tensor_value_info("x1", TensorProto.INT64, [4, 4])
 X2 = helper.make_tensor_value_info("x2", TensorProto.INT64, [4, 1])
diff --git a/onnxruntime/test/testdata/transform/computation_reduction.py b/onnxruntime/test/testdata/transform/computation_reduction.py
index 7d33c9cc66c89..9f2e6ac7c07f0 100644
--- a/onnxruntime/test/testdata/transform/computation_reduction.py
+++ b/onnxruntime/test/testdata/transform/computation_reduction.py
@@ -1,6 +1,6 @@
 import numpy as np
 import onnx
-from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper
 
 vocab_size = 256  # 30258
 
diff --git a/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_add.py b/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_add.py
index ec0fdc888bed8..216e0d2a05d5f 100755
--- a/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_add.py
+++ b/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_add.py
@@ -1,6 +1,6 @@
 import numpy as np
 import onnx
-from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper
 
 X = helper.make_tensor_value_info("input", TensorProto.FLOAT, ["batch", "seqlen", 128])
 unsqueezed_masked_lm_positions = helper.make_tensor_value_info(
diff --git a/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_div.py b/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_div.py
index d14f8a71adfc5..29abfc18e5d0e 100755
--- a/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_div.py
+++ b/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_div.py
@@ -1,6 +1,6 @@
 import numpy as np
 import onnx
-from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper
 
 X = helper.make_tensor_value_info("input", TensorProto.FLOAT, ["batch", "seqlen", 128])
 unsqueezed_masked_lm_positions = helper.make_tensor_value_info(
diff --git a/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_gelu.py b/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_gelu.py
index eade1b868ba84..b8b81a747b118 100755
--- a/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_gelu.py
+++ b/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_gelu.py
@@ -1,6 +1,5 @@
-import numpy as np
 import onnx
-from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import OperatorSetIdProto, TensorProto, helper
 
 X = helper.make_tensor_value_info("input", TensorProto.FLOAT, ["batch", "seqlen", 128])
 unsqueezed_masked_lm_positions = helper.make_tensor_value_info(
diff --git a/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_layernormalization.py b/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_layernormalization.py
index 9473d05010129..f403b093b16b6 100755
--- a/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_layernormalization.py
+++ b/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_layernormalization.py
@@ -1,6 +1,6 @@
 import numpy as np
 import onnx
-from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper
 
 X = helper.make_tensor_value_info("input", TensorProto.FLOAT, ["batch", "seqlen", 128])
 unsqueezed_masked_lm_positions = helper.make_tensor_value_info(
diff --git a/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_matmul.py b/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_matmul.py
index 50167bbd0a3a3..65767a8986746 100755
--- a/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_matmul.py
+++ b/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_matmul.py
@@ -1,6 +1,6 @@
 import numpy as np
 import onnx
-from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper
 
 X = helper.make_tensor_value_info("input", TensorProto.FLOAT, ["batch", "seqlen", 128])
 unsqueezed_masked_lm_positions = helper.make_tensor_value_info(
diff --git a/onnxruntime/test/testdata/transform/concat_slice_elimination.py b/onnxruntime/test/testdata/transform/concat_slice_elimination.py
index 88a1236922a19..97f0c6f243f60 100644
--- a/onnxruntime/test/testdata/transform/concat_slice_elimination.py
+++ b/onnxruntime/test/testdata/transform/concat_slice_elimination.py
@@ -1,8 +1,6 @@
-import random
-
 import numpy as np
 import onnx
-from onnx import GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper
 
 batch = 3
 hidden_size = 4
diff --git a/onnxruntime/test/testdata/transform/cse/generate.py b/onnxruntime/test/testdata/transform/cse/generate.py
index 1cd1b54b09a53..01d62422983b5 100644
--- a/onnxruntime/test/testdata/transform/cse/generate.py
+++ b/onnxruntime/test/testdata/transform/cse/generate.py
@@ -1,7 +1,7 @@
 import os
 
 import onnx
-from onnx import AttributeProto, GraphProto, TensorProto, helper, shape_inference
+from onnx import TensorProto, helper, shape_inference
 
 _this_dir = os.path.abspath(os.path.dirname(__file__))
 
diff --git a/onnxruntime/test/testdata/transform/expand_elimination.py b/onnxruntime/test/testdata/transform/expand_elimination.py
index da1530876348e..226c23fa66389 100644
--- a/onnxruntime/test/testdata/transform/expand_elimination.py
+++ b/onnxruntime/test/testdata/transform/expand_elimination.py
@@ -1,6 +1,6 @@
 import numpy as np
 import onnx
-from onnx import GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper
 
 X1 = helper.make_tensor_value_info("input1", TensorProto.FLOAT, [2, 1])
 X2 = helper.make_tensor_value_info("input2", TensorProto.FLOAT, ["dynamic", 4])
diff --git a/onnxruntime/test/testdata/transform/fusion/attention_gen.py b/onnxruntime/test/testdata/transform/fusion/attention_gen.py
index cd1569ae5cd2a..888242a1ba9ba 100644
--- a/onnxruntime/test/testdata/transform/fusion/attention_gen.py
+++ b/onnxruntime/test/testdata/transform/fusion/attention_gen.py
@@ -1,5 +1,4 @@
 import sys
-from enum import Enum
 
 import onnx
 from onnx import TensorProto, helper
diff --git a/onnxruntime/test/testdata/transform/fusion/constant_folding_with_shape_to_initializer.py b/onnxruntime/test/testdata/transform/fusion/constant_folding_with_shape_to_initializer.py
index 6cc5cdeb79f4a..65b37a8ed9dab 100644
--- a/onnxruntime/test/testdata/transform/fusion/constant_folding_with_shape_to_initializer.py
+++ b/onnxruntime/test/testdata/transform/fusion/constant_folding_with_shape_to_initializer.py
@@ -1,6 +1,6 @@
 import numpy as np
 import onnx
-from onnx import GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper
 
 X = helper.make_tensor_value_info("input", TensorProto.FLOAT, [2, 4, 8])
 Y = helper.make_tensor_value_info("output", TensorProto.FLOAT, [2, 4, 16])
diff --git a/onnxruntime/test/testdata/transform/fusion/div_mul.py b/onnxruntime/test/testdata/transform/fusion/div_mul.py
index 7263a986d40ca..480db8967ba57 100644
--- a/onnxruntime/test/testdata/transform/fusion/div_mul.py
+++ b/onnxruntime/test/testdata/transform/fusion/div_mul.py
@@ -1,5 +1,3 @@
-from enum import Enum
-
 import onnx
 from onnx import OperatorSetIdProto, TensorProto, helper
 
diff --git a/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul.py b/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul.py
index 6eff2e01ec8bf..2957f06b5b4eb 100644
--- a/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul.py
+++ b/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul.py
@@ -1,5 +1,3 @@
-from enum import Enum
-
 import onnx
 from onnx import TensorProto, helper
 
diff --git a/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_gen.py b/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_gen.py
index cc1058c37e31f..90f436d4e67b2 100644
--- a/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_gen.py
+++ b/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_gen.py
@@ -1,5 +1,3 @@
-from enum import Enum
-
 import onnx
 from onnx import TensorProto, helper
 from packaging import version
diff --git a/onnxruntime/test/testdata/transform/fusion/fast_gelu.py b/onnxruntime/test/testdata/transform/fusion/fast_gelu.py
index aaaffa4ab398a..90d1231093f1a 100644
--- a/onnxruntime/test/testdata/transform/fusion/fast_gelu.py
+++ b/onnxruntime/test/testdata/transform/fusion/fast_gelu.py
@@ -1,6 +1,6 @@
 import numpy as np
 import onnx
-from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper
 
 # Gelu formula: x * 0.5 * (1.0 + tanh(0.7978845608028654 * x * (1.0 + 0.044715 * x * x)))
 
diff --git a/onnxruntime/test/testdata/transform/fusion/fast_gelu2.py b/onnxruntime/test/testdata/transform/fusion/fast_gelu2.py
index 5ff752afa7e6a..48483ba50fe9e 100644
--- a/onnxruntime/test/testdata/transform/fusion/fast_gelu2.py
+++ b/onnxruntime/test/testdata/transform/fusion/fast_gelu2.py
@@ -1,6 +1,6 @@
 import numpy as np
 import onnx
-from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper
 
 # Gelu formula: x * 0.5 * (1.0 + tanh((sqrt(2 / pi) * (x + 0.044715 * pow(x, 3)))))
 has_bias = False  # change it to True to generate fast_gelu_openai_with_bias.onnx
diff --git a/onnxruntime/test/testdata/transform/fusion/fast_gelu3_with_casts.py b/onnxruntime/test/testdata/transform/fusion/fast_gelu3_with_casts.py
index 5220751a3e364..d91e186296137 100644
--- a/onnxruntime/test/testdata/transform/fusion/fast_gelu3_with_casts.py
+++ b/onnxruntime/test/testdata/transform/fusion/fast_gelu3_with_casts.py
@@ -1,6 +1,6 @@
 import numpy as np
 import onnx
-from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper
 
 # Gelu formula: x * 0.5 * (1.0 + tanh((sqrt(2 / pi) * (x + 0.044715 * pow(x, 3)))))
 
diff --git a/onnxruntime/test/testdata/transform/fusion/gelu_gen.py b/onnxruntime/test/testdata/transform/fusion/gelu_gen.py
index 45f546a04635e..22c214032b6a0 100644
--- a/onnxruntime/test/testdata/transform/fusion/gelu_gen.py
+++ b/onnxruntime/test/testdata/transform/fusion/gelu_gen.py
@@ -1,6 +1,6 @@
 import numpy as np
 import onnx
-from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper
 
 """
 Generate test model for Gelu subgraph pattern 2:
diff --git a/onnxruntime/test/testdata/transform/fusion/isinf_reducesum.py b/onnxruntime/test/testdata/transform/fusion/isinf_reducesum.py
index 447b873f01c6e..ba3c9fadbc9c6 100644
--- a/onnxruntime/test/testdata/transform/fusion/isinf_reducesum.py
+++ b/onnxruntime/test/testdata/transform/fusion/isinf_reducesum.py
@@ -1,5 +1,3 @@
-from enum import Enum
-
 import onnx
 from onnx import OperatorSetIdProto, TensorProto, helper
 
diff --git a/onnxruntime/test/testdata/transform/fusion/layer_norm_t5_gen.py b/onnxruntime/test/testdata/transform/fusion/layer_norm_t5_gen.py
index eb184fef5e59d..4580ec68aecd1 100644
--- a/onnxruntime/test/testdata/transform/fusion/layer_norm_t5_gen.py
+++ b/onnxruntime/test/testdata/transform/fusion/layer_norm_t5_gen.py
@@ -1,5 +1,3 @@
-from enum import Enum
-
 import onnx
 from onnx import OperatorSetIdProto, TensorProto, helper
 
diff --git a/onnxruntime/test/testdata/transform/fusion/layer_norm_with_cast_2.py b/onnxruntime/test/testdata/transform/fusion/layer_norm_with_cast_2.py
index 091d38d9e6797..7068fb02f2821 100644
--- a/onnxruntime/test/testdata/transform/fusion/layer_norm_with_cast_2.py
+++ b/onnxruntime/test/testdata/transform/fusion/layer_norm_with_cast_2.py
@@ -1,6 +1,3 @@
-from enum import Enum
-
-import numpy as np
 import onnx
 from onnx import OperatorSetIdProto, TensorProto, helper
 
diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py
index 7bba71723b2c8..56556fe327a63 100644
--- a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py
+++ b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py
@@ -1,5 +1,3 @@
-from enum import Enum
-
 import onnx
 from onnx import TensorProto, helper
 
diff --git a/onnxruntime/test/testdata/transform/fusion/not_where.py b/onnxruntime/test/testdata/transform/fusion/not_where.py
index 7e48164d5161a..28a4eb914f0a3 100644
--- a/onnxruntime/test/testdata/transform/fusion/not_where.py
+++ b/onnxruntime/test/testdata/transform/fusion/not_where.py
@@ -1,5 +1,3 @@
-from enum import Enum
-
 import onnx
 from onnx import OperatorSetIdProto, TensorProto, helper
 
diff --git a/onnxruntime/test/testdata/transform/id-elim.py b/onnxruntime/test/testdata/transform/id-elim.py
index 838fbb1f4a798..eef8011e7fe23 100644
--- a/onnxruntime/test/testdata/transform/id-elim.py
+++ b/onnxruntime/test/testdata/transform/id-elim.py
@@ -1,6 +1,5 @@
-import numpy as np
 import onnx
-from onnx import GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import OperatorSetIdProto, TensorProto, helper
 
 X1 = helper.make_tensor_value_info("x1", TensorProto.INT64, [4, 4])
 X2 = helper.make_tensor_value_info("x2", TensorProto.INT64, [4, 4])
diff --git a/onnxruntime/test/testdata/transform/id-scan9_sum.py b/onnxruntime/test/testdata/transform/id-scan9_sum.py
index f2a7de656c8ee..c813bbfc18d8e 100644
--- a/onnxruntime/test/testdata/transform/id-scan9_sum.py
+++ b/onnxruntime/test/testdata/transform/id-scan9_sum.py
@@ -1,6 +1,5 @@
-import numpy as np
 import onnx
-from onnx import GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import OperatorSetIdProto, TensorProto, helper
 
 initial = helper.make_tensor_value_info("initial", TensorProto.FLOAT, [2])
 x = helper.make_tensor_value_info("x", TensorProto.FLOAT, [3, 2])
diff --git a/onnxruntime/test/testdata/transform/model_parallel/bart_mlp_megatron_basic_test.py b/onnxruntime/test/testdata/transform/model_parallel/bart_mlp_megatron_basic_test.py
index 323ebf08e4acd..7879bb4d4e0ff 100644
--- a/onnxruntime/test/testdata/transform/model_parallel/bart_mlp_megatron_basic_test.py
+++ b/onnxruntime/test/testdata/transform/model_parallel/bart_mlp_megatron_basic_test.py
@@ -1,6 +1,6 @@
 import numpy as np
 import onnx
-from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper
 
 hidden_size = 4
 weight_dim_to_split = 16
diff --git a/onnxruntime/test/testdata/transform/model_parallel/bart_self_attention_megatron_basic_test.py b/onnxruntime/test/testdata/transform/model_parallel/bart_self_attention_megatron_basic_test.py
index 596b294ca27ae..886cd5c25fb08 100644
--- a/onnxruntime/test/testdata/transform/model_parallel/bart_self_attention_megatron_basic_test.py
+++ b/onnxruntime/test/testdata/transform/model_parallel/bart_self_attention_megatron_basic_test.py
@@ -1,8 +1,6 @@
-import random
-
 import numpy as np
 import onnx
-from onnx import GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper
 
 batch = 6
 hidden_size = 4
diff --git a/onnxruntime/test/testdata/transform/model_parallel/mlp_megatron_basic_test.py b/onnxruntime/test/testdata/transform/model_parallel/mlp_megatron_basic_test.py
index b26d384cbb4c9..5dec4899d59af 100644
--- a/onnxruntime/test/testdata/transform/model_parallel/mlp_megatron_basic_test.py
+++ b/onnxruntime/test/testdata/transform/model_parallel/mlp_megatron_basic_test.py
@@ -1,6 +1,6 @@
 import numpy as np
 import onnx
-from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper
 
 hidden_size = 4
 weight_dim_to_split = 16
diff --git a/onnxruntime/test/testdata/transform/model_parallel/self_attention_megatron_basic_test.py b/onnxruntime/test/testdata/transform/model_parallel/self_attention_megatron_basic_test.py
index 5083ceeb434db..30e0a58a53d2d 100644
--- a/onnxruntime/test/testdata/transform/model_parallel/self_attention_megatron_basic_test.py
+++ b/onnxruntime/test/testdata/transform/model_parallel/self_attention_megatron_basic_test.py
@@ -1,6 +1,6 @@
 import numpy as np
 import onnx
-from onnx import GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper
 
 hidden_size = 4
 attention_head = 2
diff --git a/orttraining/orttraining/eager/opgen/onnxgen.py b/orttraining/orttraining/eager/opgen/onnxgen.py
index 87c4036f48b0a..e42956551ca79 100755
--- a/orttraining/orttraining/eager/opgen/onnxgen.py
+++ b/orttraining/orttraining/eager/opgen/onnxgen.py
@@ -5,6 +5,7 @@
 
 import os.path as path
 from sys import argv
+
 from onnx import defs
 
 out_file = path.join(path.dirname(path.realpath(__file__)), "opgen", "onnxops.py")
@@ -46,7 +47,7 @@ def write(s):
     def writeline(s=""):
         fp.write(s + "\n")
 
-    writeline(f"# AUTO-GENERATED CODE! - DO NOT EDIT!")
+    writeline("# AUTO-GENERATED CODE! - DO NOT EDIT!")
     writeline(f'# $ python {" ".join(argv)}')
     writeline()
 
@@ -55,11 +56,11 @@ def writeline(s=""):
 
     for op_name, schema in sorted(onnx_ops.items()):
         writeline(f"class {schema.name}(ONNXOp):")
-        writeline(f'  """')
+        writeline('  """')
         doc_str = schema.doc.strip("\r\n")
         for doc_line in str.splitlines(doc_str, keepends=False):
             writeline(f"  {doc_line}")
-        writeline(f'  """')
+        writeline('  """')
         writeline()
         write("  def __init__(self")
 
diff --git a/orttraining/orttraining/eager/opgen/opgen/ast.py b/orttraining/orttraining/eager/opgen/opgen/ast.py
index f41a93712aa51..9c73322c361fe 100644
--- a/orttraining/orttraining/eager/opgen/opgen/ast.py
+++ b/orttraining/orttraining/eager/opgen/opgen/ast.py
@@ -2,7 +2,8 @@
 # Licensed under the MIT License.
 
 import io
-from typing import TextIO, List, Union
+from typing import List, TextIO, Union
+
 from opgen.lexer import Token
 
 
diff --git a/orttraining/orttraining/eager/opgen/opgen/atenops.py b/orttraining/orttraining/eager/opgen/opgen/atenops.py
index b8b54b5d6ce84..10fc971894302 100644
--- a/orttraining/orttraining/eager/opgen/opgen/atenops.py
+++ b/orttraining/orttraining/eager/opgen/opgen/atenops.py
@@ -1,5 +1,3 @@
-from copy import deepcopy
-
 import torch
 from opgen.generator import MakeTorchFallback, ONNXOp, SignatureOnly
 from opgen.onnxops import *
diff --git a/orttraining/orttraining/eager/opgen/opgen/custom_ops.py b/orttraining/orttraining/eager/opgen/opgen/custom_ops.py
index a8031fe7d8635..0c303780a1e7f 100644
--- a/orttraining/orttraining/eager/opgen/opgen/custom_ops.py
+++ b/orttraining/orttraining/eager/opgen/opgen/custom_ops.py
@@ -1,4 +1,4 @@
-from opgen.onnxops import BatchNormalization, Gemm, Concat
+from opgen.onnxops import BatchNormalization, Concat, Gemm
 
 ops = {
     "gemm": Gemm("A", "B", "C", "alpha", "beta", "transA", "transB"),
diff --git a/orttraining/orttraining/eager/opgen/opgen/generator.py b/orttraining/orttraining/eager/opgen/opgen/generator.py
index 8813ad15ba483..9926891f395ed 100644
--- a/orttraining/orttraining/eager/opgen/opgen/generator.py
+++ b/orttraining/orttraining/eager/opgen/opgen/generator.py
@@ -17,7 +17,7 @@ def __init__(self, count: int):
         self.name = None
 
     def __str__(self):
-        return self.name if self.name else f"<unbound output>"
+        return self.name if self.name else "<unbound output>"
 
 
 class AttrType:
@@ -282,7 +282,7 @@ def _write_function_body_onnx_op_node_attributes(self, writer, onnx_op, attrs, a
             if attr.type.startswith("at::ScalarType::"):
                 writer.write(f", {attr.type}")
             elif attr.type == AttrType.TENSOR:
-                writer.write(f", true")
+                writer.write(", true")
             elif attr.type != AttrType.STRING:
                 raise FunctionGenerationError(
                     cpp_func,
@@ -432,7 +432,7 @@ def _write_function_body_return_multiple(self, writer, cpp_func, in_place_params
             isinstance(cpp_func.return_type, ast.TemplateType)
             and cpp_func.return_type.identifier_tokens[-1].value == "std::tuple"
         ):
-            raise Exception(f"")
+            raise Exception("")
         tensorRef = "Tensor&," * len(in_place_params)
         tensorRef = tensorRef[: len(tensorRef) - 1]
         writer.write(f"return std::tuple<{tensorRef}>(")
@@ -776,7 +776,7 @@ def _parse_mapped_function_decls(self, cpp_parser: parser.CPPParser):
                 try:
                     op_namespace = op_name[0 : op_name.index("::")]
                     op_namewithoutnamespace = op_name[len(op_namespace) + 2 :]
-                except:
+                except Exception:
                     op_namespace = None
                     op_namewithoutnamespace = op_name
 
diff --git a/orttraining/orttraining/eager/opgen/opgen/lexer.py b/orttraining/orttraining/eager/opgen/opgen/lexer.py
index 661d646350f53..5d2737574aa92 100644
--- a/orttraining/orttraining/eager/opgen/opgen/lexer.py
+++ b/orttraining/orttraining/eager/opgen/opgen/lexer.py
@@ -1,9 +1,9 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-from enum import Enum
 from abc import ABC
-from typing import List, Optional, Union, Tuple
+from enum import Enum
+from typing import List, Optional, Tuple, Union
 
 
 class SourceLocation(object):
diff --git a/orttraining/orttraining/eager/opgen/opgen/onnxops.py b/orttraining/orttraining/eager/opgen/opgen/onnxops.py
index 98a2dd4d5997e..ee91dbc1c5748 100644
--- a/orttraining/orttraining/eager/opgen/opgen/onnxops.py
+++ b/orttraining/orttraining/eager/opgen/opgen/onnxops.py
@@ -1,7 +1,7 @@
 # AUTO-GENERATED CODE! - DO NOT EDIT!
 # $ python onnxgen.py
 
-from opgen.generator import ONNXAttr, ONNXOp, AttrType
+from opgen.generator import AttrType, ONNXAttr, ONNXOp
 
 
 class Abs(ONNXOp):
diff --git a/orttraining/orttraining/eager/opgen/opgen/parser.py b/orttraining/orttraining/eager/opgen/opgen/parser.py
index c1ba7e8378c5b..6fd27655104b6 100644
--- a/orttraining/orttraining/eager/opgen/opgen/parser.py
+++ b/orttraining/orttraining/eager/opgen/opgen/parser.py
@@ -1,9 +1,10 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-from opgen.lexer import *
+from typing import List, Optional, Tuple, Union
+
 from opgen.ast import *
-from typing import List, Tuple, Union, Optional
+from opgen.lexer import *
 
 
 class UnexpectedTokenError(RuntimeError):
diff --git a/orttraining/orttraining/eager/opgen/opgen/writer.py b/orttraining/orttraining/eager/opgen/opgen/writer.py
index 460a29a879dfc..b5281e1843ed8 100644
--- a/orttraining/orttraining/eager/opgen/opgen/writer.py
+++ b/orttraining/orttraining/eager/opgen/opgen/writer.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-from typing import TextIO, List
+from typing import List, TextIO
 
 
 class SourceWriter:
diff --git a/orttraining/orttraining/eager/opgen/opgen_test/lexer_test.py b/orttraining/orttraining/eager/opgen/opgen_test/lexer_test.py
index 30e78377b2445..dbe9289fdb21e 100644
--- a/orttraining/orttraining/eager/opgen/opgen_test/lexer_test.py
+++ b/orttraining/orttraining/eager/opgen/opgen_test/lexer_test.py
@@ -3,7 +3,7 @@
 
 import unittest
 
-from opgen.lexer import StringReader, Lexer, Token, TokenKind, SourceLocation
+from opgen.lexer import Lexer, StringReader, Token, TokenKind
 
 
 class LexerTestCase(unittest.TestCase):
diff --git a/orttraining/orttraining/eager/test/__main__.py b/orttraining/orttraining/eager/test/__main__.py
index f188f3c1fc3c3..cd381c050ec00 100644
--- a/orttraining/orttraining/eager/test/__main__.py
+++ b/orttraining/orttraining/eager/test/__main__.py
@@ -3,8 +3,8 @@
 
 import glob
 import os
-import sys
 import subprocess
+import sys
 
 selfdir = os.path.dirname(os.path.realpath(__file__))
 
diff --git a/orttraining/orttraining/eager/test/linux_only_ortmodule_eager_test.py b/orttraining/orttraining/eager/test/linux_only_ortmodule_eager_test.py
index 17318710850ed..fa9cd109e8eac 100644
--- a/orttraining/orttraining/eager/test/linux_only_ortmodule_eager_test.py
+++ b/orttraining/orttraining/eager/test/linux_only_ortmodule_eager_test.py
@@ -1,7 +1,5 @@
-import os
 import unittest
 
-import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -62,7 +60,7 @@ def test_ortmodule_inference(self):
 
         with torch.no_grad():
             data = torch.rand(batch_size, input_size)
-            y = model(data.to(device))
+            model(data.to(device))
         print("Done")
 
     @unittest.skip("Test fails with newest pytorch version.")
diff --git a/orttraining/orttraining/eager/test/ort_eps_test.py b/orttraining/orttraining/eager/test/ort_eps_test.py
index 7a4c8de5c5d25..bdfcb68d01efa 100644
--- a/orttraining/orttraining/eager/test/ort_eps_test.py
+++ b/orttraining/orttraining/eager/test/ort_eps_test.py
@@ -1,18 +1,17 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
+import os
 import unittest
-import torch
+
 import onnxruntime_pybind11_state as torch_ort
-import os
-import sys
+import torch
 
 
 def is_windows():
     return sys.platform.startswith("win")
 
 
-from io import StringIO
 import sys
 import threading
 import time
@@ -107,22 +106,22 @@ def test_import_custom_eps(self):
         # capture std out
         with OutputGrabber() as out:
             torch_ort.set_device(1, "TestExecutionProvider", {"device_id": "0", "some_config": "val"})
-            ort_device = torch_ort.device(1)
+            torch_ort.device(1)
         assert "My EP provider created, with device id: 0, some_option: val" in out.capturedtext
         with OutputGrabber() as out:
             torch_ort.set_device(2, "TestExecutionProvider", {"device_id": "1", "some_config": "val"})
-            ort_device = torch_ort.device(1)
+            torch_ort.device(1)
         assert "My EP provider created, with device id: 1, some_option: val" in out.capturedtext
         # test the reusing EP instance
         with OutputGrabber() as out:
             torch_ort.set_device(3, "TestExecutionProvider", {"device_id": "0", "some_config": "val"})
-            ort_device = torch_ort.device(1)
+            torch_ort.device(1)
         assert "My EP provider created, with device id: 0, some_option: val" not in out.capturedtext
         # test clear training ep instance pool
         torch_ort.clear_training_ep_instances()
         with OutputGrabber() as out:
             torch_ort.set_device(3, "TestExecutionProvider", {"device_id": "0", "some_config": "val"})
-            ort_device = torch_ort.device(1)
+            torch_ort.device(1)
         assert "My EP provider created, with device id: 0, some_option: val" in out.capturedtext
 
     @unittest.skip("Test fails with newest pytorch version.")
diff --git a/orttraining/orttraining/eager/test/ort_init.py b/orttraining/orttraining/eager/test/ort_init.py
index 43602cc6a5fdb..e8f1e7f2bf88a 100644
--- a/orttraining/orttraining/eager/test/ort_init.py
+++ b/orttraining/orttraining/eager/test/ort_init.py
@@ -8,6 +8,7 @@
 # after the import, hence this test is isolated from the others.
 
 import unittest
+
 import torch
 
 
@@ -22,8 +23,6 @@ def ort_alloc():
         with self.assertRaises(BaseException):
             ort_alloc()
 
-        import onnxruntime_pybind11_state as torch_ort
-
         ort_alloc()
         self.assertIn(config_match, torch._C._show_config())
 
diff --git a/orttraining/orttraining/eager/test/ort_ops.py b/orttraining/orttraining/eager/test/ort_ops.py
index 9f5fdfdf2413c..e9ea3822c3fee 100644
--- a/orttraining/orttraining/eager/test/ort_ops.py
+++ b/orttraining/orttraining/eager/test/ort_ops.py
@@ -5,10 +5,9 @@
 
 import unittest
 
-import numpy as np
 import onnxruntime_pybind11_state as torch_ort
 import torch
-from parameterized import parameterized, param
+from parameterized import param, parameterized
 
 
 class OrtOpTests(unittest.TestCase):
@@ -643,7 +642,7 @@ def test_op_out(self, test_name, tensor_test=torch.rand(6)):
             self.skipTest(f" {test_name}_output Fails - skipping for now")
         device = self.get_device()
         cpu_tensor = tensor_test
-        ort_tensor = cpu_tensor.to(device)
+        cpu_tensor.to(device)
 
         cpu_out_tensor = torch.tensor([], dtype=tensor_test.dtype)
         ort_out_tensor = cpu_out_tensor.to(device)
@@ -663,9 +662,9 @@ def test_op_out(self, test_name, tensor_test=torch.rand(6)):
     def test_op_tensor(self, math_sign_ops):
         device = self.get_device()
         cpu_a = torch.Tensor([1.0, 1.5, 2.0, 3.5])
-        ort_a = cpu_a.to(device)
+        cpu_a.to(device)
         cpu_b = torch.Tensor([1.0, 1.4, 2.1, 2.4])
-        ort_b = cpu_b.to(device)
+        cpu_b.to(device)
 
         for tensor_type in {torch.float, torch.bool}:
             cpu_out_tensor = torch.tensor([], dtype=tensor_type)
@@ -687,13 +686,11 @@ def test_op_scalar(self, math_sign_ops):
         cpu_scalar_int_lt = torch.scalar_tensor(2, dtype=torch.int)
         cpu_scalar_int_gt = torch.scalar_tensor(0, dtype=torch.int)
         cpu_tensor_float = torch.tensor([1.1, 1.1], dtype=torch.float32)
-        float_lt = 1.0
-        float_gt = 1.2
 
-        ort_tensor_int = cpu_tensor_int.to(device)
-        ort_scalar_int_lt = cpu_scalar_int_lt.to(device)
-        ort_scalar_int_gt = cpu_scalar_int_gt.to(device)
-        ort_tensor_float = cpu_tensor_float.to(device)
+        cpu_tensor_int.to(device)
+        cpu_scalar_int_lt.to(device)
+        cpu_scalar_int_gt.to(device)
+        cpu_tensor_float.to(device)
 
         # compare int to int, float to float - ort only supports same type at the moment
         cpu_out_tensor = torch.tensor([], dtype=torch.bool)
@@ -746,9 +743,9 @@ def test_op_scalar(self, math_sign_ops):
     def test_op_binary_tensor(self, binary_op, op_sign, alpha_supported):
         device = self.get_device()
         cpu_input = torch.rand(3, 1)  # use broadcasting in the second dim.
-        ort_input = cpu_input.to(device)
+        cpu_input.to(device)
         cpu_other = torch.rand(3, 3)
-        ort_other = cpu_other.to(device)
+        cpu_other.to(device)
 
         # verify op_sign works
         cpu_result = eval(compile("cpu_input " + op_sign + " cpu_other", "<string>", "eval"))
@@ -785,9 +782,7 @@ def test_op_binary_tensor(self, binary_op, op_sign, alpha_supported):
     def test_op_binary_scalar(self, binary_op, op_sign, alpha_supported):
         device = self.get_device()
         cpu_input = torch.ones(3, 3)
-        ort_input = cpu_input.to(device)
-        cpu_other = 3.1
-        ort_other = 3.1
+        cpu_input.to(device)
 
         # verify op_sign works
         cpu_result = eval(compile("cpu_input " + op_sign + " cpu_other", "<string>", "eval"))
diff --git a/orttraining/orttraining/eager/test/ort_tensor.py b/orttraining/orttraining/eager/test/ort_tensor.py
index a0cfdaa2cd0d6..e4d94f137da77 100644
--- a/orttraining/orttraining/eager/test/ort_tensor.py
+++ b/orttraining/orttraining/eager/test/ort_tensor.py
@@ -2,8 +2,8 @@
 # Licensed under the MIT License.
 
 import unittest
+
 import torch
-import onnxruntime_pybind11_state as torch_ort
 
 
 class OrtTensorTests(unittest.TestCase):
@@ -49,7 +49,7 @@ def test_stride(self):
         ort_ones = cpu_ones.to("ort")
         y = torch.as_strided(ort_ones, (2, 2), (1, 2))
         assert y.size() == (2, 2)
-        assert y.is_contiguous() == False
+        assert y.is_contiguous() is False
         contiguous_y = y.contiguous()
         w = torch.ones((2, 3))
         ort_w = w.to("ort")
@@ -65,7 +65,7 @@ def test_slice(self):
         ort_ones = cpu_ones.to("ort")
         y_cpu = cpu_ones[0:128, :128]
         y = ort_ones[0:128, :128]
-        assert y.is_contiguous() == False
+        assert y.is_contiguous() is False
         assert y.size() == (128, 128)
         assert torch.allclose(y.cpu(), y_cpu)
 
diff --git a/orttraining/orttraining/eager/test_model_OrtModule/mnist_fc_training.py b/orttraining/orttraining/eager/test_model_OrtModule/mnist_fc_training.py
index 505fdf24933de..b40cc228a4e5f 100644
--- a/orttraining/orttraining/eager/test_model_OrtModule/mnist_fc_training.py
+++ b/orttraining/orttraining/eager/test_model_OrtModule/mnist_fc_training.py
@@ -4,17 +4,17 @@
 
 ## Model testing is not complete.
 
-from __future__ import print_function
+
 import argparse
+
 import torch
-from onnxruntime.training import ORTModule
-from onnxruntime.capi import _pybind_state as torch_ort_eager
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
 from torchvision import datasets, transforms
-import numpy as np
-import os
+
+from onnxruntime.capi import _pybind_state as torch_ort_eager
+from onnxruntime.training import ORTModule
 
 
 class NeuralNet(nn.Module):
@@ -82,7 +82,7 @@ def main():
     )
 
     args = parser.parse_args()
-    use_cuda = not args.no_cuda and torch.cuda.is_available()
+    not args.no_cuda and torch.cuda.is_available()
 
     torch.manual_seed(args.seed)
 
@@ -98,7 +98,7 @@ def main():
         shuffle=True,
         **kwargs,
     )
-    test_loader = torch.utils.data.DataLoader(
+    torch.utils.data.DataLoader(
         datasets.MNIST(
             "./data",
             train=False,
diff --git a/orttraining/orttraining/eager/test_models/mnist_fc.py b/orttraining/orttraining/eager/test_models/mnist_fc.py
index 0f0b3bb604149..6a2c03785b4cb 100644
--- a/orttraining/orttraining/eager/test_models/mnist_fc.py
+++ b/orttraining/orttraining/eager/test_models/mnist_fc.py
@@ -1,11 +1,6 @@
-from __future__ import print_function
-import argparse
+import onnxruntime_pybind11_state as torch_ort
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
-import numpy as np
-import os
-import onnxruntime_pybind11_state as torch_ort
 
 
 class NeuralNet(nn.Module):
diff --git a/orttraining/orttraining/eager/test_models/mnist_fc_training.py b/orttraining/orttraining/eager/test_models/mnist_fc_training.py
index 95ba3bf060332..744a264e87cfb 100644
--- a/orttraining/orttraining/eager/test_models/mnist_fc_training.py
+++ b/orttraining/orttraining/eager/test_models/mnist_fc_training.py
@@ -4,7 +4,6 @@
 # pylint: disable=missing-docstring
 # pylint: disable=C0103
 
-from __future__ import print_function
 
 import argparse
 import os
@@ -99,7 +98,7 @@ def main():
         shuffle=True,
         **kwargs,
     )
-    test_loader = torch.utils.data.DataLoader(
+    torch.utils.data.DataLoader(
         datasets.MNIST(
             dataset_root_dir,
             train=False,
diff --git a/orttraining/orttraining/eager/test_models/scratchpad.py b/orttraining/orttraining/eager/test_models/scratchpad.py
index 049aa859c842c..01237d0cd029d 100644
--- a/orttraining/orttraining/eager/test_models/scratchpad.py
+++ b/orttraining/orttraining/eager/test_models/scratchpad.py
@@ -1,8 +1,8 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-import torch
 import onnxruntime_pybind11_state as torch_ort
+import torch
 
 device = torch_ort.device()
 
diff --git a/orttraining/orttraining/python/checkpointing_utils.py b/orttraining/orttraining/python/checkpointing_utils.py
index 359f6a8c53552..b7c055eaba51b 100644
--- a/orttraining/orttraining/python/checkpointing_utils.py
+++ b/orttraining/orttraining/python/checkpointing_utils.py
@@ -1,4 +1,5 @@
 import os
+
 import torch
 
 
diff --git a/orttraining/orttraining/python/deprecated/training_session.py b/orttraining/orttraining/python/deprecated/training_session.py
index b6a63dbee35d2..37ec8552a2acf 100644
--- a/orttraining/orttraining/python/deprecated/training_session.py
+++ b/orttraining/orttraining/python/deprecated/training_session.py
@@ -3,14 +3,11 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-import sys
-import os
 
 from onnxruntime.capi import _pybind_state as C
 from onnxruntime.capi.onnxruntime_inference_collection import (
-    Session,
     InferenceSession,
-    IOBinding,
+    Session,
     check_and_normalize_provider_args,
 )
 
diff --git a/orttraining/orttraining/python/ort_trainer.py b/orttraining/orttraining/python/ort_trainer.py
index 5434edd7d4439..8c4fe6d8004ac 100644
--- a/orttraining/orttraining/python/ort_trainer.py
+++ b/orttraining/orttraining/python/ort_trainer.py
@@ -1,7 +1,6 @@
 import io
 import os
 import warnings
-from packaging.version import Version as LooseVersion
 
 import numpy as np
 import onnx
@@ -9,6 +8,7 @@
 import torch.nn
 import torch.onnx
 from onnx import helper, numpy_helper
+from packaging.version import Version as LooseVersion
 
 import onnxruntime as ort
 import onnxruntime.capi.pt_patch
@@ -264,7 +264,6 @@ def forward(self, *inputs):
         # *inputs is given by torch trace. It is in the order of input_names.
         # model_ takes input in a order (which can be obtained via inspect.signature(model.forward)) different than input_names.
         sig = inspect.signature(self.model_.forward)
-        ordered_list_keys = list(sig.parameters.keys())
 
         input_dict = {}
         for key in sig.parameters.keys():
@@ -556,7 +555,7 @@ def create_ort_training_session_with_optimizer(
 def save_checkpoint(
     model, checkpoint_dir, checkpoint_prefix="ORT_checkpoint", checkpoint_state_dict=None, include_optimizer_state=True
 ):
-    if checkpoint_state_dict == None:
+    if checkpoint_state_dict is None:
         checkpoint_state_dict = {"model": model.state_dict(include_optimizer_state)}
     else:
         checkpoint_state_dict.update({"model": model.state_dict(include_optimizer_state)})
diff --git a/orttraining/orttraining/python/pt_patch.py b/orttraining/orttraining/python/pt_patch.py
index b524a286c9de7..5c5d205b21318 100644
--- a/orttraining/orttraining/python/pt_patch.py
+++ b/orttraining/orttraining/python/pt_patch.py
@@ -1,9 +1,7 @@
 import torch
-
-from torch.onnx import symbolic_opset10
-from torch.onnx import symbolic_opset12
-from torch.onnx.symbolic_helper import parse_args
 import torch.onnx.symbolic_helper as sym_help
+from torch.onnx import symbolic_opset10, symbolic_opset12
+from torch.onnx.symbolic_helper import parse_args
 
 
 @parse_args("v", "v", "v", "v", "i", "none")
diff --git a/orttraining/orttraining/python/training/__init__.py b/orttraining/orttraining/python/training/__init__.py
index 4a69f1439c656..3b28514819b2e 100644
--- a/orttraining/orttraining/python/training/__init__.py
+++ b/orttraining/orttraining/python/training/__init__.py
@@ -2,13 +2,14 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
-
+# isort:skip_file
 from onnxruntime.capi._pybind_state import PropagateCastOpsStrategy, TrainingParameters
 from onnxruntime.capi.training.training_session import TrainingSession
 
 # Options need to be imported before `ORTTrainer`.
 from .orttrainer_options import ORTTrainerOptions
 from .orttrainer import ORTTrainer, TrainStepInfo
+
 from . import amp, checkpoint, model_desc_validation, optim
 
 
diff --git a/orttraining/orttraining/python/training/_checkpoint_storage.py b/orttraining/orttraining/python/training/_checkpoint_storage.py
index 461daa57134c0..b5c03ee3c1102 100644
--- a/orttraining/orttraining/python/training/_checkpoint_storage.py
+++ b/orttraining/orttraining/python/training/_checkpoint_storage.py
@@ -3,9 +3,10 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-import h5py
-from collections.abc import Mapping
 import pickle
+from collections.abc import Mapping
+
+import h5py
 
 
 def _dfs_save(group, save_obj):
diff --git a/orttraining/orttraining/python/training/_utils.py b/orttraining/orttraining/python/training/_utils.py
index 099a29764839f..657cdd8e9937e 100644
--- a/orttraining/orttraining/python/training/_utils.py
+++ b/orttraining/orttraining/python/training/_utils.py
@@ -6,11 +6,9 @@
 import importlib.util
 import os
 import sys
-from functools import wraps
 
 import numpy as np
 import torch
-from onnx import TensorProto
 from packaging.version import Version
 
 
diff --git a/orttraining/orttraining/python/training/amp/__init__.py b/orttraining/orttraining/python/training/amp/__init__.py
index 33274a8d5e10d..dec2c8e3b868a 100644
--- a/orttraining/orttraining/python/training/amp/__init__.py
+++ b/orttraining/orttraining/python/training/amp/__init__.py
@@ -1 +1 @@
-from .loss_scaler import LossScaler, DynamicLossScaler
+from .loss_scaler import DynamicLossScaler, LossScaler
diff --git a/orttraining/orttraining/python/training/checkpoint.py b/orttraining/orttraining/python/training/checkpoint.py
index e4a2f1230b7a4..2fbd402410016 100644
--- a/orttraining/orttraining/python/training/checkpoint.py
+++ b/orttraining/orttraining/python/training/checkpoint.py
@@ -1,12 +1,13 @@
-import numpy as np
-import onnx
 import os
-import torch
-import warnings
 import tempfile
+import warnings
 from enum import Enum
-from . import _checkpoint_storage, _utils
 
+import numpy as np
+import onnx
+import torch
+
+from . import _checkpoint_storage, _utils
 
 ################################################################################
 # Experimental Checkpoint APIs
@@ -422,7 +423,7 @@ def _aggregate_over_ranks(
         assert (
             ranks[i] == rank_state_dict[_utils.state_dict_trainer_options_key()][world_rank]
         ), "Unexpected rank in file at path {}. Expected {}, got {}".format(
-            path, rank, rank_state_dict[_utils.state_dict_trainer_options_key()][world_rank]
+            path, ranks[i], rank_state_dict[_utils.state_dict_trainer_options_key()][world_rank]
         )
         if loaded_mixed_precision is None:
             loaded_mixed_precision = rank_state_dict[_utils.state_dict_trainer_options_key()][mixed_precision]
diff --git a/orttraining/orttraining/python/training/experimental/exporter.py b/orttraining/orttraining/python/training/experimental/exporter.py
index 8c5ccd1119576..8a779cafabec1 100644
--- a/orttraining/orttraining/python/training/experimental/exporter.py
+++ b/orttraining/orttraining/python/training/experimental/exporter.py
@@ -4,7 +4,7 @@
 
 
 def _export_jit_graph_to_onnx_model_proto(graph: torch._C.Graph, operator_export_type: int):
-    from torch.onnx.symbolic_helper import _set_onnx_shape_inference, _set_operator_export_type, _set_opset_version
+    from torch.onnx.symbolic_helper import _set_onnx_shape_inference, _set_operator_export_type
 
     _set_onnx_shape_inference(True)
     _set_operator_export_type(operator_export_type)
diff --git a/orttraining/orttraining/python/training/experimental/gradient_graph/_gradient_graph_tools.py b/orttraining/orttraining/python/training/experimental/gradient_graph/_gradient_graph_tools.py
index 91c656619f621..3f163e2417a29 100644
--- a/orttraining/orttraining/python/training/experimental/gradient_graph/_gradient_graph_tools.py
+++ b/orttraining/orttraining/python/training/experimental/gradient_graph/_gradient_graph_tools.py
@@ -1,11 +1,12 @@
 import io
 from pathlib import Path
-from typing import Any, Callable, Optional, Union
+from typing import Any, Callable, Union
 
 import torch
-from onnxruntime.capi._pybind_state import GradientGraphBuilder
 from torch.onnx import TrainingMode
 
+from onnxruntime.capi._pybind_state import GradientGraphBuilder
+
 from ...ortmodule._custom_op_symbolic_registry import CustomOpSymbolicRegistry
 
 
diff --git a/orttraining/orttraining/python/training/model_desc_validation.py b/orttraining/orttraining/python/training/model_desc_validation.py
index e9181f732cb32..4cfc46c4ae8b5 100644
--- a/orttraining/orttraining/python/training/model_desc_validation.py
+++ b/orttraining/orttraining/python/training/model_desc_validation.py
@@ -1,8 +1,9 @@
-import cerberus
 from collections import namedtuple
+
+import cerberus
 import torch
-from ._utils import static_vars
 
+from ._utils import static_vars
 
 LEARNING_RATE_IO_DESCRIPTION_NAME = "__learning_rate"
 ALL_FINITE_IO_DESCRIPTION_NAME = "__all_finite"
diff --git a/orttraining/orttraining/python/training/onnxblock/loss/__init__.py b/orttraining/orttraining/python/training/onnxblock/loss/__init__.py
index ac21bb0f42438..5cb9ead2d2019 100644
--- a/orttraining/orttraining/python/training/onnxblock/loss/__init__.py
+++ b/orttraining/orttraining/python/training/onnxblock/loss/__init__.py
@@ -2,4 +2,4 @@
 # Licensed under the MIT License.
 # __init__.py
 
-from .loss import MSELoss, CrossEntropyLoss, BCEWithLogitsLoss
+from .loss import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
diff --git a/orttraining/orttraining/python/training/optim/__init__.py b/orttraining/orttraining/python/training/optim/__init__.py
index f74fe08202397..5e57d7df3a864 100644
--- a/orttraining/orttraining/python/training/optim/__init__.py
+++ b/orttraining/orttraining/python/training/optim/__init__.py
@@ -1,11 +1,10 @@
-from .config import _OptimizerConfig, AdamConfig, LambConfig, SGDConfig
+from .config import AdamConfig, LambConfig, SGDConfig, _OptimizerConfig
+from .fp16_optimizer import FP16_Optimizer
+from .fused_adam import AdamWMode, FusedAdam
 from .lr_scheduler import (
-    _LRScheduler,
     ConstantWarmupLRScheduler,
     CosineWarmupLRScheduler,
     LinearWarmupLRScheduler,
     PolyWarmupLRScheduler,
+    _LRScheduler,
 )
-
-from .fused_adam import FusedAdam, AdamWMode
-from .fp16_optimizer import FP16_Optimizer
diff --git a/orttraining/orttraining/python/training/optim/_apex_amp_modifier.py b/orttraining/orttraining/python/training/optim/_apex_amp_modifier.py
index 1b91ec2bf3594..64c30ab6618a1 100644
--- a/orttraining/orttraining/python/training/optim/_apex_amp_modifier.py
+++ b/orttraining/orttraining/python/training/optim/_apex_amp_modifier.py
@@ -8,6 +8,7 @@
 
 import types
 import warnings
+
 from ._modifier import FP16OptimizerModifier
 
 
@@ -23,6 +24,7 @@ def can_be_modified(self):
 
     def override_function(m_self):
         from apex import amp as apex_amp
+
         from onnxruntime.training.ortmodule.torch_cpp_extensions import fused_ops
 
         warnings.warn("Apex AMP fp16_optimizer functions are overrided with faster implementation.", UserWarning)
diff --git a/orttraining/orttraining/python/training/optim/_modifier.py b/orttraining/orttraining/python/training/optim/_modifier.py
index 952e90ee431ee..b2ca6c9ec8c8b 100644
--- a/orttraining/orttraining/python/training/optim/_modifier.py
+++ b/orttraining/orttraining/python/training/optim/_modifier.py
@@ -34,7 +34,7 @@ def check_requirements(self, required_funcs, require_apex=False, require_torch_n
                 from apex import amp
             if require_torch_non_finite_check is True:
                 _ = torch._amp_foreach_non_finite_check_and_unscale_
-        except Exception as _:
+        except Exception:
             warnings.warn("Skip modifying optimizer because of Apex or torch_non_finite_check not found.", UserWarning)
             return False
 
diff --git a/orttraining/orttraining/python/training/optim/_modifier_registry.py b/orttraining/orttraining/python/training/optim/_modifier_registry.py
index 4291b792a4607..4a3a33ecc0513 100644
--- a/orttraining/orttraining/python/training/optim/_modifier_registry.py
+++ b/orttraining/orttraining/python/training/optim/_modifier_registry.py
@@ -3,9 +3,9 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
+from ._apex_amp_modifier import ApexAMPModifier
 from ._ds_modifier import DeepSpeedZeROModifier
 from ._megatron_modifier import LegacyMegatronLMModifier
-from ._apex_amp_modifier import ApexAMPModifier
 
 OptimizerModifierTypeRegistry = {
     "megatron.fp16.fp16.FP16_Optimizer": LegacyMegatronLMModifier,
diff --git a/orttraining/orttraining/python/training/optim/fused_adam.py b/orttraining/orttraining/python/training/optim/fused_adam.py
index 30ebcf30e4844..4de467f9d16eb 100644
--- a/orttraining/orttraining/python/training/optim/fused_adam.py
+++ b/orttraining/orttraining/python/training/optim/fused_adam.py
@@ -10,9 +10,11 @@
 This file is adapted from fused adam in NVIDIA/apex, commit a109f85
 """
 
+from enum import IntEnum
+
 import torch
+
 from ._multi_tensor_apply import MultiTensorApply
-from enum import IntEnum
 
 
 class AdamWMode(IntEnum):
diff --git a/orttraining/orttraining/python/training/optim/lr_scheduler.py b/orttraining/orttraining/python/training/optim/lr_scheduler.py
index cbe013d32f310..17b024693babd 100644
--- a/orttraining/orttraining/python/training/optim/lr_scheduler.py
+++ b/orttraining/orttraining/python/training/optim/lr_scheduler.py
@@ -276,7 +276,7 @@ def _warmup_poly(self, train_step_info):
 
         assert (
             train_step_info.optimizer_config.lr > self.lr_end
-        ), f"lr_end ({lr_end}) must be be smaller than initial lr ({train_step_info.optimizer_config.lr})"
+        ), f"lr_end ({self.lr_end}) must be be smaller than initial lr ({train_step_info.optimizer_config.lr})"
 
         if train_step_info.optimization_step < self._num_warmup_steps:
             return float(train_step_info.optimization_step) / float(max(1, self._num_warmup_steps))
diff --git a/orttraining/orttraining/python/training/ortmodule/__init__.py b/orttraining/orttraining/python/training/ortmodule/__init__.py
index f6ed8827bded3..88aa71387b821 100644
--- a/orttraining/orttraining/python/training/ortmodule/__init__.py
+++ b/orttraining/orttraining/python/training/ortmodule/__init__.py
@@ -13,7 +13,8 @@
 from onnxruntime import set_seed
 from onnxruntime.capi import build_and_package_info as ort_info
 
-from ._fallback import ORTModuleFallbackException, ORTModuleInitException, _FallbackPolicy, wrap_exception
+from ._fallback import _FallbackPolicy
+from ._fallback_exceptions import ORTModuleFallbackException, ORTModuleInitException, wrap_exception
 from .torch_cpp_extensions import is_installed as is_torch_cpp_extensions_installed
 
 
@@ -86,7 +87,7 @@ def _defined_from_envvar(name, default_value, warn=True):
         ),
     )
 
-# Initalized ORT's random seed with pytorch's initial seed
+# Initialize ORT's random seed with pytorch's initial seed
 # in case user has set pytorch seed before importing ORTModule
 set_seed((torch.initial_seed() % sys.maxsize))
 
diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function.py b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function.py
index 1c2fce2b1a80e..e18a46ec16fe4 100644
--- a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function.py
+++ b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function.py
@@ -66,7 +66,7 @@ def enable_custom_autograd_support(to_enable=True):
             # This is for the latest Pytorch nightly after this commit:
             # https://github.com/pytorch/pytorch/commit/11bc435622e6b7207bbf37ed1aafe999e1f296ec
             register_custom_op_symbolic("prim::PythonOp", _export, 1)
-        except:
+        except Exception:
             # This applies to Pytorch 1.9 and 1.9.1.
             register_custom_op_symbolic("::prim_PythonOp", _export, 1)
 
@@ -78,7 +78,7 @@ def enable_custom_autograd_support(to_enable=True):
             # This is for the latest Pytorch nightly after this commit:
             # https://github.com/pytorch/pytorch/commit/11bc435622e6b7207bbf37ed1aafe999e1f296ec
             unregister_custom_op_symbolic("prim::PythonOp", 1)
-        except:
+        except Exception:
             # This applies to Pytorch 1.9 and 1.9.1.
             unregister_custom_op_symbolic("::prim_PythonOp", 1)
 
diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py
index 887066d2a3dbc..250bb1c251fba 100644
--- a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py
+++ b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py
@@ -15,7 +15,7 @@
 from onnxruntime.training import ortmodule
 
 from . import _logger
-from ._fallback import ORTModuleONNXModelException, wrap_exception
+from ._fallback_exceptions import ORTModuleONNXModelException, wrap_exception
 
 # Some autograd.Function's shouldn't be exported as PythonOp.
 # If CheckpointFunction is exported as PythonOp, the checkpointed computation
diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_runner.py b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_runner.py
index 63daf53266291..5b57417c42cef 100644
--- a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_runner.py
+++ b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_runner.py
@@ -4,14 +4,14 @@
 # --------------------------------------------------------------------------
 
 import sys
-import torch
 
+import torch
 from torch.utils.dlpack import from_dlpack, to_dlpack
 
-from ._fallback import _FallbackManager, ORTModuleFallbackException, ORTModuleIOError, wrap_exception
-
 from onnxruntime.training.ortmodule.torch_cpp_extensions import torch_interop_utils
 
+from ._fallback_exceptions import ORTModuleFallbackException, ORTModuleIOError, wrap_exception
+
 
 def wrap_as_dlpack_or_not(grad_flag, tensor_flag, inplace_flag, training_mode_flag, arg):
     """
diff --git a/orttraining/orttraining/python/training/ortmodule/_execution_agent.py b/orttraining/orttraining/python/training/ortmodule/_execution_agent.py
index be2cf01b1f33b..dcdc854e8d59e 100644
--- a/orttraining/orttraining/python/training/ortmodule/_execution_agent.py
+++ b/orttraining/orttraining/python/training/ortmodule/_execution_agent.py
@@ -6,7 +6,7 @@
 import onnxruntime
 from onnxruntime.capi import _pybind_state as C
 from onnxruntime.capi._pybind_state import TrainingAgent as C_TrainingAgent
-from onnxruntime.capi.onnxruntime_inference_collection import IOBinding, OrtValue
+from onnxruntime.capi.onnxruntime_inference_collection import IOBinding
 
 
 class ExecutionAgentOutput:  # pylint: disable=R0903
diff --git a/orttraining/orttraining/python/training/ortmodule/_fallback.py b/orttraining/orttraining/python/training/ortmodule/_fallback.py
index 7129e522b8c49..a31a14dfe2e73 100644
--- a/orttraining/orttraining/python/training/ortmodule/_fallback.py
+++ b/orttraining/orttraining/python/training/ortmodule/_fallback.py
@@ -3,24 +3,22 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-from . import _logger
-
 import os
-import torch
 import warnings
-
 from enum import IntFlag
 from typing import Optional
+
+import torch
+
+from . import _logger, _utils
 from ._fallback_exceptions import (
+    ORTModuleDeviceException,
     ORTModuleFallbackException,
     ORTModuleInitException,
-    ORTModuleDeviceException,
     ORTModuleIOError,
-    ORTModuleTorchModelException,
     ORTModuleONNXModelException,
-    wrap_exception,
+    ORTModuleTorchModelException,
 )
-from . import _utils
 
 
 class _FallbackPolicy(IntFlag):
diff --git a/orttraining/orttraining/python/training/ortmodule/_gradient_accumulation_manager.py b/orttraining/orttraining/python/training/ortmodule/_gradient_accumulation_manager.py
index 501a559a6ee0c..dbb3507856b2e 100644
--- a/orttraining/orttraining/python/training/ortmodule/_gradient_accumulation_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_gradient_accumulation_manager.py
@@ -2,9 +2,10 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
-from . import _utils
 from onnxruntime.capi import _pybind_state as C
 
+from . import _utils
+
 
 class GradientAccumulationManager(object):
     """Handles Gradient accumulation optimization during training
diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
index c48d16636d43b..fe4bf43d94c56 100644
--- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
@@ -8,7 +8,7 @@
 import io
 import os
 import warnings
-from abc import ABC, abstractmethod
+from abc import abstractmethod
 from enum import IntFlag
 from functools import reduce
 
@@ -23,11 +23,11 @@
 
 from . import _are_deterministic_algorithms_enabled, _io, _logger, _onnx_models, _utils
 from ._custom_autograd_function_exporter import _post_process_after_export
-from ._fallback import (
+from ._fallback import _FallbackManager
+from ._fallback_exceptions import (
     ORTModuleDeviceException,
     ORTModuleONNXModelException,
     ORTModuleTorchModelException,
-    _FallbackManager,
     wrap_exception,
 )
 from ._gradient_accumulation_manager import GradientAccumulationManager
@@ -236,7 +236,7 @@ def execution_session_run_forward(execution_session, onnx_model, device, *inputs
             run_info: A _RunStateInfo which contains extra information about the execution of the graph
         """
 
-        raise NotImplemented
+        raise NotImplementedError
 
     @abstractmethod
     def forward(self):
diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager_factory.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager_factory.py
index 216417249bd20..a902f511713ad 100644
--- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager_factory.py
+++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager_factory.py
@@ -3,10 +3,10 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-from ._training_manager import TrainingManager
+from ._fallback import _FallbackManager
 from ._inference_manager import InferenceManager
+from ._training_manager import TrainingManager
 from .debug_options import DebugOptions
-from ._fallback import _FallbackManager
 
 
 class GraphExecutionManagerFactory(object):
diff --git a/orttraining/orttraining/python/training/ortmodule/_inference_manager.py b/orttraining/orttraining/python/training/ortmodule/_inference_manager.py
index e72601efdd431..fb4a3ea9580aa 100644
--- a/orttraining/orttraining/python/training/ortmodule/_inference_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_inference_manager.py
@@ -12,7 +12,8 @@
 
 from . import _are_deterministic_algorithms_enabled, _io, _logger, _use_deterministic_algorithms, _utils
 from ._execution_agent import InferenceAgent
-from ._fallback import ORTModuleFallbackException, _FallbackManager, _FallbackPolicy
+from ._fallback import _FallbackManager, _FallbackPolicy
+from ._fallback_exceptions import ORTModuleFallbackException
 from ._graph_execution_manager import GraphExecutionManager, _RunStateInfo, _SkipCheck
 from .debug_options import DebugOptions
 
diff --git a/orttraining/orttraining/python/training/ortmodule/_io.py b/orttraining/orttraining/python/training/ortmodule/_io.py
index 41987208738c9..ddb98b104a50d 100644
--- a/orttraining/orttraining/python/training/ortmodule/_io.py
+++ b/orttraining/orttraining/python/training/ortmodule/_io.py
@@ -11,7 +11,7 @@
 
 import torch
 
-from ._fallback import ORTModuleIOError, ORTModuleONNXModelException, _FallbackManager, wrap_exception
+from ._fallback_exceptions import ORTModuleIOError, ORTModuleONNXModelException, wrap_exception
 from ._utils import warn_of_constant_inputs
 
 
@@ -298,7 +298,7 @@ def __eq__(self, other):
         if not other:
             return False
         elif not isinstance(other, _TensorStub):
-            raise NotImplemented("_TensorStub must only be compared to another _TensorStub instance!")
+            raise NotImplementedError("_TensorStub must only be compared to another _TensorStub instance!")
         elif self.name != other.name:
             return False
         elif self.dtype != other.dtype:
diff --git a/orttraining/orttraining/python/training/ortmodule/_logger.py b/orttraining/orttraining/python/training/ortmodule/_logger.py
index 66e1cb556538f..f3d4d930746b6 100644
--- a/orttraining/orttraining/python/training/ortmodule/_logger.py
+++ b/orttraining/orttraining/python/training/ortmodule/_logger.py
@@ -3,12 +3,13 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-from onnxruntime.capi._pybind_state import Severity
-from contextlib import contextmanager
-from enum import IntEnum
 import io
 import sys
 import warnings
+from contextlib import contextmanager
+from enum import IntEnum
+
+from onnxruntime.capi._pybind_state import Severity
 
 
 class LogLevel(IntEnum):
diff --git a/orttraining/orttraining/python/training/ortmodule/_torch_module_factory.py b/orttraining/orttraining/python/training/ortmodule/_torch_module_factory.py
index 41d82eded40c1..d2954a287e804 100644
--- a/orttraining/orttraining/python/training/ortmodule/_torch_module_factory.py
+++ b/orttraining/orttraining/python/training/ortmodule/_torch_module_factory.py
@@ -2,9 +2,9 @@
 # Licensed under the MIT License.
 # _torch_module_factory.py
 
+from ._fallback import _FallbackManager
 from ._torch_module_ort import TorchModuleORT
 from .debug_options import DebugOptions
-from ._fallback import _FallbackManager
 
 
 class TorchModuleFactory:
diff --git a/orttraining/orttraining/python/training/ortmodule/_torch_module_interface.py b/orttraining/orttraining/python/training/ortmodule/_torch_module_interface.py
index 6d7a9db2433a0..d7e369613e6bc 100644
--- a/orttraining/orttraining/python/training/ortmodule/_torch_module_interface.py
+++ b/orttraining/orttraining/python/training/ortmodule/_torch_module_interface.py
@@ -3,9 +3,9 @@
 # _torch_module_interface.py
 
 from collections import OrderedDict
-import torch
-from typing import Iterator, Optional, Tuple, TypeVar, Callable
+from typing import Callable, Iterator, Optional, Tuple, TypeVar
 
+import torch
 
 T = TypeVar("T", bound="torch.nn.Module")
 
diff --git a/orttraining/orttraining/python/training/ortmodule/_torch_module_ort.py b/orttraining/orttraining/python/training/ortmodule/_torch_module_ort.py
index ea0676b12587c..fd03ede82e104 100644
--- a/orttraining/orttraining/python/training/ortmodule/_torch_module_ort.py
+++ b/orttraining/orttraining/python/training/ortmodule/_torch_module_ort.py
@@ -2,15 +2,17 @@
 # Licensed under the MIT License.
 # _torch_module_ort.py
 
-from . import _io, _utils
-from .debug_options import DebugOptions
-from ._graph_execution_manager_factory import GraphExecutionManagerFactory
-from ._torch_module_interface import TorchModuleInterface
-from ._fallback import _FallbackManager, ORTModuleTorchModelException, wrap_exception
 from collections import OrderedDict
+from typing import Callable, Iterator, Optional, Tuple, TypeVar
+
 import torch
-from typing import Iterator, Optional, Tuple, TypeVar, Callable
 
+from . import _io, _utils
+from ._fallback import _FallbackManager
+from ._fallback_exceptions import ORTModuleTorchModelException, wrap_exception
+from ._graph_execution_manager_factory import GraphExecutionManagerFactory
+from ._torch_module_interface import TorchModuleInterface
+from .debug_options import DebugOptions
 
 T = TypeVar("T", bound="torch.nn.Module")
 
@@ -145,7 +147,7 @@ def _replicate_for_data_parallel(self):
             ),
         )
 
-    def add_module(self, name: str, module: Optional["Module"]) -> None:
+    def add_module(self, name: str, module: Optional["torch.nn.Module"]) -> None:
         raise wrap_exception(
             ORTModuleTorchModelException, NotImplementedError("ORTModule does not support adding modules to it.")
         )
diff --git a/orttraining/orttraining/python/training/ortmodule/_torch_module_pytorch.py b/orttraining/orttraining/python/training/ortmodule/_torch_module_pytorch.py
index 44a43b2429e1c..5335f34172436 100644
--- a/orttraining/orttraining/python/training/ortmodule/_torch_module_pytorch.py
+++ b/orttraining/orttraining/python/training/ortmodule/_torch_module_pytorch.py
@@ -2,12 +2,12 @@
 # Licensed under the MIT License.
 # _torch_module_pytorch.py
 
-from ._torch_module_interface import TorchModuleInterface
-
 from collections import OrderedDict
+from typing import Callable, Iterator, Optional, Tuple, TypeVar
+
 import torch
-from typing import Iterator, Optional, Tuple, TypeVar, Callable
 
+from ._torch_module_interface import TorchModuleInterface
 
 T = TypeVar("T", bound="torch.nn.Module")
 
diff --git a/orttraining/orttraining/python/training/ortmodule/_training_manager.py b/orttraining/orttraining/python/training/ortmodule/_training_manager.py
index 6c73866aa6bc6..01717a1ad7328 100644
--- a/orttraining/orttraining/python/training/ortmodule/_training_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_training_manager.py
@@ -13,7 +13,8 @@
 
 from . import _are_deterministic_algorithms_enabled, _io, _logger, _use_deterministic_algorithms, _utils
 from ._execution_agent import TrainingAgent
-from ._fallback import ORTModuleFallbackException, _FallbackManager, _FallbackPolicy
+from ._fallback import _FallbackManager, _FallbackPolicy
+from ._fallback_exceptions import ORTModuleFallbackException
 from ._graph_execution_manager import GraphExecutionManager, _RunStateInfo, _SkipCheck
 from .debug_options import DebugOptions
 
diff --git a/orttraining/orttraining/python/training/ortmodule/_utils.py b/orttraining/orttraining/python/training/ortmodule/_utils.py
index a43f0c3e66c7d..fcdfb83b89b2b 100644
--- a/orttraining/orttraining/python/training/ortmodule/_utils.py
+++ b/orttraining/orttraining/python/training/ortmodule/_utils.py
@@ -12,11 +12,10 @@
 import traceback
 import types
 import warnings
-from typing import List
-from packaging.version import Version as LooseVersion
 
 import numpy as np
 import torch
+from packaging.version import Version as LooseVersion
 from torch._C import _from_dlpack
 from torch.utils.dlpack import to_dlpack
 
@@ -316,7 +315,7 @@ def get_exception_as_string(exception):
 
     try:
         raise exception
-    except:
+    except Exception:
         return traceback.format_exc()
 
 
diff --git a/orttraining/orttraining/python/training/ortmodule/experimental/json_config/_load_config_from_json.py b/orttraining/orttraining/python/training/ortmodule/experimental/json_config/_load_config_from_json.py
index f251df27360ee..2f1451497ffcd 100644
--- a/orttraining/orttraining/python/training/ortmodule/experimental/json_config/_load_config_from_json.py
+++ b/orttraining/orttraining/python/training/ortmodule/experimental/json_config/_load_config_from_json.py
@@ -3,17 +3,18 @@
 # _load_config_from_json.py
 
 import json
-import os
 import logging
+import os
+from functools import reduce
 from types import SimpleNamespace
 
 from onnxruntime.capi import _pybind_state as C
-from functools import reduce
-from . import JSON_PATH_ENVIRONMENT_KEY
+from onnxruntime.training import ortmodule
+
 from ..._fallback import _FallbackPolicy
 from ..._graph_execution_manager import _SkipCheck
 from ...debug_options import DebugOptions, LogLevel, _SaveOnnxOptions
-from onnxruntime.training import ortmodule
+from . import JSON_PATH_ENVIRONMENT_KEY
 
 log = logging.getLogger(__name__)
 
diff --git a/orttraining/orttraining/python/training/ortmodule/ortmodule.py b/orttraining/orttraining/python/training/ortmodule/ortmodule.py
index 18000e0462d00..f57940637468b 100644
--- a/orttraining/orttraining/python/training/ortmodule/ortmodule.py
+++ b/orttraining/orttraining/python/training/ortmodule/ortmodule.py
@@ -3,24 +3,24 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-from ._torch_module_factory import TorchModuleFactory
-from ._torch_module_pytorch import TorchModulePytorch
-from ._torch_module_ort import TorchModuleORT
-from ._custom_op_symbolic_registry import CustomOpSymbolicRegistry
-from ._custom_gradient_registry import CustomGradientRegistry
-from . import _utils
-from .debug_options import DebugOptions
-from ._fallback import _FallbackManager, _FallbackPolicy, ORTModuleFallbackException
-from onnxruntime.training import ortmodule
-
-from onnxruntime.tools import pytorch_export_contrib_ops
+from typing import Callable, Iterator, Optional, OrderedDict, Tuple, TypeVar
 
 import torch
-from typing import Iterator, Optional, Tuple, TypeVar, Callable
 
+from onnxruntime.tools import pytorch_export_contrib_ops
+from onnxruntime.training import ortmodule
+
+from . import _utils
+from ._custom_gradient_registry import CustomGradientRegistry
+from ._custom_op_symbolic_registry import CustomOpSymbolicRegistry
+from ._fallback import _FallbackManager, _FallbackPolicy
+from ._fallback_exceptions import ORTModuleFallbackException
+from ._torch_module_factory import TorchModuleFactory
+from ._torch_module_ort import TorchModuleORT
+from .debug_options import DebugOptions
 
 # Needed to override PyTorch methods
-T = TypeVar("T", bound="Module")
+T = TypeVar("T", bound="torch.nn.Module")
 
 
 class ORTModule(torch.nn.Module):
@@ -145,7 +145,7 @@ def _replicate_for_data_parallel(self):
 
         return self._torch_module._replicate_for_data_parallel()
 
-    def add_module(self, name: str, module: Optional["Module"]) -> None:
+    def add_module(self, name: str, module: Optional["torch.nn.Module"]) -> None:
         """Raises a ORTModuleTorchModelException exception since ORTModule does not support adding modules to it"""
 
         self._torch_module.add_module(name, module)
@@ -176,7 +176,7 @@ def _apply(self, fn):
         self._torch_module._apply(fn)
         return self
 
-    def apply(self: T, fn: Callable[["Module"], None]) -> T:
+    def apply(self: T, fn: Callable[["torch.nn.Module"], None]) -> T:
         """Override :meth:`~torch.nn.Module.apply` to delegate execution to ONNX Runtime"""
 
         self._torch_module.apply(fn)
@@ -203,7 +203,7 @@ def state_dict(self, destination=None, prefix="", keep_vars=False):
 
         return self._torch_module.state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
 
-    def load_state_dict(self, state_dict: "OrderedDict[str, Tensor]", strict: bool = True):
+    def load_state_dict(self, state_dict: OrderedDict[str, torch.Tensor], strict: bool = True):
         """Override :meth:`~torch.nn.Module.load_state_dict` to delegate execution to ONNX Runtime"""
 
         return self._torch_module.load_state_dict(state_dict, strict=strict)
@@ -257,12 +257,12 @@ def _load_from_state_dict(
             state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
         )
 
-    def named_children(self) -> Iterator[Tuple[str, "Module"]]:
+    def named_children(self) -> Iterator[Tuple[str, "torch.nn.Module"]]:
         """Override :meth:`~torch.nn.Module.named_children`"""
 
         yield from self._torch_module.named_children()
 
-    def modules(self) -> Iterator["Module"]:
+    def modules(self) -> Iterator["torch.nn.Module"]:
         """Override :meth:`~torch.nn.Module.modules`"""
 
         yield from self._torch_module.modules()
diff --git a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/setup.py b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/setup.py
index 0ab8a0c1899e2..9b705f514e1a1 100644
--- a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/setup.py
+++ b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/setup.py
@@ -5,7 +5,7 @@
 
 import os
 
-from setuptools import Extension, setup
+from setuptools import setup
 from torch.utils import cpp_extension
 
 filename = os.path.join(os.path.dirname(__file__), "torch_interop_utils.cc")
diff --git a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cuda/fused_ops/setup.py b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cuda/fused_ops/setup.py
index b73623c430525..7fd5a236b18d2 100644
--- a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cuda/fused_ops/setup.py
+++ b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cuda/fused_ops/setup.py
@@ -3,9 +3,7 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-import fileinput
 import os
-import sys
 
 from setuptools import setup
 from torch.utils import cpp_extension
diff --git a/orttraining/orttraining/python/training/orttrainer.py b/orttraining/orttraining/python/training/orttrainer.py
index bdf6a1e9e1ea1..8f7b0842f1ce6 100644
--- a/orttraining/orttraining/python/training/orttrainer.py
+++ b/orttraining/orttraining/python/training/orttrainer.py
@@ -298,7 +298,7 @@ def save_as_onnx(self, path):
 
     def _check_model_export(self, input):
         from numpy.testing import assert_allclose
-        from onnx import TensorProto, helper, numpy_helper
+        from onnx import numpy_helper
 
         onnx_model_copy = copy.deepcopy(self._onnx_model)
 
@@ -729,7 +729,7 @@ def get_providers(provider_options):
                 if gpu_ep_name not in providers:
                     raise RuntimeError(
                         "ORTTrainer options specify a CUDA device but the {} provider is unavailable.".format(
-                            cuda_ep_name
+                            gpu_ep_name
                         )
                     )
 
@@ -930,8 +930,8 @@ def _training_session_run_helper(self, is_train, inputs, inputs_desc, outputs_de
             # to move the data between device and host.
             # so output will be on the same device as input.
             try:
-                test_pt_device = torch.device(target_device)
-            except:
+                torch.device(target_device)
+            except Exception:
                 # in this case, input/output must on CPU
                 assert input.device.type == "cpu"
                 target_device = "cpu"
diff --git a/orttraining/orttraining/python/training/orttrainer_options.py b/orttraining/orttraining/python/training/orttrainer_options.py
index 9e7a2bde4dfa0..c4c54a57a4565 100644
--- a/orttraining/orttraining/python/training/orttrainer_options.py
+++ b/orttraining/orttraining/python/training/orttrainer_options.py
@@ -1,5 +1,4 @@
 import cerberus
-import torch
 
 import onnxruntime as ort
 
@@ -536,7 +535,7 @@ def _check_is_callable(field, value, error):
     try:
         # Python 3
         result = value is None or callable(value)
-    except:
+    except Exception:
         # Python 3 but < 3.2
         if hasattr(value, "__call__"):
             result = True
diff --git a/orttraining/orttraining/python/training/postprocess.py b/orttraining/orttraining/python/training/postprocess.py
index ff77a05e41e31..12ae8ed34181f 100644
--- a/orttraining/orttraining/python/training/postprocess.py
+++ b/orttraining/orttraining/python/training/postprocess.py
@@ -1,12 +1,7 @@
-import sys
-import os.path
-from onnx import *
-import onnx
-import numpy as np
 import struct
 
+import onnx
 from onnx import helper
-from onnx import numpy_helper
 
 
 def run_postprocess(model):
@@ -169,7 +164,7 @@ def fix_expand_shape_pt_1_5(model):
             if n_shape.op_type != "Shape" or n_constant_g.op_type != "Constant":
                 break
             n_input = n_shape.input[0]
-            if not n_input in model_inputs_names:
+            if n_input not in model_inputs_names:
                 break
             n_input_candidates.append(n_input)
 
diff --git a/orttraining/orttraining/python/training/torchdynamo/register_backend.py b/orttraining/orttraining/python/training/torchdynamo/register_backend.py
index 2830a10b4feb7..852cf7e89500c 100644
--- a/orttraining/orttraining/python/training/torchdynamo/register_backend.py
+++ b/orttraining/orttraining/python/training/torchdynamo/register_backend.py
@@ -7,6 +7,7 @@
 from torch._dynamo.backends.common import aot_autograd
 from .ort_backend import OrtBackend
 
+from .ort_backend import OrtBackend
 
 # This should be the underlying compiler for ALL graphs if
 # the user uses ORT to accelerate PyTorch via Dynamo.
diff --git a/orttraining/orttraining/python/training/utils/data/__init__.py b/orttraining/orttraining/python/training/utils/data/__init__.py
index 91207012216d3..ea1195f247d90 100644
--- a/orttraining/orttraining/python/training/utils/data/__init__.py
+++ b/orttraining/orttraining/python/training/utils/data/__init__.py
@@ -2,4 +2,4 @@
 # Licensed under the MIT License.
 # __init__.py
 
-from .sampler import LoadBalancingDistributedSampler, LoadBalancingDistributedBatchSampler
+from .sampler import LoadBalancingDistributedBatchSampler, LoadBalancingDistributedSampler
diff --git a/orttraining/orttraining/python/training/utils/data/sampler.py b/orttraining/orttraining/python/training/utils/data/sampler.py
index 932f9e76dc13c..2fab9a11d95e9 100644
--- a/orttraining/orttraining/python/training/utils/data/sampler.py
+++ b/orttraining/orttraining/python/training/utils/data/sampler.py
@@ -2,13 +2,14 @@
 # Licensed under the MIT License.
 # sampler.py
 
-import torch
 import math
+from typing import Callable, Iterator, Optional
+
+import numpy as np
+import torch
 import torch.distributed as dist
-from torch.utils.data.sampler import Sampler
 from torch.utils.data.dataset import Dataset
-from typing import Optional, Iterator, Callable
-import numpy as np
+from torch.utils.data.sampler import Sampler
 
 
 def _shard_wrapped_indices_across_workers(dataset_index_list, num_shards, num_samples_per_shard):
diff --git a/orttraining/orttraining/test/external_custom_ops/setup.py b/orttraining/orttraining/test/external_custom_ops/setup.py
index 57ba10b91ad2d..6179a746493e4 100644
--- a/orttraining/orttraining/test/external_custom_ops/setup.py
+++ b/orttraining/orttraining/test/external_custom_ops/setup.py
@@ -1,13 +1,14 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
-import sys
 import os
 import subprocess
-from setuptools import setup, Extension
-from setuptools.command.build_ext import build_ext
-from subprocess import CalledProcessError
-import pybind11
+import sys
+
 import onnx
+import pybind11
+from setuptools import Extension, setup
+from setuptools.command.build_ext import build_ext
+
 import onnxruntime
 
 
diff --git a/orttraining/orttraining/test/external_custom_ops/test.py b/orttraining/orttraining/test/external_custom_ops/test.py
index 7d3e4edf48bd8..18b37f9305e20 100644
--- a/orttraining/orttraining/test/external_custom_ops/test.py
+++ b/orttraining/orttraining/test/external_custom_ops/test.py
@@ -1,18 +1,17 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-import os
-import sys
+
 import numpy as np
 
+# Restore dlopen flags.
+import orttraining_external_custom_ops
+
 # Expose available (onnx::* and protobuf::*) symbols from onnxruntime to resolve references in
 # the custom ops shared library. Deepbind flag is required to avoid conflicts with other
 # instances of onnx/protobuf libraries.
 import onnxruntime
 
-# Restore dlopen flags.
-import orttraining_external_custom_ops
-
 so = onnxruntime.SessionOptions()
 sess = onnxruntime.InferenceSession("testdata/model.onnx", so)
 input = np.random.rand(2, 2).astype(np.float32)
diff --git a/orttraining/orttraining/test/external_transformer/test/external_transformers_test.py b/orttraining/orttraining/test/external_transformer/test/external_transformers_test.py
index a1377d2448bfd..7c749135decb3 100644
--- a/orttraining/orttraining/test/external_transformer/test/external_transformers_test.py
+++ b/orttraining/orttraining/test/external_transformer/test/external_transformers_test.py
@@ -78,14 +78,14 @@ def readOutput(self):
             self.capturedtext += char
 
 
+import os
+import unittest
+
 import torch
-from onnxruntime.capi import _pybind_state as torch_ort_eager
 import torch.nn as nn
 import torch.nn.functional as F
-import numpy as np
-import os
-from onnxruntime.training import optim, orttrainer, orttrainer_options
-import unittest
+
+from onnxruntime.training import optim, orttrainer
 
 
 def my_loss(x, target):
@@ -134,7 +134,7 @@ def test_external_graph_transformer_triggering(self):
         target = torch.randint(0, 10, (batch_size,))
 
         with OutputGrabber() as out:
-            loss = model.train_step(data, target)
+            model.train_step(data, target)
         assert "******************Trigger Customized Graph Transformer:  MyGraphTransformer!" in out.capturedtext
 
 
diff --git a/orttraining/orttraining/test/python/_test_helpers.py b/orttraining/orttraining/test/python/_test_helpers.py
index 95c3b58521a56..8b13bf14afd55 100644
--- a/orttraining/orttraining/test/python/_test_helpers.py
+++ b/orttraining/orttraining/test/python/_test_helpers.py
@@ -11,7 +11,6 @@
 try:
     from onnxruntime.training.ortmodule import ORTModule
     from onnxruntime.training.ortmodule._fallback import ORTModuleInitException
-    from onnxruntime.training.ortmodule._graph_execution_manager_factory import GraphExecutionManagerFactory
 except ImportError:
     # Some pipelines do not contain ORTModule
     pass
@@ -61,7 +60,7 @@ def assert_model_outputs(output_a, output_b, verbose=False, rtol=1e-7, atol=0):
         )
 
     # for idx in range(len(output_a)):
-    assert_allclose(output_a, output_b, rtol=rtol, atol=atol, err_msg=f"Model output value mismatch")
+    assert_allclose(output_a, output_b, rtol=rtol, atol=atol, err_msg="Model output value mismatch")
 
 
 def assert_onnx_weights(model_a, model_b, verbose=False, rtol=1e-7, atol=0):
diff --git a/orttraining/orttraining/test/python/launch_test.py b/orttraining/orttraining/test/python/launch_test.py
index d183f3189511c..d3427b00a3061 100755
--- a/orttraining/orttraining/test/python/launch_test.py
+++ b/orttraining/orttraining/test/python/launch_test.py
@@ -2,14 +2,12 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-import os
-import sys
 import argparse
+import logging
+import sys
 
 from _test_commons import run_subprocess
 
-import logging
-
 logging.basicConfig(format="%(asctime)s %(name)s [%(levelname)s] - %(message)s", level=logging.DEBUG)
 log = logging.getLogger("Build")
 
diff --git a/orttraining/orttraining/test/python/onnxruntime_test_postprocess.py b/orttraining/orttraining/test/python/onnxruntime_test_postprocess.py
index 83bd524e7d6f3..99e7698f07cb0 100644
--- a/orttraining/orttraining/test/python/onnxruntime_test_postprocess.py
+++ b/orttraining/orttraining/test/python/onnxruntime_test_postprocess.py
@@ -1,21 +1,15 @@
-import copy
 import os
-import sys
 import unittest
 
-import onnx
-import pytest
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
-from numpy.testing import assert_allclose, assert_array_equal
 from orttraining_test_bert_postprocess import postprocess_model
 from orttraining_test_data_loader import create_ort_test_dataloader
 from orttraining_test_transformers import BertForPreTraining, BertModelTest
 from orttraining_test_utils import map_optimizer_attributes
 
 import onnxruntime
-from onnxruntime.capi.ort_trainer import IODescription, LossScaler, ModelDescription, ORTTrainer, generate_sample
+from onnxruntime.capi.ort_trainer import IODescription, ModelDescription, ORTTrainer
 
 torch.manual_seed(1)
 onnxruntime.set_seed(1)
@@ -47,7 +41,7 @@ def get_onnx_model(
             _extra_postprocess=_extra_postprocess,
         )
 
-        train_output = model.train_step(*inputs)
+        model.train_step(*inputs)
         return model.onnx_model_
 
     def count_all_nodes(self, model):
diff --git a/orttraining/orttraining/test/python/onnxruntime_test_register_ep.py b/orttraining/orttraining/test/python/onnxruntime_test_register_ep.py
index 5f71125cff413..e3030bced8439 100644
--- a/orttraining/orttraining/test/python/onnxruntime_test_register_ep.py
+++ b/orttraining/orttraining/test/python/onnxruntime_test_register_ep.py
@@ -1,6 +1,7 @@
+import os
 import unittest
+
 import onnxruntime_pybind11_state as C
-import os
 
 
 class EPRegistrationTests(unittest.TestCase):
diff --git a/orttraining/orttraining/test/python/orttraining_ortmodule_distributed_tests.py b/orttraining/orttraining/test/python/orttraining_ortmodule_distributed_tests.py
index 4f778444b88f0..08b304cb0e3b2 100644
--- a/orttraining/orttraining/test/python/orttraining_ortmodule_distributed_tests.py
+++ b/orttraining/orttraining/test/python/orttraining_ortmodule_distributed_tests.py
@@ -2,13 +2,12 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-import sys
 import argparse
+import logging
+import sys
 
 from _test_commons import run_subprocess
 
-import logging
-
 logging.basicConfig(format="%(asctime)s %(name)s [%(levelname)s] - %(message)s", level=logging.DEBUG)
 log = logging.getLogger("ORTModuleDistributedTests")
 
diff --git a/orttraining/orttraining/test/python/orttraining_run_bert_pretrain.py b/orttraining/orttraining/test/python/orttraining_run_bert_pretrain.py
index a087a97da5a54..17b25290c9b7f 100644
--- a/orttraining/orttraining/test/python/orttraining_run_bert_pretrain.py
+++ b/orttraining/orttraining/test/python/orttraining_run_bert_pretrain.py
@@ -1,37 +1,27 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
 # ==================
-import os
-import shutil
-import logging
-import random
-import h5py
-from tqdm import tqdm
-import datetime
-import numpy as np
 import dataclasses
-from dataclasses import dataclass, field
-from typing import Optional, Any, Dict
-import json
+import datetime
 import glob
-
+import json
+import logging
+import os
+import random
+import shutil
 import unittest
+from concurrent.futures import ProcessPoolExecutor
+from dataclasses import dataclass, field
+from typing import Any, Dict, Optional
 
+import h5py
+import numpy as np
 import torch
-from torch.utils.data import DataLoader, RandomSampler, Dataset
 import torch.distributed as dist
+from torch.utils.data import DataLoader, Dataset, RandomSampler
 from torch.utils.tensorboard import SummaryWriter
-
-from transformers import BertForPreTraining, BertConfig, HfArgumentParser
-
-from concurrent.futures import ProcessPoolExecutor
+from tqdm import tqdm
+from transformers import BertConfig, BertForPreTraining, HfArgumentParser
 
 import onnxruntime as ort
-from onnxruntime.training import amp, optim, orttrainer
-from onnxruntime.training.optim import PolyWarmupLRScheduler, LinearWarmupLRScheduler
-from onnxruntime.training.checkpoint import aggregate_checkpoints
 
 # need to override torch.onnx.symbolic_opset12.nll_loss to handle ignore_index == -100 cases.
 # the fix for ignore_index == -100 cases is already in pytorch master.
@@ -39,6 +29,9 @@
 # eventually we will use pytorch with fixed nll_loss once computation
 # issues are understood and solved.
 import onnxruntime.capi.pt_patch
+from onnxruntime.training import amp, optim, orttrainer
+from onnxruntime.training.checkpoint import aggregate_checkpoints
+from onnxruntime.training.optim import LinearWarmupLRScheduler
 
 # we cannot make full convergence run in nightly pipeling because of its timeout limit,
 # max_steps is still needed to calculate learning rate. force_to_stop_max_steps is used to
@@ -493,7 +486,6 @@ def do_pretrain(args):
 
     logger.info("Running training: Batch size = %d, initial LR = %f", args.train_batch_size, args.learning_rate)
 
-    most_recent_ckpts_paths = []
     average_loss = 0.0
     epoch = 0
     training_steps = 0
@@ -696,7 +688,7 @@ def test_pretrain_zero(self):
             deepspeed_zero_stage=self.deepspeed_zero_stage,
             save_checkpoint=True,
         )
-        train_loss = do_pretrain(args)
+        do_pretrain(args)
 
         # ensure all workers reach this point before loading the checkpointed state
         torch.distributed.barrier()
@@ -733,12 +725,7 @@ def test_pretrain_zero(self):
     # calling unpublished get_mpi_context_xxx to get rank/size numbers.
     try:
         # In case ORT is not built with MPI/NCCL, there are no get_mpi_context_xxx internal apis.
-        from onnxruntime.capi._pybind_state import (
-            get_mpi_context_local_rank,
-            get_mpi_context_local_size,
-            get_mpi_context_world_rank,
-            get_mpi_context_world_size,
-        )
+        from onnxruntime.capi._pybind_state import get_mpi_context_local_rank, get_mpi_context_world_size
 
         has_get_mpi_context_internal_api = True
     except ImportError:
diff --git a/orttraining/orttraining/test/python/orttraining_run_frontend_batch_size_test.py b/orttraining/orttraining/test/python/orttraining_run_frontend_batch_size_test.py
index db03a636d046e..e96b90138c3d5 100644
--- a/orttraining/orttraining/test/python/orttraining_run_frontend_batch_size_test.py
+++ b/orttraining/orttraining/test/python/orttraining_run_frontend_batch_size_test.py
@@ -1,6 +1,6 @@
-import sys
 import collections
 import subprocess
+import sys
 
 Config = collections.namedtuple(
     "Config",
diff --git a/orttraining/orttraining/test/python/orttraining_run_glue.py b/orttraining/orttraining/test/python/orttraining_run_glue.py
index a9b514599fb78..a40d97e00be50 100644
--- a/orttraining/orttraining/test/python/orttraining_run_glue.py
+++ b/orttraining/orttraining/test/python/orttraining_run_glue.py
@@ -1,14 +1,13 @@
 # adapted from run_glue.py of huggingface transformers
 
-import dataclasses
 import logging
 import os
+import unittest
 from dataclasses import dataclass, field
 from typing import Dict, Optional
-import unittest
+
 import numpy as np
 from numpy.testing import assert_allclose
-
 from transformers import (
     AutoConfig,
     AutoModelForSequenceClassification,
@@ -24,15 +23,9 @@
 )
 
 import onnxruntime
-from onnxruntime.capi.ort_trainer import ORTTrainer, LossScaler, ModelDescription, IODescription
 
 try:
-    from onnxruntime.capi._pybind_state import (
-        get_mpi_context_local_rank,
-        get_mpi_context_local_size,
-        get_mpi_context_world_rank,
-        get_mpi_context_world_size,
-    )
+    from onnxruntime.capi._pybind_state import get_mpi_context_local_rank, get_mpi_context_world_size
 
     has_get_mpi_context_internal_api = True
 except ImportError:
@@ -42,8 +35,6 @@
 
 from orttraining_transformer_trainer import ORTTransformerTrainer
 
-import torch
-
 logger = logging.getLogger(__name__)
 
 
diff --git a/orttraining/orttraining/test/python/orttraining_run_multiple_choice.py b/orttraining/orttraining/test/python/orttraining_run_multiple_choice.py
index a4c069c683e1c..d7154de44a7c3 100644
--- a/orttraining/orttraining/test/python/orttraining_run_multiple_choice.py
+++ b/orttraining/orttraining/test/python/orttraining_run_multiple_choice.py
@@ -1,35 +1,25 @@
 # adapted from run_multiple_choice.py of huggingface transformers
 # https://github.com/huggingface/transformers/blob/master/examples/multiple-choice/run_multiple_choice.py
 
-import dataclasses
 import logging
 import os
+import unittest
 from dataclasses import dataclass, field
 from typing import Dict, Optional
-import unittest
-import numpy as np
-from numpy.testing import assert_allclose
 
+import numpy as np
+from orttraining_transformer_trainer import ORTTransformerTrainer
 from transformers import (
     AutoConfig,
     AutoModelForMultipleChoice,
     AutoTokenizer,
     EvalPrediction,
-    HfArgumentParser,
-    Trainer,
     TrainingArguments,
     set_seed,
 )
+from utils_multiple_choice import MultipleChoiceDataset, Split, SwagProcessor
 
 import onnxruntime
-from onnxruntime.capi.ort_trainer import ORTTrainer, LossScaler, ModelDescription, IODescription
-
-from orttraining_transformer_trainer import ORTTransformerTrainer
-
-import torch
-
-from utils_multiple_choice import MultipleChoiceDataset, Split, SwagProcessor
-from orttraining_run_glue import verify_old_and_new_api_are_equal
 
 logger = logging.getLogger(__name__)
 
diff --git a/orttraining/orttraining/test/python/orttraining_test_bert_postprocess.py b/orttraining/orttraining/test/python/orttraining_test_bert_postprocess.py
index 66de14dce6852..57238d35d51f6 100644
--- a/orttraining/orttraining/test/python/orttraining_test_bert_postprocess.py
+++ b/orttraining/orttraining/test/python/orttraining_test_bert_postprocess.py
@@ -1,5 +1,4 @@
-from orttraining_test_model_transform import add_name, fix_transpose, add_expand_shape
-from orttraining_test_layer_norm_transform import layer_norm_transform
+from orttraining_test_model_transform import add_name
 
 
 def postprocess_model(model):
diff --git a/orttraining/orttraining/test/python/orttraining_test_checkpoint_storage.py b/orttraining/orttraining/test/python/orttraining_test_checkpoint_storage.py
index 2ef8322bd9cfd..87eef2c8c40fe 100644
--- a/orttraining/orttraining/test/python/orttraining_test_checkpoint_storage.py
+++ b/orttraining/orttraining/test/python/orttraining_test_checkpoint_storage.py
@@ -2,12 +2,13 @@
 # Licensed under the MIT License.
 # orttraining_test_checkpoint_storage.py
 
-import pytest
-import torch
-import numpy as np
 import os
-import shutil
 import pickle
+import shutil
+
+import numpy as np
+import pytest
+import torch
 
 from onnxruntime.training import _checkpoint_storage
 
diff --git a/orttraining/orttraining/test/python/orttraining_test_data_loader.py b/orttraining/orttraining/test/python/orttraining_test_data_loader.py
index 2df5a3964bc94..cfe44b83883b6 100644
--- a/orttraining/orttraining/test/python/orttraining_test_data_loader.py
+++ b/orttraining/orttraining/test/python/orttraining_test_data_loader.py
@@ -1,7 +1,9 @@
-from enum import Enum
 import random
+from enum import Enum
+
 import torch
-from torch.utils.data import Dataset, DataLoader
+from torch.utils.data import DataLoader, Dataset
+
 from onnxruntime.capi.ort_trainer import generate_sample
 
 global_rng = random.Random()
diff --git a/orttraining/orttraining/test/python/orttraining_test_debuggability.py b/orttraining/orttraining/test/python/orttraining_test_debuggability.py
index d3d6987f47c2a..499f0ba7a1ff5 100644
--- a/orttraining/orttraining/test/python/orttraining_test_debuggability.py
+++ b/orttraining/orttraining/test/python/orttraining_test_debuggability.py
@@ -1,33 +1,9 @@
-import inspect
-import onnx
-import os
 import pytest
 import torch
-import torchvision
-
-from numpy.testing import assert_allclose
-
-from onnxruntime import set_seed
-from onnxruntime.capi.ort_trainer import (
-    IODescription as Legacy_IODescription,
-    ModelDescription as Legacy_ModelDescription,
-    LossScaler as Legacy_LossScaler,
-    ORTTrainer as Legacy_ORTTrainer,
-)
-from onnxruntime.training import (
-    _utils,
-    amp,
-    optim,
-    orttrainer,
-    TrainStepInfo,
-    model_desc_validation as md_val,
-    orttrainer_options as orttrainer_options,
-)
-
 from _test_commons import _load_pytorch_transformer_model
 
-import _test_helpers
-
+from onnxruntime import set_seed
+from onnxruntime.training import optim, orttrainer
 
 ###############################################################################
 # Testing starts here #########################################################
diff --git a/orttraining/orttraining/test/python/orttraining_test_dort.py b/orttraining/orttraining/test/python/orttraining_test_dort.py
index 3c2166a735e8f..ae6d1ac3c46f4 100644
--- a/orttraining/orttraining/test/python/orttraining_test_dort.py
+++ b/orttraining/orttraining/test/python/orttraining_test_dort.py
@@ -7,7 +7,7 @@
 from torch import nn
 from torch.nn import functional as F
 
-from onnxruntime.training.torchdynamo.register_backend import ort, aot_ort
+from onnxruntime.training.torchdynamo.register_backend import aot_ort, ort
 
 
 class TestTorchDynamoOrt(unittest.TestCase):
diff --git a/orttraining/orttraining/test/python/orttraining_test_experimental_gradient_graph.py b/orttraining/orttraining/test/python/orttraining_test_experimental_gradient_graph.py
index c67de052753ad..8f81d03dbae55 100644
--- a/orttraining/orttraining/test/python/orttraining_test_experimental_gradient_graph.py
+++ b/orttraining/orttraining/test/python/orttraining_test_experimental_gradient_graph.py
@@ -4,8 +4,9 @@
 
 import numpy as np
 import onnx
-import onnxruntime
 import torch
+
+import onnxruntime
 from onnxruntime.training.experimental import export_gradient_graph
 
 
diff --git a/orttraining/orttraining/test/python/orttraining_test_hierarchical_ortmodule.py b/orttraining/orttraining/test/python/orttraining_test_hierarchical_ortmodule.py
index 42daff79bd7d1..1e7fc2df57895 100644
--- a/orttraining/orttraining/test/python/orttraining_test_hierarchical_ortmodule.py
+++ b/orttraining/orttraining/test/python/orttraining_test_hierarchical_ortmodule.py
@@ -8,7 +8,6 @@
 import torch.nn.functional as F
 from torch.utils.checkpoint import checkpoint
 
-from onnxruntime.training.ortmodule import ORTModule
 from onnxruntime.training.ortmodule.experimental.hierarchical_ortmodule import HierarchicalORTModule
 
 
diff --git a/orttraining/orttraining/test/python/orttraining_test_layer_norm_transform.py b/orttraining/orttraining/test/python/orttraining_test_layer_norm_transform.py
index 241a963e28498..370fa52e6d2cd 100644
--- a/orttraining/orttraining/test/python/orttraining_test_layer_norm_transform.py
+++ b/orttraining/orttraining/test/python/orttraining_test_layer_norm_transform.py
@@ -14,10 +14,10 @@ def find_node(graph_proto, op_type):
 
 
 def gen_attribute(key, value):
-    attr = AttributeProto()
+    attr = onnx.AttributeProto()
     attr.name = key
     attr.ints.extend(int(v) for v in value)
-    attr.type = AttributeProto.INTS
+    attr.type = onnx.AttributeProto.INTS
     return attr
 
 
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
index 1a386579f8b69..97599639f6be5 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
@@ -557,7 +557,7 @@ def test_forward_call_positional_and_keyword_arguments():
     ],
 )
 def test_compare_pytorch_forward_call_positional_and_keyword_arguments(forward_statement):
-    one = torch.FloatTensor([1])
+    torch.FloatTensor([1])
 
     model = NeuralNetSimplePositionalAndKeywordArguments()
     pytorch_result = eval(forward_statement + ".item()")
@@ -679,7 +679,7 @@ def test_model_with_different_devices_same_session():
 
         model.to(device)
         x = torch.randn(N, D_in, device=device)
-        y = model(x)
+        model(x)
 
     del os.environ["ORTMODULE_SKIPCHECK_POLICY"]
 
@@ -1661,8 +1661,6 @@ def run_step(model, input):
 
 @pytest.mark.parametrize("input_shape", ([4, 2],))
 def test_aten_argmax(input_shape):
-    import torch.nn.functional as F
-
     class TopKGate(torch.nn.Module):
         def forward(self, input: torch.Tensor):
             indices = torch.argmax(input, dim=1)
@@ -2074,7 +2072,7 @@ def run_step(backbone_layers, task_layers, x):
         _test_helpers.assert_gradients_match_and_reset_gradient(ort_model1, pt_model1)
 
         # Run task 2
-        x2 = torch.randn(N, D_in, device=device)
+        torch.randn(N, D_in, device=device)
         pt_prediction = run_step(pt_model0, pt_model2, x1)
         ort_prediction = run_step(ort_model0, ort_model2, x1)
 
@@ -2314,11 +2312,11 @@ def run_step(model, x1):
     ort_x1 = pt_x1.clone()
 
     with pytest.raises(Exception) as ex_info:
-        pt_y1 = run_step(pt_model, pt_x1)
+        run_step(pt_model, pt_x1)
     assert "modified by an inplace operation" in str(ex_info.value)
 
     with pytest.raises(Exception) as ex_info:
-        ort_y1 = run_step(ort_model, ort_x1)
+        run_step(ort_model, ort_x1)
     assert "modified by an inplace operation" in str(ex_info.value)
 
 
@@ -2671,7 +2669,7 @@ def forward(self, x):
         ort_model = ORTModule(copy.deepcopy(pt_model))
         with pytest.raises(RuntimeError) as runtime_error:
             ort_model(x)
-        assert f"Expected all tensors to be on the same device, but found at least two devices" in str(
+        assert "Expected all tensors to be on the same device, but found at least two devices" in str(
             runtime_error.value
         )
     else:
@@ -2702,7 +2700,7 @@ def forward(self, x):
         with pytest.raises(RuntimeError) as runtime_error:
             ort_model = ORTModule(copy.deepcopy(pt_model))
             ort_model(x)
-        assert f"Expected all tensors to be on the same device, but found at least two devices" in str(
+        assert "Expected all tensors to be on the same device, but found at least two devices" in str(
             runtime_error.value
         )
     else:
@@ -2733,7 +2731,7 @@ def forward(self, x):
         ort_model = ORTModule(copy.deepcopy(pt_model))
         with pytest.raises(RuntimeError) as runtime_error:
             ort_model(x)
-        assert f"Expected all tensors to be on the same device, but found at least two devices" in str(
+        assert "Expected all tensors to be on the same device, but found at least two devices" in str(
             runtime_error.value
         )
     else:
@@ -2764,7 +2762,7 @@ def forward(self, x):
         ort_model = ORTModule(copy.deepcopy(pt_model))
         with pytest.raises(RuntimeError) as runtime_error:
             ort_model(x)
-        assert f"Expected all tensors to be on the same device, but found at least two devices" in str(
+        assert "Expected all tensors to be on the same device, but found at least two devices" in str(
             runtime_error.value
         )
     else:
@@ -2941,7 +2939,7 @@ def test_forward_data_and_model_on_different_devices(data_device, model_device):
     ort_model = ORTModule(model)
     # When exporting the model, ensure device is same between input data and model (else pytorch will raise while exporting)
     x = torch.randn(N, D_in, device=model_device)
-    output = ort_model(x)
+    ort_model(x)
 
     # Now that the model has been exported, feed in data from device other than the model device
     x = torch.randn(N, D_in, device=data_device)
@@ -2951,7 +2949,7 @@ def test_forward_data_and_model_on_different_devices(data_device, model_device):
         # Fallback
         with pytest.raises(RuntimeError) as runtime_error:
             ort_model(x)
-        assert f"Expected all tensors to be on the same device, but found at least two devices" in str(
+        assert "Expected all tensors to be on the same device, but found at least two devices" in str(
             runtime_error.value
         )
     else:
@@ -3063,7 +3061,7 @@ def test_model_wrapped_inside_torch_no_grad():
 
     # Make sure no exception is raised
     with torch.no_grad():
-        output = model(x)
+        model(x)
 
 
 def test_model_initializer_requires_grad_changes_from_one_forward_to_next():
@@ -3207,7 +3205,7 @@ def test_state_dict():
     pt_model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
     x = torch.randn(N, D_in, device=device)
-    y = x.clone()
+    x.clone()
 
     state_dict_ort = ort_model.state_dict()
     state_dict_pt = pt_model.state_dict()
@@ -3236,7 +3234,7 @@ def test_load_state_dict():
     pt_model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
     x = torch.randn(N, D_in, device=device)
-    y = x.clone()
+    x.clone()
 
     state_dict_pt = pt_model.state_dict()
     list(next(iter(state_dict_pt.items())))[1] += 10
@@ -3635,14 +3633,6 @@ def forward(self, pos_0, pos_1, *args, kw_0=None, kw_1=None, **kwargs):
     model = KwargsNet(input_size=D_in, hidden_size=H, num_classes=D_out).to(device)
     model = ORTModule(model)
 
-    # Dummy inputs used
-    pos_0 = torch.randn(N, D_in, device=device)
-    pos_1 = torch.randn(N, D_in, device=device)
-    kw_0 = torch.randn(N, D_in, device=device)
-    kw_1 = torch.randn(N, D_in, device=device)
-    args = [torch.randn(N, D_in, device=device)] * 2
-    kwargs = {"kwargs_0": torch.randn(N, D_in, device=device), "kwargs_1": torch.randn(D_in, D_in, device=device)}
-
     # Training step
     prediction = eval(forward_statement)
     assert prediction is not None
@@ -4017,7 +4007,7 @@ def forward(self, bool_argument, input1):
     x = torch.randn(N, D_in, device=device)
 
     # Ensure that no exceptions are raised
-    out = model(bool_argument, x)
+    model(bool_argument, x)
 
 
 @pytest.mark.parametrize(
@@ -4280,7 +4270,6 @@ def __init__(self):
             self.dummy = torch.nn.Parameter(torch.FloatTensor([0]))
 
         def forward(self, batch):
-            a = batch[0]
             b = batch[1]
             return self.dummy + b
 
@@ -4412,7 +4401,7 @@ def test_debug_options_save_onnx_models_validate_fail_on_non_str_prefix():
 def test_debug_options_save_onnx_models_validate_fail_on_no_prefix():
     with pytest.raises(Exception) as ex_info:
         _ = DebugOptions(save_onnx=True)
-    assert f"onnx_prefix must be provided when save_onnx is set." in str(ex_info.value)
+    assert "onnx_prefix must be provided when save_onnx is set." in str(ex_info.value)
 
 
 def test_debug_options_log_level():
@@ -4535,7 +4524,6 @@ def __init__(self):
             self.dummy = torch.nn.Parameter(torch.FloatTensor([0]))
 
         def forward(self, batch):
-            b = batch["b"]
             a = batch["a"]
             return self.dummy + a
 
@@ -4788,7 +4776,7 @@ def forward(self, a):
 
     assert not hasattr(pt_model, "_torch_module")
     assert "_torch_module" in ort_model.__dict__
-    assert ort_model._torch_module == True
+    assert ort_model._torch_module is True
 
 
 def test_ortmodule_setattr_signals_model_changed():
@@ -4816,11 +4804,11 @@ def forward(self, a):
     exported_model1 = ort_model._torch_module._execution_manager(True)._onnx_models.exported_model
 
     for training_mode in [False, True]:
-        assert ort_model._torch_module._execution_manager(training_mode)._original_model_has_changed == False
+        assert ort_model._torch_module._execution_manager(training_mode)._original_model_has_changed is False
     ort_model.input_flag = False
 
     for training_mode in [False, True]:
-        assert ort_model._torch_module._execution_manager(training_mode)._original_model_has_changed == True
+        assert ort_model._torch_module._execution_manager(training_mode)._original_model_has_changed is True
 
     _ = ort_model(torch.randn(N, D_in, device=device))
     exported_model2 = ort_model._torch_module._execution_manager(True)._onnx_models.exported_model
@@ -4846,7 +4834,7 @@ def load_state_dict(self):
     device = "cuda"
     pt_model = UserNet().to(device)
     with pytest.warns(UserWarning) as warning_record:
-        ort_model = ORTModule(pt_model)
+        ORTModule(pt_model)
 
     # FutureWarning('The first argument to symbolic functions is deprecated in 1.13 and will be removed in the future.
     # Please annotate treat the first argument (g) as GraphContext and use context information from the object
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd.py
index a625735c8c039..a760b1ed168a1 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd.py
@@ -5,11 +5,12 @@
 # pylint: disable=C0103
 # pylint: disable=W0212
 
-import pytest
-import torch
+import os
 
 # Import ORT modules.
-from _test_helpers import *
+import _test_helpers
+import pytest
+import torch
 from packaging.version import Version
 from torch.nn.parameter import Parameter
 
@@ -86,7 +87,7 @@ def input_generator():
     # generate a label that have same shape as forward output.
     label_input = torch.ones([output_size])
 
-    run_training_test_and_compare(model_builder, input_generator, label_input)
+    _test_helpers.run_training_test_and_compare(model_builder, input_generator, label_input)
 
 
 def test_GeLU_custom_func_rets_not_as_module_output():
@@ -144,7 +145,7 @@ def input_generator():
     # generate a label that have same shape as forward output.
     label_input = torch.ones([output_size])
 
-    run_training_test_and_compare(model_builder, input_generator, label_input)
+    _test_helpers.run_training_test_and_compare(model_builder, input_generator, label_input)
 
 
 def test_GeLU_multiple_forward_runs():
@@ -196,7 +197,7 @@ def input_generator():
     # generate a label that have same shape as forward output.
     label_input = torch.ones([output_size])
 
-    run_training_test_and_compare(model_builder, input_generator, label_input, run_forward_twice=True)
+    _test_helpers.run_training_test_and_compare(model_builder, input_generator, label_input, run_forward_twice=True)
 
 
 def test_MegatronF():
@@ -236,7 +237,7 @@ def input_generator():
     # generate a label that have same shape as forward output.
     label_input = torch.ones([output_size])
 
-    run_training_test_and_compare(model_builder, input_generator, label_input)
+    _test_helpers.run_training_test_and_compare(model_builder, input_generator, label_input)
 
 
 def test_ScalarAndTuple():
@@ -283,7 +284,7 @@ def input_generator():
     # generate a label that have same shape as forward output.
     label_input = torch.ones([output_size])
 
-    run_training_test_and_compare(model_builder, input_generator, label_input)
+    _test_helpers.run_training_test_and_compare(model_builder, input_generator, label_input)
 
 
 def test_ScalarAndTupleReordered():
@@ -330,7 +331,7 @@ def input_generator():
     # generate a label that have same shape as forward output.
     label_input = torch.ones([output_size])
 
-    run_training_test_and_compare(model_builder, input_generator, label_input)
+    _test_helpers.run_training_test_and_compare(model_builder, input_generator, label_input)
 
 
 @pytest.mark.skip(
@@ -380,7 +381,7 @@ def input_generator():
     label_input = torch.ones([output_size])
 
     # Test when input is in-place updated, but does not require gradient.
-    run_training_test_and_compare(model_builder, input_generator, label_input, ignore_grad_compare=True)
+    _test_helpers.run_training_test_and_compare(model_builder, input_generator, label_input, ignore_grad_compare=True)
 
 
 @pytest.mark.skip(
@@ -429,7 +430,7 @@ def input_generator():
     # which is a duplicated computation with the PythonOp.
     # So for the weights that are used twice BUT SHOULD only used once, the gradients are almost 2x than PyTorch's grad,
     # this is the reason we ignore the gradient compare here.
-    run_training_test_and_compare(model_builder, input_generator, label_input, ignore_grad_compare=True)
+    _test_helpers.run_training_test_and_compare(model_builder, input_generator, label_input, ignore_grad_compare=True)
 
 
 @pytest.mark.skip(reason="disable due to exporter bug https://github.com/microsoft/onnx-converters-private/issues/37.")
@@ -476,7 +477,7 @@ def input_generator():
     # generate a label that have same shape as forward output.
     label_input = torch.ones([output_size])
 
-    run_training_test_and_compare(model_builder, input_generator, label_input)
+    _test_helpers.run_training_test_and_compare(model_builder, input_generator, label_input)
 
 
 @pytest.mark.skip(
@@ -528,7 +529,7 @@ def input_generator():
     # duplicated computation with the PythonOp.  Thus, for the weights that are used twice BUT SHOULD
     # only used once, the gradients are almost 2x than PyTorch's grad, this is the reason we
     # ignore the gradient compare here.
-    run_training_test_and_compare(model_builder, input_generator, label_input, ignore_grad_compare=True)
+    _test_helpers.run_training_test_and_compare(model_builder, input_generator, label_input, ignore_grad_compare=True)
 
 
 @pytest.mark.skip(
@@ -580,7 +581,7 @@ def input_generator():
     # should reuse the input torch tensor @140214095996104, 140212816617984 but actually not." It seems
     # if we don't have mark_dirty() in auto grad forward, the result is not using the input_,
     # (maybe a view of it, because data address is same)
-    run_training_test_and_compare(model_builder, input_generator, label_input, ignore_grad_compare=True)
+    _test_helpers.run_training_test_and_compare(model_builder, input_generator, label_input, ignore_grad_compare=True)
 
 
 ##########################################################################################
@@ -630,7 +631,7 @@ def input_generator():
     # generate a label that have same shape as forward output.
     label_input = torch.ones([output_size])
 
-    run_training_test_and_compare(model_builder, input_generator, label_input)
+    _test_helpers.run_training_test_and_compare(model_builder, input_generator, label_input)
 
 
 def test_EvalTest():
@@ -643,7 +644,7 @@ def forward(ctx, x):
 
         @staticmethod
         def backward(ctx, grad_output):
-            x = ctx.saved_tensors
+            ctx.saved_tensors
             return None
 
     class EvalTestModel(torch.nn.Module):
@@ -671,8 +672,8 @@ def input_generator():
     # generate a label that have same shape as forward output.
     label_input = torch.ones([output_size])
 
-    # Test pure inferencing scenarios, when inputs don't requires_grad.
-    run_evaluate_test_and_compare(model_builder, input_generator, label_input)
+    # Test pure inference scenarios, when inputs don't requires_grad.
+    _test_helpers.run_evaluate_test_and_compare(model_builder, input_generator, label_input)
 
 
 @pytest.mark.skipif(
@@ -736,7 +737,7 @@ def input_generator():
     label_input = torch.ones([output_size])
 
     # Test multi-input and multi-output custom function.
-    run_training_test_and_compare(model_builder, input_generator, label_input)
+    _test_helpers.run_training_test_and_compare(model_builder, input_generator, label_input)
 
 
 def test_InnerModuleCall():
@@ -802,12 +803,12 @@ def get_inner_module_call_result(x, device, use_ort):
     # Test indirect ORTModule call from custom function
     result_pth = get_inner_module_call_result(x.detach(), "cuda:0", False)
     result_ort = get_inner_module_call_result(x.detach(), "cuda:0", True)
-    compare_tensor_list(result_ort, result_pth)
+    _test_helpers.compare_tensor_list(result_ort, result_pth)
 
     # Test indirect ORTModule call from custom function
     result_ort = get_inner_module_call_result(x.detach(), "cpu", True)
     result_pth = get_inner_module_call_result(x.detach(), "cpu", False)
-    compare_tensor_list(result_ort, result_pth)
+    _test_helpers.compare_tensor_list(result_ort, result_pth)
 
 
 @pytest.mark.skipif(
@@ -860,9 +861,9 @@ def input_generator_with_requires_grad():
     label_input = torch.ones([output_size])
 
     # Test multi-input and multi-output custom function.
-    run_training_test_and_compare(model_builder, input_generator, label_input)
+    _test_helpers.run_training_test_and_compare(model_builder, input_generator, label_input)
 
-    run_training_test_and_compare(model_builder, input_generator_with_requires_grad, label_input)
+    _test_helpers.run_training_test_and_compare(model_builder, input_generator_with_requires_grad, label_input)
 
 
 def test_MultipleStream_InForwardFunction():
@@ -908,7 +909,7 @@ def input_generator():
     label_input = torch.ones([output_size])
 
     # Test multi-input and multi-output custom function.
-    run_training_test_and_compare(
+    _test_helpers.run_training_test_and_compare(
         model_builder, input_generator, label_input, expected_outputs=[torch.tensor([0.224, 0.272])]
     )
 
@@ -956,7 +957,7 @@ def input_generator():
     label_input = torch.ones([output_size])
 
     # Test multi-input and multi-output custom function.
-    run_training_test_and_compare(
+    _test_helpers.run_training_test_and_compare(
         model_builder, input_generator, label_input, expected_outputs=[torch.tensor([0.224, 0.272])]
     )
 
@@ -1003,7 +1004,7 @@ def input_generator():
     label_input = torch.ones([output_size])
 
     # Test multi-input and multi-output custom function.
-    run_training_test_and_compare(
+    _test_helpers.run_training_test_and_compare(
         model_builder, input_generator, label_input, expected_outputs=[torch.tensor([0.224, 0.272])]
     )
 
@@ -1052,7 +1053,7 @@ def input_generator():
     label_input = torch.ones([output_size])
 
     # Test multi-input and multi-output custom function.
-    run_training_test_and_compare(
+    _test_helpers.run_training_test_and_compare(
         model_builder, input_generator, label_input, expected_outputs=[torch.tensor([0.224, 0.272])]
     )
 
@@ -1176,7 +1177,7 @@ def forward(ctx, x):
 
         @staticmethod
         def backward(ctx, grad_output):
-            x = ctx.saved_tensors
+            ctx.saved_tensors
             return None
 
     class TestSkippedModel(torch.nn.Module):
@@ -1333,4 +1334,4 @@ def input_generator():
         return torch.randn(output_size, output_size, dtype=torch.float).requires_grad_()
 
     label_input = torch.ones([output_size])
-    run_training_test_and_compare(model_builder, input_generator, label_input)
+    _test_helpers.run_training_test_and_compare(model_builder, input_generator, label_input)
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd_dist.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd_dist.py
index 188e053e4711e..1f9e3d4584483 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd_dist.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd_dist.py
@@ -3,18 +3,17 @@
 
 
 import copy
-import onnxruntime
 import os
-import sys
+
+import _test_helpers
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
-from onnxruntime.training.ortmodule import ORTModule
-from onnxruntime.training.ortmodule._graph_execution_manager_factory import GraphExecutionManagerFactory
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.nn.parameter import Parameter
 
-import _test_helpers
+import onnxruntime
+from onnxruntime.training.ortmodule import ORTModule
 
 torch.manual_seed(1)
 onnxruntime.set_seed(1)
@@ -123,7 +122,7 @@ def run_with_ort_on_gpu(model, args, rank, device):
     size = 2
     try:
         mp.spawn(test_Distributed_ReduceWithMarkDirtyModel, nprocs=size, args=(size,))
-    except:
+    except Exception:
         import sys
 
         sys.stdout.flush()
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py
index 8f1d57ff138a8..f186214bebb51 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py
@@ -1,23 +1,27 @@
-import logging
 import argparse
-import torch
-import wget
+import datetime
+import logging
 import os
-import pandas as pd
-import zipfile
-from transformers import BertTokenizer, AutoConfig
-from sklearn.model_selection import train_test_split
-from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
-from transformers import BertForSequenceClassification, AdamW, BertConfig
-from transformers import get_linear_schedule_with_warmup
-import numpy as np
 import random
 import time
-import datetime
+import zipfile
 
+import numpy as np
+import pandas as pd
+import torch
+import wget
+from sklearn.model_selection import train_test_split
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
+from transformers import (
+    AdamW,
+    AutoConfig,
+    BertForSequenceClassification,
+    BertTokenizer,
+    get_linear_schedule_with_warmup,
+)
 
 import onnxruntime
-from onnxruntime.training.ortmodule import ORTModule, DebugOptions
+from onnxruntime.training.ortmodule import DebugOptions, ORTModule
 
 
 def train(model, optimizer, scheduler, train_dataloader, epoch, device, args):
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py
index 42697766c9815..ab1be7c90f869 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py
@@ -1,23 +1,21 @@
-import logging
 import argparse
-import torch
-import wget
+import datetime
+import logging
 import os
-import pandas as pd
-import zipfile
-from transformers import BertTokenizer, AutoConfig
-from sklearn.model_selection import train_test_split
-from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
-from transformers import BertForSequenceClassification, AdamW, BertConfig
-from transformers import get_linear_schedule_with_warmup
-import numpy as np
 import random
 import time
-import datetime
+import zipfile
 
+import numpy as np
+import pandas as pd
+import torch
+import wget
+from sklearn.model_selection import train_test_split
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
+from transformers import AutoConfig, BertForSequenceClassification, BertTokenizer, get_linear_schedule_with_warmup
 
 import onnxruntime
-from onnxruntime.training.ortmodule import ORTModule, DebugOptions
+from onnxruntime.training.ortmodule import DebugOptions, ORTModule
 
 
 def train(model, optimizer, scaler, scheduler, train_dataloader, epoch, device, args):
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_pipeline_parallel.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_pipeline_parallel.py
index 21ebdb52037d4..ee31ac302640f 100755
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_pipeline_parallel.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_pipeline_parallel.py
@@ -1,13 +1,12 @@
+import argparse
+
+import deepspeed
 import torch
-from torch import nn, optim
 import torch.distributed as dist
-import deepspeed
-from deepspeed.pipe import PipelineModule, LayerSpec
-from deepspeed.utils import RepeatingLoader
-
-from onnxruntime.training.ortmodule import ORTModule, _utils
+from deepspeed.pipe import PipelineModule
+from torch import nn
 
-import argparse
+from onnxruntime.training.ortmodule import ORTModule
 
 # USAGE:
 # pip install deepspeed
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_zero_stage_1.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_zero_stage_1.py
index 037558663b428..91c48f4b0edd3 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_zero_stage_1.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_zero_stage_1.py
@@ -9,15 +9,15 @@
 ```
 """
 import argparse
-import torch
 import time
-from torchvision import datasets, transforms
+
+import deepspeed
+import torch
 import torch.distributed as dist
+from torchvision import datasets, transforms
 
 import onnxruntime
-from onnxruntime.training.ortmodule import ORTModule, DebugOptions, LogLevel
-
-import deepspeed
+from onnxruntime.training.ortmodule import DebugOptions, LogLevel, ORTModule
 
 
 class NeuralNet(torch.nn.Module):
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_experimental_json_config.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_experimental_json_config.py
index 7b2e08dc9ed6c..2998d0afb8336 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_experimental_json_config.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_experimental_json_config.py
@@ -1,7 +1,9 @@
 import os
+
 import torch
-from onnxruntime.training import ortmodule
+
 from onnxruntime.capi import _pybind_state as C
+from onnxruntime.training import ortmodule
 from onnxruntime.training.ortmodule.experimental.json_config import load_from_json
 
 
@@ -41,30 +43,30 @@ def test_load_config_from_json_1():
         assert ort_model_attributes._propagate_cast_ops_allow == ["ABC", "DEF"]
 
         # test use external gpu allocator
-        assert ort_model_attributes._use_external_gpu_allocator == False
+        assert ort_model_attributes._use_external_gpu_allocator is False
 
         # test enable custom autograd function
-        assert ort_model_attributes._enable_custom_autograd_function == True
+        assert ort_model_attributes._enable_custom_autograd_function is True
 
         # test use static shape
-        assert ort_model_attributes._use_static_shape == True
+        assert ort_model_attributes._use_static_shape is True
 
         # test run symbolic shape inference
-        assert ort_model_attributes._run_symbolic_shape_infer == False
+        assert ort_model_attributes._run_symbolic_shape_infer is False
 
         # test enable grad acc optimization
-        assert ort_model_attributes._enable_grad_acc_optimization == True
+        assert ort_model_attributes._enable_grad_acc_optimization is True
 
         # test skip check
         assert ort_model_attributes._skip_check.value == 14
 
         # test debug options
-        assert ort_model_attributes._debug_options.save_onnx_models.save == True
+        assert ort_model_attributes._debug_options.save_onnx_models.save is True
         assert ort_model_attributes._debug_options.save_onnx_models.name_prefix == "my_model"
         assert ort_model_attributes._debug_options.logging.log_level.name == "VERBOSE"
 
         # test use memory aware gradient builder.
-        assert ort_model_attributes._use_memory_efficient_gradient == False
+        assert ort_model_attributes._use_memory_efficient_gradient is False
 
         # test fallback policy
         assert ort_model_attributes._fallback_manager.policy.value == 1
@@ -94,30 +96,30 @@ def test_load_config_from_json_2():
         assert ort_model_attributes._propagate_cast_ops_allow == ["XYZ", "PQR"]
 
         # test use external gpu allocator
-        assert ort_model_attributes._use_external_gpu_allocator == True
+        assert ort_model_attributes._use_external_gpu_allocator is True
 
         # test enable custom autograd function
-        assert ort_model_attributes._enable_custom_autograd_function == False
+        assert ort_model_attributes._enable_custom_autograd_function is False
 
         # test use static shape
-        assert ort_model_attributes._use_static_shape == False
+        assert ort_model_attributes._use_static_shape is False
 
         # test run symbolic shape inference
-        assert ort_model_attributes._run_symbolic_shape_infer == True
+        assert ort_model_attributes._run_symbolic_shape_infer is True
 
         # test enable grad acc optimization
-        assert ort_model_attributes._enable_grad_acc_optimization == False
+        assert ort_model_attributes._enable_grad_acc_optimization is False
 
         # test skip check
         assert ort_model_attributes._skip_check.value == 10
 
         # test debug options
-        assert ort_model_attributes._debug_options.save_onnx_models.save == True
+        assert ort_model_attributes._debug_options.save_onnx_models.save is True
         assert ort_model_attributes._debug_options.save_onnx_models.name_prefix == "my_other_model"
         assert ort_model_attributes._debug_options.logging.log_level.name == "INFO"
 
         # test use memory aware gradient builder.
-        assert ort_model_attributes._use_memory_efficient_gradient == True
+        assert ort_model_attributes._use_memory_efficient_gradient is True
 
         # test fallback policy
         assert ort_model_attributes._fallback_manager.policy.value == 250
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_fairscale_sharded_optimizer.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_fairscale_sharded_optimizer.py
index e1a7dd591ec36..672abce394a9b 100755
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_fairscale_sharded_optimizer.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_fairscale_sharded_optimizer.py
@@ -1,16 +1,18 @@
 import argparse
+import os
+import time
+
+import numpy as np
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
-from fairscale.optim.oss import OSS
-from fairscale.nn.data_parallel import ShardedDataParallel as ShardedDDP
 import torchvision
-from torchvision import datasets, transforms
-import time
+from fairscale.nn.data_parallel import ShardedDataParallel as ShardedDDP
+from fairscale.optim.oss import OSS
 from torch.nn.parallel import DistributedDataParallel as DDP
-import os
-from onnxruntime.training.ortmodule import ORTModule, DebugOptions
-import numpy as np
+from torchvision import datasets, transforms
+
+from onnxruntime.training.ortmodule import DebugOptions, ORTModule
 
 # Usage :
 # pip install fairscale
@@ -201,9 +203,7 @@ def train(rank: int, args, world_size: int, epochs: int):
     train_dataloader, test_dataloader = get_dataloader(args, rank, args.batch_size)
     loss_fn = my_loss
     base_optimizer = torch.optim.SGD  # pick any pytorch compliant optimizer here
-    base_optimizer_arguments = (
-        {}
-    )  # pass any optimizer specific arguments here, or directly below when instantiating OSS
+    # pass any optimizer specific arguments here, or directly below when instantiating OSS
     if args.use_sharded_optimizer:
         # Wrap the optimizer in its state sharding brethren
         optimizer = OSS(params=model.parameters(), optim=base_optimizer, lr=args.lr)
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_fallback.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_fallback.py
index 6cde304a6570b..e58b903013ecd 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_fallback.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_fallback.py
@@ -440,7 +440,7 @@ def test_ortmodule_fallback_init__missing_cpp_extensions(
     if is_torch_cpp_extensions_installed(ORTMODULE_TORCH_CPP_DIR):
         warnings.warn(
             "Skipping test_ortmodule_fallback_init__missing_cpp_extensions."
-            f" It requires PyTorch CPP extensions to be missing"
+            " It requires PyTorch CPP extensions to be missing"
         )
     else:
 
@@ -647,7 +647,7 @@ def get_batch(source, i):
     for epoch in range(1, 2):
         model.train()  # turn on train mode
 
-        num_batches = len(train_data) // bptt
+        len(train_data) // bptt
         for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
             data, targets = get_batch(train_data, i)
             batch_size = data.size(0)
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_poc.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_poc.py
index bb94a6c514977..6cc060e10665c 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_poc.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_poc.py
@@ -1,11 +1,12 @@
 import argparse
 import logging
-import torch
 import time
+
+import torch
 from torchvision import datasets, transforms
 
 import onnxruntime
-from onnxruntime.training.ortmodule import ORTModule, DebugOptions
+from onnxruntime.training.ortmodule import DebugOptions, ORTModule
 
 
 class NeuralNet(torch.nn.Module):
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_pytorch_ddp.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_pytorch_ddp.py
index 93426659991fe..fd9384f41652c 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_pytorch_ddp.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_pytorch_ddp.py
@@ -1,19 +1,16 @@
 # This test script is a modified version of Pytorch's tutorial.
 # For details, see https://pytorch.org/tutorials/intermediate/ddp_tutorial.html.
+import argparse
 import os
-import sys
 import tempfile
-import torch
-import argparse
 
+import torch
 import torch.distributed as dist
+import torch.multiprocessing as mp
 import torch.nn as nn
 import torch.optim as optim
-import torch.multiprocessing as mp
-
 from torch.nn.parallel import DistributedDataParallel as DDP
 
-import onnxruntime
 from onnxruntime.training.ortmodule import ORTModule
 
 
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_torch_lightning_basic.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_torch_lightning_basic.py
index 9f8f273837d85..626bc4c946ed1 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_torch_lightning_basic.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_torch_lightning_basic.py
@@ -1,13 +1,13 @@
 import argparse
 from multiprocessing import cpu_count
 
+import pytorch_lightning as pl
 import torch
-from torch import nn
 import torch.nn.functional as F
+from torch import nn
+from torch.utils.data import DataLoader
 from torchvision import transforms
 from torchvision.datasets import MNIST
-from torch.utils.data import DataLoader
-import pytorch_lightning as pl
 
 import onnxruntime
 from onnxruntime.training.ortmodule import ORTModule
diff --git a/orttraining/orttraining/test/python/orttraining_test_orttrainer_bert_toy_onnx.py b/orttraining/orttraining/test/python/orttraining_test_orttrainer_bert_toy_onnx.py
index 531085f21ce61..feabea05b8e79 100644
--- a/orttraining/orttraining/test/python/orttraining_test_orttrainer_bert_toy_onnx.py
+++ b/orttraining/orttraining/test/python/orttraining_test_orttrainer_bert_toy_onnx.py
@@ -1,33 +1,19 @@
-import copy
+import os
 from functools import partial
-import inspect
-import math
-import numpy as np
-from numpy.testing import assert_allclose
+
+import _test_commons
+import _test_helpers
 import onnx
-import os
 import pytest
 import torch
+from numpy.testing import assert_allclose
 
 import onnxruntime
-from onnxruntime.capi.ort_trainer import (
-    IODescription as Legacy_IODescription,
-    ModelDescription as Legacy_ModelDescription,
-    LossScaler as Legacy_LossScaler,
-    ORTTrainer as Legacy_ORTTrainer,
-)
-from onnxruntime.training import (
-    _utils,
-    amp,
-    checkpoint,
-    optim,
-    orttrainer,
-    TrainStepInfo,
-    model_desc_validation as md_val,
-    orttrainer_options as orttrainer_options,
-)
-
-import _test_commons, _test_helpers
+from onnxruntime.capi.ort_trainer import IODescription as Legacy_IODescription
+from onnxruntime.capi.ort_trainer import LossScaler as Legacy_LossScaler
+from onnxruntime.capi.ort_trainer import ModelDescription as Legacy_ModelDescription
+from onnxruntime.capi.ort_trainer import ORTTrainer as Legacy_ORTTrainer
+from onnxruntime.training import amp, optim, orttrainer
 
 ###############################################################################
 # Helper functions ############################################################
@@ -191,7 +177,6 @@ def legacy_ort_trainer_learning_rate_description():
 
 
 def legacy_bert_model_description():
-    vocab_size = 30528
     input_ids_desc = Legacy_IODescription("input_ids", ["batch", "max_seq_len_in_batch"])
     segment_ids_desc = Legacy_IODescription("segment_ids", ["batch", "max_seq_len_in_batch"])
     input_mask_desc = Legacy_IODescription("input_mask", ["batch", "max_seq_len_in_batch"])
@@ -264,7 +249,7 @@ def testToyBERTDeterministicCheck(expected_losses):
     # Modeling
     model_desc = bert_model_description()
     model = load_bert_onnx_model()
-    params = optimizer_parameters(model)
+    optimizer_parameters(model)
     optim_config = optim.LambConfig()
     opts = orttrainer.ORTTrainerOptions(
         {
@@ -727,7 +712,6 @@ def testToyBertCheckpointFrozenWeights():
 )
 def testToyBertLoadOptimState(optimizer, mixedprecision_enabled):
     # Common setup
-    rtol = 1e-03
     device = "cuda"
     seed = 1
     torch.manual_seed(seed)
diff --git a/orttraining/orttraining/test/python/orttraining_test_orttrainer_checkpoint_functions.py b/orttraining/orttraining/test/python/orttraining_test_orttrainer_checkpoint_functions.py
index 99606d923e1d2..5848682a108c8 100644
--- a/orttraining/orttraining/test/python/orttraining_test_orttrainer_checkpoint_functions.py
+++ b/orttraining/orttraining/test/python/orttraining_test_orttrainer_checkpoint_functions.py
@@ -1,10 +1,12 @@
-import pytest
-from unittest.mock import patch, Mock
-from _test_commons import _load_pytorch_transformer_model
-from onnxruntime.training import amp, checkpoint, optim, orttrainer, _checkpoint_storage
+from unittest.mock import Mock, patch
+
 import numpy as np
 import onnx
+import pytest
 import torch
+from _test_commons import _load_pytorch_transformer_model
+
+from onnxruntime.training import _checkpoint_storage, checkpoint, optim, orttrainer
 
 # Helper functions
 
@@ -625,7 +627,7 @@ def test_checkpoint_aggregation(load_mock):
     assert (state_dict["optimizer"]["non_sharded"]["Moment_2"] == np.array([6666, 5555, 4444])).all()
     assert (state_dict["optimizer"]["non_sharded"]["Step"] == np.array([55])).all()
 
-    assert state_dict["trainer_options"]["mixed_precision"] == False
+    assert state_dict["trainer_options"]["mixed_precision"] is False
     assert state_dict["trainer_options"]["world_rank"] == 0
     assert state_dict["trainer_options"]["world_size"] == 1
     assert state_dict["trainer_options"]["horizontal_parallel_size"] == 1
@@ -711,7 +713,7 @@ def test_checkpoint_aggregation_mixed_precision(load_mock):
     assert (state_dict["optimizer"]["non_sharded"]["Moment_2"] == np.array([6666, 5555, 4444])).all()
     assert (state_dict["optimizer"]["non_sharded"]["Step"] == np.array([55])).all()
 
-    assert state_dict["trainer_options"]["mixed_precision"] == True
+    assert state_dict["trainer_options"]["mixed_precision"] is True
     assert state_dict["trainer_options"]["world_rank"] == 0
     assert state_dict["trainer_options"]["world_size"] == 1
     assert state_dict["trainer_options"]["horizontal_parallel_size"] == 1
diff --git a/orttraining/orttraining/test/python/orttraining_test_orttrainer_frontend.py b/orttraining/orttraining/test/python/orttraining_test_orttrainer_frontend.py
index 57b5af656eb66..7d788d1308cd9 100644
--- a/orttraining/orttraining/test/python/orttraining_test_orttrainer_frontend.py
+++ b/orttraining/orttraining/test/python/orttraining_test_orttrainer_frontend.py
@@ -2,7 +2,7 @@
 import os
 import tempfile
 from functools import partial
-from packaging.version import Version as StrictVersion
+
 import _test_commons
 import _test_helpers
 import onnx
@@ -10,16 +10,14 @@
 import torch
 import torch.nn.functional as F
 from numpy.testing import assert_allclose
+from packaging.version import Version as StrictVersion
 
 from onnxruntime import SessionOptions, set_seed
-from onnxruntime.capi.ort_trainer import IODescription as Legacy_IODescription
 from onnxruntime.capi.ort_trainer import LossScaler as Legacy_LossScaler
-from onnxruntime.capi.ort_trainer import ModelDescription as Legacy_ModelDescription
 from onnxruntime.capi.ort_trainer import ORTTrainer as Legacy_ORTTrainer
-from onnxruntime.training import PropagateCastOpsStrategy, TrainStepInfo, _utils, amp, checkpoint
+from onnxruntime.training import PropagateCastOpsStrategy, TrainStepInfo, _utils, amp
 from onnxruntime.training import model_desc_validation as md_val
-from onnxruntime.training import optim, orttrainer
-from onnxruntime.training import orttrainer_options as orttrainer_options
+from onnxruntime.training import optim, orttrainer, orttrainer_options
 
 ###############################################################################
 # Testing starts here #########################################################
@@ -315,7 +313,7 @@ def testDynamicLossScalerCustomValues():
     scaler = amp.loss_scaler.DynamicLossScaler(
         automatic_update=False, loss_scale=3, up_scale_window=7, min_loss_scale=5, max_loss_scale=10
     )
-    assert scaler.automatic_update == False
+    assert scaler.automatic_update is False
     assert_allclose(scaler.loss_scale, 3, rtol=rtol, err_msg="loss scale mismatch")
     assert_allclose(scaler.min_loss_scale, 5, rtol=rtol, err_msg="min loss scale mismatch")
     assert_allclose(scaler.max_loss_scale, 10, rtol=rtol, err_msg="max loss scale mismatch")
@@ -331,14 +329,14 @@ def testTrainStepInfo():
         optimizer_config=optimizer_config, all_finite=False, fetches=fetches, optimization_step=123, step=456
     )
     assert step_info.optimizer_config == optimizer_config
-    assert step_info.all_finite == False
+    assert step_info.all_finite is False
     assert step_info.fetches == fetches
     assert step_info.optimization_step == 123
     assert step_info.step == 456
 
     step_info = orttrainer.TrainStepInfo(optimizer_config)
     assert step_info.optimizer_config == optimizer_config
-    assert step_info.all_finite == True
+    assert step_info.all_finite is True
     assert step_info.fetches == []
     assert step_info.optimization_step == 0
     assert step_info.step == 0
@@ -458,7 +456,7 @@ def testOptimizerConfigAdam():
     assert_allclose(0.0, cfg.lambda_coef, rtol=rtol, err_msg="lambda_coef mismatch")
     assert_allclose(1e-8, cfg.epsilon, rtol=rtol, err_msg="epsilon mismatch")
     assert_allclose(1.0, cfg.max_norm_clip, rtol=rtol, err_msg="max_norm_clip mismatch")
-    assert cfg.do_bias_correction == True, "lambda_coef mismatch"
+    assert cfg.do_bias_correction is True, "lambda_coef mismatch"
     assert cfg.weight_decay_mode == optim.AdamConfig.DecayMode.BEFORE_WEIGHT_UPDATE, "weight_decay_mode mismatch"
 
 
@@ -475,7 +473,7 @@ def testOptimizerConfigLamb():
     assert cfg.ratio_max == float("inf"), "ratio_max mismatch"
     assert_allclose(1e-6, cfg.epsilon, rtol=rtol, err_msg="epsilon mismatch")
     assert_allclose(1.0, cfg.max_norm_clip, rtol=rtol, err_msg="max_norm_clip mismatch")
-    assert cfg.do_bias_correction == False, "do_bias_correction mismatch"
+    assert cfg.do_bias_correction is False, "do_bias_correction mismatch"
 
 
 @pytest.mark.parametrize("optim_name", [("Adam"), ("Lamb")])
@@ -1044,7 +1042,7 @@ def testORTTrainerInternalUseContribOps(enable_onnx_contrib_ops):
     # Training loop
     data, targets = batcher_fn(train_data, 0)
     if not enable_onnx_contrib_ops and not pytorch_110:
-        with pytest.raises(Exception) as e_info:
+        with pytest.raises(Exception):
             _, _ = trainer.train_step(data, targets)
     else:
         _, _ = trainer.train_step(data, targets)
@@ -1591,7 +1589,7 @@ def testORTTrainerLegacyAndExperimentalLRScheduler(seed, device, optimizer_confi
 
 
 def testLossScalerLegacyAndExperimentalFullCycle():
-    info = orttrainer.TrainStepInfo(
+    orttrainer.TrainStepInfo(
         optimizer_config=optim.LambConfig(lr=0.001), all_finite=True, fetches=[], optimization_step=0, step=0
     )
     new_ls = amp.DynamicLossScaler()
@@ -1757,7 +1755,7 @@ def testORTTrainerOptionsEnabledAdasumFlag(test_input):
     """Test the enabled_adasum flag values when set enabled"""
 
     actual_values = orttrainer_options.ORTTrainerOptions(test_input)
-    assert actual_values.distributed.enable_adasum == True
+    assert actual_values.distributed.enable_adasum is True
 
 
 @pytest.mark.parametrize(
@@ -1774,7 +1772,7 @@ def testORTTrainerOptionsDisabledAdasumFlag(test_input):
     """Test the enabled_adasum flag values when set disabled"""
 
     actual_values = orttrainer_options.ORTTrainerOptions(test_input)
-    assert actual_values.distributed.enable_adasum == False
+    assert actual_values.distributed.enable_adasum is False
 
 
 def testORTTrainerUnusedInput():
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortvalue.py b/orttraining/orttraining/test/python/orttraining_test_ortvalue.py
index cfdc52a9f0848..26792f7491384 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortvalue.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortvalue.py
@@ -3,20 +3,22 @@
 # Licensed under the MIT License.
 # pylint: disable=W0212,C0114,C0116
 
-import unittest
 import copy
 import sys
+import unittest
+
+import _test_helpers
 import numpy as np
-from numpy.testing import assert_almost_equal
-import onnxruntime as onnxrt
-from onnxruntime.capi.onnxruntime_pybind11_state import OrtValue as C_OrtValue, OrtValueVector
-from onnxruntime.training.ortmodule import ORTModule, _utils
-from onnxruntime.capi import _pybind_state as C
 import torch
+from numpy.testing import assert_almost_equal
 from torch._C import _from_dlpack
 from torch.utils.dlpack import from_dlpack
-import _test_helpers
 
+import onnxruntime as onnxrt
+from onnxruntime.capi import _pybind_state as C
+from onnxruntime.capi.onnxruntime_pybind11_state import OrtValue as C_OrtValue
+from onnxruntime.capi.onnxruntime_pybind11_state import OrtValueVector
+from onnxruntime.training.ortmodule import ORTModule, _utils
 
 has_cuda = torch.cuda.is_available()
 
diff --git a/orttraining/orttraining/test/python/orttraining_test_sampler.py b/orttraining/orttraining/test/python/orttraining_test_sampler.py
index c47b721b7d100..a1281fbf286f2 100644
--- a/orttraining/orttraining/test/python/orttraining_test_sampler.py
+++ b/orttraining/orttraining/test/python/orttraining_test_sampler.py
@@ -2,9 +2,11 @@
 # Licensed under the MIT License.
 # orttraining_test_sampler.py
 
+import random
+
 import torch
+
 from onnxruntime.training.utils.data import sampler
-import random
 
 
 class MyDataset(torch.utils.data.Dataset):
diff --git a/orttraining/orttraining/test/python/orttraining_test_transformers.py b/orttraining/orttraining/test/python/orttraining_test_transformers.py
index 1e73da0f65b3f..0d41105cc34be 100644
--- a/orttraining/orttraining/test/python/orttraining_test_transformers.py
+++ b/orttraining/orttraining/test/python/orttraining_test_transformers.py
@@ -1,23 +1,15 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import unittest
-import shutil
-import pytest
-import os
 import random
+import unittest
+
 import numpy as np
+import torch
 from numpy.testing import assert_allclose
-from transformers import BertConfig, BertForPreTraining, BertModel
-
-from orttraining_test_data_loader import ids_tensor, BatchArgsOption
-from orttraining_test_utils import run_test, get_lr
+from orttraining_test_data_loader import BatchArgsOption, ids_tensor
+from orttraining_test_utils import get_lr, run_test
+from transformers import BertConfig, BertForPreTraining
 
 import onnxruntime
-from onnxruntime.capi.ort_trainer import ORTTrainer, IODescription, ModelDescription, LossScaler
-
-import torch
+from onnxruntime.capi.ort_trainer import IODescription, LossScaler, ModelDescription
 
 
 class BertModelTest(unittest.TestCase):
diff --git a/orttraining/orttraining/test/python/orttraining_test_utils.py b/orttraining/orttraining/test/python/orttraining_test_utils.py
index 7397cc9d517b9..af1022c4f0e6b 100644
--- a/orttraining/orttraining/test/python/orttraining_test_utils.py
+++ b/orttraining/orttraining/test/python/orttraining_test_utils.py
@@ -1,12 +1,10 @@
+import math
+
 import torch
-from orttraining_test_bert_postprocess import postprocess_model
 from orttraining_test_data_loader import BatchArgsOption, create_ort_test_dataloader, split_batch
 
 from onnxruntime.capi.ort_trainer import IODescription, ORTTrainer
-from onnxruntime.training import TrainStepInfo, _utils, amp
-from onnxruntime.training import model_desc_validation as md_val
-from onnxruntime.training import optim, orttrainer
-from onnxruntime.training import orttrainer_options as orttrainer_options
+from onnxruntime.training import amp, optim, orttrainer
 from onnxruntime.training.optim import _LRScheduler
 
 
diff --git a/orttraining/orttraining/test/python/orttraining_transformer_trainer.py b/orttraining/orttraining/test/python/orttraining_transformer_trainer.py
index 0185670dac79f..b80c3ef5bba38 100644
--- a/orttraining/orttraining/test/python/orttraining_transformer_trainer.py
+++ b/orttraining/orttraining/test/python/orttraining_transformer_trainer.py
@@ -4,36 +4,21 @@
 import logging
 import os
 import random
-
-from typing import Callable, Dict, List, NamedTuple, Optional, Tuple
+from typing import Callable, Dict, List, NamedTuple, Optional
 
 import numpy as np
 import torch
-from torch import nn
 from torch.utils.data.dataloader import DataLoader
 from torch.utils.data.dataset import Dataset
 from torch.utils.data.distributed import DistributedSampler
-from torch.utils.data.sampler import RandomSampler, SequentialSampler
+from torch.utils.data.sampler import SequentialSampler
 from tqdm import tqdm, trange
-
-from transformers.data.data_collator import DataCollator, DefaultDataCollator
+from transformers.data.data_collator import DefaultDataCollator
 from transformers.modeling_utils import PreTrainedModel
 from transformers.training_args import TrainingArguments
 
 import onnxruntime
-from orttraining_test_bert_postprocess import postprocess_model
-from onnxruntime.capi.ort_trainer import ORTTrainer, LossScaler, ModelDescription, IODescription
-
-from onnxruntime.training import (
-    _utils,
-    amp,
-    optim,
-    orttrainer,
-    TrainStepInfo,
-    model_desc_validation as md_val,
-    orttrainer_options as orttrainer_options,
-)
-from onnxruntime.training.optim import LinearWarmupLRScheduler, _LRScheduler
+from onnxruntime.training import amp, optim, orttrainer
 
 try:
     from torch.utils.tensorboard import SummaryWriter
diff --git a/orttraining/orttraining/test/python/perf_log/ort_module_perf_test_tools.py b/orttraining/orttraining/test/python/perf_log/ort_module_perf_test_tools.py
index b7b619a92e53b..a71ed93001230 100644
--- a/orttraining/orttraining/test/python/perf_log/ort_module_perf_test_tools.py
+++ b/orttraining/orttraining/test/python/perf_log/ort_module_perf_test_tools.py
@@ -1,13 +1,13 @@
 # https://docs.microsoft.com/en-us/azure/mysql/connect-python
 
-import mysql.connector
-from mysql.connector import errorcode
-import git
-import os
-
 import argparse
+import os
 from datetime import datetime
 
+import git
+import mysql.connector
+from mysql.connector import errorcode
+
 
 def get_repo_commit(repo_path):
     repo = git.Repo(repo_path, search_parent_directories=True)
diff --git a/orttraining/orttraining/test/python/utils_multiple_choice.py b/orttraining/orttraining/test/python/utils_multiple_choice.py
index 562ecbf8c496d..04aee10c45303 100644
--- a/orttraining/orttraining/test/python/utils_multiple_choice.py
+++ b/orttraining/orttraining/test/python/utils_multiple_choice.py
@@ -2,21 +2,17 @@
 # https://github.com/huggingface/transformers/blob/master/examples/multiple-choice/utils_multiple_choice.py
 
 import csv
-import glob
-import json
 import logging
 import os
 from dataclasses import dataclass
 from enum import Enum
 from typing import List, Optional
 
+import torch
 import tqdm
 from filelock import FileLock
-
-from transformers import PreTrainedTokenizer, is_tf_available, is_torch_available
-
-import torch
 from torch.utils.data.dataset import Dataset
+from transformers import PreTrainedTokenizer
 
 logger = logging.getLogger(__name__)
 
diff --git a/orttraining/pytorch_frontend_examples/mnist_training.py b/orttraining/pytorch_frontend_examples/mnist_training.py
index afab8a3bf7ec2..b71a9d75921e8 100644
--- a/orttraining/pytorch_frontend_examples/mnist_training.py
+++ b/orttraining/pytorch_frontend_examples/mnist_training.py
@@ -4,18 +4,16 @@
 
 ## Model testing is not complete.
 
-from __future__ import print_function
 import argparse
+import os
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import torch.optim as optim
+from mpi4py import MPI
 from torchvision import datasets, transforms
-import numpy as np
-import os
 
 from onnxruntime.capi.ort_trainer import IODescription, ModelDescription, ORTTrainer
-from mpi4py import MPI
 
 try:
     from onnxruntime.capi._pybind_state import set_cuda_device_id
@@ -193,7 +191,6 @@ def main():
 
     for epoch in range(1, args.epochs + 1):
         train_with_trainer(args, trainer, device, train_loader, epoch)
-        import pdb
 
         test_with_trainer(args, trainer, device, test_loader)
 
diff --git a/orttraining/tools/amdgpu/script/rocprof.py b/orttraining/tools/amdgpu/script/rocprof.py
index dc91d13606fb0..baafdafc98578 100644
--- a/orttraining/tools/amdgpu/script/rocprof.py
+++ b/orttraining/tools/amdgpu/script/rocprof.py
@@ -1,6 +1,4 @@
 import argparse
-import numpy as np
-import os
 import csv
 
 parser = argparse.ArgumentParser()
diff --git a/orttraining/tools/ci_test/compare_huggingface.py b/orttraining/tools/ci_test/compare_huggingface.py
index c484cfb56adcb..fd7244a0cf0b7 100755
--- a/orttraining/tools/ci_test/compare_huggingface.py
+++ b/orttraining/tools/ci_test/compare_huggingface.py
@@ -1,6 +1,6 @@
-import sys
-import json
 import collections
+import json
+import sys
 
 actual = sys.argv[1]
 expect = sys.argv[2]
diff --git a/orttraining/tools/ci_test/compare_results.py b/orttraining/tools/ci_test/compare_results.py
index ba76b9eaf414c..1c302f7dcd07b 100644
--- a/orttraining/tools/ci_test/compare_results.py
+++ b/orttraining/tools/ci_test/compare_results.py
@@ -1,10 +1,8 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-import argparse
 import collections
 import csv
-import re
 import sys
 
 Comparison = collections.namedtuple("Comparison", ["name", "fn"])
diff --git a/orttraining/tools/ci_test/download_azure_blob_archive.py b/orttraining/tools/ci_test/download_azure_blob_archive.py
index 6fa875a1d2373..dea1964cc0f66 100755
--- a/orttraining/tools/ci_test/download_azure_blob_archive.py
+++ b/orttraining/tools/ci_test/download_azure_blob_archive.py
@@ -9,8 +9,6 @@
 import subprocess
 import sys
 import tempfile
-import urllib.request
-import zipfile
 
 SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
 REPO_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "..", "..", ".."))
diff --git a/orttraining/tools/ci_test/run_batch_size_test.py b/orttraining/tools/ci_test/run_batch_size_test.py
index 4a7ec51062914..cd93c44cf73b6 100755
--- a/orttraining/tools/ci_test/run_batch_size_test.py
+++ b/orttraining/tools/ci_test/run_batch_size_test.py
@@ -4,9 +4,9 @@
 
 import argparse
 import collections
+import os
 import subprocess
 import sys
-import os
 
 
 def parse_args():
diff --git a/orttraining/tools/ci_test/run_bert_perf_test.py b/orttraining/tools/ci_test/run_bert_perf_test.py
index 8f6a59c1fd883..27cd9fb01c99b 100644
--- a/orttraining/tools/ci_test/run_bert_perf_test.py
+++ b/orttraining/tools/ci_test/run_bert_perf_test.py
@@ -3,10 +3,10 @@
 # Licensed under the MIT License.
 
 import argparse
+import json
+import os
 import subprocess
 import sys
-import os
-import json
 from collections import namedtuple
 
 SCRIPT_DIR = os.path.realpath(os.path.dirname(__file__))
diff --git a/orttraining/tools/ci_test/run_convergence_test.py b/orttraining/tools/ci_test/run_convergence_test.py
index 568e3c4cd9c4c..bdea6ad95c944 100755
--- a/orttraining/tools/ci_test/run_convergence_test.py
+++ b/orttraining/tools/ci_test/run_convergence_test.py
@@ -3,12 +3,12 @@
 # Licensed under the MIT License.
 
 import argparse
+import os
 import subprocess
 import sys
 import tempfile
-import os
 
-from compare_results import compare_results_files, Comparisons
+from compare_results import Comparisons, compare_results_files
 
 SCRIPT_DIR = os.path.realpath(os.path.dirname(__file__))
 
diff --git a/orttraining/tools/ci_test/run_gpt2_perf_test.py b/orttraining/tools/ci_test/run_gpt2_perf_test.py
index 8c0ac1953feed..e64fc3c7812e3 100644
--- a/orttraining/tools/ci_test/run_gpt2_perf_test.py
+++ b/orttraining/tools/ci_test/run_gpt2_perf_test.py
@@ -3,9 +3,9 @@
 # Licensed under the MIT License.
 
 import argparse
+import os
 import subprocess
 import sys
-import os
 from collections import namedtuple
 
 SCRIPT_DIR = os.path.realpath(os.path.dirname(__file__))
diff --git a/orttraining/tools/scripts/experiment.py b/orttraining/tools/scripts/experiment.py
index 0e3e2ceead465..1841a89d91849 100644
--- a/orttraining/tools/scripts/experiment.py
+++ b/orttraining/tools/scripts/experiment.py
@@ -1,20 +1,14 @@
 import argparse
-import os
 import re
-import sys
 
 from azure.common.client_factory import get_client_from_cli_profile
 from azure.mgmt.containerregistry import ContainerRegistryManagementClient
-
-from azureml.core import Workspace, Experiment, Run, Datastore
-from azureml.core.compute import ComputeTarget, AmlCompute
-
+from azureml.core import Datastore, Experiment, Workspace
+from azureml.core.compute import ComputeTarget
 from azureml.core.container_registry import ContainerRegistry
+from azureml.core.runconfig import MpiConfiguration
 from azureml.train.estimator import Estimator
 
-from azureml.data.azure_storage_datastore import AzureFileDatastore, AzureBlobDatastore
-from azureml.core.runconfig import MpiConfiguration, RunConfiguration
-
 parser = argparse.ArgumentParser()
 parser.add_argument(
     "--subscription", type=str, default="ea482afa-3a32-437c-aa10-7de928a9e793"
diff --git a/orttraining/tools/scripts/gpt2_model_transform.py b/orttraining/tools/scripts/gpt2_model_transform.py
index 9e018a34069e5..b9efea22066f1 100644
--- a/orttraining/tools/scripts/gpt2_model_transform.py
+++ b/orttraining/tools/scripts/gpt2_model_transform.py
@@ -1,10 +1,9 @@
 ### Be noted: this script is developed against the model exported from Megatron GPT2 Pretraining script.
 
 import sys
-import onnx
-from onnx import helper, shape_inference
-from onnx import TensorProto
+
 import numpy as np
+import onnx
 from onnx import numpy_helper
 
 if len(sys.argv) < 2:
@@ -112,7 +111,7 @@ def process_concat(model):
                 skip = True
             input_nodes.append(concat_input_node)
 
-        if skip == True:
+        if skip:
             continue
 
         # figure out target shape
diff --git a/orttraining/tools/scripts/layer_norm_transform.py b/orttraining/tools/scripts/layer_norm_transform.py
index 0ad4ea2559207..92db4a5149e3b 100644
--- a/orttraining/tools/scripts/layer_norm_transform.py
+++ b/orttraining/tools/scripts/layer_norm_transform.py
@@ -1,8 +1,8 @@
-import sys
 import os.path
-from onnx import *
-import onnx
+import sys
+
 import numpy as np
+import onnx
 
 
 def find_node(graph_proto, op_type):
@@ -17,10 +17,10 @@ def find_node(graph_proto, op_type):
 
 
 def gen_attribute(key, value):
-    attr = AttributeProto()
+    attr = onnx.AttributeProto()
     attr.name = key
     attr.ints.extend(int(v) for v in value)
-    attr.type = AttributeProto.INTS
+    attr.type = onnx.AttributeProto.INTS
     return attr
 
 
@@ -120,7 +120,7 @@ def main():
         layer_norm_output.append("saved_mean_" + str(id))
         id = id + 1
         layer_norm_output.append("saved_inv_std_var_" + str(id))
-        layer_norm = helper.make_node(
+        layer_norm = onnx.helper.make_node(
             "LayerNormalization",
             layer_norm_input,
             layer_norm_output,
diff --git a/orttraining/tools/scripts/model_transform.py b/orttraining/tools/scripts/model_transform.py
index 8c0be5b08c04a..c4de2dfe00235 100644
--- a/orttraining/tools/scripts/model_transform.py
+++ b/orttraining/tools/scripts/model_transform.py
@@ -1,8 +1,7 @@
 import sys
-import onnx
-from onnx import helper, shape_inference
-from onnx import TensorProto
+
 import numpy as np
+import onnx
 from onnx import numpy_helper
 
 if len(sys.argv) < 2:
diff --git a/orttraining/tools/scripts/nv_run_pretraining.py b/orttraining/tools/scripts/nv_run_pretraining.py
index 3e51a8886ecb6..db0c51e8d2373 100644
--- a/orttraining/tools/scripts/nv_run_pretraining.py
+++ b/orttraining/tools/scripts/nv_run_pretraining.py
@@ -15,42 +15,30 @@
 # limitations under the License.
 """BERT finetuning runner."""
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+import argparse
 
 # ==================
-import csv
-import os
-import time
 import logging
-import argparse
+import os
 import random
+import time
+from concurrent.futures import ProcessPoolExecutor
+
+import amp_C
+import apex_C
 import h5py
-from tqdm import tqdm, trange
-import os
 import numpy as np
 import torch
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Dataset
-from torch.utils.data.distributed import DistributedSampler
-import math
 from apex import amp
-import multiprocessing
-
-from tokenization import BertTokenizer
-from modeling import BertForPreTraining, BertConfig
-from optimization import BertLAMB
-
-from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
-from utils import is_main_process
+from apex.amp import _amp_state
 from apex.parallel import DistributedDataParallel as DDP
-from schedulers import LinearWarmUpScheduler
 from apex.parallel.distributed import flat_dist_call
-import amp_C
-import apex_C
-from apex.amp import _amp_state
-
-from concurrent.futures import ProcessPoolExecutor
+from modeling import BertConfig, BertForPreTraining
+from optimization import BertLAMB
+from torch.utils.data import DataLoader, Dataset, RandomSampler
+from tqdm import tqdm
+from utils import is_main_process
 
 logging.basicConfig(
     format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
@@ -459,7 +447,6 @@ def main():
 
         # Note: We loop infinitely over epochs, termination is handled via iteration count
         while True:
-            thread = None
             if not args.resume_from_checkpoint or epoch > 0 or args.phase2:
                 files = [
                     os.path.join(args.input_dir, f)
diff --git a/orttraining/tools/scripts/opset12_model_transform.py b/orttraining/tools/scripts/opset12_model_transform.py
index c19aceb6216d8..453f2bd2a250e 100644
--- a/orttraining/tools/scripts/opset12_model_transform.py
+++ b/orttraining/tools/scripts/opset12_model_transform.py
@@ -13,10 +13,9 @@
 #   bert-base-uncased_L_12_H_768_A_12_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12.onnx
 
 import sys
-import onnx
-from onnx import helper, shape_inference
-from onnx import TensorProto
+
 import numpy as np
+import onnx
 from onnx import numpy_helper
 
 if len(sys.argv) < 2:
diff --git a/orttraining/tools/scripts/performance_investigation.py b/orttraining/tools/scripts/performance_investigation.py
index b064b13fa6d34..c8550a4d73c49 100644
--- a/orttraining/tools/scripts/performance_investigation.py
+++ b/orttraining/tools/scripts/performance_investigation.py
@@ -1,4 +1,5 @@
 import argparse
+
 import onnx
 
 parser = argparse.ArgumentParser(description="ONNX file analyzer for performance investigation.")
diff --git a/orttraining/tools/scripts/pipeline_model_split.py b/orttraining/tools/scripts/pipeline_model_split.py
index b95bbe49003ec..6cb33603d57da 100644
--- a/orttraining/tools/scripts/pipeline_model_split.py
+++ b/orttraining/tools/scripts/pipeline_model_split.py
@@ -1,9 +1,7 @@
-import sys
 import os
+
 import onnx
-from onnx import helper
-from onnx import TensorProto
-from onnx import OperatorSetIdProto
+from onnx import TensorProto, helper
 
 # Edge that needs to be cut for the split.
 # If the edge is feeding into more than one nodes, and not all the nodes belong to the same cut,
@@ -287,7 +285,7 @@ def generate_subgraph(model, start_nodes, identity_node_list):
         try:
             if i in identity_node_index:
                 del main_graph.graph.node[i]
-        except:
+        except Exception:
             print("error deleting identity node", i)
 
     all_visited_nodes = []
@@ -301,7 +299,7 @@ def generate_subgraph(model, start_nodes, identity_node_list):
         outputs0 = []
         while stack0:
             node = stack0.pop()
-            if not node in visited0:
+            if node not in visited0:
                 tranversed_node += 1
                 visited0.append(node)
                 all_visited_nodes.append(node)
@@ -338,7 +336,7 @@ def generate_subgraph(model, start_nodes, identity_node_list):
                     del subgraph.graph.node[i]
                 else:
                     del main_graph.graph.node[i]
-            except:
+            except Exception:
                 print("error deleting node", i)
 
         for i in reversed(range(len(main_graph.graph.input))):
@@ -347,7 +345,7 @@ def generate_subgraph(model, start_nodes, identity_node_list):
                     del subgraph.graph.input[i]
                 else:
                     del main_graph.graph.input[i]
-            except:
+            except Exception:
                 print("error deleting inputs", i)
 
         for i in reversed(range(len(main_graph.graph.output))):
@@ -356,7 +354,7 @@ def generate_subgraph(model, start_nodes, identity_node_list):
                     del subgraph.graph.output[i]
                 else:
                     del main_graph.graph.output[i]
-            except:
+            except Exception:
                 print("error deleting outputs ", i)
 
         print("model", str(model_count), " length ", len(subgraph.graph.node))
diff --git a/orttraining/tools/scripts/sqldb_to_tensors.py b/orttraining/tools/scripts/sqldb_to_tensors.py
index cf24e0c294450..7476d2cbabc32 100644
--- a/orttraining/tools/scripts/sqldb_to_tensors.py
+++ b/orttraining/tools/scripts/sqldb_to_tensors.py
@@ -2,6 +2,7 @@
 # Licensed under the MIT License.
 
 import sqlite3
+
 import onnx
 from onnx import numpy_helper
 
diff --git a/orttraining/tools/scripts/watch_experiment.py b/orttraining/tools/scripts/watch_experiment.py
index 33bb73f8dc9b9..5af7a11eeab67 100644
--- a/orttraining/tools/scripts/watch_experiment.py
+++ b/orttraining/tools/scripts/watch_experiment.py
@@ -1,13 +1,12 @@
 import argparse
-import sys
 import os
-
+import sys
 from concurrent.futures import ThreadPoolExecutor
-from requests import Session
-from threading import Event, Thread
+from threading import Event
 
-from azureml.core import Workspace, Experiment, Run
 from azureml._run_impl.run_watcher import RunWatcher
+from azureml.core import Experiment, Workspace
+from requests import Session
 
 parser = argparse.ArgumentParser()
 parser.add_argument(
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 4a4e718d9fa60..1e4917c1c031c 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,8 +1,6 @@
-black>=22.3
 cerberus
 codecov
 flatbuffers
-isort
 jinja2
 numpy
 onnx
@@ -19,3 +17,5 @@ scipy
 sympy
 wheel
 setuptools>=41.4.0
+lintrunner
+lintrunner-adapters
diff --git a/setup.py b/setup.py
index 294b975a56595..44006123921e6 100644
--- a/setup.py
+++ b/setup.py
@@ -130,9 +130,11 @@ def get_tag(self):
                 _, _, plat = _bdist_wheel.get_tag(self)
                 if platform.system() == "Linux":
                     # Get the right platform tag by querying the linker version
-                    glibc_major, glibc_minor = popen("ldd --version | head -1").read().split()[-1].split(".")
-                    """# See https://github.com/mayeut/pep600_compliance/blob/master/
-                    pep600_compliance/tools/manylinux-policy.json"""
+                    glibc_major, glibc_minor = (
+                        popen("ldd --version | head -1").read().split()[-1].split(".")  # noqa: DUO106
+                    )
+                    # See https://github.com/mayeut/pep600_compliance/blob/master/
+                    # pep600_compliance/tools/manylinux-policy.json
                     if glibc_major == "2" and glibc_minor == "17":
                         plat = "manylinux_2_17_x86_64.manylinux2014_x86_64"
                     else:  # For manylinux2014 and above, no alias is required
@@ -631,7 +633,7 @@ def check_date_format(date_str):
             try:
                 datetime.datetime.strptime(date_str, "%Y%m%d")
                 return True
-            except:  # noqa
+            except Exception:
                 return False
 
         def reformat_run_count(count_str):
@@ -642,7 +644,7 @@ def reformat_run_count(count_str):
                 elif count >= 1000:
                     raise RuntimeError(f"Too many builds for the same day: {count}")
                 return ""
-            except:  # noqa
+            except Exception:
                 return ""
 
         build_suffix_is_date_format = check_date_format(build_suffix[:8])
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 01ae36affd488..e31242452d12d 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -734,11 +734,11 @@ def get_config_build_dir(build_dir, config):
 
 
 def run_subprocess(
-    args, cwd=None, capture_stdout=False, dll_path=None, shell=False, env={}, python_path=None, cuda_home=None
+    args, cwd=None, capture_stdout=False, dll_path=None, shell=False, env=None, python_path=None, cuda_home=None
 ):
     if isinstance(args, str):
         raise ValueError("args should be a sequence of strings, not a string")
-
+    env = env or {}
     my_env = os.environ.copy()
     if dll_path:
         if is_windows():
diff --git a/tools/ci_build/github/android/build_aar_package.py b/tools/ci_build/github/android/build_aar_package.py
index c0cacb4231665..5f6f753d1b33c 100644
--- a/tools/ci_build/github/android/build_aar_package.py
+++ b/tools/ci_build/github/android/build_aar_package.py
@@ -153,9 +153,11 @@ def _build_aar(args):
     use_shell = True if is_windows() else False
 
     # clean, build, and publish to a local directory
-    subprocess.run(gradle_command + ["clean"], env=temp_env, shell=use_shell, check=True, cwd=JAVA_ROOT)
-    subprocess.run(gradle_command + ["build"], env=temp_env, shell=use_shell, check=True, cwd=JAVA_ROOT)
-    subprocess.run(gradle_command + ["publish"], env=temp_env, shell=use_shell, check=True, cwd=JAVA_ROOT)
+    subprocess.run(gradle_command + ["clean"], env=temp_env, shell=use_shell, check=True, cwd=JAVA_ROOT)  # noqa: DUO116
+    subprocess.run(gradle_command + ["build"], env=temp_env, shell=use_shell, check=True, cwd=JAVA_ROOT)  # noqa: DUO116
+    subprocess.run(  # noqa: DUO116
+        gradle_command + ["publish"], env=temp_env, shell=use_shell, check=True, cwd=JAVA_ROOT
+    )
 
 
 def parse_args():
diff --git a/tools/ci_build/github/azure-pipelines/python-checks-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/python-checks-ci-pipeline.yml
deleted file mode 100644
index 17c2b8766d891..0000000000000
--- a/tools/ci_build/github/azure-pipelines/python-checks-ci-pipeline.yml
+++ /dev/null
@@ -1,19 +0,0 @@
-jobs:
-- job: 'PythonCodeChecks'
-  pool:
-    vmImage: 'ubuntu-20.04'
-
-  timeoutInMinutes: 10
-
-  steps:
-  - task: UsePythonVersion@0
-    inputs:
-      versionSpec: '3.8'
-      addToPath: true
-      architecture: 'x64'
-
-  - script: python -m pip install -r tools/ci_build/github/python_checks/requirements.txt
-    displayName: "Install requirements"
-
-  - script: python -m flake8 --config .flake8
-    displayName: "Run Flake8"
diff --git a/tools/ci_build/github/python_checks/readme.md b/tools/ci_build/github/python_checks/readme.md
deleted file mode 100644
index b31300d6cf07b..0000000000000
--- a/tools/ci_build/github/python_checks/readme.md
+++ /dev/null
@@ -1,18 +0,0 @@
-# Python Code Checks
-
-Python code checks are run by this [CI build](../azure-pipelines/python-checks-ci-pipeline.yml).
-Here are instructions on how to run them manually.
-
-## Prerequisites
-
-Install requirements.
-
-From the repo root, run:
-
-`$ python -m pip install -r tools/ci_build/github/python_checks/requirements.txt`
-
-## Flake8
-
-From the repo root, run:
-
-`$ python -m flake8 --config .flake8`
diff --git a/tools/ci_build/github/python_checks/requirements.txt b/tools/ci_build/github/python_checks/requirements.txt
deleted file mode 100644
index b5446261e8e51..0000000000000
--- a/tools/ci_build/github/python_checks/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-flake8==3.9
diff --git a/tools/doc/rename_folders.py b/tools/doc/rename_folders.py
index d65b8a350eed1..56b2a39357411 100644
--- a/tools/doc/rename_folders.py
+++ b/tools/doc/rename_folders.py
@@ -13,7 +13,7 @@ def rename_folder(root):
     Returns the list of renamed folders.
     """
     found = []
-    for r, dirs, files in os.walk(root):
+    for r, dirs, _ in os.walk(root):
         for name in dirs:
             if name.startswith("_"):
                 found.append((r, name))
@@ -35,7 +35,7 @@ def replace_files(root, renamed):
     subs = {r[1]: r[2] for r in renamed}
     reg = re.compile('(\\"[a-zA-Z0-9\\.\\/\\?\\:@\\-_=#]+\\.([a-zA-Z]){2,6}' '([a-zA-Z0-9\\.\\&\\/\\?\\:@\\-_=#])*\\")')
 
-    for r, dirs, files in os.walk(root):
+    for r, _, files in os.walk(root):
         for name in files:
             if os.path.splitext(name)[-1] != ".html":
                 continue
diff --git a/tools/nuget/validate_package.py b/tools/nuget/validate_package.py
index 5baa2f603c5d7..a8dbe1a955a8e 100644
--- a/tools/nuget/validate_package.py
+++ b/tools/nuget/validate_package.py
@@ -89,7 +89,7 @@ def check_if_dlls_are_present(
     platforms = platforms_supported.strip().split(",")
     if package_type == "tarball":
         file_list_in_package = list()
-        for (dirpath, dirnames, filenames) in os.walk(package_path):
+        for (dirpath, _, filenames) in os.walk(package_path):
             file_list_in_package += [os.path.join(dirpath, file) for file in filenames]
     else:
         file_list_in_package = zip_file.namelist()
@@ -194,7 +194,7 @@ def validate_tarball(args):
     package_folder = re.search("(.*)[.].*", package_name).group(1)
 
     print("tar zxvf " + package_name)
-    os.system("tar zxvf " + package_name)
+    os.system("tar zxvf " + package_name)  # noqa: DUO106
 
     is_windows_ai_package = False
     zip_file = None
@@ -276,7 +276,7 @@ def validate_nuget(args):
 
         # Make a copy of the Nuget package
         print("Copying [" + full_nuget_path + "] -> [" + nupkg_copy_name + "], and extracting its contents")
-        os.system("copy " + full_nuget_path + " " + nupkg_copy_name)
+        os.system("copy " + full_nuget_path + " " + nupkg_copy_name)  # noqa: DUO106
 
         # Convert nupkg to zip
         os.rename(nupkg_copy_name, zip_copy_name)
diff --git a/tools/python/PythonTools.md b/tools/python/PythonTools.md
index 2dbf962db3e57..a9dfe6470b365 100644
--- a/tools/python/PythonTools.md
+++ b/tools/python/PythonTools.md
@@ -98,7 +98,7 @@ import ort_test_dir_utils
 try:
     ort_test_dir_utils.run_test_dir('temp/examples/test1')
     ort_test_dir_utils.run_test_dir('temp/examples/test2/expand_elimination.onnx')
-except:
+except Exception:
     print("Exception:", sys.exc_info()[1])
 ```
 
diff --git a/tools/python/example_operator_perf_test.py b/tools/python/example_operator_perf_test.py
index 50a3edd5c9b27..41c0d605bc636 100644
--- a/tools/python/example_operator_perf_test.py
+++ b/tools/python/example_operator_perf_test.py
@@ -99,7 +99,7 @@ def run_test():
         # run the model and measure time after 'iters' calls
         while total < num_seconds:
             start = time.time_ns()
-            for i in range(iters):
+            for _ in range(iters):
                 # ignore the outputs as we're not validating them in a performance test
                 sess.run(None, inputs)
             end = time.time_ns()
diff --git a/tools/python/gen_contrib_doc.py b/tools/python/gen_contrib_doc.py
index 15e7f65d093d9..b8a5a943ab698 100644
--- a/tools/python/gen_contrib_doc.py
+++ b/tools/python/gen_contrib_doc.py
@@ -2,7 +2,6 @@
 
 # This file is copied and adapted from https://github.com/onnx/onnx repository.
 # There was no copyright statement on the file at the time of copying.
-from __future__ import absolute_import, division, print_function, unicode_literals
 
 import argparse
 import io
@@ -10,9 +9,9 @@
 import pathlib
 import sys
 from collections import defaultdict
-from typing import Any, Dict, List, Sequence, Set, Text, Tuple
+from typing import Any, DefaultDict, List, Sequence, Set, Text, Tuple
 
-import numpy as np  # type: ignore
+import numpy as np
 from onnx import AttributeProto, FunctionProto
 
 import onnxruntime.capi.onnxruntime_pybind11_state as rtpy
@@ -29,13 +28,13 @@
     ext = ".md"
 
 
-def display_number(v):  # type: (int) -> Text
+def display_number(v: int) -> str:
     if OpSchema.is_infinite(v):
         return "&#8734;"
     return Text(v)
 
 
-def should_render_domain(domain, domain_filter):  # type: (Text) -> bool
+def should_render_domain(domain, domain_filter) -> bool:
     if domain == ONNX_DOMAIN or domain == "" or domain == ONNX_ML_DOMAIN or domain == "ai.onnx.ml":
         return False
 
@@ -45,18 +44,18 @@ def should_render_domain(domain, domain_filter):  # type: (Text) -> bool
     return True
 
 
-def format_name_with_domain(domain, schema_name):  # type: (Text, Text) -> Text
+def format_name_with_domain(domain: str, schema_name: str) -> str:
     if domain:
         return "{}.{}".format(domain, schema_name)
     else:
         return schema_name
 
 
-def format_name_with_version(schema_name, version):  # type: (Text, Text) -> Text
+def format_name_with_version(schema_name: str, version: str) -> str:
     return "{}-{}".format(schema_name, version)
 
 
-def display_attr_type(v):  # type: (OpSchema.AttrType) -> Text
+def display_attr_type(v: OpSchema.AttrType) -> str:
     assert isinstance(v, OpSchema.AttrType)
     s = Text(v)
     s = s[s.rfind(".") + 1 :].lower()
@@ -65,33 +64,33 @@ def display_attr_type(v):  # type: (OpSchema.AttrType) -> Text
     return s
 
 
-def display_domain(domain):  # type: (Text) -> Text
+def display_domain(domain: str) -> str:
     if domain:
         return "the '{}' operator set".format(domain)
     else:
         return "the default ONNX operator set"
 
 
-def display_domain_short(domain):  # type: (Text) -> Text
+def display_domain_short(domain: str) -> str:
     if domain:
         return domain
     else:
         return "ai.onnx (default)"
 
 
-def display_version_link(name, version):  # type: (Text, int) -> Text
+def display_version_link(name: str, version: int) -> str:
     changelog_md = "Changelog" + ext
     name_with_ver = "{}-{}".format(name, version)
     return '<a href="{}#{}">{}</a>'.format(changelog_md, name_with_ver, name_with_ver)
 
 
-def display_function_version_link(name, version):  # type: (Text, int) -> Text
+def display_function_version_link(name: str, version: int) -> str:
     changelog_md = "FunctionsChangelog" + ext
     name_with_ver = "{}-{}".format(name, version)
     return '<a href="{}#{}">{}</a>'.format(changelog_md, name_with_ver, name_with_ver)
 
 
-def get_attribute_value(attr):  # type: (AttributeProto) -> Any
+def get_attribute_value(attr: AttributeProto) -> Any:
     if attr.HasField("f"):
         return attr.f
     elif attr.HasField("i"):
@@ -116,7 +115,7 @@ def get_attribute_value(attr):  # type: (AttributeProto) -> Any
         raise ValueError("Unsupported ONNX attribute: {}".format(attr))
 
 
-def display_schema(schema, versions):  # type: (OpSchema, Sequence[OpSchema]) -> Text
+def display_schema(schema: OpSchema, versions: Sequence[OpSchema]) -> str:
     s = ""
 
     # doc
@@ -163,7 +162,7 @@ def display_schema(schema, versions):  # type: (OpSchema, Sequence[OpSchema]) ->
             elif hasattr(attr, "default_value") and attr.default_value.name:
                 default_value = get_attribute_value(attr.default_value)
 
-                def format_value(value):  # type: (Any) -> Text
+                def format_value(value: Any) -> str:
                     if isinstance(value, float):
                         value = np.round(value, 5)
                     if isinstance(value, (bytes, bytearray)) and sys.version_info[0] == 3:
@@ -247,7 +246,7 @@ def format_value(value):  # type: (Any) -> Text
     return s
 
 
-def display_function(function, versions, domain=ONNX_DOMAIN):  # type: (FunctionProto, List[int], Text) -> Text
+def display_function(function: FunctionProto, versions: List[int], domain: str = ONNX_DOMAIN) -> str:
     s = ""
 
     if domain:
@@ -303,15 +302,10 @@ def display_function(function, versions, domain=ONNX_DOMAIN):  # type: (Function
     return s
 
 
-def support_level_str(level):  # type: (OpSchema.SupportType) -> Text
+def support_level_str(level: OpSchema.SupportType) -> str:
     return "<sub>experimental</sub> " if level == OpSchema.SupportType.EXPERIMENTAL else ""
 
 
-# def function_status_str(status=OperatorStatus.Value("EXPERIMENTAL")):  # type: ignore
-#     return \
-#         "<sub>experimental</sub> " if status == OperatorStatus.Value('EXPERIMENTAL') else ""  # type: ignore
-
-
 def main(output_path: str, domain_filter: [str]):
 
     with io.open(output_path, "w", newline="", encoding="utf-8") as fout:
@@ -323,9 +317,9 @@ def main(output_path: str, domain_filter: [str]):
         )
 
         # domain -> support level -> name -> [schema]
-        index = defaultdict(
+        index: DefaultDict[Text, DefaultDict[int, DefaultDict[Text, List[OpSchema]]]] = defaultdict(
             lambda: defaultdict(lambda: defaultdict(list))
-        )  # type: Dict[Text, Dict[int, Dict[Text, List[OpSchema]]]]  # noqa: E501
+        )
 
         for schema in rtpy.get_all_operator_schema():
             index[schema.domain][int(schema.support_level)][schema.name].append(schema)
@@ -334,10 +328,8 @@ def main(output_path: str, domain_filter: [str]):
 
         # Preprocess the Operator Schemas
         # [(domain, [(support_level, [(schema name, current schema, all versions schemas)])])]
-        operator_schemas = (
-            list()
-        )  # type: List[Tuple[Text, List[Tuple[int, List[Tuple[Text, OpSchema, List[OpSchema]]]]]]]  # noqa: E501
-        exsting_ops = set()  # type: Set[Text]
+        operator_schemas: List[Tuple[Text, List[Tuple[int, List[Tuple[Text, OpSchema, List[OpSchema]]]]]]] = []
+        exsting_ops: Set[str] = set()
         for domain, _supportmap in sorted(index.items()):
             if not should_render_domain(domain, domain_filter):
                 continue
@@ -361,7 +353,7 @@ def main(output_path: str, domain_filter: [str]):
             fout.write(s)
 
             for _, namemap in supportmap:
-                for n, schema, versions in namemap:
+                for n, schema, _ in namemap:
                     s = '  * {}<a href="#{}">{}</a>\n'.format(
                         support_level_str(schema.support_level),
                         format_name_with_domain(domain, n),
diff --git a/tools/python/sparsify_initializers.py b/tools/python/sparsify_initializers.py
index 17bddae6bbe40..7a5fba7429166 100644
--- a/tools/python/sparsify_initializers.py
+++ b/tools/python/sparsify_initializers.py
@@ -55,8 +55,8 @@ def setup_logging(verbose):  # type: (bool)  -> None
 
 
 def convert_tensor_to_sparse(
-    tensor, sparsity_threshold, tolerance
-):  # type: (TensorProto, float, float) -> Tuple[SparseTensorProto, float]
+    tensor: TensorProto, sparsity_threshold: float, tolerance: float
+) -> Tuple[SparseTensorProto, float]:
     """returns a tuple of sparse_tensor and sparsity level"""
     values = []
     indices = []
@@ -141,8 +141,8 @@ def convert_tensor_to_sparse(
 
 
 def convert_initializers(
-    model, exclude_names, sparsity_threshold, tolerance
-):  # type: (ModelProto, List[str], float, float) -> None
+    model: ModelProto, exclude_names: List[str], sparsity_threshold: float, tolerance: float
+) -> None:
     graph = model.graph
     converted_sparse = []
     remaining_initializers = []
diff --git a/tools/python/util/ort_format_model/operator_type_usage_processors.py b/tools/python/util/ort_format_model/operator_type_usage_processors.py
index 8f21298518f87..3aeeedb9c5732 100644
--- a/tools/python/util/ort_format_model/operator_type_usage_processors.py
+++ b/tools/python/util/ort_format_model/operator_type_usage_processors.py
@@ -1,9 +1,10 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
+from __future__ import annotations
 
 import json
-import typing
 from abc import ABC, abstractmethod
+from typing import Dict, Optional, Set
 
 import ort_flatbuffers_py.fbs as fbs
 
@@ -65,9 +66,7 @@ def __init__(self, domain: str, optype: str):
     def process_node(self, node: fbs.Node, value_name_to_typeinfo: dict):
         pass
 
-    def is_typed_registration_needed(
-        self, type_in_registration: str, globally_allowed_types: typing.Optional[typing.Set[str]]
-    ):
+    def is_typed_registration_needed(self, type_in_registration: str, globally_allowed_types: Optional[Set[str]]):
         """
         Given the string from a kernel registration, determine if the registration is required or not.
         :param type_in_registration: Type string from kernel registration
@@ -113,10 +112,10 @@ def __init__(
         self,
         domain: str,
         optype: str,
-        inputs: [int] = [0],
-        outputs: [int] = [],
-        required_input_types: typing.Dict[int, typing.Set[str]] = {},
-        required_output_types: typing.Dict[int, typing.Set[str]] = {},
+        inputs: Optional[list[int]] = None,
+        outputs: Optional[list[int]] = None,
+        required_input_types: Optional[Dict[int, Set[str]]] = None,
+        required_output_types: Optional[Dict[int, Set[str]]] = None,
     ):
         """
         Create DefaultTypeUsageProcessor. Types for one or more inputs and/or outputs can be tracked by the processor.
@@ -134,6 +133,10 @@ def __init__(
         :param required_output_types: Required output types. May be empty.
         """
         super().__init__(domain, optype)
+        inputs = inputs or [0]
+        outputs = outputs or []
+        required_input_types = required_input_types or {}
+        required_output_types = required_output_types or {}
         self._input_types = {}
         self._output_types = {}
 
@@ -190,9 +193,7 @@ def process_node(self, node: fbs.Node, value_name_to_typeinfo: dict):
             type_str = value_name_to_typestr(node.Outputs(o), value_name_to_typeinfo)
             self._output_types[o].add(type_str)
 
-    def is_typed_registration_needed(
-        self, type_in_registration: str, globally_allowed_types: typing.Optional[typing.Set[str]]
-    ):
+    def is_typed_registration_needed(self, type_in_registration: str, globally_allowed_types: Optional[Set[str]]):
         if 0 not in self._input_types.keys():
             # currently all standard typed registrations are for input 0.
             # custom registrations can be handled by operator specific processors (e.g. OneHotProcessor below).
@@ -266,9 +267,7 @@ def __init__(self, domain: str, optype: str):
         # init with tracking of input 1 only.
         super().__init__(domain, optype, inputs=[1], outputs=[])
 
-    def is_typed_registration_needed(
-        self, type_in_registration: str, globally_allowed_types: typing.Optional[typing.Set[str]]
-    ):
+    def is_typed_registration_needed(self, type_in_registration: str, globally_allowed_types: Optional[Set[str]]):
         return self.is_input_type_enabled(type_in_registration, 1, globally_allowed_types)
 
 
@@ -281,9 +280,7 @@ def __init__(self, domain: str, optype: str):
         # init with tracking of output 0 only.
         super().__init__(domain, optype, inputs=[], outputs=[0])
 
-    def is_typed_registration_needed(
-        self, type_in_registration: str, globally_allowed_types: typing.Optional[typing.Set[str]]
-    ):
+    def is_typed_registration_needed(self, type_in_registration: str, globally_allowed_types: Optional[Set[str]]):
         return self.is_output_type_enabled(type_in_registration, 0, globally_allowed_types)
 
 
@@ -305,9 +302,7 @@ def process_node(self, node: fbs.Node, value_name_to_typeinfo: dict):
         key = (type0, type2, type1)
         self._triples.add(key)
 
-    def is_typed_registration_needed(
-        self, type_in_registration: str, globally_allowed_types: typing.Optional[typing.Set[str]]
-    ):
+    def is_typed_registration_needed(self, type_in_registration: str, globally_allowed_types: Optional[Set[str]]):
         # the OneHot registration involves a concatenation of the 3 types involved
         reg_types = tuple([_reg_type_to_cpp_type(reg_type) for reg_type in _split_reg_types(type_in_registration)])
         if globally_allowed_types is not None:
@@ -640,7 +635,7 @@ class GloballyAllowedTypesOpTypeImplFilter(OpTypeImplFilterInterface):
 
     _valid_allowed_types = set(FbsTypeInfo.tensordatatype_to_string.values())
 
-    def __init__(self, globally_allowed_types: typing.Set[str]):
+    def __init__(self, globally_allowed_types: Set[str]):
         self._operator_processors = _create_operator_type_usage_processors()
 
         if not globally_allowed_types.issubset(self._valid_allowed_types):
diff --git a/tools/python/util/ort_format_model/ort_model_processor.py b/tools/python/util/ort_format_model/ort_model_processor.py
index 7c65930e4cd0e..8fd228e1f4627 100644
--- a/tools/python/util/ort_format_model/ort_model_processor.py
+++ b/tools/python/util/ort_format_model/ort_model_processor.py
@@ -25,7 +25,7 @@ def __init__(self, model_path: str, required_ops: dict, processors: OperatorType
         self._op_type_processors = processors
 
     @staticmethod
-    def _setup_type_info(graph: fbs.Graph, outer_scope_value_typeinfo={}):
+    def _setup_type_info(graph: fbs.Graph, outer_scope_value_typeinfo=None):
         """
         Setup the node args for this level of Graph.
         We copy the current list which represents the outer scope values, and add the local node args to that
@@ -34,6 +34,7 @@ def _setup_type_info(graph: fbs.Graph, outer_scope_value_typeinfo={}):
         :param outer_scope_value_typeinfo: TypeInfo for outer scope values. Empty for the top-level graph in a model.
         :return: Dictionary of NodeArg name to TypeInfo
         """
+        outer_scope_value_typeinfo = outer_scope_value_typeinfo or {}
         value_name_to_typeinfo = outer_scope_value_typeinfo.copy()
         for j in range(0, graph.NodeArgsLength()):
             n = graph.NodeArgs(j)
diff --git a/tools/python/util/run.py b/tools/python/util/run.py
index c3a389233ff72..98724ae956402 100644
--- a/tools/python/util/run.py
+++ b/tools/python/util/run.py
@@ -46,7 +46,7 @@ def run(
     def output(is_stream_captured):
         return subprocess.PIPE if is_stream_captured else (subprocess.DEVNULL if quiet else None)
 
-    completed_process = subprocess.run(
+    completed_process = subprocess.run(  # noqa: DUO116
         cmd,
         cwd=cwd,
         check=check,