diff --git a/compiler/rustc_codegen_cranelift/.github/workflows/abi-cafe.yml b/compiler/rustc_codegen_cranelift/.github/workflows/abi-cafe.yml
index 12aa69d3c7956..bd3b051185b46 100644
--- a/compiler/rustc_codegen_cranelift/.github/workflows/abi-cafe.yml
+++ b/compiler/rustc_codegen_cranelift/.github/workflows/abi-cafe.yml
@@ -35,6 +35,10 @@ jobs:
     steps:
     - uses: actions/checkout@v3
 
+    - name: CPU features
+      if: matrix.os == 'ubuntu-latest'
+      run: cat /proc/cpuinfo
+
     - name: Cache cargo target dir
       uses: actions/cache@v3
       with:
diff --git a/compiler/rustc_codegen_cranelift/.github/workflows/main.yml b/compiler/rustc_codegen_cranelift/.github/workflows/main.yml
index 47d9a3b93f729..05dc28d074530 100644
--- a/compiler/rustc_codegen_cranelift/.github/workflows/main.yml
+++ b/compiler/rustc_codegen_cranelift/.github/workflows/main.yml
@@ -66,6 +66,10 @@ jobs:
     steps:
     - uses: actions/checkout@v3
 
+    - name: CPU features
+      if: matrix.os == 'ubuntu-latest'
+      run: cat /proc/cpuinfo
+
     - name: Cache cargo target dir
       uses: actions/cache@v3
       with:
@@ -136,6 +140,9 @@ jobs:
     steps:
     - uses: actions/checkout@v3
 
+    - name: CPU features
+      run: cat /proc/cpuinfo
+
     - name: Prepare dependencies
       run: ./y.sh prepare
 
@@ -159,6 +166,9 @@ jobs:
     steps:
     - uses: actions/checkout@v3
 
+    - name: CPU features
+      run: cat /proc/cpuinfo
+
     - name: Cache cargo target dir
       uses: actions/cache@v3
       with:
diff --git a/compiler/rustc_codegen_cranelift/.github/workflows/rustc.yml b/compiler/rustc_codegen_cranelift/.github/workflows/rustc.yml
index b49dc3aff7aaa..cb5dd51fee310 100644
--- a/compiler/rustc_codegen_cranelift/.github/workflows/rustc.yml
+++ b/compiler/rustc_codegen_cranelift/.github/workflows/rustc.yml
@@ -11,6 +11,9 @@ jobs:
     steps:
     - uses: actions/checkout@v3
 
+    - name: CPU features
+      run: cat /proc/cpuinfo
+
     - name: Cache cargo target dir
       uses: actions/cache@v3
       with:
@@ -31,6 +34,9 @@ jobs:
     steps:
     - uses: actions/checkout@v3
 
+    - name: CPU features
+      run: cat /proc/cpuinfo
+
     - name: Cache cargo target dir
       uses: actions/cache@v3
       with:
diff --git a/compiler/rustc_codegen_cranelift/Readme.md b/compiler/rustc_codegen_cranelift/Readme.md
index 1a2b2bbc58812..ca6ecdf1d0e88 100644
--- a/compiler/rustc_codegen_cranelift/Readme.md
+++ b/compiler/rustc_codegen_cranelift/Readme.md
@@ -5,8 +5,48 @@ This has the potential to improve compilation times in debug mode.
 If your project doesn't use any of the things listed under "Not yet supported", it should work fine.
 If not please open an issue.
 
+## Download using Rustup
+
+The Cranelift codegen backend is distributed in nightly builds on Linux and x86_64 macOS. If you want to
+install it using Rustup, you can do that by running:
+
+```bash
+$ rustup component add rustc-codegen-cranelift-preview --toolchain nightly
+```
+
+Once it is installed, you can enable it with one of the following approaches:
+- `CARGO_PROFILE_DEV_CODEGEN_BACKEND=cranelift cargo +nightly build -Zcodegen-backend`
+- `RUSTFLAGS="-Zcodegen-backend=cranelift" cargo +nightly build`
+- Add the following to `.cargo/config.toml`:
+    ```toml
+    [unstable]
+    codegen-backend = true
+
+    [profile.dev]
+    codegen-backend = "cranelift"
+    ```
+- Add the following to `Cargo.toml`:
+    ```toml
+    # This line needs to come before anything else in Cargo.toml
+    cargo-features = ["codegen-backend"]
+
+    [profile.dev]
+    codegen-backend = "cranelift"
+    ```
+
+## Precompiled builds
+
+You can also download a pre-built version from the [releases] page.
+Extract the `dist` directory in the archive anywhere you want.
+If you want to use `cargo clif build` instead of having to specify the full path to the `cargo-clif` executable, you can add the `bin` subdirectory of the extracted `dist` directory to your `PATH`.
+(tutorial [for Windows](https://stackoverflow.com/a/44272417), and [for Linux/MacOS](https://unix.stackexchange.com/questions/26047/how-to-correctly-add-a-path-to-path/26059#26059)).
+
+[releases]: https://github.com/rust-lang/rustc_codegen_cranelift/releases/tag/dev
+
 ## Building and testing
 
+If you want to build the backend manually, you can download it from GitHub and build it yourself:
+
 ```bash
 $ git clone https://github.com/rust-lang/rustc_codegen_cranelift
 $ cd rustc_codegen_cranelift
@@ -22,15 +62,6 @@ $ ./test.sh
 
 For more docs on how to build and test see [build_system/usage.txt](build_system/usage.txt) or the help message of `./y.sh`.
 
-## Precompiled builds
-
-Alternatively you can download a pre built version from the [releases] page.
-Extract the `dist` directory in the archive anywhere you want.
-If you want to use `cargo clif build` instead of having to specify the full path to the `cargo-clif` executable, you can add the `bin` subdirectory of the extracted `dist` directory to your `PATH`.
-(tutorial [for Windows](https://stackoverflow.com/a/44272417), and [for Linux/MacOS](https://unix.stackexchange.com/questions/26047/how-to-correctly-add-a-path-to-path/26059#26059)).
-
-[releases]: https://github.com/rust-lang/rustc_codegen_cranelift/releases/tag/dev
-
 ## Usage
 
 rustc_codegen_cranelift can be used as a near-drop-in replacement for `cargo build` or `cargo run` for existing projects.
diff --git a/compiler/rustc_codegen_cranelift/rust-toolchain b/compiler/rustc_codegen_cranelift/rust-toolchain
index b832b06e0ffba..80ef1e49f2319 100644
--- a/compiler/rustc_codegen_cranelift/rust-toolchain
+++ b/compiler/rustc_codegen_cranelift/rust-toolchain
@@ -1,3 +1,3 @@
 [toolchain]
-channel = "nightly-2023-11-10"
+channel = "nightly-2023-11-16"
 components = ["rust-src", "rustc-dev", "llvm-tools"]
diff --git a/compiler/rustc_codegen_cranelift/scripts/rustup.sh b/compiler/rustc_codegen_cranelift/scripts/rustup.sh
index e62788f2e507d..355282911c255 100755
--- a/compiler/rustc_codegen_cranelift/scripts/rustup.sh
+++ b/compiler/rustc_codegen_cranelift/scripts/rustup.sh
@@ -46,7 +46,7 @@ case $1 in
         git pull origin master
         branch=sync_cg_clif-$(date +%Y-%m-%d)
         git checkout -b "$branch"
-        "$cg_clif/git-fixed-subtree.sh" pull --prefix=compiler/rustc_codegen_cranelift/ https://github.com/bjorn3/rustc_codegen_cranelift.git master
+        "$cg_clif/git-fixed-subtree.sh" pull --prefix=compiler/rustc_codegen_cranelift/ https://github.com/rust-lang/rustc_codegen_cranelift.git master
         git push -u my "$branch"
 
         # immediately merge the merge commit into cg_clif to prevent merge conflicts when syncing
diff --git a/compiler/rustc_codegen_cranelift/scripts/setup_rust_fork.sh b/compiler/rustc_codegen_cranelift/scripts/setup_rust_fork.sh
index bbb8a010d965f..731828caae2c4 100644
--- a/compiler/rustc_codegen_cranelift/scripts/setup_rust_fork.sh
+++ b/compiler/rustc_codegen_cranelift/scripts/setup_rust_fork.sh
@@ -1,15 +1,17 @@
 #!/usr/bin/env bash
 set -e
 
+# CG_CLIF_FORCE_GNU_AS will force usage of as instead of the LLVM backend of rustc as we
+# the LLVM backend isn't compiled in here.
+export CG_CLIF_FORCE_GNU_AS=1
+
 # Compiletest expects all standard library paths to start with /rustc/FAKE_PREFIX.
 # CG_CLIF_STDLIB_REMAP_PATH_PREFIX will cause cg_clif's build system to pass
 # --remap-path-prefix to handle this.
-# CG_CLIF_FORCE_GNU_AS will force usage of as instead of the LLVM backend of rustc as we
-# the LLVM backend isn't compiled in here.
-CG_CLIF_FORCE_GNU_AS=1 CG_CLIF_STDLIB_REMAP_PATH_PREFIX=/rustc/FAKE_PREFIX ./y.sh build
+CG_CLIF_STDLIB_REMAP_PATH_PREFIX=/rustc/FAKE_PREFIX ./y.sh build
 
 echo "[SETUP] Rust fork"
-git clone https://github.com/rust-lang/rust.git || true
+git clone https://github.com/rust-lang/rust.git --filter=tree:0 || true
 pushd rust
 git fetch
 git checkout -- .
diff --git a/compiler/rustc_codegen_cranelift/scripts/test_bootstrap.sh b/compiler/rustc_codegen_cranelift/scripts/test_bootstrap.sh
index a8f6d7a202486..791d457993de3 100755
--- a/compiler/rustc_codegen_cranelift/scripts/test_bootstrap.sh
+++ b/compiler/rustc_codegen_cranelift/scripts/test_bootstrap.sh
@@ -11,7 +11,5 @@ rm -r compiler/rustc_codegen_cranelift/{Cargo.*,src}
 cp ../Cargo.* compiler/rustc_codegen_cranelift/
 cp -r ../src compiler/rustc_codegen_cranelift/src
 
-# CG_CLIF_FORCE_GNU_AS will force usage of as instead of the LLVM backend of rustc as we
-# the LLVM backend isn't compiled in here.
-CG_CLIF_FORCE_GNU_AS=1 ./x.py build --stage 1 library/std
+./x.py build --stage 1 library/std
 popd
diff --git a/compiler/rustc_codegen_cranelift/src/abi/mod.rs b/compiler/rustc_codegen_cranelift/src/abi/mod.rs
index c4572e035258d..0ff1473da4313 100644
--- a/compiler/rustc_codegen_cranelift/src/abi/mod.rs
+++ b/compiler/rustc_codegen_cranelift/src/abi/mod.rs
@@ -383,6 +383,7 @@ pub(crate) fn codegen_terminator_call<'tcx>(
                 args,
                 ret_place,
                 target,
+                source_info.span,
             );
             return;
         }
diff --git a/compiler/rustc_codegen_cranelift/src/base.rs b/compiler/rustc_codegen_cranelift/src/base.rs
index 91b1547cb6ea6..71557d49ef2c3 100644
--- a/compiler/rustc_codegen_cranelift/src/base.rs
+++ b/compiler/rustc_codegen_cranelift/src/base.rs
@@ -456,7 +456,7 @@ fn codegen_fn_body(fx: &mut FunctionCx<'_, '_, '_>, start_block: Block) {
                     );
                 }
 
-                crate::inline_asm::codegen_inline_asm(
+                crate::inline_asm::codegen_inline_asm_terminator(
                     fx,
                     source_info.span,
                     template,
diff --git a/compiler/rustc_codegen_cranelift/src/constant.rs b/compiler/rustc_codegen_cranelift/src/constant.rs
index b0853d30e03b8..cf68a3857c58c 100644
--- a/compiler/rustc_codegen_cranelift/src/constant.rs
+++ b/compiler/rustc_codegen_cranelift/src/constant.rs
@@ -1,10 +1,13 @@
 //! Handling of `static`s, `const`s and promoted allocations
 
+use std::cmp::Ordering;
+
 use cranelift_module::*;
 use rustc_data_structures::fx::{FxHashMap, FxHashSet};
 use rustc_middle::middle::codegen_fn_attrs::CodegenFnAttrFlags;
 use rustc_middle::mir::interpret::{read_target_uint, AllocId, GlobalAlloc, Scalar};
 use rustc_middle::mir::ConstValue;
+use rustc_middle::ty::ScalarInt;
 
 use crate::prelude::*;
 
@@ -430,9 +433,9 @@ fn define_all_allocs(tcx: TyCtxt<'_>, module: &mut dyn Module, cx: &mut Constant
 pub(crate) fn mir_operand_get_const_val<'tcx>(
     fx: &FunctionCx<'_, '_, 'tcx>,
     operand: &Operand<'tcx>,
-) -> Option<ConstValue<'tcx>> {
+) -> Option<ScalarInt> {
     match operand {
-        Operand::Constant(const_) => Some(eval_mir_constant(fx, const_).0),
+        Operand::Constant(const_) => eval_mir_constant(fx, const_).0.try_to_scalar_int(),
         // FIXME(rust-lang/rust#85105): Casts like `IMM8 as u32` result in the const being stored
         // inside a temporary before being passed to the intrinsic requiring the const argument.
         // This code tries to find a single constant defining definition of the referenced local.
@@ -440,7 +443,7 @@ pub(crate) fn mir_operand_get_const_val<'tcx>(
             if !place.projection.is_empty() {
                 return None;
             }
-            let mut computed_const_val = None;
+            let mut computed_scalar_int = None;
             for bb_data in fx.mir.basic_blocks.iter() {
                 for stmt in &bb_data.statements {
                     match &stmt.kind {
@@ -456,22 +459,38 @@ pub(crate) fn mir_operand_get_const_val<'tcx>(
                                     operand,
                                     ty,
                                 ) => {
-                                    if computed_const_val.is_some() {
+                                    if computed_scalar_int.is_some() {
                                         return None; // local assigned twice
                                     }
                                     if !matches!(ty.kind(), ty::Uint(_) | ty::Int(_)) {
                                         return None;
                                     }
-                                    let const_val = mir_operand_get_const_val(fx, operand)?;
-                                    if fx.layout_of(*ty).size
-                                        != const_val.try_to_scalar_int()?.size()
+                                    let scalar_int = mir_operand_get_const_val(fx, operand)?;
+                                    let scalar_int = match fx
+                                        .layout_of(*ty)
+                                        .size
+                                        .cmp(&scalar_int.size())
                                     {
-                                        return None;
-                                    }
-                                    computed_const_val = Some(const_val);
+                                        Ordering::Equal => scalar_int,
+                                        Ordering::Less => match ty.kind() {
+                                            ty::Uint(_) => ScalarInt::try_from_uint(
+                                                scalar_int.try_to_uint(scalar_int.size()).unwrap(),
+                                                fx.layout_of(*ty).size,
+                                            )
+                                            .unwrap(),
+                                            ty::Int(_) => ScalarInt::try_from_int(
+                                                scalar_int.try_to_int(scalar_int.size()).unwrap(),
+                                                fx.layout_of(*ty).size,
+                                            )
+                                            .unwrap(),
+                                            _ => unreachable!(),
+                                        },
+                                        Ordering::Greater => return None,
+                                    };
+                                    computed_scalar_int = Some(scalar_int);
                                 }
                                 Rvalue::Use(operand) => {
-                                    computed_const_val = mir_operand_get_const_val(fx, operand)
+                                    computed_scalar_int = mir_operand_get_const_val(fx, operand)
                                 }
                                 _ => return None,
                             }
@@ -522,7 +541,7 @@ pub(crate) fn mir_operand_get_const_val<'tcx>(
                     TerminatorKind::Call { .. } => {}
                 }
             }
-            computed_const_val
+            computed_scalar_int
         }
     }
 }
diff --git a/compiler/rustc_codegen_cranelift/src/inline_asm.rs b/compiler/rustc_codegen_cranelift/src/inline_asm.rs
index ce0eecca8a8ba..25d14319f5791 100644
--- a/compiler/rustc_codegen_cranelift/src/inline_asm.rs
+++ b/compiler/rustc_codegen_cranelift/src/inline_asm.rs
@@ -10,7 +10,7 @@ use target_lexicon::BinaryFormat;
 
 use crate::prelude::*;
 
-enum CInlineAsmOperand<'tcx> {
+pub(crate) enum CInlineAsmOperand<'tcx> {
     In {
         reg: InlineAsmRegOrRegClass,
         value: Value,
@@ -34,7 +34,7 @@ enum CInlineAsmOperand<'tcx> {
     },
 }
 
-pub(crate) fn codegen_inline_asm<'tcx>(
+pub(crate) fn codegen_inline_asm_terminator<'tcx>(
     fx: &mut FunctionCx<'_, '_, 'tcx>,
     span: Span,
     template: &[InlineAsmTemplatePiece],
@@ -42,8 +42,6 @@ pub(crate) fn codegen_inline_asm<'tcx>(
     options: InlineAsmOptions,
     destination: Option<mir::BasicBlock>,
 ) {
-    // FIXME add .eh_frame unwind info directives
-
     // Used by panic_abort on Windows, but uses a syntax which only happens to work with
     // asm!() by accident and breaks with the GNU assembler as well as global_asm!() for
     // the LLVM backend.
@@ -135,15 +133,33 @@ pub(crate) fn codegen_inline_asm<'tcx>(
         })
         .collect::<Vec<_>>();
 
-    let mut inputs = Vec::new();
-    let mut outputs = Vec::new();
+    codegen_inline_asm_inner(fx, template, &operands, options);
+
+    match destination {
+        Some(destination) => {
+            let destination_block = fx.get_block(destination);
+            fx.bcx.ins().jump(destination_block, &[]);
+        }
+        None => {
+            fx.bcx.ins().trap(TrapCode::UnreachableCodeReached);
+        }
+    }
+}
+
+pub(crate) fn codegen_inline_asm_inner<'tcx>(
+    fx: &mut FunctionCx<'_, '_, 'tcx>,
+    template: &[InlineAsmTemplatePiece],
+    operands: &[CInlineAsmOperand<'tcx>],
+    options: InlineAsmOptions,
+) {
+    // FIXME add .eh_frame unwind info directives
 
     let mut asm_gen = InlineAssemblyGenerator {
         tcx: fx.tcx,
         arch: fx.tcx.sess.asm_arch.unwrap(),
         enclosing_def_id: fx.instance.def_id(),
         template,
-        operands: &operands,
+        operands,
         options,
         registers: Vec::new(),
         stack_slots_clobber: Vec::new(),
@@ -165,6 +181,8 @@ pub(crate) fn codegen_inline_asm<'tcx>(
     let generated_asm = asm_gen.generate_asm_wrapper(&asm_name);
     fx.cx.global_asm.push_str(&generated_asm);
 
+    let mut inputs = Vec::new();
+    let mut outputs = Vec::new();
     for (i, operand) in operands.iter().enumerate() {
         match operand {
             CInlineAsmOperand::In { reg: _, value } => {
@@ -186,16 +204,6 @@ pub(crate) fn codegen_inline_asm<'tcx>(
     }
 
     call_inline_asm(fx, &asm_name, asm_gen.stack_slot_size, inputs, outputs);
-
-    match destination {
-        Some(destination) => {
-            let destination_block = fx.get_block(destination);
-            fx.bcx.ins().jump(destination_block, &[]);
-        }
-        None => {
-            fx.bcx.ins().trap(TrapCode::UnreachableCodeReached);
-        }
-    }
 }
 
 struct InlineAssemblyGenerator<'a, 'tcx> {
@@ -637,8 +645,21 @@ impl<'tcx> InlineAssemblyGenerator<'_, 'tcx> {
     ) {
         match arch {
             InlineAsmArch::X86_64 => {
-                write!(generated_asm, "    mov [rbx+0x{:x}], ", offset.bytes()).unwrap();
-                reg.emit(generated_asm, InlineAsmArch::X86_64, None).unwrap();
+                match reg {
+                    InlineAsmReg::X86(reg)
+                        if reg as u32 >= X86InlineAsmReg::xmm0 as u32
+                            && reg as u32 <= X86InlineAsmReg::xmm15 as u32 =>
+                    {
+                        // rustc emits x0 rather than xmm0
+                        write!(generated_asm, "    movups [rbx+0x{:x}], ", offset.bytes()).unwrap();
+                        write!(generated_asm, "xmm{}", reg as u32 - X86InlineAsmReg::xmm0 as u32)
+                            .unwrap();
+                    }
+                    _ => {
+                        write!(generated_asm, "    mov [rbx+0x{:x}], ", offset.bytes()).unwrap();
+                        reg.emit(generated_asm, InlineAsmArch::X86_64, None).unwrap();
+                    }
+                }
                 generated_asm.push('\n');
             }
             InlineAsmArch::AArch64 => {
@@ -663,8 +684,24 @@ impl<'tcx> InlineAssemblyGenerator<'_, 'tcx> {
     ) {
         match arch {
             InlineAsmArch::X86_64 => {
-                generated_asm.push_str("    mov ");
-                reg.emit(generated_asm, InlineAsmArch::X86_64, None).unwrap();
+                match reg {
+                    InlineAsmReg::X86(reg)
+                        if reg as u32 >= X86InlineAsmReg::xmm0 as u32
+                            && reg as u32 <= X86InlineAsmReg::xmm15 as u32 =>
+                    {
+                        // rustc emits x0 rather than xmm0
+                        write!(
+                            generated_asm,
+                            "    movups xmm{}",
+                            reg as u32 - X86InlineAsmReg::xmm0 as u32
+                        )
+                        .unwrap();
+                    }
+                    _ => {
+                        generated_asm.push_str("    mov ");
+                        reg.emit(generated_asm, InlineAsmArch::X86_64, None).unwrap()
+                    }
+                }
                 writeln!(generated_asm, ", [rbx+0x{:x}]", offset.bytes()).unwrap();
             }
             InlineAsmArch::AArch64 => {
@@ -720,7 +757,12 @@ fn call_inline_asm<'tcx>(
     fx.bcx.ins().call(inline_asm_func, &[stack_slot_addr]);
 
     for (offset, place) in outputs {
-        let ty = fx.clif_type(place.layout().ty).unwrap();
+        let ty = if place.layout().ty.is_simd() {
+            let (lane_count, lane_type) = place.layout().ty.simd_size_and_type(fx.tcx);
+            fx.clif_type(lane_type).unwrap().by(lane_count.try_into().unwrap()).unwrap()
+        } else {
+            fx.clif_type(place.layout().ty).unwrap()
+        };
         let value = stack_slot.offset(fx, i32::try_from(offset.bytes()).unwrap().into()).load(
             fx,
             ty,
@@ -729,83 +771,3 @@ fn call_inline_asm<'tcx>(
         place.write_cvalue(fx, CValue::by_val(value, place.layout()));
     }
 }
-
-pub(crate) fn codegen_xgetbv<'tcx>(
-    fx: &mut FunctionCx<'_, '_, 'tcx>,
-    xcr_no: Value,
-    ret: CPlace<'tcx>,
-) {
-    // FIXME add .eh_frame unwind info directives
-
-    let operands = vec![
-        CInlineAsmOperand::In {
-            reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::cx)),
-            value: xcr_no,
-        },
-        CInlineAsmOperand::Out {
-            reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::ax)),
-            late: true,
-            place: Some(ret),
-        },
-        CInlineAsmOperand::Out {
-            reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::dx)),
-            late: true,
-            place: None,
-        },
-    ];
-    let options = InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM;
-
-    let mut inputs = Vec::new();
-    let mut outputs = Vec::new();
-
-    let mut asm_gen = InlineAssemblyGenerator {
-        tcx: fx.tcx,
-        arch: fx.tcx.sess.asm_arch.unwrap(),
-        enclosing_def_id: fx.instance.def_id(),
-        template: &[InlineAsmTemplatePiece::String(
-            "
-            xgetbv
-            // out = rdx << 32 | rax
-            shl rdx, 32
-            or rax, rdx
-            "
-            .to_string(),
-        )],
-        operands: &operands,
-        options,
-        registers: Vec::new(),
-        stack_slots_clobber: Vec::new(),
-        stack_slots_input: Vec::new(),
-        stack_slots_output: Vec::new(),
-        stack_slot_size: Size::from_bytes(0),
-    };
-    asm_gen.allocate_registers();
-    asm_gen.allocate_stack_slots();
-
-    let inline_asm_index = fx.cx.inline_asm_index.get();
-    fx.cx.inline_asm_index.set(inline_asm_index + 1);
-    let asm_name = format!(
-        "__inline_asm_{}_n{}",
-        fx.cx.cgu_name.as_str().replace('.', "__").replace('-', "_"),
-        inline_asm_index
-    );
-
-    let generated_asm = asm_gen.generate_asm_wrapper(&asm_name);
-    fx.cx.global_asm.push_str(&generated_asm);
-
-    for (i, operand) in operands.iter().enumerate() {
-        match operand {
-            CInlineAsmOperand::In { reg: _, value } => {
-                inputs.push((asm_gen.stack_slots_input[i].unwrap(), *value));
-            }
-            CInlineAsmOperand::Out { reg: _, late: _, place } => {
-                if let Some(place) = place {
-                    outputs.push((asm_gen.stack_slots_output[i].unwrap(), *place));
-                }
-            }
-            _ => unreachable!(),
-        }
-    }
-
-    call_inline_asm(fx, &asm_name, asm_gen.stack_slot_size, inputs, outputs);
-}
diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm.rs
index e9b7daf14924d..659e6c133ef5e 100644
--- a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm.rs
+++ b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm.rs
@@ -12,6 +12,7 @@ pub(crate) fn codegen_llvm_intrinsic_call<'tcx>(
     args: &[mir::Operand<'tcx>],
     ret: CPlace<'tcx>,
     target: Option<BasicBlock>,
+    span: Span,
 ) {
     if intrinsic.starts_with("llvm.aarch64") {
         return llvm_aarch64::codegen_aarch64_llvm_intrinsic_call(
@@ -31,6 +32,7 @@ pub(crate) fn codegen_llvm_intrinsic_call<'tcx>(
             args,
             ret,
             target,
+            span,
         );
     }
 
diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs
index 4c536048626ee..8dd2b6ed014e6 100644
--- a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs
+++ b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs
@@ -1,7 +1,10 @@
 //! Emulate x86 LLVM intrinsics
 
+use rustc_ast::ast::{InlineAsmOptions, InlineAsmTemplatePiece};
 use rustc_middle::ty::GenericArgsRef;
+use rustc_target::asm::*;
 
+use crate::inline_asm::{codegen_inline_asm_inner, CInlineAsmOperand};
 use crate::intrinsics::*;
 use crate::prelude::*;
 
@@ -12,6 +15,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
     args: &[mir::Operand<'tcx>],
     ret: CPlace<'tcx>,
     target: Option<BasicBlock>,
+    span: Span,
 ) {
     match intrinsic {
         "llvm.x86.sse2.pause" | "llvm.aarch64.isb" => {
@@ -24,7 +28,35 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
 
             let xcr_no = xcr_no.load_scalar(fx);
 
-            crate::inline_asm::codegen_xgetbv(fx, xcr_no, ret);
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String(
+                    "
+                    xgetbv
+                    // out = rdx << 32 | rax
+                    shl rdx, 32
+                    or rax, rdx
+                    "
+                    .to_string(),
+                )],
+                &[
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::cx)),
+                        value: xcr_no,
+                    },
+                    CInlineAsmOperand::Out {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::ax)),
+                        late: true,
+                        place: Some(ret),
+                    },
+                    CInlineAsmOperand::Out {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::dx)),
+                        late: true,
+                        place: None,
+                    },
+                ],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
         }
 
         "llvm.x86.sse3.ldu.dq" | "llvm.x86.avx.ldu.dq.256" => {
@@ -688,64 +720,278 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
 
         "llvm.x86.pclmulqdq" => {
             // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128&ig_expand=772
-            intrinsic_args!(fx, args => (a, b, imm8); intrinsic);
+            intrinsic_args!(fx, args => (a, b, _imm8); intrinsic);
 
-            assert_eq!(a.layout(), b.layout());
-            let layout = a.layout();
+            let a = a.load_scalar(fx);
+            let b = b.load_scalar(fx);
 
-            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
-            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
-            assert_eq!(lane_ty, fx.tcx.types.i64);
-            assert_eq!(ret_lane_ty, fx.tcx.types.i64);
-            assert_eq!(lane_count, 2);
-            assert_eq!(ret_lane_count, 2);
+            let imm8 = if let Some(imm8) = crate::constant::mir_operand_get_const_val(fx, &args[2])
+            {
+                imm8
+            } else {
+                fx.tcx.sess.span_fatal(
+                    span,
+                    "Index argument for `_mm_clmulepi64_si128` is not a constant",
+                );
+            };
 
-            let imm8 = imm8.load_scalar(fx);
+            let imm8 = imm8.try_to_u8().unwrap_or_else(|_| panic!("kind not scalar: {:?}", imm8));
 
-            let control0 = fx.bcx.ins().band_imm(imm8, 0b0000_0001);
-            let a_lane0 = a.value_lane(fx, 0).load_scalar(fx);
-            let a_lane1 = a.value_lane(fx, 1).load_scalar(fx);
-            let temp1 = fx.bcx.ins().select(control0, a_lane1, a_lane0);
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String(format!("pclmulqdq xmm0, xmm1, {imm8}"))],
+                &[
+                    CInlineAsmOperand::InOut {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
+                        _late: true,
+                        in_value: a,
+                        out_place: Some(ret),
+                    },
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),
+                        value: b,
+                    },
+                ],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
 
-            let control4 = fx.bcx.ins().band_imm(imm8, 0b0001_0000);
-            let b_lane0 = b.value_lane(fx, 0).load_scalar(fx);
-            let b_lane1 = b.value_lane(fx, 1).load_scalar(fx);
-            let temp2 = fx.bcx.ins().select(control4, b_lane1, b_lane0);
+        "llvm.x86.aesni.aeskeygenassist" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128&ig_expand=261
+            intrinsic_args!(fx, args => (a, _imm8); intrinsic);
 
-            fn extract_bit(fx: &mut FunctionCx<'_, '_, '_>, val: Value, bit: i64) -> Value {
-                let tmp = fx.bcx.ins().ushr_imm(val, bit);
-                fx.bcx.ins().band_imm(tmp, 1)
-            }
+            let a = a.load_scalar(fx);
 
-            let mut res1 = fx.bcx.ins().iconst(types::I64, 0);
-            for i in 0..=63 {
-                let x = extract_bit(fx, temp1, 0);
-                let y = extract_bit(fx, temp2, i);
-                let mut temp = fx.bcx.ins().band(x, y);
-                for j in 1..=i {
-                    let x = extract_bit(fx, temp1, j);
-                    let y = extract_bit(fx, temp2, i - j);
-                    let z = fx.bcx.ins().band(x, y);
-                    temp = fx.bcx.ins().bxor(temp, z);
-                }
-                let temp = fx.bcx.ins().ishl_imm(temp, i);
-                res1 = fx.bcx.ins().bor(res1, temp);
-            }
-            ret.place_lane(fx, 0).to_ptr().store(fx, res1, MemFlags::trusted());
-
-            let mut res2 = fx.bcx.ins().iconst(types::I64, 0);
-            for i in 64..=127 {
-                let mut temp = fx.bcx.ins().iconst(types::I64, 0);
-                for j in i - 63..=63 {
-                    let x = extract_bit(fx, temp1, j);
-                    let y = extract_bit(fx, temp2, i - j);
-                    let z = fx.bcx.ins().band(x, y);
-                    temp = fx.bcx.ins().bxor(temp, z);
-                }
-                let temp = fx.bcx.ins().ishl_imm(temp, i);
-                res2 = fx.bcx.ins().bor(res2, temp);
-            }
-            ret.place_lane(fx, 1).to_ptr().store(fx, res2, MemFlags::trusted());
+            let imm8 = if let Some(imm8) = crate::constant::mir_operand_get_const_val(fx, &args[1])
+            {
+                imm8
+            } else {
+                fx.tcx.sess.span_fatal(
+                    span,
+                    "Index argument for `_mm_aeskeygenassist_si128` is not a constant",
+                );
+            };
+
+            let imm8 = imm8.try_to_u8().unwrap_or_else(|_| panic!("kind not scalar: {:?}", imm8));
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String(format!("aeskeygenassist xmm0, xmm0, {imm8}"))],
+                &[CInlineAsmOperand::InOut {
+                    reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
+                    _late: true,
+                    in_value: a,
+                    out_place: Some(ret),
+                }],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
+
+        "llvm.x86.aesni.aesimc" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128&ig_expand=260
+            intrinsic_args!(fx, args => (a); intrinsic);
+
+            let a = a.load_scalar(fx);
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String("aesimc xmm0, xmm0".to_string())],
+                &[CInlineAsmOperand::InOut {
+                    reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
+                    _late: true,
+                    in_value: a,
+                    out_place: Some(ret),
+                }],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
+
+        "llvm.x86.aesni.aesenc" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenc_si128&ig_expand=252
+            intrinsic_args!(fx, args => (a, round_key); intrinsic);
+
+            let a = a.load_scalar(fx);
+            let round_key = round_key.load_scalar(fx);
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String("aesenc xmm0, xmm1".to_string())],
+                &[
+                    CInlineAsmOperand::InOut {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
+                        _late: true,
+                        in_value: a,
+                        out_place: Some(ret),
+                    },
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),
+                        value: round_key,
+                    },
+                ],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
+
+        "llvm.x86.aesni.aesenclast" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128&ig_expand=257
+            intrinsic_args!(fx, args => (a, round_key); intrinsic);
+
+            let a = a.load_scalar(fx);
+            let round_key = round_key.load_scalar(fx);
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String("aesenclast xmm0, xmm1".to_string())],
+                &[
+                    CInlineAsmOperand::InOut {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
+                        _late: true,
+                        in_value: a,
+                        out_place: Some(ret),
+                    },
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),
+                        value: round_key,
+                    },
+                ],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
+
+        "llvm.x86.aesni.aesdec" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128&ig_expand=242
+            intrinsic_args!(fx, args => (a, round_key); intrinsic);
+
+            let a = a.load_scalar(fx);
+            let round_key = round_key.load_scalar(fx);
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String("aesdec xmm0, xmm1".to_string())],
+                &[
+                    CInlineAsmOperand::InOut {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
+                        _late: true,
+                        in_value: a,
+                        out_place: Some(ret),
+                    },
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),
+                        value: round_key,
+                    },
+                ],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
+
+        "llvm.x86.aesni.aesdeclast" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128&ig_expand=247
+            intrinsic_args!(fx, args => (a, round_key); intrinsic);
+
+            let a = a.load_scalar(fx);
+            let round_key = round_key.load_scalar(fx);
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String("aesdeclast xmm0, xmm1".to_string())],
+                &[
+                    CInlineAsmOperand::InOut {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
+                        _late: true,
+                        in_value: a,
+                        out_place: Some(ret),
+                    },
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),
+                        value: round_key,
+                    },
+                ],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
+
+        "llvm.x86.sha256rnds2" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha256rnds2_epu32&ig_expand=5977
+            intrinsic_args!(fx, args => (a, b, k); intrinsic);
+
+            let a = a.load_scalar(fx);
+            let b = b.load_scalar(fx);
+            let k = k.load_scalar(fx);
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String("sha256rnds2 xmm1, xmm2".to_string())],
+                &[
+                    CInlineAsmOperand::InOut {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),
+                        _late: true,
+                        in_value: a,
+                        out_place: Some(ret),
+                    },
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm2)),
+                        value: b,
+                    },
+                    // Implicit argument to the sha256rnds2 instruction
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
+                        value: k,
+                    },
+                ],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
+
+        "llvm.x86.sha256msg1" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha256msg1_epu32&ig_expand=5975
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            let a = a.load_scalar(fx);
+            let b = b.load_scalar(fx);
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String("sha256msg1 xmm1, xmm2".to_string())],
+                &[
+                    CInlineAsmOperand::InOut {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),
+                        _late: true,
+                        in_value: a,
+                        out_place: Some(ret),
+                    },
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm2)),
+                        value: b,
+                    },
+                ],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
+
+        "llvm.x86.sha256msg2" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha256msg2_epu32&ig_expand=5976
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            let a = a.load_scalar(fx);
+            let b = b.load_scalar(fx);
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String("sha256msg2 xmm1, xmm2".to_string())],
+                &[
+                    CInlineAsmOperand::InOut {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),
+                        _late: true,
+                        in_value: a,
+                        out_place: Some(ret),
+                    },
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm2)),
+                        value: b,
+                    },
+                ],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
         }
 
         "llvm.x86.avx.ptestz.256" => {
diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/simd.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/simd.rs
index ea137c4ca1e8c..0bd211fd614f0 100644
--- a/compiler/rustc_codegen_cranelift/src/intrinsics/simd.rs
+++ b/compiler/rustc_codegen_cranelift/src/intrinsics/simd.rs
@@ -282,11 +282,11 @@ pub(super) fn codegen_simd_intrinsic_call<'tcx>(
                 fx.tcx.sess.span_fatal(span, "Index argument for `simd_insert` is not a constant");
             };
 
-            let idx = idx_const
-                .try_to_bits(Size::from_bytes(4 /* u32*/))
-                .unwrap_or_else(|| panic!("kind not scalar: {:?}", idx_const));
+            let idx: u32 = idx_const
+                .try_to_u32()
+                .unwrap_or_else(|_| panic!("kind not scalar: {:?}", idx_const));
             let (lane_count, _lane_ty) = base.layout().ty.simd_size_and_type(fx.tcx);
-            if idx >= lane_count.into() {
+            if u64::from(idx) >= lane_count {
                 fx.tcx.sess.span_fatal(
                     fx.mir.span,
                     format!("[simd_insert] idx {} >= lane_count {}", idx, lane_count),
@@ -331,10 +331,10 @@ pub(super) fn codegen_simd_intrinsic_call<'tcx>(
             };
 
             let idx = idx_const
-                .try_to_bits(Size::from_bytes(4 /* u32*/))
-                .unwrap_or_else(|| panic!("kind not scalar: {:?}", idx_const));
+                .try_to_u32()
+                .unwrap_or_else(|_| panic!("kind not scalar: {:?}", idx_const));
             let (lane_count, _lane_ty) = v.layout().ty.simd_size_and_type(fx.tcx);
-            if idx >= lane_count.into() {
+            if u64::from(idx) >= lane_count {
                 fx.tcx.sess.span_fatal(
                     fx.mir.span,
                     format!("[simd_extract] idx {} >= lane_count {}", idx, lane_count),