From f4a369e77d73a3555d8955851aa811bff4afb88b Mon Sep 17 00:00:00 2001
From: Ian McIntyre <ianpmcintyre@gmail.com>
Date: Fri, 29 Dec 2023 17:01:55 -0500
Subject: [PATCH] Refactor LPSPI with word packing, futures

Instead of using LPSPI continuous transactions, we pack the user's data
into u32 words. The primitives for translating the user's data to / from
the data FIFOs should help us as we consider async LPSPI drivers. And,
in order to implement embedded-hal 1.0 traits, we'll need something like
the dummy transmit / receive helpers, since we need to handle differing
transmit / receive buffer sizes. The included unit tests don't trigger
an error in Miri, and they try to simulate how we'd use the primitives
in firmware. (This is an "it's not obviously wrong" test, not an "it's
correct" test; help me review here.)

The commit introduces spinning futures into the LPSPI driver. By
combining and spinning on these futures, we can realize in-place
transfers, read-only transactions, write-only transactions, etc. These
implementations flush the FIFOs, allowing users to synchronize external
components with LPSPI I/O.

We no longer return the Busy error; we'll wait for transmit FIFO space.
We also never return the NoData error, instead returning success when
there's no I/O to do. Since this commit is a non-breaking change, the
two errors are still available in the error enum. I'll remove them
later.

I'm moving the blocking SPI example into RTIC and rewriting the driver
test. The tests demonstrate overlapping writes, writes with flushes, and
in-place transfers with a physical loopback. There's also tests that
show how word sizes and bit orders interact. I'd appreciate if folks
could test these changes in their system, since it affects how the
embedded-hal implementations behave. I'm only testing this commit on a
1170EVK with the new example.
---
 .github/workflows/rust.yml    |   8 +-
 CHANGELOG.md                  |   8 +
 Cargo.toml                    |   7 +-
 examples/hal_spi.rs           |  92 -----
 examples/rtic_spi_blocking.rs | 168 +++++++++
 src/common/lpspi.rs           | 646 +++++++++++++++++++++++++++++-----
 src/lib.rs                    |  15 +
 7 files changed, 750 insertions(+), 194 deletions(-)
 delete mode 100644 examples/hal_spi.rs
 create mode 100644 examples/rtic_spi_blocking.rs

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 00ccaf27..bafc1f1a 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -87,10 +87,10 @@ jobs:
         - --examples --features=board/imxrt1010evk,board/lcd1602
         - --examples --features=board/imxrt1060evk,board/lcd1602
         # SPI examples (might break other examples)
-        - --example=hal_spi --example=rtic_spi --example=async_dma_spi --features=board/teensy4,board/spi
-        - --example=hal_spi --example=rtic_spi --example=async_dma_spi --features=board/imxrt1010evk,board/spi
-        - --example=hal_spi --example=rtic_spi --example=async_dma_spi --features=board/imxrt1060evk,board/spi
-        - --example=hal_spi --example=rtic_spi --example=async_dma_spi --features=board/imxrt1170evk-cm7,board/spi
+        - --example=rtic_spi_blocking --example=rtic_spi --example=async_dma_spi --features=board/teensy4,board/spi
+        - --example=rtic_spi_blocking --example=rtic_spi --example=async_dma_spi --features=board/imxrt1010evk,board/spi
+        - --example=rtic_spi_blocking --example=rtic_spi --example=async_dma_spi --features=board/imxrt1060evk,board/spi
+        - --example=rtic_spi_blocking --example=rtic_spi --example=async_dma_spi --features=board/imxrt1170evk-cm7,board/spi
         # The i.MX RT 1170 EVK (CM7) target is WIP. The list below describes the working examples.
         - --features=board/imxrt1170evk-cm7,board/lcd1602 --example=hal_led
           --example=hal_gpio_input --example=rtic_gpio_input
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e1faed3c..99cce155 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,6 +17,14 @@ Introduce LPSPI improvements:
 - Allow users to change the watermark while enabled. Deprecate the corresponding
   method on the `Disabled` helper.
 
+Change how the LPSPI driver manages the FIFOs. As a result of this change, the
+driver never returns the `Busy` or `NoData` errors through the embedded-hal
+interfaces. Instead of returning `Busy`, the driver blocks until there's space in
+the FIFO. If the caller provides an empty buffer, then the result is OK.
+
+The LPSPI embedded-hal (0.2) implementations will implicitly flush after blocking
+I/O. Users can rely on this behavior to synchronize external components.
+
 ## [0.5.4] 2023-11-26
 
 Add CCM APIs for configuring FlexIO clocks on 1000 targets.
diff --git a/Cargo.toml b/Cargo.toml
index 77c9dd02..a841283f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -32,6 +32,11 @@ default-features = false
 [dependencies.nb]
 version = "1"
 
+[dependencies.futures]
+version = "0.3.30"
+default-features = false
+features = ["async-await"]
+
 [dependencies.eh02]
 package = "embedded-hal"
 version = "0.2"
@@ -155,7 +160,7 @@ name = "async_dma_spi"
 required-features = ["board/spi"]
 
 [[example]]
-name = "hal_spi"
+name = "rtic_spi_blocking"
 required-features = ["board/spi"]
 
 [[example]]
diff --git a/examples/hal_spi.rs b/examples/hal_spi.rs
deleted file mode 100644
index ff4977e8..00000000
--- a/examples/hal_spi.rs
+++ /dev/null
@@ -1,92 +0,0 @@
-//! Demonstrates a blocking SPI peripheral.
-//!
-//! Connect your SDI and SDO pins together, then run this example.
-//! The example prints success / errors to the board's serial console.
-//! You should see a 1MHz SPI clock, and that the elements of a write /
-//! transfer operation occur within a single low PCS.
-
-#![no_main]
-#![no_std]
-
-use imxrt_hal as hal;
-
-use eh02::{
-    blocking::serial::Write as _,
-    blocking::spi::{Transfer, Write},
-};
-use hal::lpspi::LpspiError;
-
-const GPT1_DELAY_MS: u32 = board::GPT1_FREQUENCY / 1_000 * 500;
-const GPT1_OCR: hal::gpt::OutputCompareRegister = hal::gpt::OutputCompareRegister::OCR1;
-
-/// Change me to experiment with different word sizes.
-/// Valid types: u8, u16, u32.
-type Elem = u8;
-
-fn write_error<T>(console: &mut board::Console, result: Result<T, LpspiError>) {
-    use hal::lpspi::Direction;
-    match result {
-        Err(LpspiError::Busy) => {
-            console.bwrite_all(b"Error: BUSY\r\n").ok();
-        }
-        Err(LpspiError::Fifo(Direction::Rx)) => {
-            console.bwrite_all(b"Error: RX FIFO\r\n").ok();
-        }
-        Err(LpspiError::Fifo(Direction::Tx)) => {
-            console.bwrite_all(b"Error: TX FIFO\r\n").ok();
-        }
-        Err(LpspiError::NoData) => {
-            console.bwrite_all(b"Error: NO DATA\r\n").ok();
-        }
-        Err(LpspiError::FrameSize) => {
-            console.bwrite_all(b"Error: FRAME SIZE\r\n").ok();
-        }
-        Ok(_) => {}
-    }
-}
-
-#[imxrt_rt::entry]
-fn main() -> ! {
-    let (
-        board::Common { mut gpt1, .. },
-        board::Specifics {
-            mut spi,
-            mut console,
-            ..
-        },
-    ) = board::new();
-
-    gpt1.set_output_compare_count(GPT1_OCR, GPT1_DELAY_MS);
-    gpt1.set_mode(hal::gpt::Mode::Restart);
-    gpt1.enable();
-
-    console.bwrite_all(b"Starting example...\r\n").ok();
-    loop {
-        let data: [Elem; 5] = [0xDE, 0xAD, 0xBE, 0xEF, 0xA5];
-        let mut buffer: [Elem; 5] = data;
-
-        while !gpt1.is_elapsed(GPT1_OCR) {}
-        gpt1.clear_elapsed(GPT1_OCR);
-
-        console.bwrite_all(b"Transfer... ").ok();
-        let result = spi.transfer(&mut buffer);
-        if result.is_err() {
-            write_error(&mut console, result);
-        } else if buffer != data {
-            console.bwrite_all(b"Data mismatch\r\n").ok();
-        } else {
-            console.bwrite_all(b"OK\r\n").ok();
-        }
-
-        while !gpt1.is_elapsed(GPT1_OCR) {}
-        gpt1.clear_elapsed(GPT1_OCR);
-
-        console.bwrite_all(b"Write... ").ok();
-        let result = spi.write(&buffer[..3]);
-        if result.is_err() {
-            write_error(&mut console, result);
-        } else {
-            console.bwrite_all(b"OK\r\n").ok();
-        }
-    }
-}
diff --git a/examples/rtic_spi_blocking.rs b/examples/rtic_spi_blocking.rs
new file mode 100644
index 00000000..b7e8af9e
--- /dev/null
+++ b/examples/rtic_spi_blocking.rs
@@ -0,0 +1,168 @@
+//! Demonstrates a SPI device with blocking I/O.
+//!
+//! Connect SDI to SDO. The example uses the LPSPI interrupt to
+//! schedule transfers, and to receive data. You can observe the
+//! I/O with a scope / logic analyzer. The SPI CLK runs at 1MHz.
+//!
+//! Keep an eye on the defmt log to see if tests fail.
+
+#![no_std]
+#![no_main]
+
+#[rtic::app(device = board, peripherals = false)]
+mod app {
+
+    use imxrt_hal as hal;
+
+    const PIT_DELAY_MS: u32 = board::PIT_FREQUENCY / 1_000 * 250;
+
+    #[local]
+    struct Local {
+        spi: board::Spi,
+        pit: hal::pit::Pit<2>,
+    }
+
+    #[shared]
+    struct Shared {}
+
+    #[init]
+    fn init(_: init::Context) -> (Shared, Local, init::Monotonics) {
+        let (
+            board::Common {
+                pit: (_, _, pit, _),
+                ..
+            },
+            board::Specifics { spi, .. },
+        ) = board::new();
+        (Shared {}, Local { spi, pit }, init::Monotonics())
+    }
+
+    #[idle(local = [spi, pit])]
+    fn idle(cx: idle::Context) -> ! {
+        let idle::LocalResources { spi, pit, .. } = cx.local;
+        pit.set_load_timer_value(PIT_DELAY_MS);
+
+        let mut delay = move || {
+            pit.enable();
+            while !pit.is_elapsed() {}
+            pit.clear_elapsed();
+            pit.disable();
+        };
+
+        loop {
+            for _ in 0..3 {
+                delay();
+            }
+
+            // For studying the effects of bit order and word size.
+            //
+            // If you have a logic analyzer that can change its word
+            // size and bit order, use this sequence to evaluate how
+            // the driver packs your transfer elements.
+            {
+                use eh02::blocking::spi::Write;
+                use hal::lpspi::BitOrder::{self, *};
+
+                const BIT_ORDERS: [BitOrder; 2] = [Msb, Lsb];
+
+                const U32_WORDS: [u32; 2] = [0xDEADBEEFu32, 0xAD1CAC1D];
+                for bit_order in BIT_ORDERS {
+                    spi.set_bit_order(bit_order);
+                    spi.write(&U32_WORDS).unwrap();
+                }
+
+                const U8_WORDS: [u8; 7] = [0xDEu8, 0xAD, 0xBE, 0xEF, 0xA5, 0x00, 0x1D];
+                for bit_order in BIT_ORDERS {
+                    spi.set_bit_order(bit_order);
+                    spi.write(&U8_WORDS).unwrap();
+                }
+
+                const U16_WORDS: [u16; 3] = [0xDEADu16, 0xBEEF, 0xA5A5];
+                for bit_order in BIT_ORDERS {
+                    spi.set_bit_order(bit_order);
+                    spi.write(&U16_WORDS).unwrap();
+                }
+
+                delay();
+            }
+
+            // Change me to explore bit order behavors in the
+            // remaining write / loopback transfer tests.
+            spi.set_bit_order(hal::lpspi::BitOrder::Msb);
+
+            // Make sure concatenated elements look correct on the wire.
+            {
+                use eh02::blocking::spi::Write;
+
+                spi.write(&[1u8, 2, 3]).unwrap();
+                spi.write(&[1u8, 2, 3, 4]).unwrap();
+                spi.write(&[1u8, 2, 3, 4, 5]).unwrap();
+                spi.write(&[1u8, 2, 3, 4, 5, 6]).unwrap();
+                spi.write(&[1u8, 2, 3, 4, 5, 6, 7]).unwrap();
+
+                spi.write(&[0x0102u16, 0x0304, 0x0506]).unwrap();
+                spi.write(&[0x0102u16, 0x0304, 0x0506, 0x0708]).unwrap();
+                spi.write(&[0x0102u16, 0x0304, 0x0506, 0x0708, 0x090A])
+                    .unwrap();
+
+                spi.write(&[0x01020304u32, 0x05060708, 0x090A0B0C]).unwrap();
+
+                delay();
+            }
+
+            {
+                use eh02::blocking::spi::{Transfer, Write};
+
+                // Change me to test different Elem sizes, buffer sizes,
+                // bit patterns.
+                type Elem = u8;
+                const SENTINEL: Elem = 0x0F;
+                const BUFFER: [Elem; 13] = [SENTINEL; 13];
+
+                // Simple loopback transfer. Easy to find with your
+                // scope.
+                let mut buffer = BUFFER;
+                spi.transfer(&mut buffer).unwrap();
+                if buffer != BUFFER {
+                    defmt::error!("Simple transfer buffer mismatch!");
+                }
+
+                delay();
+
+                // Adjacent loopback transfer. Look for the big
+                // burst of data on your scope.
+                let mut buffer = BUFFER;
+                let mut error = false;
+                for idx in 0u32..16 {
+                    buffer.fill(SENTINEL.rotate_right(idx));
+                    let expected = buffer;
+                    spi.transfer(&mut buffer).unwrap();
+                    error |= buffer != expected;
+                }
+                if error {
+                    defmt::error!("At least one of the bursted transfers didn't match!");
+                }
+
+                delay();
+
+                // Simple write.
+                let buffer = BUFFER;
+                spi.write(&buffer).unwrap();
+
+                delay();
+
+                // Pipelined writes. Look for the burst of data
+                // on your scope. Internally, the writes will flush,
+                // so the delay between transfers should be about
+                // the same as they are for the transfers.
+                let mut buffer = BUFFER;
+                for idx in 0..16 {
+                    buffer.fill(SENTINEL.rotate_right(idx));
+                    spi.write(&buffer).unwrap();
+                }
+
+                delay();
+            }
+        }
+    }
+}
diff --git a/src/common/lpspi.rs b/src/common/lpspi.rs
index c2c03115..900b4c54 100644
--- a/src/common/lpspi.rs
+++ b/src/common/lpspi.rs
@@ -78,6 +78,9 @@
 //! transactions. However, keep in mind that disabling the receiver during a continuous transaction
 //! may not work as expected.
 
+use core::marker::PhantomData;
+use core::task::Poll;
+
 use crate::iomuxc::{consts, lpspi};
 use crate::ral;
 
@@ -599,16 +602,6 @@ impl<P, const N: u8> Lpspi<P, N> {
         }
     }
 
-    /// Check for any receiver errors.
-    fn recv_ok(&self) -> Result<(), LpspiError> {
-        let status = self.status();
-        if status.intersects(Status::RECEIVE_ERROR) {
-            Err(LpspiError::Fifo(Direction::Rx))
-        } else {
-            Ok(())
-        }
-    }
-
     /// Place `word` into the transmit FIFO.
     ///
     /// This will result in the value being sent from the LPSPI.
@@ -618,17 +611,79 @@ impl<P, const N: u8> Lpspi<P, N> {
         ral::write_reg!(ral::lpspi, self.lpspi, TDR, word);
     }
 
-    pub(crate) fn wait_for_transmit_fifo_space(&mut self) -> Result<(), LpspiError> {
-        loop {
+    /// Wait for transmit FIFO space in a (concurrent) spin loop.
+    ///
+    /// This future does not care about the TX FIFO watermark. Instead, it
+    /// checks the FIFO's size with an additional read.
+    pub(crate) async fn spin_for_fifo_space(&self) -> Result<(), LpspiError> {
+        core::future::poll_fn(|_| {
             let status = self.status();
             if status.intersects(Status::TRANSMIT_ERROR) {
-                return Err(LpspiError::Fifo(Direction::Tx));
+                return Poll::Ready(Err(LpspiError::Fifo(Direction::Tx)));
             }
             let fifo_status = self.fifo_status();
             if !fifo_status.is_full(Direction::Tx) {
-                return Ok(());
+                Poll::Ready(Ok(()))
+            } else {
+                Poll::Pending
+            }
+        })
+        .await
+    }
+
+    pub(crate) fn wait_for_transmit_fifo_space(&self) -> Result<(), LpspiError> {
+        crate::spin_on(self.spin_for_fifo_space())
+    }
+
+    /// Wait for receive data in a (concurrent) spin loop.
+    ///
+    /// This future does not care about the RX FIFO watermark. Instead, it
+    /// checks the FIFO's size with an additional read.
+    async fn spin_for_word(&self) -> Result<u32, LpspiError> {
+        core::future::poll_fn(|_| {
+            let status = self.status();
+            if status.intersects(Status::RECEIVE_ERROR) {
+                return Poll::Ready(Err(LpspiError::Fifo(Direction::Rx)));
+            }
+
+            let fifo_status = self.fifo_status();
+            if !fifo_status.is_empty(Direction::Rx) {
+                let data = self.read_data_unchecked();
+                Poll::Ready(Ok(data))
+            } else {
+                Poll::Pending
             }
+        })
+        .await
+    }
+
+    /// Send `len` LPSPI words (u32s) out of the peripheral.
+    ///
+    /// Expected to run in a (concurrent) spin loop, possibly with
+    /// `spin_receive`.
+    async fn spin_transmit(
+        &self,
+        mut data: impl TransmitData,
+        len: usize,
+    ) -> Result<(), LpspiError> {
+        for _ in 0..len {
+            self.spin_for_fifo_space().await?;
+            let word = data.next_word(self.bit_order);
+            self.enqueue_data(word);
         }
+        Ok(())
+    }
+
+    /// Accept `len` LPSPI words (u32s) from the peripheral.
+    ///
+    /// Expected to run in a (concurrent) spin loop, possibly with
+    /// `spin_transmit`.
+    async fn spin_receive(&self, mut data: impl ReceiveData, len: usize) -> Result<(), LpspiError> {
+        for _ in 0..len {
+            let word = self.spin_for_word().await?;
+            data.next_word(word);
+        }
+        Ok(())
     }
 
     /// Set the SPI mode for the peripheral.
@@ -694,107 +749,55 @@ impl<P, const N: u8> Lpspi<P, N> {
         }
     }
 
-    /// Exchanges data with the SPI device.
-    ///
-    /// This routine uses continuous transfers to perform the transaction, no matter the
-    /// primitive type. There's an optimization for &[u32] that we're missing; in this case,
-    /// we don't necessarily need to use continuous transfers. The frame size could be set to
-    /// 8 * buffer.len() * sizeof(u32), and we copy user words into the transmit queue as-is.
-    /// But handling the packing of u8s and u16s into the u32 transmit queue in software is
-    /// extra work, work that's effectively achieved when we use continuous transfers.
-    /// We're guessing that the time to pop a transmit command from the queue is much faster
-    /// than the time taken to pop from the data queue, so the extra queue utilization shouldn't
-    /// matter.
-    fn exchange<W>(&mut self, buffer: &mut [W]) -> Result<(), LpspiError>
-    where
-        W: Word,
-    {
-        if self.status().intersects(Status::BUSY) {
-            return Err(LpspiError::Busy);
-        } else if buffer.is_empty() {
-            return Err(LpspiError::NoData);
+    fn exchange<W: Word>(&mut self, data: &mut [W]) -> Result<(), LpspiError> {
+        if data.is_empty() {
+            return Ok(());
         }
 
-        self.clear_fifos();
-
-        let mut transaction = Transaction::new(8 * core::mem::size_of::<W>() as u16)?;
+        let mut transaction = Transaction::new_words(data)?;
         transaction.bit_order = self.bit_order();
-        transaction.continuous = true;
-
-        let mut tx_idx = 0usize;
-        let mut rx_idx = 0usize;
 
-        // Continue looping while there is either tx OR rx remaining
-        while tx_idx < buffer.len() || rx_idx < buffer.len() {
-            if tx_idx < buffer.len() {
-                let word = buffer[tx_idx];
+        self.wait_for_transmit_fifo_space()?;
+        self.enqueue_transaction(&transaction);
 
-                // Turn off TCR CONT on last tx as a workaround so that the final
-                // falling edge comes through:
-                // https://community.nxp.com/t5/i-MX-RT/RT1050-LPSPI-last-bit-not-completing-in-continuous-mode/m-p/898460
-                if tx_idx + 1 == buffer.len() {
-                    transaction.continuous = false;
-                }
+        let word_count = word_count(data);
+        let (tx, rx) = transfer_in_place(data);
 
-                self.wait_for_transmit_fifo_space()?;
-                self.enqueue_transaction(&transaction);
+        crate::spin_on(futures::future::try_join(
+            self.spin_transmit(tx, word_count),
+            self.spin_receive(rx, word_count),
+        ))
+        .map_err(|err| {
+            self.recover_from_error();
+            err
+        })?;
 
-                self.wait_for_transmit_fifo_space()?;
-                self.enqueue_data(word.into());
-                transaction.continuing = true;
-                tx_idx += 1;
-            }
-
-            if rx_idx < buffer.len() {
-                self.recv_ok()?;
-                if let Some(word) = self.read_data() {
-                    buffer[rx_idx] = word.try_into().unwrap_or(W::MAX);
-                    rx_idx += 1;
-                }
-            }
-        }
+        self.flush()?;
 
         Ok(())
     }
 
-    /// Write data to the transmit queue without subsequently reading
-    /// the receive queue.
-    ///
-    /// Use this method when you know that the receiver queue is disabled
-    /// (RXMASK high in TCR).
-    ///
-    /// Similar to `exchange`, this is using continuous transfers for all supported primitives.
-    fn write_no_read<W>(&mut self, buffer: &[W]) -> Result<(), LpspiError>
-    where
-        W: Word,
-    {
-        if self.status().intersects(Status::BUSY) {
-            return Err(LpspiError::Busy);
-        } else if buffer.is_empty() {
-            return Err(LpspiError::NoData);
+    fn write_no_read<W: Word>(&mut self, data: &[W]) -> Result<(), LpspiError> {
+        if data.is_empty() {
+            return Ok(());
         }
 
-        self.clear_fifos();
-
-        let mut transaction = Transaction::new(8 * core::mem::size_of::<W>() as u16)?;
-        transaction.bit_order = self.bit_order();
-        transaction.continuous = true;
+        let mut transaction = Transaction::new_words(data)?;
         transaction.receive_data_mask = true;
+        transaction.bit_order = self.bit_order();
 
-        for word in buffer {
-            self.wait_for_transmit_fifo_space()?;
-            self.enqueue_transaction(&transaction);
+        self.wait_for_transmit_fifo_space()?;
+        self.enqueue_transaction(&transaction);
 
-            self.wait_for_transmit_fifo_space()?;
-            self.enqueue_data((*word).into());
-            transaction.continuing = true;
-        }
+        let word_count = word_count(data);
+        let tx = TransmitBuffer::new(data);
 
-        transaction.continuing = false;
-        transaction.continuous = false;
+        crate::spin_on(self.spin_transmit(tx, word_count)).map_err(|err| {
+            self.recover_from_error();
+            err
+        })?;
 
-        self.wait_for_transmit_fifo_space()?;
-        self.enqueue_transaction(&transaction);
+        self.flush()?;
 
         Ok(())
     }
@@ -913,6 +916,15 @@ impl<P, const N: u8> Lpspi<P, N> {
     pub fn set_watermark(&mut self, direction: Direction, watermark: u8) -> u8 {
         set_watermark(&self.lpspi, direction, watermark)
     }
+
+    /// Recover from a transaction error.
+    fn recover_from_error(&mut self) {
+        // Resets the peripheral and flushes whatever is in the FIFOs.
+        self.soft_reset();
+
+        // Reset the status flags, clearing the error condition for the next use.
+        self.clear_status(Status::TRANSMIT_ERROR | Status::RECEIVE_ERROR);
+    }
 }
 
 bitflags::bitflags! {
@@ -1199,22 +1211,462 @@ impl<P, const N: u8> eh02::blocking::spi::Write<u32> for Lpspi<P, N> {
 /// Describes SPI words that can participate in transactions.
 trait Word: Copy + Into<u32> + TryFrom<u32> {
     const MAX: Self;
+    const ZERO: Self;
+
+    /// Repeatedly call `provider` to produce yourself,
+    /// then turn yourself into a LPSPI word.
+    fn pack_word(bit_order: BitOrder, provider: impl FnMut() -> Option<Self>) -> u32;
+
+    /// Given a word, deconstruct the word and call the
+    /// `sink` with those components.
+    fn unpack_word(word: u32, sink: impl FnMut(Self));
 }
 
 impl Word for u8 {
     const MAX: u8 = u8::MAX;
+    const ZERO: u8 = 0;
+    fn pack_word(bit_order: BitOrder, mut provider: impl FnMut() -> Option<Self>) -> u32 {
+        let mut word = 0;
+        match bit_order {
+            BitOrder::Msb => {
+                for _ in 0..4 {
+                    if let Some(byte) = provider() {
+                        word <<= 8;
+                        word |= u32::from(byte);
+                    }
+                }
+            }
+            BitOrder::Lsb => {
+                for offset in 0..4 {
+                    if let Some(byte) = provider() {
+                        word |= u32::from(byte) << (8 * offset);
+                    }
+                }
+            }
+        }
+
+        word
+    }
+    fn unpack_word(word: u32, mut sink: impl FnMut(Self)) {
+        for offset in [0, 8, 16, 24] {
+            sink((word >> offset) as u8);
+        }
+    }
 }
 
 impl Word for u16 {
     const MAX: u16 = u16::MAX;
+    const ZERO: u16 = 0;
+    fn pack_word(bit_order: BitOrder, mut provider: impl FnMut() -> Option<Self>) -> u32 {
+        let mut word = 0;
+        match bit_order {
+            BitOrder::Msb => {
+                for _ in 0..2 {
+                    if let Some(half) = provider() {
+                        word <<= 16;
+                        word |= u32::from(half);
+                    }
+                }
+            }
+            BitOrder::Lsb => {
+                for offset in 0..2 {
+                    if let Some(half) = provider() {
+                        word |= u32::from(half) << (16 * offset);
+                    }
+                }
+            }
+        }
+
+        word
+    }
+    fn unpack_word(word: u32, mut sink: impl FnMut(Self)) {
+        for offset in [0, 16] {
+            sink((word >> offset) as u16);
+        }
+    }
 }
 
 impl Word for u32 {
     const MAX: u32 = u32::MAX;
+    const ZERO: u32 = 0;
+    fn pack_word(_: BitOrder, mut provider: impl FnMut() -> Option<Self>) -> u32 {
+        provider().unwrap_or(0)
+    }
+    fn unpack_word(word: u32, mut sink: impl FnMut(Self)) {
+        sink(word)
+    }
+}
+
+/// Generalizes how we prepare LPSPI words for transmit.
+trait TransmitData {
+    /// Get the next word for the transmit FIFO.
+    ///
+    /// If you're out of words, return 0.
+    fn next_word(&mut self, bit_order: BitOrder) -> u32;
+}
+
+/// Generalizes how we save LPSPI data into memory.
+trait ReceiveData {
+    /// Invoked each time we read data from the queue.
+    fn next_word(&mut self, word: u32);
+}
+
+/// Transmit data from a buffer.
+struct TransmitBuffer<'a, W> {
+    /// The read position.
+    ptr: *const W,
+    /// One past the end of the buffer.
+    end: *const W,
+    _buffer: PhantomData<&'a [W]>,
+}
+
+impl<'a, W> TransmitBuffer<'a, W>
+where
+    W: Word,
+{
+    fn new(buffer: &'a [W]) -> Self {
+        // Safety: pointer offset math meets expectations.
+        unsafe { Self::from_raw(buffer.as_ptr(), buffer.len()) }
+    }
+
+    /// # Safety
+    ///
+    /// `ptr + len` must be in bounds, or one past the end of the
+    /// allocation.
+    unsafe fn from_raw(ptr: *const W, len: usize) -> Self {
+        Self {
+            ptr,
+            end: unsafe { ptr.add(len) },
+            _buffer: PhantomData,
+        }
+    }
+
+    /// Read the next element from the buffer.
+    fn next_read(&mut self) -> Option<W> {
+        // Safety: read the next word only if we're in bounds.
+        unsafe {
+            (self.ptr != self.end).then(|| {
+                let word = self.ptr.read();
+                self.ptr = self.ptr.add(1);
+                word
+            })
+        }
+    }
+}
+
+impl<W> TransmitData for TransmitBuffer<'_, W>
+where
+    W: Word,
+{
+    fn next_word(&mut self, bit_order: BitOrder) -> u32 {
+        W::pack_word(bit_order, || self.next_read())
+    }
+}
+
+/// Transmits dummy values.
+struct TransmitDummies;
+
+impl TransmitData for TransmitDummies {
+    fn next_word(&mut self, _: BitOrder) -> u32 {
+        u32::MAX
+    }
+}
+
+/// Receive data into a buffer.
+struct ReceiveBuffer<'a, W> {
+    /// The write position.
+    ptr: *mut W,
+    /// One past the end of the buffer.
+    end: *const W,
+    _buffer: PhantomData<&'a [W]>,
 }
 
+impl<'a, W> ReceiveBuffer<'a, W>
+where
+    W: Word,
+{
+    #[cfg(test)] // TODO(mciantyre) remove once needed in non-test code.
+    fn new(buffer: &'a mut [W]) -> Self {
+        // Safety: pointer offset math meets expectations.
+        unsafe { Self::from_raw(buffer.as_mut_ptr(), buffer.len()) }
+    }
+
+    /// # Safety
+    ///
+    /// `ptr + len` must be in bounds, or one past the end of the
+    /// allocation.
+    unsafe fn from_raw(ptr: *mut W, len: usize) -> Self {
+        Self {
+            ptr,
+            end: unsafe { ptr.cast_const().add(len) },
+            _buffer: PhantomData,
+        }
+    }
+
+    /// Put the next element into the buffer.
+    fn next_write(&mut self, elem: W) {
+        // Safety: write the next word only if we're in bounds.
+        // Words are primitive types; we don't need to execute
+        // a drop when we overwrite a value in memory.
+        unsafe {
+            if self.ptr.cast_const() != self.end {
+                self.ptr.write(elem);
+                self.ptr = self.ptr.add(1);
+            }
+        }
+    }
+}
+
+impl<W> ReceiveData for ReceiveBuffer<'_, W>
+where
+    W: Word,
+{
+    fn next_word(&mut self, word: u32) {
+        W::unpack_word(word, |elem| self.next_write(elem));
+    }
+}
+
+/// Receive dummy data.
+struct ReceiveDummies;
+
+impl ReceiveData for ReceiveDummies {
+    fn next_word(&mut self, _: u32) {}
+}
+
+/// Computes how may Ws fit inside a LPSPI word.
+const fn per_word<W: Word>() -> usize {
+    core::mem::size_of::<u32>() / core::mem::size_of::<W>()
+}
+
+/// Computes how many u32 words we need to transact this buffer.
+const fn word_count<W: Word>(words: &[W]) -> usize {
+    (words.len() + per_word::<W>() - 1) / per_word::<W>()
+}
+
+/// Creates the transmit and receive buffer objects for an
+/// in-place transfer.
+fn transfer_in_place<W: Word>(buffer: &mut [W]) -> (TransmitBuffer<'_, W>, ReceiveBuffer<'_, W>) {
+    // Safety: pointer math meets expectation. This produces
+    // a mutable and immutable pointer to the same mutable buffer.
+    // Module inspection shows that these pointers never become
+    // references. We maintain the lifetime across both objects,
+    // so the buffer isn't dropped.
+    unsafe {
+        let len = buffer.len();
+        let ptr = buffer.as_mut_ptr();
+        (
+            TransmitBuffer::from_raw(ptr, len),
+            ReceiveBuffer::from_raw(ptr, len),
+        )
+    }
+}
+
+/// Tests try to approximate the way we'll use TransmitBuffer and ReceiveBuffer
+/// in firmware. Consider running these with miri to evaluate unsafe usages.
 #[cfg(test)]
 mod tests {
+    #[test]
+    fn transfer_in_place_interleaved_read_write_u32() {
+        const BUFFER: [u32; 9] = [42u32, 43, 44, 45, 46, 47, 48, 49, 50];
+        let mut buffer = BUFFER;
+        let (mut tx, mut rx) = super::transfer_in_place(&mut buffer);
+
+        for elem in BUFFER {
+            assert_eq!(elem, tx.next_read().unwrap());
+            rx.next_write(elem + 1);
+        }
+
+        assert_eq!(buffer, [43, 44, 45, 46, 47, 48, 49, 50, 51]);
+    }
+
+    #[test]
+    fn transfer_in_place_interleaved_write_read_u32() {
+        const BUFFER: [u32; 9] = [42u32, 43, 44, 45, 46, 47, 48, 49, 50];
+        let mut buffer = BUFFER;
+        let (mut tx, mut rx) = super::transfer_in_place(&mut buffer);
+
+        for elem in BUFFER {
+            rx.next_write(elem + 1);
+            assert_eq!(elem + 1, tx.next_read().unwrap());
+        }
+
+        assert_eq!(buffer, [43, 44, 45, 46, 47, 48, 49, 50, 51]);
+    }
+
+    #[test]
+    fn transfer_in_place_bulk_read_write_u32() {
+        const BUFFER: [u32; 9] = [42u32, 43, 44, 45, 46, 47, 48, 49, 50];
+        let mut buffer = BUFFER;
+        let (mut tx, mut rx) = super::transfer_in_place(&mut buffer);
+
+        for elem in BUFFER {
+            assert_eq!(elem, tx.next_read().unwrap());
+        }
+        for elem in BUFFER {
+            rx.next_write(elem + 1);
+        }
+
+        assert_eq!(buffer, [43, 44, 45, 46, 47, 48, 49, 50, 51]);
+    }
+
+    #[test]
+    fn transfer_in_place_bulk_write_read_u32() {
+        const BUFFER: [u32; 9] = [42u32, 43, 44, 45, 46, 47, 48, 49, 50];
+        let mut buffer = BUFFER;
+        let (mut tx, mut rx) = super::transfer_in_place(&mut buffer);
+
+        for elem in BUFFER {
+            rx.next_write(elem + 1);
+        }
+        for elem in BUFFER {
+            assert_eq!(elem + 1, tx.next_read().unwrap());
+        }
+
+        assert_eq!(buffer, [43, 44, 45, 46, 47, 48, 49, 50, 51]);
+    }
+
+    #[test]
+    fn transmit_buffer() {
+        use super::{BitOrder::*, TransmitBuffer, TransmitData};
+
+        //
+        // u32
+        //
+        // This is the easiest to understand w.r.t. the bit order, since this is the natural word
+        // size of the peripheral. No matter the bit order, we produce the same word for the TX
+        // FIFO. The hardware handles the MSB or LSB transform.
+
+        let mut tx = TransmitBuffer::new(&[0xDEADBEEFu32, 0xAD1CAC1D]);
+        assert_eq!(tx.next_word(Msb), 0xDEADBEEF);
+        assert_eq!(tx.next_word(Msb), 0xAD1CAC1D);
+        assert_eq!(tx.next_word(Msb), 0);
+
+        let mut tx = TransmitBuffer::new(&[0xDEADBEEFu32, 0xAD1CAC1D]);
+        assert_eq!(tx.next_word(Lsb), 0xDEADBEEF);
+        assert_eq!(tx.next_word(Lsb), 0xAD1CAC1D);
+        assert_eq!(tx.next_word(Lsb), 0);
+
+        //
+        // u8
+        //
+        // If the user prefers u8 words, then we should pack the bytes into a u32 such that the
+        // hardware's MSB/LSB transform maintains the (literal) byte order.
+
+        let mut tx = TransmitBuffer::new(&[0xDEu8, 0xAD, 0xBE, 0xEF, 0xA5, 0x00, 0x1D]);
+        assert_eq!(tx.next_word(Msb), 0xDEADBEEF);
+        assert_eq!(tx.next_word(Msb), 0x00A5001D);
+        assert_eq!(tx.next_word(Msb), 0);
+        assert_eq!(tx.next_word(Msb), 0);
+
+        let mut tx = TransmitBuffer::new(&[0xDEu8, 0xAD, 0xBE, 0xEF, 0xA5, 0x00, 0x1D]);
+        assert_eq!(tx.next_word(Lsb), 0xEFBEADDE);
+        assert_eq!(tx.next_word(Lsb), 0x001D00A5);
+        assert_eq!(tx.next_word(Lsb), 0);
+        assert_eq!(tx.next_word(Lsb), 0);
+
+        let mut tx = TransmitBuffer::new(&[0xDEu8, 0xAD, 0xBE, 0xEF]);
+        assert_eq!(tx.next_word(Msb), 0xDEADBEEF);
+        assert_eq!(tx.next_word(Msb), 0);
+        assert_eq!(tx.next_word(Msb), 0);
+
+        let mut tx = TransmitBuffer::new(&[0xDEu8, 0xAD, 0xBE, 0xEF]);
+        assert_eq!(tx.next_word(Lsb), 0xEFBEADDE);
+        assert_eq!(tx.next_word(Lsb), 0);
+        assert_eq!(tx.next_word(Lsb), 0);
+
+        let mut tx = TransmitBuffer::new(&[0xDEu8, 0xAD, 0xBE]);
+        assert_eq!(tx.next_word(Msb), 0x00DEADBE);
+        assert_eq!(tx.next_word(Msb), 0);
+        assert_eq!(tx.next_word(Msb), 0);
+
+        let mut tx = TransmitBuffer::new(&[0xDEu8, 0xAD, 0xBE]);
+        assert_eq!(tx.next_word(Lsb), 0x00BEADDE);
+        assert_eq!(tx.next_word(Lsb), 0);
+        assert_eq!(tx.next_word(Lsb), 0);
+
+        //
+        // u16
+        //
+        // Same goes here: we should combine u16s such that the hardware transfers elements
+        // in order while applying the MSB/LSB transform on each u16.
+
+        let mut tx = TransmitBuffer::new(&[0xDEADu16, 0xBEEF, 0xA5A5]);
+        assert_eq!(tx.next_word(Msb), 0xDEADBEEF);
+        assert_eq!(tx.next_word(Msb), 0x0000A5A5);
+        assert_eq!(tx.next_word(Msb), 0);
+        assert_eq!(tx.next_word(Msb), 0);
+
+        let mut tx = TransmitBuffer::new(&[0xDEADu16, 0xBEEF, 0xA5A5]);
+        assert_eq!(tx.next_word(Lsb), 0xBEEFDEAD);
+        assert_eq!(tx.next_word(Lsb), 0x0000A5A5);
+        assert_eq!(tx.next_word(Lsb), 0);
+        assert_eq!(tx.next_word(Lsb), 0);
+
+        let mut tx = TransmitBuffer::new(&[0xDEADu16, 0xBEEF]);
+        assert_eq!(tx.next_word(Msb), 0xDEADBEEF);
+        assert_eq!(tx.next_word(Msb), 0);
+        assert_eq!(tx.next_word(Msb), 0);
+
+        let mut tx = TransmitBuffer::new(&[0xDEADu16, 0xBEEF]);
+        assert_eq!(tx.next_word(Lsb), 0xBEEFDEAD);
+        assert_eq!(tx.next_word(Lsb), 0);
+        assert_eq!(tx.next_word(Lsb), 0);
+
+        let mut tx = TransmitBuffer::new(&[0xDEADu16]);
+        assert_eq!(tx.next_word(Msb), 0x0000DEAD);
+        assert_eq!(tx.next_word(Msb), 0);
+        assert_eq!(tx.next_word(Msb), 0);
+
+        let mut tx = TransmitBuffer::new(&[0xDEADu16]);
+        assert_eq!(tx.next_word(Lsb), 0x0000DEAD);
+        assert_eq!(tx.next_word(Lsb), 0);
+        assert_eq!(tx.next_word(Lsb), 0);
+    }
+
+    #[test]
+    fn receive_buffer() {
+        use super::{ReceiveBuffer, ReceiveData};
+
+        //
+        // u8
+        //
+
+        let mut buffer = [0u8; 9];
+        let mut rx = ReceiveBuffer::new(&mut buffer);
+        rx.next_word(0xDEADBEEF);
+        rx.next_word(0xAD1CAC1D);
+        rx.next_word(0x04030201);
+        rx.next_word(0x55555555);
+        assert_eq!(
+            buffer,
+            [0xEF, 0xBE, 0xAD, 0xDE, 0x1D, 0xAC, 0x1C, 0xAD, 0x01]
+        );
+
+        //
+        // u16
+        //
+
+        let mut buffer = [0u16; 5];
+        let mut rx = ReceiveBuffer::new(&mut buffer);
+        rx.next_word(0xDEADBEEF);
+        rx.next_word(0xAD1CAC1D);
+        rx.next_word(0x04030201);
+        rx.next_word(0x55555555);
+        assert_eq!(buffer, [0xBEEF, 0xDEAD, 0xAC1D, 0xAD1C, 0x0201]);
+
+        //
+        // u32
+        //
+
+        let mut buffer = [0u32; 3];
+        let mut rx = ReceiveBuffer::new(&mut buffer);
+        rx.next_word(0xDEADBEEF);
+        rx.next_word(0xAD1CAC1D);
+        rx.next_word(0x77777777);
+        rx.next_word(0x55555555);
+        assert_eq!(buffer, [0xDEADBEEF, 0xAD1CAC1D, 0x77777777]);
+    }
+
     #[test]
     fn transaction_frame_sizes() {
         assert!(super::Transaction::new_words(&[1u8]).is_ok());
diff --git a/src/lib.rs b/src/lib.rs
index d0993d63..e9caa0ea 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -295,3 +295,18 @@ pub mod iomuxc {
 
 #[cfg_attr(family = "none", allow(unused_imports))] // Nothing to export in this build.
 pub use crate::chip::reexports::*;
+
+/// Simply spin on the future.
+fn spin_on<F: core::future::Future>(future: F) -> F::Output {
+    use core::task::{Context, Poll};
+
+    let waker = futures::task::noop_waker();
+    let mut context = Context::from_waker(&waker);
+    let mut future = core::pin::pin!(future);
+
+    loop {
+        if let Poll::Ready(result) = future.as_mut().poll(&mut context) {
+            return result;
+        }
+    }
+}