From 013e49d8599e5ba32c82d55941cb08232c879d79 Mon Sep 17 00:00:00 2001 From: Matt Edholm Date: Thu, 14 May 2026 11:50:36 -0400 Subject: [PATCH] fix(13e6): partition + SPI corruption + bootstrap stay-awake MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three problems surfaced during the first 13.3" end-to-end run: 1) LittleFS IntegerDivideByZero on 200 → write /img.bin. Cause: the ~3.5 MB SPIFFS in default_16MB.csv can't fit three 960 KB setup screens + a 960 KB cached image (~3.84 MB). Switching to a custom partitions_13e6.csv with 24 MB LittleFS on the 32 MB flash. 2) Yellow wash across the panel on long SPI bursts. Cause: SPI DMA from a PSRAM-backed scratch buffer hits a cache-coherency window — the CPU's writes hadn't reached PSRAM yet when DMA read it. Push each half in 8 KB chunks through an internal-SRAM (DMA-coherent) scratch, and drop the bus clock to 4 MHz to match the 7.3" production speed. 3) Bootstrap window (no image yet) was deep-sleeping for 15 s between polls — each cycle a ~5 s ROM-boot + Wi-Fi reconnect, so the user waited ~20 s × N retries between scanning the setup QR and seeing their first photo land. Now normal_operation_impl returns early during bootstrap and main.cpp's normal_operation loops with a 2 s delay, keeping Wi-Fi up. Once the first image arrives, the normal scheduled deep sleep takes over. Also fixes a related bug Matt called out: a transient TLS hiccup during bootstrap was hitting the 5xx fallback path and painting a full yellow fill over the green setup QR, leaving the user with no claim path. Criterion is now "does /img.bin exist?" (panel has something worth showing with a border) rather than "is currentImgId set?", so a fresh device with no cached image preserves the setup screen through transient network errors. Diagnostic prints in the panel driver + [op] start/code lines in normal_operation_impl that proved invaluable during bringup; leaving them in for now. Tests updated for the new bootstrap semantics (deep sleep no longer arms on bootstrap-cycle 204/404/5xx); 43/43 native tests pass, 7.3" production build stays byte-identical. Co-Authored-By: Claude Opus 4.7 (1M context) --- partitions_13e6.csv | 17 ++++++++ platformio.ini | 10 +++-- src/config.h | 7 ++++ src/main.cpp | 15 +++++-- src/operation.h | 43 ++++++++++++-------- src/panels/waveshare13e6/v1/epd_driver.cpp | 11 +++++- test/test_normal_operation/test_main.cpp | 46 ++++++++++++++-------- 7 files changed, 106 insertions(+), 43 deletions(-) create mode 100644 partitions_13e6.csv diff --git a/partitions_13e6.csv b/partitions_13e6.csv new file mode 100644 index 0000000..fa9304b --- /dev/null +++ b/partitions_13e6.csv @@ -0,0 +1,17 @@ +# Custom partition layout for the Waveshare ESP32-S3-ePaper-13.3E6 (32 MB OPI flash). +# +# default_16MB.csv ships ~3.5 MB to SPIFFS — too small once we add a +# panel-native /img.bin (960 KB) on top of the three setup-screen .bin +# files (also 960 KB each). LittleFS panics in lfs_ctz_traverse / lfs.c +# with an IntegerDivideByZero when it runs out of blocks mid-write. +# +# This layout: 4 MB app0 + 4 MB app1 (keep OTA option open) + 24 MB +# LittleFS + 64 KB coredump on 32 MB flash. Plenty of headroom. +# +# Name, Type, SubType, Offset, Size, Flags +nvs, data, nvs, 0x9000, 0x5000, +otadata, data, ota, 0xe000, 0x2000, +app0, app, ota_0, 0x10000, 0x400000, +app1, app, ota_1, 0x410000, 0x400000, +spiffs, data, spiffs, 0x810000, 0x17E0000, +coredump, data, coredump, 0x1FF0000,0x10000, diff --git a/platformio.ini b/platformio.ini index 85a6c04..1ee51cc 100644 --- a/platformio.ini +++ b/platformio.ini @@ -101,10 +101,12 @@ board_build.flash_mode = opi board_upload.flash_size = 32MB board_build.arduino.memory_type = opi_opi board_build.filesystem = littlefs -; Default partition table reserves ~1.5MB for SPIFFS — not enough for three -; 960 KB setup-screen .bin files (2.88 MB minimum + LittleFS metadata). -; 16MB preset gives ~3.5 MB to the filesystem, tight but works. -board_build.partitions = default_16MB.csv +; Custom partition layout: 4 MB app slots + 24 MB LittleFS on 32 MB flash. +; The 16 MB preset's ~3.5 MB SPIFFS is too tight — three 960 KB setup +; screens + a 960 KB cached /img.bin overflows it, and LittleFS panics +; in lfs_ctz_traverse (lfs.c:2988) with an IntegerDivideByZero +; mid-write rather than returning a clean "no space" error. +board_build.partitions = partitions_13e6.csv extra_scripts = pre:scripts/data_dir.py build_src_filter = + diff --git a/src/config.h b/src/config.h index d9df0e1..1831b33 100644 --- a/src/config.h +++ b/src/config.h @@ -165,4 +165,11 @@ #ifndef FIRST_IMAGE_POLL_INTERVAL_MS #define FIRST_IMAGE_POLL_INTERVAL_MS 15000ULL #endif +// During bootstrap (no image yet) the device stays awake and polls in a +// tight loop — keeping WiFi up between requests so the user doesn't wait +// through a ~5 s deep-sleep + reconnect on every retry. Once the first +// image arrives the device enters the normal scheduled deep sleep. +#ifndef BOOTSTRAP_RETRY_INTERVAL_MS +#define BOOTSTRAP_RETRY_INTERVAL_MS 2000ULL +#endif #define IMAGE_PATH "/img.bin" diff --git a/src/main.cpp b/src/main.cpp index 52652ef..9d05bd7 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -288,10 +288,17 @@ static void normal_operation(const String& mac) { WiFiClientSecure client; client.setInsecure(); // V1: no cert pinning for personal-scale device - HTTPClient http; - http.begin(client, url); - - normal_operation_impl(mac, http, url, prefs); + // Bootstrap loop: normal_operation_impl deep-sleeps (never returns) once + // we've received our first image. While in the pre-image window it + // returns instead, so we keep WiFi up and retry on a short interval — + // way faster end-to-end than waiting through a deep-sleep + reconnect + // for every "no image yet" poll. + while (true) { + HTTPClient http; + http.begin(client, url); + normal_operation_impl(mac, http, url, prefs); + delay(BOOTSTRAP_RETRY_INTERVAL_MS); + } } // ── Setup ───────────────────────────────────────────────────────────────────── diff --git a/src/operation.h b/src/operation.h index 5f9a7ed..911b29e 100644 --- a/src/operation.h +++ b/src/operation.h @@ -107,6 +107,7 @@ inline bool check_reset_button() { template void normal_operation_impl(const String& mac, HTTP& http, const String& url, Preferences& prefs) { + Serial.println("[op] start GET " + url); prefs.begin(NVS_NAMESPACE, true); int32_t currentImgId = prefs.getInt(NVS_KEY_IMG_ID, -1); bool drawNeeded = prefs.getInt(NVS_KEY_DRAW_NEEDED, 0) != 0; @@ -162,6 +163,7 @@ void normal_operation_impl(const String& mac, HTTP& http, const String& url, Pre const char* collectHeaders[] = { "X-Interval-Ms", "X-Image-Id", "X-Image-Sha256", "X-Claimed" }; http.collectHeaders(collectHeaders, 4); int code = http.GET(); + Serial.println("[op] GET code=" + String(code)); // Server confirmed we're claimed → flag clears, regardless of what // happened to the response body. Without this, every poll forever @@ -292,27 +294,31 @@ void normal_operation_impl(const String& mac, HTTP& http, const String& url, Pre // previous cycle. http.end(); } else { - // Sync failed (5xx, timeout, malformed). Per FR38, the last-good image - // must persist; only the border indicates the error. epd_draw_image_with_border - // falls back to a full fill if the cached file is missing or wrong size, - // so first-boot error still gets a visible signal. + // Sync failed (5xx, timeout, TLS handshake failure → code=-1, etc.). + // Criterion is "does /img.bin exist?", not "is currentImgId >= 0?": + // • file exists → a real photo is on the panel. Draw it with a + // yellow border so the user sees a sync signal without losing it. + // Per FR38, the last-good image must persist. + // • no file → bootstrap window. Panel is showing the setup-claim + // QR which the user still needs to scan. A transient TLS hiccup + // must NOT wipe that to a yellow fill — it would leave the + // recipient with no way to claim the frame until the next 200. http.end(); - displayInitialized = true; - epd_init(); File r = LittleFS.open(IMAGE_PATH, "r"); if (r) { Serial.println(String("[op] sync fail code=") + String(code) + " -> drawing image with yellow border"); + displayInitialized = true; + epd_init(); epd_draw_image_with_border(r, COLOR_YELLOW, BORDER_THICKNESS_PX); r.close(); + prefs.begin(NVS_NAMESPACE, false); + prefs.putInt(NVS_KEY_ERR_BORDER, 1); + prefs.end(); } else { Serial.println(String("[op] sync fail code=") + String(code) + - " -> no cached image, falling back to full yellow fill"); - epd_fill(COLOR_YELLOW); + " with no cached image; preserving setup screen"); } - prefs.begin(NVS_NAMESPACE, false); - prefs.putInt(NVS_KEY_ERR_BORDER, 1); - prefs.end(); } // Only power off the display if it was initialized this cycle. Calling @@ -321,16 +327,19 @@ void normal_operation_impl(const String& mac, HTTP& http, const String& url, Pre // entire poll interval on every 304 response. if (displayInitialized) epd_sleep(); - // Bootstrap-fast polling: until we've ever displayed an image, ignore - // the schedule and poll every 15 s. Without this, a freshly-claimed - // device whose owner has wakeTimes set to e.g. noon would sit dark for - // up to 24 h before the first photo lands. We re-read img_id from NVS - // because the 200 path persists it AFTER the local var was captured. + // Bootstrap window: the device has never received an image — the user is + // still mid-setup, looking at the green setup QR, waiting for the first + // photo to land. Return WITHOUT arming deep sleep. The caller is expected + // to re-invoke us on a short timer until imgIdAfter goes >= 0, keeping + // WiFi up and the device responsive. The previous "15 s deep-sleep + // between bootstrap polls" wasted ~5 s per cycle on flash boot + WiFi + // reconnect, so end-user wait until first image was ~20 s × N retries. prefs.begin(NVS_NAMESPACE, true); int32_t imgIdAfter = prefs.getInt(NVS_KEY_IMG_ID, -1); prefs.end(); if (imgIdAfter < 0) { - sleepMs = FIRST_IMAGE_POLL_INTERVAL_MS; + Serial.println("[op] bootstrap: no image yet, caller will retry"); + return; } esp_sleep_enable_timer_wakeup(sleepMs * 1000ULL); diff --git a/src/panels/waveshare13e6/v1/epd_driver.cpp b/src/panels/waveshare13e6/v1/epd_driver.cpp index d65a657..9967673 100644 --- a/src/panels/waveshare13e6/v1/epd_driver.cpp +++ b/src/panels/waveshare13e6/v1/epd_driver.cpp @@ -209,6 +209,9 @@ static void ensure_dma_scratch() { if (!g_dma_scratch) { g_dma_scratch = (uint8_t*)heap_caps_malloc( DMA_CHUNK, MALLOC_CAP_INTERNAL | MALLOC_CAP_DMA); + Serial.printf("[epd13e6] dma_scratch=%p (free internal=%u)\n", + g_dma_scratch, + (unsigned)heap_caps_get_free_size(MALLOC_CAP_INTERNAL)); } } @@ -235,7 +238,12 @@ static void push_half(int cs_pin, const uint8_t* half_fb, size_t bytes) { static void push_full_frame(const uint8_t* fb) { constexpr size_t HALF_BYTES = (size_t)HALF_BYTES_ROW * H; uint8_t* slice = (uint8_t*)heap_caps_malloc(HALF_BYTES, MALLOC_CAP_SPIRAM); - if (!slice) return; + if (!slice) { + Serial.printf("[epd13e6] push_full_frame: slice alloc FAILED (free PSRAM=%u)\n", + (unsigned)ESP.getFreePsram()); + return; + } + Serial.println("[epd13e6] push_full_frame: pushing halves"); for (uint16_t y = 0; y < H; y++) { memcpy(slice + (size_t)y * HALF_BYTES_ROW, @@ -252,6 +260,7 @@ static void push_full_frame(const uint8_t* fb) { push_half(PIN_CS_S, slice, HALF_BYTES); heap_caps_free(slice); + Serial.println("[epd13e6] push_full_frame: done"); } // ── epd.h surface ────────────────────────────────────────────────────────────── diff --git a/test/test_normal_operation/test_main.cpp b/test/test_normal_operation/test_main.cpp index d0d94d7..ebe7595 100644 --- a/test/test_normal_operation/test_main.cpp +++ b/test/test_normal_operation/test_main.cpp @@ -141,7 +141,9 @@ void test_fw04_204_no_prior_image_does_not_redraw() { "204 must not redraw the QR — panel already holds it from provisioning"); TEST_ASSERT_EQUAL(0, g_epd_init_count); TEST_ASSERT_EQUAL(0, g_epd_draw_image_count); - TEST_ASSERT_TRUE(g_deep_sleep_started); + // Bootstrap: no deep sleep. Caller (main.cpp normal_operation loop) keeps + // WiFi up and retries on a short BOOTSTRAP_RETRY_INTERVAL_MS timer. + TEST_ASSERT_FALSE(g_deep_sleep_started); } // FW-04b: 204 after a real image was previously displayed — panel holds the @@ -157,14 +159,15 @@ void test_fw04b_204_with_prior_image_does_not_redraw() { TEST_ASSERT_EQUAL(0, g_epd_fill_count); } -// FW-05: 404 — same logic as 204; panel keeps whatever's there. +// FW-05: 404 — same logic as 204; panel keeps whatever's there. Without a +// prior image, the bootstrap path also skips deep sleep (caller retries). void test_fw05_404_does_not_redraw() { g_http_get_code = 404; normal_operation_impl(String("mac"), http, String("url"), prefs); TEST_ASSERT_EQUAL(0, g_show_setup_qr_count); TEST_ASSERT_EQUAL(0, g_epd_init_count); TEST_ASSERT_EQUAL(0, g_epd_draw_image_count); - TEST_ASSERT_TRUE(g_deep_sleep_started); + TEST_ASSERT_FALSE(g_deep_sleep_started); } // FW-06a: 5xx error WITH a cached image → preserve last image and overlay a @@ -185,18 +188,24 @@ void test_fw06a_error_with_cache_draws_border_not_fill() { TEST_ASSERT_EQUAL(1, prefs.getInt(NVS_KEY_ERR_BORDER, -1)); } -// FW-06b: 5xx error with NO cached image → fall back to full yellow fill so -// the user still sees a sync-fail signal on a fresh device. -void test_fw06b_error_without_cache_falls_back_to_fill() { +// FW-06b: 5xx error during the bootstrap window (no cached image) MUST NOT +// paint anything — the panel is holding the green setup-claim QR, which the +// user still needs to scan. A transient TLS hiccup or server blip wiping it +// to yellow leaves the recipient stranded with no path to claim the frame. +// err_border MUST NOT be set either, since there's nothing to "recover" from +// on the next healthy response. +void test_fw06b_error_without_cache_preserves_setup_screen() { g_http_get_code = 500; // LittleFS has no IMAGE_PATH entry normal_operation_impl(String("mac"), http, String("url"), prefs); TEST_ASSERT_EQUAL(0, g_epd_draw_border_count); - TEST_ASSERT_EQUAL(1, g_epd_fill_count); - TEST_ASSERT_EQUAL(COLOR_YELLOW, g_epd_fill_last_color); - TEST_ASSERT_EQUAL(1, prefs.getInt(NVS_KEY_ERR_BORDER, -1)); + TEST_ASSERT_EQUAL_MESSAGE(0, g_epd_fill_count, + "5xx during bootstrap must NOT paint yellow over the setup QR"); + // err_border NOT set — default sentinel (-1) means key wasn't written. + TEST_ASSERT_EQUAL(-1, prefs.getInt(NVS_KEY_ERR_BORDER, -1)); + TEST_ASSERT_FALSE(g_deep_sleep_started); } // FW-06c: 304 with err_border flag set (sync recovered after a previous @@ -389,17 +398,20 @@ void test_fw_deep_sleep_arms_ext0_button_wakeup() { } // FW-FIRST-IMG-A: device has never received an image (img_id = -1) AND the -// poll didn't deliver one (e.g. 204 because no images approved yet). Sleep -// must be the 15s bootstrap interval, NOT whatever the server's schedule -// says. Without this, a fresh device on a noon-daily schedule sits dark -// for up to 24 h before the first photo lands. -void test_fw_first_image_bootstrap_polls_at_15s_when_no_image_yet() { +// poll didn't deliver one (e.g. 204 because no images approved yet). The +// bootstrap path must SKIP deep sleep entirely — the caller (main.cpp's +// normal_operation loop) keeps WiFi alive and re-invokes the function on +// a short BOOTSTRAP_RETRY_INTERVAL_MS timer so the user doesn't watch a +// ~5 s deep-sleep + wifi-reconnect on every "no image yet" cycle. +// Without this contract, a fresh device on a noon-daily schedule would +// deep-sleep for the server's 6-hour interval and sit dark all day. +void test_fw_first_image_bootstrap_skips_deep_sleep() { g_http_get_code = 204; // Server tries to set a 6-hour interval for the user's noon-daily schedule. g_http_response_headers["X-Interval-Ms"] = String((unsigned long)(6ULL * 60 * 60 * 1000)).c_str(); // No img_id in NVS → device has never seen an image. normal_operation_impl(String("mac"), http, String("url"), prefs); - TEST_ASSERT_EQUAL_UINT64(FIRST_IMAGE_POLL_INTERVAL_MS * 1000ULL, g_sleep_us); + TEST_ASSERT_FALSE(g_deep_sleep_started); } // FW-FIRST-IMG-B: once we've persisted an image (200 path wrote img_id), the @@ -542,7 +554,7 @@ int main(int argc, char** argv) { RUN_TEST(test_fw04b_204_with_prior_image_does_not_redraw); RUN_TEST(test_fw05_404_does_not_redraw); RUN_TEST(test_fw06a_error_with_cache_draws_border_not_fill); - RUN_TEST(test_fw06b_error_without_cache_falls_back_to_fill); + RUN_TEST(test_fw06b_error_without_cache_preserves_setup_screen); RUN_TEST(test_fw06c_304_after_error_repaints_clean); RUN_TEST(test_fw06d_304_steady_state_does_not_fill_yellow); RUN_TEST(test_fw06e_200_after_error_clears_flag); @@ -560,7 +572,7 @@ int main(int argc, char** argv) { RUN_TEST(test_fw_no_flag_means_no_header); RUN_TEST(test_fw_X_Claimed_response_clears_flag); RUN_TEST(test_fw_no_X_Claimed_response_keeps_flag); - RUN_TEST(test_fw_first_image_bootstrap_polls_at_15s_when_no_image_yet); + RUN_TEST(test_fw_first_image_bootstrap_skips_deep_sleep); RUN_TEST(test_fw_first_image_bootstrap_clears_after_200); RUN_TEST(test_fw_first_image_just_arrived_uses_server_interval); RUN_TEST(test_fw_deep_sleep_arms_ext0_button_wakeup);