Skip to content

Espressif ESP32-S3-BOX-3 Voice Assistant

Complete configuration for the Espressif ESP32-S3-BOX-3 with persistent Home Assistant timers.

Hardware Overview

Feature Value
Board ESP32-S3 with PSRAM
Display 320x240 ILI9341/ILI9xxx LCD
Touch GT911 Capacitive
Audio ES7210 ADC + ES8311 DAC
Wake Word On-device micro_wake_word
Additional Mute button, presence sensor, battery monitoring

Prerequisites

  • ESPHome 2025.5.0 or newer
  • Home Assistant with:
  • Timer entity matching your area (e.g., timer.playroom)
  • Template sensor for timer remaining seconds
  • Intent scripts for timer control
  • Timer finished automation

Complete Configuration

Download the complete ESPHome configuration file:

Download esp32-s3-box-3-voice-assistant.yaml

Configuration

Substitutions

# file: esphome/examples/esp32-s3-box-3-voice-assistant.yaml
# section: substitutions
substitutions:
  device_name: "${timer_area}-voice-assistant"
  friendly_name: "${timer_area} Voice Assistant"
  device_description: "ESP32-S3-BOX-3"

  # REQUIRED: Set this to match your HA area
  timer_area: "playroom"

  # Generic voice assistant images
  loading_illustration_file: https://github.com/esphome/wake-word-voice-assistants/raw/main/casita/loading_320_240.png
  idle_illustration_file: https://github.com/esphome/wake-word-voice-assistants/raw/main/casita/idle_320_240.png
  listening_illustration_file: https://github.com/esphome/wake-word-voice-assistants/raw/main/casita/listening_320_240.png
  thinking_illustration_file: https://github.com/esphome/wake-word-voice-assistants/raw/main/casita/thinking_320_240.png
  replying_illustration_file: https://github.com/esphome/wake-word-voice-assistants/raw/main/casita/replying_320_240.png
  error_illustration_file: https://github.com/esphome/wake-word-voice-assistants/raw/main/casita/error_320_240.png
  timer_finished_illustration_file: https://github.com/esphome/wake-word-voice-assistants/raw/main/casita/timer_finished_320_240.png

  # Background colors
  loading_illustration_background_color: "000000"
  idle_illustration_background_color: "000000"
  listening_illustration_background_color: "FFFFFF"
  thinking_illustration_background_color: "FFFFFF"
  replying_illustration_background_color: "FFFFFF"
  error_illustration_background_color: "000000"

  # Voice assistant phase IDs
  voice_assist_idle_phase_id: "1"
  voice_assist_listening_phase_id: "2"
  voice_assist_thinking_phase_id: "3"
  voice_assist_replying_phase_id: "4"
  voice_assist_not_ready_phase_id: "10"
  voice_assist_error_phase_id: "11"
  voice_assist_muted_phase_id: "12"
  voice_assist_timer_finished_phase_id: "20"
  voice_assist_ota_phase_id: "30"

  # Font configuration
  allowed_characters: " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~°"
  font_glyphsets: "GF_Latin_Core"
  font_family: Figtree

ESPHome Core

# file: esphome/examples/esp32-s3-box-3-voice-assistant.yaml
# section: esphome
esphome:
  name: ${device_name}
  friendly_name: ${friendly_name}
  comment: ${device_description}
  min_version: 2025.5.0
  name_add_mac_suffix: false
  on_boot:
    - priority: 600
      then:
        - script.execute: draw_display
        - delay: 30s
        - if:
            condition:
              lambda: return id(init_in_progress);
            then:
              - lambda: id(init_in_progress) = false;
              - script.execute: draw_display

ESP32 Platform

# file: esphome/examples/esp32-s3-box-3-voice-assistant.yaml
# section: esp32
esp32:
  board: esp32s3box
  flash_size: 16MB
  cpu_frequency: 240MHz
  framework:
    type: esp-idf
    sdkconfig_options:
      CONFIG_ESP32S3_DEFAULT_CPU_FREQ_240: "y"
      CONFIG_ESP32S3_DATA_CACHE_64KB: "y"
      CONFIG_ESP32S3_DATA_CACHE_LINE_64B: "y"

psram:
  mode: octal
  speed: 80MHz

API Services

# file: esphome/examples/esp32-s3-box-3-voice-assistant.yaml
# section: api
api:
  on_client_connected:
    - script.execute: draw_display
  on_client_disconnected:
    - script.execute: draw_display

  services:
    - service: timer_finished
      then:
        - logger.log: "Timer finished! Playing alarm..."
        - switch.turn_on: timer_ringing

    - service: timer_started
      variables:
        duration: int
      then:
        - logger.log:
            format: "Timer started with duration: %d seconds"
            args: ["duration"]
        - script.execute: draw_display

    - service: timer_cancelled
      then:
        - logger.log: "Timer cancelled"
        - switch.turn_off: timer_ringing
        - script.execute: draw_display

    - service: stop_alarm
      then:
        - switch.turn_off: timer_ringing

OTA, Logger, WiFi, Time

# file: esphome/examples/esp32-s3-box-3-voice-assistant.yaml
# section: ota
ota:
  - platform: esphome
    id: ota_esphome
    on_begin:
      - script.execute: stop_wake_word
      - lambda: |-
          id(voice_assistant_phase) = ${voice_assist_ota_phase_id};
          id(ota_progress) = 0;
      - display.page.show: ota_page
      - component.update: s3_box_lcd
    on_progress:
      - lambda: id(ota_progress) = (int)x;
      - component.update: s3_box_lcd
    on_end:
      - lambda: id(ota_progress) = 100;
      - component.update: s3_box_lcd
    on_error:
      - lambda: id(ota_progress) = -1;
      - display.page.show: error_page
      - component.update: s3_box_lcd
      - delay: 5s
      - script.execute: draw_display

logger:
  level: DEBUG
  hardware_uart: USB_SERIAL_JTAG
  logs:
    text_sensor: WARN
    sensor: WARN
    component: ERROR

wifi:
  ssid: !secret wifi_ssid
  password: !secret wifi_password
  on_connect:
    - script.execute: draw_display
  on_disconnect:
    - script.execute: draw_display

time:
  - platform: sntp
    id: sntp_time
    servers: !secret ntp_servers
    timezone: !secret timezone

Timer Sync Intervals

# file: esphome/examples/esp32-s3-box-3-voice-assistant.yaml
# section: interval
interval:
  - interval: 30s
    then:
      - lambda: |-
          if (id(voice_assistant_phase) == ${voice_assist_idle_phase_id} ||
              id(voice_assistant_phase) == ${voice_assist_muted_phase_id}) {
            std::string state = id(timer_state).state;
            if (state == "active" || state == "paused") {
              ESP_LOGD("timer_sync", "Timer is %s but display is idle - triggering redraw", state.c_str());
              id(draw_display).execute();
            }
          }

Buttons

# file: esphome/examples/esp32-s3-box-3-voice-assistant.yaml
# section: button
button:
  - platform: restart
    id: restart_btn
    name: Restart

  - platform: factory_reset
    id: factory_reset_btn
    internal: true

Home Assistant Timer Sensors

# file: esphome/examples/esp32-s3-box-3-voice-assistant.yaml
# section: sensor
sensor:
  - platform: template
    name: "Voice Assistant Phase"
    id: voice_assistant_phase_sensor
    lambda: |-
      return (float)id(voice_assistant_phase);
    update_interval: 500ms

  - platform: homeassistant
    id: timer_remaining
    name: "Timer remaining"
    entity_id: sensor.${timer_area}_timer_remaining_seconds
    unit_of_measurement: "s"
    device_class: "duration"
    on_value:
      then:
        - script.execute: draw_display

  - platform: homeassistant
    id: timer_duration
    name: "Timer duration"
    entity_id: sensor.${timer_area}_timer_remaining_seconds
    attribute: duration_seconds
    unit_of_measurement: "s"
    device_class: "duration"

  - platform: homeassistant
    id: timer_progress
    entity_id: sensor.${timer_area}_timer_remaining_seconds
    attribute: progress_percent
    internal: true

  - platform: aht10
    i2c_id: bus_b
    variant: AHT20
    temperature:
      name: "Temperature"
      id: s3temp
    humidity:
      name: "Humidity"
    update_interval: 60s

  - platform: adc
    pin: GPIO10
    id: battery_voltage
    unit_of_measurement: "V"
    accuracy_decimals: 1
    device_class: "voltage"
    entity_category: "diagnostic"
    update_interval: 30s
    attenuation: auto
    filters:
      - multiply: 4.11

  - platform: copy
    id: battery_percent
    source_id: battery_voltage
    name: "Battery level"
    unit_of_measurement: "%"
    accuracy_decimals: 0
    device_class: "battery"
    entity_category: "diagnostic"
    filters:
      - lambda: return (x - 2.7) / (4.2 - 2.7) * 100;
      - clamp:
          min_value: 0
          max_value: 100

  - platform: wifi_signal
    name: "WiFi db"
    id: wifi_signal_db
    update_interval: 30s

  - platform: copy
    source_id: wifi_signal_db
    name: "WiFi Signal"
    id: wifi_percent
    filters:
      - lambda: return min(max(2 * (x + 100.0), 0.0), 100.0);
    unit_of_measurement: "%"
    entity_category: "diagnostic"

Touch and Binary Sensors

# file: esphome/examples/esp32-s3-box-3-voice-assistant.yaml
# section: touchscreen
touchscreen:
  - platform: gt911
    i2c_id: bus_a
    address: 0x5D
    id: gt911_touchscreen
    interrupt_pin:
      number: GPIO3
      ignore_strapping_warning: true
# file: esphome/examples/esp32-s3-box-3-voice-assistant.yaml
# section: binary_sensor
binary_sensor:
  - platform: gpio
    pin:
      number: GPIO21
    name: "Presence detect"
    device_class: "occupancy"

  - platform: gt911
    id: touch_area
    index: 0
    on_press:
      then:
        - if:
            condition:
              lambda: return !id(init_in_progress);
            then:
              - if:
                  condition:
                    switch.is_on: timer_ringing
                  then:
                    - switch.turn_off: timer_ringing
                  else:
                    - if:
                        condition:
                          voice_assistant.is_running:
                        then:
                          - voice_assistant.stop:
                        else:
                          - if:
                              condition:
                                media_player.is_announcing:
                              then:
                                media_player.stop:
                                  announcement: true
                              else:
                                - if:
                                    condition:
                                      media_player.is_playing:
                                    then:
                                      - media_player.pause:
                                    else:
                                      - if:
                                          condition:
                                            and:
                                              - lambda: return !id(is_muted);
                                              - not: voice_assistant.is_running
                                          then:
                                            - media_player.speaker.play_on_device_media_file:
                                                media_file: wake_word_triggered_sound_file
                                                announcement: true
                                            - wait_until:
                                                - not:
                                                    - media_player.is_announcing:
                                            - voice_assistant.start:

  - platform: gpio
    pin:
      number: GPIO0
      mode: INPUT_PULLUP
      inverted: true
    id: left_top_button
    internal: true
    on_multi_click:
      - timing:
          - ON for at least 50ms
          - OFF for at least 50ms
        then:
          - switch.turn_off: timer_ringing
      - timing:
          - ON for at least 10s
        then:
          - button.press: factory_reset_btn

  - platform: gpio
    pin:
      number: GPIO1
      mode: INPUT_PULLUP
      inverted: true
    id: mute_button
    name: "Mute Button"
    internal: true
    trigger_on_initial_state: true
    on_press:
      then:
        - script.execute: enable_mute
    on_release:
      then:
        - script.execute: disable_mute

I2C, I2S Audio, Microphone, Speaker

# file: esphome/examples/esp32-s3-box-3-voice-assistant.yaml
# section: i2c
i2c:
  - id: bus_a
    sda: GPIO08
    scl: GPIO18
    scan: true
    sda_pullup_enabled: true
    scl_pullup_enabled: true
    frequency: 100kHz
  - sda: GPIO41
    scl: GPIO40
    scan: true
    sda_pullup_enabled: true
    scl_pullup_enabled: true
    frequency: 50kHz
    id: bus_b
# file: esphome/examples/esp32-s3-box-3-voice-assistant.yaml
# section: audio
i2s_audio:
  - id: i2s_audio_bus
    i2s_lrclk_pin: GPIO45
    i2s_bclk_pin: GPIO17
    i2s_mclk_pin: GPIO2

audio_adc:
  - platform: es7210
    id: es7210_adc
    bits_per_sample: 16bit
    sample_rate: 16000
    i2c_id: bus_a

audio_dac:
  - platform: es8311
    id: es8311_dac
    bits_per_sample: 16bit
    sample_rate: 48000
    i2c_id: bus_a

microphone:
  - platform: i2s_audio
    id: box_mic
    sample_rate: 16000
    i2s_din_pin: GPIO16
    bits_per_sample: 16bit
    adc_type: external

speaker:
  - id: i2s_audio_speaker
    platform: i2s_audio
    i2s_audio_id: i2s_audio_bus
    i2s_dout_pin: GPIO15
    dac_type: external
    sample_rate: 48000
    bits_per_sample: 16bit
    channel: left
    audio_dac: es8311_dac
    buffer_duration: 100ms

media_player:
  - platform: speaker
    name: None
    id: speaker_media_player
    volume_min: 0.5
    volume_max: 0.8
    task_stack_in_psram: true
    announcement_pipeline:
      speaker: i2s_audio_speaker
      format: FLAC
      sample_rate: 48000
      num_channels: 1
    files:
      - id: timer_finished_sound
        file: https://github.com/esphome/home-assistant-voice-pe/raw/dev/sounds/timer_finished.flac
      - id: wake_word_triggered_sound_file
        file: https://github.com/esphome/home-assistant-voice-pe/raw/dev/sounds/wake_word_triggered.flac
    on_announcement:
      - lambda: id(announcement_in_progress) = true;
      - script.execute: track_announcement_lifecycle
      - if:
          condition:
            - microphone.is_capturing:
          then:
            - script.execute: stop_wake_word
      - if:
          condition:
            and:
              - not:
                  voice_assistant.is_running:
              - switch.is_off: timer_ringing
          then:
            - lambda: id(voice_assistant_phase) = ${voice_assist_muted_phase_id};
            - script.execute: draw_display
    on_idle:
      - delay: 100ms
      - if:
          condition:
            and:
              - not:
                  voice_assistant.is_running:
              - switch.is_off: timer_ringing
              - not:
                  media_player.is_announcing:
              - lambda: return !id(announcement_in_progress);
          then:
            - script.execute: start_wake_word
            - script.execute: set_idle_or_mute_phase
            - script.execute: draw_display

Wake Word and Voice Assistant

# file: esphome/examples/esp32-s3-box-3-voice-assistant.yaml
# section: wake_word
micro_wake_word:
  id: mww
  models:
    - model: okay_nabu
      id: okay_nabu
    - model: https://github.com/kahrendt/microWakeWord/releases/download/stop/stop.json
      id: stop
      internal: true
  vad:
    model: github://esphome/micro-wake-word-models/models/v2/vad.json
  on_wake_word_detected:
    - voice_assistant.start:
        wake_word: !lambda return wake_word;

voice_assistant:
  id: va
  microphone: box_mic
  media_player: speaker_media_player
  micro_wake_word: mww
  noise_suppression_level: 2
  auto_gain: 31dBFS
  volume_multiplier: 2.0
  on_listening:
    - lambda: id(voice_assistant_phase) = ${voice_assist_listening_phase_id};
    - text_sensor.template.publish:
        id: text_request
        state: "..."
    - text_sensor.template.publish:
        id: text_response
        state: "..."
    - script.execute: draw_display
  on_stt_vad_end:
    - lambda: id(voice_assistant_phase) = ${voice_assist_thinking_phase_id};
    - script.execute: draw_display
  on_stt_end:
    - text_sensor.template.publish:
        id: text_request
        state: !lambda return x;
    - script.execute: draw_display
  on_tts_start:
    - text_sensor.template.publish:
        id: text_response
        state: !lambda return x;
    - lambda: id(voice_assistant_phase) = ${voice_assist_replying_phase_id};
    - script.execute: draw_display
  on_end:
    - if:
        condition:
          - lambda: return id(announcement_in_progress);
        then:
          - logger.log: "on_end: Skipping - announcement in progress"
        else:
          - wait_until:
              condition:
                - media_player.is_announcing:
              timeout: 3s
          - wait_until:
              - and:
                  - not:
                      media_player.is_announcing:
                  - not:
                      speaker.is_playing:
          - lambda: id(va).set_use_wake_word(false);
          - micro_wake_word.start:
          - script.execute: set_idle_or_mute_phase
          - script.execute: draw_display
    - text_sensor.template.publish:
        id: text_request
        state: ""
    - text_sensor.template.publish:
        id: text_response
        state: ""
  on_error:
    - if:
        condition:
          lambda: return !id(init_in_progress);
        then:
          - lambda: id(voice_assistant_phase) = ${voice_assist_error_phase_id};
          - script.execute: draw_display
          - delay: 1s
          - script.execute: set_idle_or_mute_phase
          - script.execute: draw_display
  on_client_connected:
    - lambda: id(init_in_progress) = false;
    - script.execute: start_wake_word
    - script.execute: set_idle_or_mute_phase
    - script.execute: draw_display
  on_client_disconnected:
    - script.execute: stop_wake_word
    - lambda: id(voice_assistant_phase) = ${voice_assist_not_ready_phase_id};
    - script.execute: draw_display

  # Timer Event Stubs - HA handles actual timer logic
  on_timer_started:
    - logger.log:
        format: "Timer started (handled by HA): %s"
        args: ["timer.id.c_str()"]
  on_timer_finished:
    - logger.log: "Timer finished event received (handled by HA automation)"
  on_timer_cancelled:
    - logger.log: "Timer cancelled (handled by HA)"
  on_timer_updated:
    - logger.log: "Timer updated (handled by HA)"
  on_timer_tick:
    - lambda: return;

Text Sensors

# file: esphome/examples/esp32-s3-box-3-voice-assistant.yaml
# section: text_sensor
text_sensor:
  - platform: template
    name: "Voice Assistant State"
    id: voice_assistant_state_sensor
    lambda: |-
      int phase = id(voice_assistant_phase);
      switch(phase) {
        case ${voice_assist_idle_phase_id}: return {"idle"};
        case ${voice_assist_listening_phase_id}: return {"listening"};
        case ${voice_assist_thinking_phase_id}: return {"thinking"};
        case ${voice_assist_replying_phase_id}: return {"replying"};
        case ${voice_assist_error_phase_id}: return {"error"};
        case ${voice_assist_not_ready_phase_id}: return {"not_ready"};
        case ${voice_assist_muted_phase_id}: return {"muted"};
        case ${voice_assist_timer_finished_phase_id}: return {"timer_finished"};
        default: return {"unknown"};
      }
    update_interval: 500ms

  - platform: homeassistant
    id: timer_state
    entity_id: sensor.${timer_area}_timer_remaining_seconds
    attribute: timer_state
    internal: true
    on_value:
      then:
        - script.execute: draw_display

  - id: text_request
    platform: template
    on_value:
      lambda: |-
        if(id(text_request).state.length()>32) {
          std::string name = id(text_request).state.c_str();
          std::string truncated = esphome::str_truncate(name.c_str(),31);
          id(text_request).state = (truncated+"...").c_str();
        }

  - id: text_response
    platform: template
    on_value:
      lambda: |-
        if(id(text_response).state.length()>32) {
          std::string name = id(text_response).state.c_str();
          std::string truncated = esphome::str_truncate(name.c_str(),31);
          id(text_response).state = (truncated+"...").c_str();
        }

Switches

# file: esphome/examples/esp32-s3-box-3-voice-assistant.yaml
# section: switch
output:
  - platform: ledc
    pin: GPIO47
    id: backlight_output

light:
  - platform: monochromatic
    id: led
    name: Screen
    icon: "mdi:television"
    entity_category: config
    output: backlight_output
    restore_mode: RESTORE_DEFAULT_ON
    default_transition_length: 250ms

switch:
  - platform: gpio
    name: Speaker Enable
    pin: GPIO46
    restore_mode: RESTORE_DEFAULT_ON
    entity_category: config
    disabled_by_default: true

  - platform: template
    name: Mute
    id: mute
    icon: "mdi:microphone-off"
    optimistic: false
    restore_mode: RESTORE_DEFAULT_OFF
    entity_category: config
    lambda: |-
      return id(is_muted);
    turn_on_action:
      - if:
          condition:
            lambda: return !id(is_muted);
          then:
            - script.execute: toggle_mute
    turn_off_action:
      - if:
          condition:
            lambda: return id(is_muted);
          then:
            - script.execute: toggle_mute

  - platform: template
    id: timer_ringing
    name: "Timer Ringing"
    icon: "mdi:bell-ring-outline"
    optimistic: true
    restore_mode: ALWAYS_OFF
    on_turn_off:
      - lambda: |-
          id(speaker_media_player)
            ->make_call()
            .set_command(media_player::MediaPlayerCommand::MEDIA_PLAYER_COMMAND_REPEAT_OFF)
            .set_announcement(true)
            .perform();
          id(speaker_media_player)->set_playlist_delay_ms(speaker::AudioPipelineType::ANNOUNCEMENT, 0);
      - media_player.stop:
          announcement: true
      - script.execute: set_idle_or_mute_phase
      - script.execute: draw_display
    on_turn_on:
      - lambda: id(voice_assistant_phase) = ${voice_assist_timer_finished_phase_id};
      - script.execute: draw_display
      - lambda: |-
          id(speaker_media_player)
            ->make_call()
            .set_command(media_player::MediaPlayerCommand::MEDIA_PLAYER_COMMAND_REPEAT_ONE)
            .set_announcement(true)
            .perform();
          id(speaker_media_player)->set_playlist_delay_ms(speaker::AudioPipelineType::ANNOUNCEMENT, 1000);
      - media_player.speaker.play_on_device_media_file:
          media_file: timer_finished_sound
          announcement: true
      - delay: 15min
      - switch.turn_off: timer_ringing

Global Variables

# file: esphome/examples/esp32-s3-box-3-voice-assistant.yaml
# section: globals
globals:
  - id: init_in_progress
    type: bool
    restore_value: false
    initial_value: "true"
  - id: voice_assistant_phase
    type: int
    restore_value: false
    initial_value: ${voice_assist_not_ready_phase_id}
  - id: ota_progress
    type: int
    restore_value: false
    initial_value: "0"
  - id: announcement_in_progress
    type: bool
    restore_value: false
    initial_value: "false"
  - id: is_muted
    type: bool
    restore_value: false
    initial_value: "false"

Scripts

# file: esphome/examples/esp32-s3-box-3-voice-assistant.yaml
# section: script
script:
  - id: draw_display
    then:
      - if:
          condition:
            lambda: return !id(init_in_progress);
          then:
            - if:
                condition:
                  wifi.connected:
                then:
                  - if:
                      condition:
                        api.connected:
                      then:
                        - lambda: |
                            switch(id(voice_assistant_phase)) {
                              case ${voice_assist_listening_phase_id}:
                                id(s3_box_lcd).show_page(listening_page);
                                id(s3_box_lcd).update();
                                break;
                              case ${voice_assist_thinking_phase_id}:
                                id(s3_box_lcd).show_page(thinking_page);
                                id(s3_box_lcd).update();
                                break;
                              case ${voice_assist_replying_phase_id}:
                                id(s3_box_lcd).show_page(replying_page);
                                id(s3_box_lcd).update();
                                break;
                              case ${voice_assist_error_phase_id}:
                                id(s3_box_lcd).show_page(error_page);
                                id(s3_box_lcd).update();
                                break;
                              case ${voice_assist_muted_phase_id}:
                                id(s3_box_lcd).show_page(muted_page);
                                id(s3_box_lcd).update();
                                break;
                              case ${voice_assist_not_ready_phase_id}:
                                id(s3_box_lcd).show_page(no_ha_page);
                                id(s3_box_lcd).update();
                                break;
                              case ${voice_assist_timer_finished_phase_id}:
                                id(s3_box_lcd).show_page(timer_finished_page);
                                id(s3_box_lcd).update();
                                break;
                              case ${voice_assist_ota_phase_id}:
                                id(s3_box_lcd).show_page(ota_page);
                                id(s3_box_lcd).update();
                                break;
                              default:
                                id(s3_box_lcd).show_page(idle_page);
                                id(s3_box_lcd).update();
                            }
                      else:
                        - display.page.show: no_ha_page
                        - component.update: s3_box_lcd
                else:
                  - display.page.show: no_wifi_page
                  - component.update: s3_box_lcd
          else:
            - display.page.show: initializing_page
            - component.update: s3_box_lcd

  - id: draw_timer_timeline
    then:
      - lambda: |
          std::string state = id(timer_state).state;
          int remaining = (int)id(timer_remaining).state;
          int duration = (int)id(timer_duration).state;

          if (state == "active" && duration > 0 && remaining > 0) {
            int active_pixels = (320 * remaining) / duration;
            if (active_pixels > 0) {
              id(s3_box_lcd).filled_rectangle(0, 225, 320, 15, Color::WHITE);
              id(s3_box_lcd).filled_rectangle(0, 226, active_pixels, 13, id(active_timer_color));
            }
          } else if (state == "paused" && duration > 0 && remaining > 0) {
            int active_pixels = (320 * remaining) / duration;
            if (active_pixels > 0) {
              id(s3_box_lcd).filled_rectangle(0, 225, 320, 15, Color::WHITE);
              id(s3_box_lcd).filled_rectangle(0, 226, active_pixels, 13, id(paused_timer_color));
            }
          }

  - id: draw_active_timer_widget
    then:
      - lambda: |
          std::string state = id(timer_state).state;
          int remaining = (int)id(timer_remaining).state;

          if (state == "active" || state == "paused") {
            id(s3_box_lcd).filled_rectangle(80, 40, 160, 50, Color::WHITE);
            id(s3_box_lcd).rectangle(80, 40, 160, 50, Color::BLACK);

            int hours_left = remaining / 3600;
            int minutes_left = (remaining - hours_left * 3600) / 60;
            int seconds_left = remaining - hours_left * 3600 - minutes_left * 60;

            auto display_hours = (hours_left < 10 ? "0" : "") + std::to_string(hours_left);
            auto display_minute = (minutes_left < 10 ? "0" : "") + std::to_string(minutes_left);
            auto display_seconds = (seconds_left < 10 ? "0" : "") + std::to_string(seconds_left);

            std::string display_string = "";
            if (hours_left > 0) {
              display_string = display_hours + ":" + display_minute;
            } else {
              display_string = display_minute + ":" + display_seconds;
            }
            id(s3_box_lcd).printf(120, 47, id(font_timer), Color::BLACK, "%s", display_string.c_str());
          }

  - id: enable_mute
    mode: single
    then:
      - script.execute: stop_wake_word
      - delay: 100ms
      - microphone.mute:
      - lambda: id(is_muted) = true;
      - switch.template.publish:
          id: mute
          state: ON
      - lambda: id(voice_assistant_phase) = ${voice_assist_muted_phase_id};
      - script.execute: draw_display

  - id: disable_mute
    mode: single
    then:
      - lambda: id(is_muted) = false;
      - switch.template.publish:
          id: mute
          state: OFF
      - lambda: id(voice_assistant_phase) = ${voice_assist_idle_phase_id};
      - script.execute: draw_display
      - microphone.unmute:
      - delay: 100ms
      - script.execute: start_wake_word

  - id: toggle_mute
    mode: restart
    then:
      - if:
          condition:
            lambda: return id(is_muted);
          then:
            - lambda: id(is_muted) = false;
            - switch.template.publish:
                id: mute
                state: OFF
            - lambda: id(voice_assistant_phase) = ${voice_assist_idle_phase_id};
            - script.execute: draw_display
            - microphone.unmute:
            - delay: 100ms
            - script.execute: start_wake_word
          else:
            - script.execute: stop_wake_word
            - delay: 100ms
            - microphone.mute:
            - lambda: id(is_muted) = true;
            - switch.template.publish:
                id: mute
                state: ON
            - lambda: id(voice_assistant_phase) = ${voice_assist_muted_phase_id};
            - script.execute: draw_display

  - id: start_wake_word
    then:
      - if:
          condition:
            lambda: return !id(is_muted);
          then:
            - if:
                condition:
                  not:
                    - voice_assistant.is_running:
                then:
                  - lambda: id(va).set_use_wake_word(false);
                  - micro_wake_word.start:

  - id: stop_wake_word
    then:
      - micro_wake_word.stop:

  - id: set_idle_or_mute_phase
    then:
      - if:
          condition:
            lambda: return !id(is_muted);
          then:
            - lambda: id(voice_assistant_phase) = ${voice_assist_idle_phase_id};
          else:
            - lambda: id(voice_assistant_phase) = ${voice_assist_muted_phase_id};

  - id: track_announcement_lifecycle
    mode: restart
    then:
      - logger.log: "Announcement lifecycle: waiting for audio to start..."
      - wait_until:
          condition:
            - media_player.is_announcing:
          timeout: 30s
      - if:
          condition:
            - not:
                media_player.is_announcing:
          then:
            - logger.log: "Announcement lifecycle: timed out waiting for audio"
            - lambda: id(announcement_in_progress) = false;
          else:
            - logger.log: "Announcement lifecycle: audio playing, waiting for completion..."
            - wait_until:
                condition:
                  - not:
                      media_player.is_announcing:
                timeout: 10min
            - logger.log: "Announcement lifecycle: complete"
            - lambda: id(announcement_in_progress) = false;

Images, Fonts, Colors

# file: esphome/examples/esp32-s3-box-3-voice-assistant.yaml
# section: image
image:
  - file: ${error_illustration_file}
    id: casita_error
    resize: 320x240
    type: RGB
    transparency: alpha_channel
  - file: ${idle_illustration_file}
    id: casita_idle
    resize: 320x240
    type: RGB
    transparency: alpha_channel
  - file: ${listening_illustration_file}
    id: casita_listening
    resize: 320x240
    type: RGB
    transparency: alpha_channel
  - file: ${thinking_illustration_file}
    id: casita_thinking
    resize: 320x240
    type: RGB
    transparency: alpha_channel
  - file: ${replying_illustration_file}
    id: casita_replying
    resize: 320x240
    type: RGB
    transparency: alpha_channel
  - file: ${timer_finished_illustration_file}
    id: casita_timer_finished
    resize: 320x240
    type: RGB
    transparency: alpha_channel
  - file: ${loading_illustration_file}
    id: casita_initializing
    resize: 320x240
    type: RGB
    transparency: alpha_channel
  - file: https://github.com/esphome/wake-word-voice-assistants/raw/main/error_box_illustrations/error-no-wifi.png
    id: error_no_wifi
    resize: 320x240
    type: RGB
    transparency: alpha_channel
  - file: https://github.com/esphome/wake-word-voice-assistants/raw/main/error_box_illustrations/error-no-ha.png
    id: error_no_ha
    resize: 320x240
    type: RGB
    transparency: alpha_channel
# file: esphome/examples/esp32-s3-box-3-voice-assistant.yaml
# section: font
font:
  - file:
      type: gfonts
      family: ${font_family}
      weight: 300
      italic: true
    id: font_request
    size: 15
    glyphsets:
      - ${font_glyphsets}
  - file:
      type: gfonts
      family: ${font_family}
      weight: 300
    id: font_response
    size: 15
    glyphsets:
      - ${font_glyphsets}
  - file:
      type: gfonts
      family: ${font_family}
      weight: 300
    id: font_timer
    size: 30
    glyphsets:
      - ${font_glyphsets}
  - file:
      type: gfonts
      family: ${font_family}
      weight: 700
    id: font_ota
    size: 24
    glyphsets:
      - ${font_glyphsets}
# file: esphome/examples/esp32-s3-box-3-voice-assistant.yaml
# section: color
color:
  - id: idle_color
    hex: ${idle_illustration_background_color}
  - id: listening_color
    hex: ${listening_illustration_background_color}
  - id: thinking_color
    hex: ${thinking_illustration_background_color}
  - id: replying_color
    hex: ${replying_illustration_background_color}
  - id: loading_color
    hex: ${loading_illustration_background_color}
  - id: error_color
    hex: ${error_illustration_background_color}
  - id: active_timer_color
    hex: "26ed3a"
  - id: paused_timer_color
    hex: "3b89e3"
  - id: ota_progress_color
    hex: "ff6600"
# file: esphome/examples/esp32-s3-box-3-voice-assistant.yaml
# section: spi
spi:
  - id: spi_bus
    clk_pin: 7
    mosi_pin: 6

Display

# file: esphome/examples/esp32-s3-box-3-voice-assistant.yaml
# section: display
display:
  - platform: ili9xxx
    id: s3_box_lcd
    model: S3BOX
    invert_colors: false
    data_rate: 40MHz
    cs_pin: 5
    dc_pin: 4
    reset_pin:
      number: 48
      inverted: true
    update_interval: never
    pages:
      - id: idle_page
        lambda: |-
          it.fill(id(idle_color));
          it.image((it.get_width() / 2), (it.get_height() / 2), id(casita_idle), ImageAlign::CENTER);
          id(draw_timer_timeline).execute();

      - id: listening_page
        lambda: |-
          it.fill(id(listening_color));
          it.image((it.get_width() / 2), (it.get_height() / 2), id(casita_listening), ImageAlign::CENTER);
          id(draw_timer_timeline).execute();

      - id: thinking_page
        lambda: |-
          it.fill(id(thinking_color));
          it.image((it.get_width() / 2), (it.get_height() / 2), id(casita_thinking), ImageAlign::CENTER);
          it.filled_rectangle(20, 20, 280, 30, Color::WHITE);
          it.rectangle(20, 20, 280, 30, Color::BLACK);
          it.printf(30, 25, id(font_request), Color::BLACK, "%s", id(text_request).state.c_str());
          id(draw_timer_timeline).execute();

      - id: replying_page
        lambda: |-
          it.fill(id(replying_color));
          it.image((it.get_width() / 2), (it.get_height() / 2), id(casita_replying), ImageAlign::CENTER);
          it.filled_rectangle(20, 20, 280, 30, Color::WHITE);
          it.rectangle(20, 20, 280, 30, Color::BLACK);
          it.filled_rectangle(20, 190, 280, 30, Color::WHITE);
          it.rectangle(20, 190, 280, 30, Color::BLACK);
          it.printf(30, 25, id(font_request), Color::BLACK, "%s", id(text_request).state.c_str());
          it.printf(30, 195, id(font_response), Color::BLACK, "%s", id(text_response).state.c_str());
          id(draw_timer_timeline).execute();

      - id: timer_finished_page
        lambda: |-
          it.fill(id(idle_color));
          it.image((it.get_width() / 2), (it.get_height() / 2), id(casita_timer_finished), ImageAlign::CENTER);

      - id: error_page
        lambda: |-
          it.fill(id(error_color));
          it.image((it.get_width() / 2), (it.get_height() / 2), id(casita_error), ImageAlign::CENTER);

      - id: no_ha_page
        lambda: |-
          it.image((it.get_width() / 2), (it.get_height() / 2), id(error_no_ha), ImageAlign::CENTER);

      - id: no_wifi_page
        lambda: |-
          it.image((it.get_width() / 2), (it.get_height() / 2), id(error_no_wifi), ImageAlign::CENTER);

      - id: initializing_page
        lambda: |-
          it.fill(id(loading_color));
          it.image((it.get_width() / 2), (it.get_height() / 2), id(casita_initializing), ImageAlign::CENTER);

      - id: muted_page
        lambda: |-
          it.fill(Color::BLACK);
          id(draw_timer_timeline).execute();
          id(draw_active_timer_widget).execute();
      - id: ota_page
        lambda: |-
          it.fill(id(error_color));
          it.image((it.get_width() / 2), (it.get_height() / 2), id(casita_error), ImageAlign::CENTER);
          it.filled_rectangle(20, 200, 280, 30, Color::WHITE);
          it.rectangle(20, 200, 280, 30, Color::BLACK);
          int progress_width = (id(ota_progress) * 276) / 100;
          if (progress_width > 0) {
            it.filled_rectangle(22, 202, progress_width, 26, id(ota_progress_color));
          }
          if (id(ota_progress) >= 0) {
            it.printf(160, 185, id(font_ota), Color::WHITE, TextAlign::BOTTOM_CENTER, "Upgrading: %d%%", id(ota_progress));
          } else {
            it.printf(160, 185, id(font_ota), Color::WHITE, TextAlign::BOTTOM_CENTER, "Update Failed!");
          }

Testing

After flashing:

  1. Say "Okay Nabu" followed by "Set a timer for 1 minute"
  2. Verify the timer progress bar appears at the bottom of the display
  3. Wait for timer to complete or say "Cancel the timer"
  4. Verify the alarm plays and can be dismissed by touching the screen