STT append spoken text (#4216)

* STT append spoken text * remove default args from calls * fix bug where message and input were out of sync * update diffs
2025-07-31 12:23:36 -07:00 · 2025-07-31 12:23:36 -07:00 · 755ef4bb80
commit 755ef4bb80
parent 4b7932f9e1
3 changed files with 60 additions and 15 deletions
--- a/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/SpeechToText/index.jsx
+++ b/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/SpeechToText/index.jsx
@ -1,4 +1,4 @@
-import { useEffect, useCallback } from "react";
+import { useEffect, useCallback, useRef } from "react";
 import { Microphone } from "@phosphor-icons/react";
 import { Tooltip } from "react-tooltip";
 import _regeneratorRuntime from "regenerator-runtime";
@ -19,6 +19,7 @@ const SILENCE_INTERVAL = 3_200; // wait in seconds of silence before closing.
 * @returns {React.ReactElement} The SpeechToText component
 */
 export default function SpeechToText({ sendCommand }) {
+  const previousTranscriptRef = useRef("");
  const {
    transcript,
    listening,
@ -39,6 +40,7 @@ export default function SpeechToText({ sendCommand }) {
    }

    resetTranscript();
+    previousTranscriptRef.current = "";
    SpeechRecognition.startListening({
      continuous: browserSupportsContinuousListening,
      language: window?.navigator?.language ?? "en-US",
@ -47,14 +49,19 @@ export default function SpeechToText({ sendCommand }) {

  function endSTTSession() {
    SpeechRecognition.stopListening();
-    if (transcript.length > 0) {
+
+    // If auto submit is enabled, send an empty string to the chat window to submit the current transcript
+    // since every chunk of text should have been streamed to the chat window by now.
+    if (Appearance.get("autoSubmitSttInput")) {
      sendCommand({
-        text: transcript,
-        autoSubmit: Appearance.get("autoSubmitSttInput"),
+        text: "",
+        autoSubmit: true,
+        writeMode: "append",
      });
    }

    resetTranscript();
+    previousTranscriptRef.current = "";
    clearTimeout(timeout);
  }

@ -95,7 +102,15 @@ export default function SpeechToText({ sendCommand }) {

  useEffect(() => {
    if (transcript?.length > 0 && listening) {
-      sendCommand({ text: transcript });
+      const previousTranscript = previousTranscriptRef.current;
+      const newContent = transcript.slice(previousTranscript.length);
+
+      // Stream just the diff of the new content since transcript is an accumulating string.
+      // and not just the new content transcribed.
+      if (newContent.length > 0)
+        sendCommand({ text: newContent, writeMode: "append" });
+
+      previousTranscriptRef.current = transcript;
      clearTimeout(timeout);
      timeout = setTimeout(() => {
        endSTTSession();
--- a/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/index.jsx
+++ b/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/index.jsx
@ -25,6 +25,7 @@ import useTextSize from "@/hooks/useTextSize";
 import { useTranslation } from "react-i18next";
 import Appearance from "@/models/appearance";

+export const PROMPT_INPUT_ID = "primary-prompt-input";
 export const PROMPT_INPUT_EVENT = "set_prompt_input";
 const MAX_EDIT_STACK_SIZE = 100;

@ -51,10 +52,12 @@ export default function PromptInput({
   * To prevent too many re-renders we remotely listen for updates from the parent
   * via an event cycle. Otherwise, using message as a prop leads to a re-render every
   * change on the input.
-   * @param {Event} e
+   * @param {{detail: {messageContent: string, writeMode: 'replace' | 'append'}}} e
   */
  function handlePromptUpdate(e) {
-    setPromptInput(e?.detail ?? "");
+    const { messageContent, writeMode = "replace" } = e?.detail ?? {};
+    if (writeMode === "append") setPromptInput((prev) => prev + messageContent);
+    else setPromptInput(messageContent ?? "");
  }

  useEffect(() => {
@ -261,6 +264,7 @@ export default function PromptInput({
            <AttachmentManager attachments={attachments} />
            <div className="flex items-center border-b border-theme-chat-input-border mx-3">
              <textarea
+                id={PROMPT_INPUT_ID}
                ref={textareaRef}
                onChange={handleChange}
                onKeyDown={captureEnterOrUndo}
--- a/frontend/src/components/WorkspaceChat/ChatContainer/index.jsx
+++ b/frontend/src/components/WorkspaceChat/ChatContainer/index.jsx
@ -1,7 +1,10 @@
 import { useState, useEffect, useContext } from "react";
 import ChatHistory from "./ChatHistory";
 import { CLEAR_ATTACHMENTS_EVENT, DndUploaderContext } from "./DnDWrapper";
-import PromptInput, { PROMPT_INPUT_EVENT } from "./PromptInput";
+import PromptInput, {
+  PROMPT_INPUT_EVENT,
+  PROMPT_INPUT_ID,
+} from "./PromptInput";
 import Workspace from "@/models/workspace";
 import handleChat, { ABORT_STREAM_EVENT } from "@/utils/chat";
 import { isMobile } from "react-device-detect";
@ -38,12 +41,21 @@ export default function ChatContainer({ workspace, knownHistory = [] }) {
    clearTranscriptOnListen: true,
  });

-  // Emit an update to the state of the prompt input without directly
-  // passing a prop in so that it does not re-render constantly.
-  function setMessageEmit(messageContent = "") {
-    setMessage(messageContent);
+  /**
+   * Emit an update to the state of the prompt input without directly
+   * passing a prop in so that it does not re-render constantly.
+   * @param {string} messageContent - The message content to set
+   * @param {'replace' | 'append'} writeMode - Replace current text or append to existing text (default: replace)
+   */
+  function setMessageEmit(messageContent = "", writeMode = "replace") {
+    if (writeMode === "append") setMessage((prev) => prev + messageContent);
+    else setMessage(messageContent ?? "");
+
+    // Push the update to the PromptInput component (same logic as above to keep in sync)
    window.dispatchEvent(
-      new CustomEvent(PROMPT_INPUT_EVENT, { detail: messageContent })
+      new CustomEvent(PROMPT_INPUT_EVENT, {
+        detail: { messageContent, writeMode },
+      })
    );
  }

@ -102,6 +114,7 @@ export default function ChatContainer({ workspace, knownHistory = [] }) {
   * @param {boolean} options.autoSubmit - Determines if the text should be sent immediately or if it should be added to the message state (default: false)
   * @param {Object[]} options.history - The history of the chat prior to this message for overriding the current chat history
   * @param {Object[import("./DnDWrapper").Attachment]} options.attachments - The attachments to send to the LLM for this message
+   * @param {'replace' | 'append'} options.writeMode - Replace current text or append to existing text (default: replace)
   * @returns {void}
   */
  const sendCommand = async ({
@ -109,13 +122,26 @@ export default function ChatContainer({ workspace, knownHistory = [] }) {
    autoSubmit = false,
    history = [],
    attachments = [],
+    writeMode = "replace",
  } = {}) => {
-    if (!text || text === "") return false;
+    // If we are not auto-submitting, we can just emit the text to the prompt input.
    if (!autoSubmit) {
-      setMessageEmit(text);
+      setMessageEmit(text, writeMode);
      return;
    }

+    // If we are auto-submitting in append mode
+    // than we need to update text with whatever is in the prompt input + the text we are sending.
+    // @note: `message` will not work here since it is not updated yet.
+    // If text is still empty, after this, then we should just return.
+    if (writeMode === "append") {
+      const currentText = document.getElementById(PROMPT_INPUT_ID)?.value;
+      text = currentText + text;
+    }
+
+    if (!text || text === "") return false;
+    // If we are auto-submitting
+    // Then we can replace the current text since this is not accumulating.
    let prevChatHistory;
    if (history.length > 0) {
      // use pre-determined history chain.