fix(tts): strip Markdown syntax before sending text to TTS engines (#5560)

* fix(tts): strip Markdown syntax before sending text to TTS engines Chat responses are rendered as Markdown but the TTS components piped the raw response into Piper / the browser's `SpeechSynthesis` API. The synthesizer reads every special character literally — `**bold**` becomes "asterisk asterisk bold asterisk asterisk", `# Heading` becomes "pound heading", code fences are read backtick-by-backtick, and bullet lists become "hyphen item". The result is unintelligible whenever the assistant includes any formatting, which is most of the time. This commit adds a small `messageToSpeech` helper that converts a Markdown chat message into plain text suitable for TTS: - fenced code blocks and images are dropped (nothing useful to read) - inline code and link labels keep their text content - emphasis markers, headings, blockquote markers, list markers, and horizontal rules are stripped while preserving the underlying words - HTML tags are removed but their text content kept - table pipes become commas so rows read naturally The helper is regex-based — no new dependency — and is wired into both the native (`SpeechSynthesis`) and Piper TTS components in `WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton`. Closes #5557. --- Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
2026-05-02 01:14:19 +05:30 · 2026-05-02 01:14:19 +05:30 · 7d884c7863
commit 7d884c7863
parent 667db6d13b
3 changed files with 96 additions and 2 deletions
--- a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/native.jsx
+++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/native.jsx
@ -1,5 +1,6 @@
 import React, { useEffect, useState } from "react";
 import { SpeakerHigh, PauseCircle } from "@phosphor-icons/react";
+import messageToSpeech from "@/utils/chat/messageToSpeech";

 export default function NativeTTSMessage({ chatId, message }) {
  const [speaking, setSpeaking] = useState(false);
@ -25,7 +26,7 @@ export default function NativeTTSMessage({ chatId, message }) {
    }

    if (window.speechSynthesis.speaking && !speaking) return;
-    const utterance = new SpeechSynthesisUtterance(message);
+    const utterance = new SpeechSynthesisUtterance(messageToSpeech(message));
    utterance.addEventListener("end", endSpeechUtterance);
    window.speechSynthesis.speak(utterance);
    setSpeaking(true);
--- a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/piperTTS.jsx
+++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton/piperTTS.jsx
@ -1,6 +1,7 @@
 import { useEffect, useState, useRef } from "react";
 import { SpeakerHigh, PauseCircle, CircleNotch } from "@phosphor-icons/react";
 import PiperTTSClient from "@/utils/piperTTS";
+import messageToSpeech from "@/utils/chat/messageToSpeech";

 export default function PiperTTS({ chatId, voiceId = null, message }) {
  const playerRef = useRef(null);
@ -19,7 +20,9 @@ export default function PiperTTS({ chatId, voiceId = null, message }) {
      if (!audioSrc) {
        setLoading(true);
        const client = new PiperTTSClient({ voiceId });
-        const blobUrl = await client.getAudioBlobForText(message);
+        const blobUrl = await client.getAudioBlobForText(
+          messageToSpeech(message)
+        );
        setAudioSrc(blobUrl);
        setLoading(false);
      } else {
--- a/frontend/src/utils/chat/messageToSpeech.js
+++ b/frontend/src/utils/chat/messageToSpeech.js
@ -0,0 +1,90 @@
+/**
+ * Convert a chat message string to plain text suitable for text-to-speech.
+ *
+ * The chat history we render is Markdown — when we hand that text directly to
+ * a TTS engine (Piper, the browser's `SpeechSynthesis`,
+ * etc.) the engine reads every special character literally:
+ *   - `**bold**` becomes "asterisk asterisk bold asterisk asterisk"
+ *   - `_italic_` becomes "underscore italic underscore"
+ *   - inline code/code fences are read verbatim with backticks
+ *   - bullet lists become "hyphen item" / "asterisk item"
+ *   - links become "open bracket label close bracket open paren url close paren"
+ *
+ * This helper strips the most common Markdown syntax while preserving the
+ * underlying spoken content. It is intentionally regex-based (no extra
+ * dependency) so it can be safely used both on the client and inside the
+ * native browser TTS path which has no access to the server's markdown-it
+ * tokenizer. AsyncTTS does not use this helper as the cloud based TTS engines
+ * do not need this cleanup and seem to handle the Markdown syntax just fine.
+ *
+ * @param {string} message - The raw markdown message body.
+ * @returns {string} A plain-text string suitable for TTS.
+ */
+export default function messageToSpeech(message = "") {
+  if (typeof message !== "string" || message.length === 0) return "";
+
+  let text = message;
+
+  /*
+   * Remove fenced code blocks entirely — reading code aloud is rarely
+   * useful and produces a long stream of unintelligible characters.
+   */
+  text = text.replace(/```[\s\S]*?```/g, " ");
+  text = text.replace(/~~~[\s\S]*?~~~/g, " ");
+
+  // Strip inline code wrappers but keep the text inside.
+  text = text.replace(/`([^`]*)`/g, "$1");
+
+  // Images: drop entirely — there's nothing useful to speak.
+  text = text.replace(/!\[[^\]]*\]\([^)]*\)/g, " ");
+
+  /*
+   * Links: keep the visible label, drop the URL.
+   * `[label](url)` -> `label`
+   */
+  text = text.replace(/\[([^\]]+)\]\([^)]*\)/g, "$1");
+
+  // Reference-style link definitions: drop the URL line entirely.
+  text = text.replace(/^\s*\[[^\]]+\]:\s*\S+.*$/gm, "");
+
+  // Heading markers (`#`, `##`, ...): keep the heading text only.
+  text = text.replace(/^\s{0,3}#{1,6}\s+/gm, "");
+
+  // Blockquote markers (`>`): drop the leading marker.
+  text = text.replace(/^\s{0,3}>\s?/gm, "");
+
+  /*
+   * Unordered list markers (`-`, `*`, `+`) and ordered list markers
+   * (`1.`, `12)`): drop the marker, keep the item text.
+   */
+  text = text.replace(/^\s*[-*+]\s+/gm, "");
+  text = text.replace(/^\s*\d+[.)]\s+/gm, "");
+
+  // Horizontal rules: drop entirely.
+  text = text.replace(/^\s{0,3}(?:[-*_]\s*){3,}\s*$/gm, " ");
+
+  /*
+   * Bold / italic / strikethrough emphasis. Order matters — handle the
+   * longer markers (`***`, `___`, `**`, `__`, `~~`) before the singletons
+   * so "asterisk" is never read aloud.
+   */
+  text = text.replace(/(\*\*\*|___)([^*_]+)\1/g, "$2");
+  text = text.replace(/(\*\*|__)([^*_]+)\1/g, "$2");
+  text = text.replace(/(\*|_)([^*_\n]+)\1/g, "$2");
+  text = text.replace(/~~([^~]+)~~/g, "$1");
+
+  /*
+   * Tables: convert pipe separators to commas so rows read naturally,
+   * and drop the alignment row (`---|---|---`).
+   */
+  text = text.replace(/^\s*\|?\s*[:\-\s|]+\|[:\-\s|]+\s*$/gm, "");
+  text = text.replace(/\|/g, ", ");
+
+  // HTML tags: strip but keep their text content.
+  text = text.replace(/<\/?[^>]+>/g, " ");
+
+  // Collapse repeated whitespace (newlines and spaces) to single spaces.
+  text = text.replace(/\s+/g, " ").trim();
+
+  return text;
+}