fix(tts): strip Markdown syntax before sending text to TTS engines (#5560)

* fix(tts): strip Markdown syntax before sending text to TTS engines

Chat responses are rendered as Markdown but the TTS components piped the
raw response into Piper / the browser's `SpeechSynthesis` API. The
synthesizer reads every special character literally — `**bold**` becomes
"asterisk asterisk bold asterisk asterisk", `# Heading` becomes "pound
heading", code fences are read backtick-by-backtick, and bullet lists
become "hyphen item". The result is unintelligible whenever the assistant
includes any formatting, which is most of the time.

This commit adds a small `messageToSpeech` helper that converts a
Markdown chat message into plain text suitable for TTS:

- fenced code blocks and images are dropped (nothing useful to read)
- inline code and link labels keep their text content
- emphasis markers, headings, blockquote markers, list markers, and
  horizontal rules are stripped while preserving the underlying words
- HTML tags are removed but their text content kept
- table pipes become commas so rows read naturally

The helper is regex-based — no new dependency — and is wired into both
the native (`SpeechSynthesis`) and Piper TTS components in
`WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton`.

Closes #5557.
---
Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
This commit is contained in:
Gopal Bagaswar 2026-05-02 01:14:19 +05:30 committed by GitHub
parent 667db6d13b
commit 7d884c7863
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 96 additions and 2 deletions

View File

@ -1,5 +1,6 @@
import React, { useEffect, useState } from "react"; import React, { useEffect, useState } from "react";
import { SpeakerHigh, PauseCircle } from "@phosphor-icons/react"; import { SpeakerHigh, PauseCircle } from "@phosphor-icons/react";
import messageToSpeech from "@/utils/chat/messageToSpeech";
export default function NativeTTSMessage({ chatId, message }) { export default function NativeTTSMessage({ chatId, message }) {
const [speaking, setSpeaking] = useState(false); const [speaking, setSpeaking] = useState(false);
@ -25,7 +26,7 @@ export default function NativeTTSMessage({ chatId, message }) {
} }
if (window.speechSynthesis.speaking && !speaking) return; if (window.speechSynthesis.speaking && !speaking) return;
const utterance = new SpeechSynthesisUtterance(message); const utterance = new SpeechSynthesisUtterance(messageToSpeech(message));
utterance.addEventListener("end", endSpeechUtterance); utterance.addEventListener("end", endSpeechUtterance);
window.speechSynthesis.speak(utterance); window.speechSynthesis.speak(utterance);
setSpeaking(true); setSpeaking(true);

View File

@ -1,6 +1,7 @@
import { useEffect, useState, useRef } from "react"; import { useEffect, useState, useRef } from "react";
import { SpeakerHigh, PauseCircle, CircleNotch } from "@phosphor-icons/react"; import { SpeakerHigh, PauseCircle, CircleNotch } from "@phosphor-icons/react";
import PiperTTSClient from "@/utils/piperTTS"; import PiperTTSClient from "@/utils/piperTTS";
import messageToSpeech from "@/utils/chat/messageToSpeech";
export default function PiperTTS({ chatId, voiceId = null, message }) { export default function PiperTTS({ chatId, voiceId = null, message }) {
const playerRef = useRef(null); const playerRef = useRef(null);
@ -19,7 +20,9 @@ export default function PiperTTS({ chatId, voiceId = null, message }) {
if (!audioSrc) { if (!audioSrc) {
setLoading(true); setLoading(true);
const client = new PiperTTSClient({ voiceId }); const client = new PiperTTSClient({ voiceId });
const blobUrl = await client.getAudioBlobForText(message); const blobUrl = await client.getAudioBlobForText(
messageToSpeech(message)
);
setAudioSrc(blobUrl); setAudioSrc(blobUrl);
setLoading(false); setLoading(false);
} else { } else {

View File

@ -0,0 +1,90 @@
/**
* Convert a chat message string to plain text suitable for text-to-speech.
*
* The chat history we render is Markdown when we hand that text directly to
* a TTS engine (Piper, the browser's `SpeechSynthesis`,
* etc.) the engine reads every special character literally:
* - `**bold**` becomes "asterisk asterisk bold asterisk asterisk"
* - `_italic_` becomes "underscore italic underscore"
* - inline code/code fences are read verbatim with backticks
* - bullet lists become "hyphen item" / "asterisk item"
* - links become "open bracket label close bracket open paren url close paren"
*
* This helper strips the most common Markdown syntax while preserving the
* underlying spoken content. It is intentionally regex-based (no extra
* dependency) so it can be safely used both on the client and inside the
* native browser TTS path which has no access to the server's markdown-it
* tokenizer. AsyncTTS does not use this helper as the cloud based TTS engines
* do not need this cleanup and seem to handle the Markdown syntax just fine.
*
* @param {string} message - The raw markdown message body.
* @returns {string} A plain-text string suitable for TTS.
*/
export default function messageToSpeech(message = "") {
if (typeof message !== "string" || message.length === 0) return "";
let text = message;
/*
* Remove fenced code blocks entirely reading code aloud is rarely
* useful and produces a long stream of unintelligible characters.
*/
text = text.replace(/```[\s\S]*?```/g, " ");
text = text.replace(/~~~[\s\S]*?~~~/g, " ");
// Strip inline code wrappers but keep the text inside.
text = text.replace(/`([^`]*)`/g, "$1");
// Images: drop entirely — there's nothing useful to speak.
text = text.replace(/!\[[^\]]*\]\([^)]*\)/g, " ");
/*
* Links: keep the visible label, drop the URL.
* `[label](url)` -> `label`
*/
text = text.replace(/\[([^\]]+)\]\([^)]*\)/g, "$1");
// Reference-style link definitions: drop the URL line entirely.
text = text.replace(/^\s*\[[^\]]+\]:\s*\S+.*$/gm, "");
// Heading markers (`#`, `##`, ...): keep the heading text only.
text = text.replace(/^\s{0,3}#{1,6}\s+/gm, "");
// Blockquote markers (`>`): drop the leading marker.
text = text.replace(/^\s{0,3}>\s?/gm, "");
/*
* Unordered list markers (`-`, `*`, `+`) and ordered list markers
* (`1.`, `12)`): drop the marker, keep the item text.
*/
text = text.replace(/^\s*[-*+]\s+/gm, "");
text = text.replace(/^\s*\d+[.)]\s+/gm, "");
// Horizontal rules: drop entirely.
text = text.replace(/^\s{0,3}(?:[-*_]\s*){3,}\s*$/gm, " ");
/*
* Bold / italic / strikethrough emphasis. Order matters handle the
* longer markers (`***`, `___`, `**`, `__`, `~~`) before the singletons
* so "asterisk" is never read aloud.
*/
text = text.replace(/(\*\*\*|___)([^*_]+)\1/g, "$2");
text = text.replace(/(\*\*|__)([^*_]+)\1/g, "$2");
text = text.replace(/(\*|_)([^*_\n]+)\1/g, "$2");
text = text.replace(/~~([^~]+)~~/g, "$1");
/*
* Tables: convert pipe separators to commas so rows read naturally,
* and drop the alignment row (`---|---|---`).
*/
text = text.replace(/^\s*\|?\s*[:\-\s|]+\|[:\-\s|]+\s*$/gm, "");
text = text.replace(/\|/g, ", ");
// HTML tags: strip but keep their text content.
text = text.replace(/<\/?[^>]+>/g, " ");
// Collapse repeated whitespace (newlines and spaces) to single spaces.
text = text.replace(/\s+/g, " ").trim();
return text;
}