fix(tts): strip Markdown syntax before sending text to TTS engines (#5560)
* fix(tts): strip Markdown syntax before sending text to TTS engines Chat responses are rendered as Markdown but the TTS components piped the raw response into Piper / the browser's `SpeechSynthesis` API. The synthesizer reads every special character literally — `**bold**` becomes "asterisk asterisk bold asterisk asterisk", `# Heading` becomes "pound heading", code fences are read backtick-by-backtick, and bullet lists become "hyphen item". The result is unintelligible whenever the assistant includes any formatting, which is most of the time. This commit adds a small `messageToSpeech` helper that converts a Markdown chat message into plain text suitable for TTS: - fenced code blocks and images are dropped (nothing useful to read) - inline code and link labels keep their text content - emphasis markers, headings, blockquote markers, list markers, and horizontal rules are stripped while preserving the underlying words - HTML tags are removed but their text content kept - table pipes become commas so rows read naturally The helper is regex-based — no new dependency — and is wired into both the native (`SpeechSynthesis`) and Piper TTS components in `WorkspaceChat/ChatContainer/ChatHistory/HistoricalMessage/Actions/TTSButton`. Closes #5557. --- Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
This commit is contained in:
parent
667db6d13b
commit
7d884c7863
@ -1,5 +1,6 @@
|
||||
import React, { useEffect, useState } from "react";
|
||||
import { SpeakerHigh, PauseCircle } from "@phosphor-icons/react";
|
||||
import messageToSpeech from "@/utils/chat/messageToSpeech";
|
||||
|
||||
export default function NativeTTSMessage({ chatId, message }) {
|
||||
const [speaking, setSpeaking] = useState(false);
|
||||
@ -25,7 +26,7 @@ export default function NativeTTSMessage({ chatId, message }) {
|
||||
}
|
||||
|
||||
if (window.speechSynthesis.speaking && !speaking) return;
|
||||
const utterance = new SpeechSynthesisUtterance(message);
|
||||
const utterance = new SpeechSynthesisUtterance(messageToSpeech(message));
|
||||
utterance.addEventListener("end", endSpeechUtterance);
|
||||
window.speechSynthesis.speak(utterance);
|
||||
setSpeaking(true);
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
import { useEffect, useState, useRef } from "react";
|
||||
import { SpeakerHigh, PauseCircle, CircleNotch } from "@phosphor-icons/react";
|
||||
import PiperTTSClient from "@/utils/piperTTS";
|
||||
import messageToSpeech from "@/utils/chat/messageToSpeech";
|
||||
|
||||
export default function PiperTTS({ chatId, voiceId = null, message }) {
|
||||
const playerRef = useRef(null);
|
||||
@ -19,7 +20,9 @@ export default function PiperTTS({ chatId, voiceId = null, message }) {
|
||||
if (!audioSrc) {
|
||||
setLoading(true);
|
||||
const client = new PiperTTSClient({ voiceId });
|
||||
const blobUrl = await client.getAudioBlobForText(message);
|
||||
const blobUrl = await client.getAudioBlobForText(
|
||||
messageToSpeech(message)
|
||||
);
|
||||
setAudioSrc(blobUrl);
|
||||
setLoading(false);
|
||||
} else {
|
||||
|
||||
90
frontend/src/utils/chat/messageToSpeech.js
Normal file
90
frontend/src/utils/chat/messageToSpeech.js
Normal file
@ -0,0 +1,90 @@
|
||||
/**
|
||||
* Convert a chat message string to plain text suitable for text-to-speech.
|
||||
*
|
||||
* The chat history we render is Markdown — when we hand that text directly to
|
||||
* a TTS engine (Piper, the browser's `SpeechSynthesis`,
|
||||
* etc.) the engine reads every special character literally:
|
||||
* - `**bold**` becomes "asterisk asterisk bold asterisk asterisk"
|
||||
* - `_italic_` becomes "underscore italic underscore"
|
||||
* - inline code/code fences are read verbatim with backticks
|
||||
* - bullet lists become "hyphen item" / "asterisk item"
|
||||
* - links become "open bracket label close bracket open paren url close paren"
|
||||
*
|
||||
* This helper strips the most common Markdown syntax while preserving the
|
||||
* underlying spoken content. It is intentionally regex-based (no extra
|
||||
* dependency) so it can be safely used both on the client and inside the
|
||||
* native browser TTS path which has no access to the server's markdown-it
|
||||
* tokenizer. AsyncTTS does not use this helper as the cloud based TTS engines
|
||||
* do not need this cleanup and seem to handle the Markdown syntax just fine.
|
||||
*
|
||||
* @param {string} message - The raw markdown message body.
|
||||
* @returns {string} A plain-text string suitable for TTS.
|
||||
*/
|
||||
export default function messageToSpeech(message = "") {
|
||||
if (typeof message !== "string" || message.length === 0) return "";
|
||||
|
||||
let text = message;
|
||||
|
||||
/*
|
||||
* Remove fenced code blocks entirely — reading code aloud is rarely
|
||||
* useful and produces a long stream of unintelligible characters.
|
||||
*/
|
||||
text = text.replace(/```[\s\S]*?```/g, " ");
|
||||
text = text.replace(/~~~[\s\S]*?~~~/g, " ");
|
||||
|
||||
// Strip inline code wrappers but keep the text inside.
|
||||
text = text.replace(/`([^`]*)`/g, "$1");
|
||||
|
||||
// Images: drop entirely — there's nothing useful to speak.
|
||||
text = text.replace(/!\[[^\]]*\]\([^)]*\)/g, " ");
|
||||
|
||||
/*
|
||||
* Links: keep the visible label, drop the URL.
|
||||
* `[label](url)` -> `label`
|
||||
*/
|
||||
text = text.replace(/\[([^\]]+)\]\([^)]*\)/g, "$1");
|
||||
|
||||
// Reference-style link definitions: drop the URL line entirely.
|
||||
text = text.replace(/^\s*\[[^\]]+\]:\s*\S+.*$/gm, "");
|
||||
|
||||
// Heading markers (`#`, `##`, ...): keep the heading text only.
|
||||
text = text.replace(/^\s{0,3}#{1,6}\s+/gm, "");
|
||||
|
||||
// Blockquote markers (`>`): drop the leading marker.
|
||||
text = text.replace(/^\s{0,3}>\s?/gm, "");
|
||||
|
||||
/*
|
||||
* Unordered list markers (`-`, `*`, `+`) and ordered list markers
|
||||
* (`1.`, `12)`): drop the marker, keep the item text.
|
||||
*/
|
||||
text = text.replace(/^\s*[-*+]\s+/gm, "");
|
||||
text = text.replace(/^\s*\d+[.)]\s+/gm, "");
|
||||
|
||||
// Horizontal rules: drop entirely.
|
||||
text = text.replace(/^\s{0,3}(?:[-*_]\s*){3,}\s*$/gm, " ");
|
||||
|
||||
/*
|
||||
* Bold / italic / strikethrough emphasis. Order matters — handle the
|
||||
* longer markers (`***`, `___`, `**`, `__`, `~~`) before the singletons
|
||||
* so "asterisk" is never read aloud.
|
||||
*/
|
||||
text = text.replace(/(\*\*\*|___)([^*_]+)\1/g, "$2");
|
||||
text = text.replace(/(\*\*|__)([^*_]+)\1/g, "$2");
|
||||
text = text.replace(/(\*|_)([^*_\n]+)\1/g, "$2");
|
||||
text = text.replace(/~~([^~]+)~~/g, "$1");
|
||||
|
||||
/*
|
||||
* Tables: convert pipe separators to commas so rows read naturally,
|
||||
* and drop the alignment row (`---|---|---`).
|
||||
*/
|
||||
text = text.replace(/^\s*\|?\s*[:\-\s|]+\|[:\-\s|]+\s*$/gm, "");
|
||||
text = text.replace(/\|/g, ", ");
|
||||
|
||||
// HTML tags: strip but keep their text content.
|
||||
text = text.replace(/<\/?[^>]+>/g, " ");
|
||||
|
||||
// Collapse repeated whitespace (newlines and spaces) to single spaces.
|
||||
text = text.replace(/\s+/g, " ").trim();
|
||||
|
||||
return text;
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user