Export image support for JSON and JSONL (#4359)

* export image support for json and jsonl

* add tests and cleanup functionality

* add test for convertTo prepare function

* comment

---------

Co-authored-by: timothycarambat <rambat1010@gmail.com>
This commit is contained in:
Sean Hatfield 2025-09-03 10:30:57 -07:00 committed by GitHub
parent bb7d65f0eb
commit e31465a639
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 294 additions and 8 deletions

View File

@ -0,0 +1,238 @@
/* eslint-env jest */
const { prepareChatsForExport } = require("../../../utils/helpers/chat/convertTo");
// Mock the database models
jest.mock("../../../models/workspaceChats");
jest.mock("../../../models/embedChats");
const { WorkspaceChats } = require("../../../models/workspaceChats");
const { EmbedChats } = require("../../../models/embedChats");
const mockChat = (withImages = false) => {
return {
id: 1,
prompt: "Test prompt",
response: JSON.stringify({
text: "Test response",
attachments: withImages ? [
{ mime: "image/png", name: "image.png", contentString: "data:image/png;base64,iVBORw0KGg....=" },
{ mime: "image/jpeg", name: "image2.jpeg", contentString: "data:image/jpeg;base64,iVBORw0KGg....=" }
] : [],
sources: [],
metrics: {},
}),
createdAt: new Date(),
workspace: { name: "Test Workspace", openAiPrompt: "Test OpenAI Prompt" },
user: { username: "testuser" },
feedbackScore: 1,
}
};
describe("prepareChatsForExport", () => {
beforeEach(() => {
jest.clearAllMocks();
WorkspaceChats.whereWithData = jest.fn().mockResolvedValue([]);
EmbedChats.whereWithEmbedAndWorkspace = jest.fn().mockResolvedValue([]);
});
test("should throw error for invalid chat type", async () => {
await expect(prepareChatsForExport("json", "invalid"))
.rejects
.toThrow("Invalid chat type: invalid");
});
test("should throw error for invalid export type", async () => {
await expect(prepareChatsForExport("invalid", "workspace"))
.rejects
.toThrow("Invalid export type: invalid");
});
// CSV and JSON are the same format, so we can test them together
test("should return prepared data in csv and json format for workspace chat type", async () => {
const chatExample = mockChat();
WorkspaceChats.whereWithData.mockResolvedValue([chatExample]);
const result = await prepareChatsForExport("json", "workspace");
const responseJson = JSON.parse(chatExample.response);
expect(result).toBeDefined();
expect(result).toEqual([{
id: chatExample.id,
prompt: chatExample.prompt,
response: responseJson.text,
sent_at: chatExample.createdAt,
rating: chatExample.feedbackScore ? "GOOD" : "BAD",
username: chatExample.user.username,
workspace: chatExample.workspace.name,
attachments: [],
}]);
});
test("Should handle attachments for workspace chat type when json format is selected", async () => {
const chatExample = mockChat(true);
WorkspaceChats.whereWithData.mockResolvedValue([chatExample]);
const result = await prepareChatsForExport("json", "workspace");
const responseJson = JSON.parse(chatExample.response);
expect(result).toBeDefined();
expect(result).toEqual([{
id: chatExample.id,
prompt: chatExample.prompt,
response: responseJson.text,
sent_at: chatExample.createdAt,
rating: chatExample.feedbackScore ? "GOOD" : "BAD",
username: chatExample.user.username,
workspace: chatExample.workspace.name,
attachments: [
{
type: "image",
image: responseJson.attachments[0].contentString,
},
{
type: "image",
image: responseJson.attachments[1].contentString,
},
]
}]);
});
test("Should ignore attachments for workspace chat type when csv format is selected", async () => {
const chatExample = mockChat(true);
WorkspaceChats.whereWithData.mockResolvedValue([chatExample]);
const result = await prepareChatsForExport("csv", "workspace");
const responseJson = JSON.parse(chatExample.response);
expect(result).toBeDefined();
expect(result.attachments).not.toBeDefined();
expect(result).toEqual([{
id: chatExample.id,
prompt: chatExample.prompt,
response: responseJson.text,
sent_at: chatExample.createdAt,
rating: chatExample.feedbackScore ? "GOOD" : "BAD",
username: chatExample.user.username,
workspace: chatExample.workspace.name,
}]);
});
test("should return prepared data in jsonAlpaca format for workspace chat type", async () => {
const chatExample = mockChat();
const imageChatExample = mockChat(true);
WorkspaceChats.whereWithData.mockResolvedValue([chatExample, imageChatExample]);
const result = await prepareChatsForExport("jsonAlpaca", "workspace");
const responseJson1 = JSON.parse(chatExample.response);
const responseJson2 = JSON.parse(imageChatExample.response);
expect(result).toBeDefined();
// Alpaca format does not support attachments - so they are not included
expect(result[0].attachments).not.toBeDefined();
expect(result[1].attachments).not.toBeDefined();
expect(result).toEqual([{
instruction: chatExample.workspace.openAiPrompt,
input: chatExample.prompt,
output: responseJson1.text,
},
{
instruction: chatExample.workspace.openAiPrompt,
input: imageChatExample.prompt,
output: responseJson2.text,
}]);
});
test("should return prepared data in jsonl format for workspace chat type", async () => {
const chatExample = mockChat();
const responseJson = JSON.parse(chatExample.response);
WorkspaceChats.whereWithData.mockResolvedValue([chatExample]);
const result = await prepareChatsForExport("jsonl", "workspace");
expect(result).toBeDefined();
expect(result).toEqual(
{
[chatExample.workspace.id]: {
messages: [
{
role: "system",
content: [{
type: "text",
text: chatExample.workspace.openAiPrompt,
}],
},
{
role: "user",
content: [{
type: "text",
text: chatExample.prompt,
}],
},
{
role: "assistant",
content: [{
type: "text",
text: responseJson.text,
}],
},
],
},
},
);
});
test("should return prepared data in jsonl format for workspace chat type with attachments", async () => {
const chatExample = mockChat();
const imageChatExample = mockChat(true);
const responseJson = JSON.parse(chatExample.response);
const imageResponseJson = JSON.parse(imageChatExample.response);
WorkspaceChats.whereWithData.mockResolvedValue([chatExample, imageChatExample]);
const result = await prepareChatsForExport("jsonl", "workspace");
expect(result).toBeDefined();
expect(result).toEqual(
{
[chatExample.workspace.id]: {
messages: [
{
role: "system",
content: [{
type: "text",
text: chatExample.workspace.openAiPrompt,
}],
},
{
role: "user",
content: [{
type: "text",
text: chatExample.prompt,
}],
},
{
role: "assistant",
content: [{
type: "text",
text: responseJson.text,
}],
},
{
role: "user",
content: [{
type: "text",
text: imageChatExample.prompt,
}, {
type: "image",
image: imageResponseJson.attachments[0].contentString,
}, {
type: "image",
image: imageResponseJson.attachments[1].contentString,
}],
},
{
role: "assistant",
content: [{
type: "text",
text: imageResponseJson.text,
}],
},
],
},
},
);
});
});

View File

@ -34,6 +34,7 @@ async function convertToJSONAlpaca(preparedData) {
return JSON.stringify(preparedData, null, 4);
}
// You can validate JSONL outputs on https://jsonlines.org/validator/
async function convertToJSONL(workspaceChatsMap) {
return Object.values(workspaceChatsMap)
.map((workspaceChats) => JSON.stringify(workspaceChats))
@ -64,12 +65,24 @@ async function prepareChatsForExport(format = "jsonl", chatType = "workspace") {
if (format === "csv" || format === "json") {
const preparedData = chats.map((chat) => {
const responseJson = JSON.parse(chat.response);
const responseJson = safeJsonParse(chat.response, {});
const baseData = {
id: chat.id,
prompt: chat.prompt,
response: responseJson.text,
sent_at: chat.createdAt,
// Only add attachments to the json format since we cannot arrange attachments in csv format
...(format === "json"
? {
attachments:
responseJson.attachments?.length > 0
? responseJson.attachments.map((attachment) => ({
type: "image",
image: attachmentToDataUrl(attachment),
}))
: [],
}
: {}),
};
if (chatType === "embed") {
@ -101,9 +114,10 @@ async function prepareChatsForExport(format = "jsonl", chatType = "workspace") {
return preparedData;
}
// jsonAlpaca format does not support array outputs
if (format === "jsonAlpaca") {
const preparedData = chats.map((chat) => {
const responseJson = JSON.parse(chat.response);
const responseJson = safeJsonParse(chat.response, {});
return {
instruction: buildSystemPrompt(
chat,
@ -117,31 +131,54 @@ async function prepareChatsForExport(format = "jsonl", chatType = "workspace") {
return preparedData;
}
// Export to JSONL format (recommended for fine-tuning)
const workspaceChatsMap = chats.reduce((acc, chat) => {
const { prompt, response, workspaceId } = chat;
const responseJson = JSON.parse(response);
const responseJson = safeJsonParse(response, { attachments: [] });
const attachments = responseJson.attachments;
if (!acc[workspaceId]) {
acc[workspaceId] = {
messages: [
{
role: "system",
content:
content: [
{
type: "text",
text:
chat.workspace?.openAiPrompt ||
"Given the following conversation, relevant context, and a follow up question, reply with an answer to the current question the user is asking. Return only your response to the question given the above information following the users instructions as needed.",
},
],
},
],
};
}
acc[workspaceId].messages.push(
{
role: "user",
content: prompt,
content: [
{
type: "text",
text: prompt,
},
...(attachments?.length > 0
? attachments.map((attachment) => ({
type: "image",
image: attachmentToDataUrl(attachment),
}))
: []),
],
},
{
role: "assistant",
content: responseJson.text,
content: [
{
type: "text",
text: responseJson.text,
},
],
}
);
@ -203,6 +240,17 @@ function buildSystemPrompt(chat, prompt = null) {
return `${prompt ?? STANDARD_PROMPT}${context}`;
}
/**
* Converts an attachment's content string to a proper data URL format if needed
* @param {Object} attachment - The attachment object containing contentString and mime type
* @returns {string} The properly formatted data URL
*/
function attachmentToDataUrl(attachment) {
return attachment.contentString.startsWith("data:")
? attachment.contentString
: `data:${attachment.mime};base64,${attachment.contentString}`;
}
module.exports = {
prepareChatsForExport,
exportChatsAsType,