Add OCR of image support (#3219)
* OCR PDFs as fallback in spawn thread * wip * build our own worker fanout and wrapper * norm pkgs * Add image OCR support
This commit is contained in:
parent
2a9066e83a
commit
89bba68219
2
.github/workflows/dev-build.yaml
vendored
2
.github/workflows/dev-build.yaml
vendored
@ -6,7 +6,7 @@ concurrency:
|
|||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
branches: ['ocr-parse-pdfs'] # put your current branch to create a build. Core team only.
|
branches: ['ocr-parse-images'] # put your current branch to create a build. Core team only.
|
||||||
paths-ignore:
|
paths-ignore:
|
||||||
- '**.md'
|
- '**.md'
|
||||||
- 'cloud-deployments/*'
|
- 'cloud-deployments/*'
|
||||||
|
|||||||
48
collector/processSingleFile/convert/asImage.js
Normal file
48
collector/processSingleFile/convert/asImage.js
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
const { v4 } = require("uuid");
|
||||||
|
const { tokenizeString } = require("../../utils/tokenizer");
|
||||||
|
const {
|
||||||
|
createdDate,
|
||||||
|
trashFile,
|
||||||
|
writeToServerDocuments,
|
||||||
|
} = require("../../utils/files");
|
||||||
|
const OCRLoader = require("../../utils/OCRLoader");
|
||||||
|
const { default: slugify } = require("slugify");
|
||||||
|
|
||||||
|
async function asImage({ fullFilePath = "", filename = "" }) {
|
||||||
|
let content = await new OCRLoader().ocrImage(fullFilePath);
|
||||||
|
|
||||||
|
if (!content?.length) {
|
||||||
|
console.error(`Resulting text content was empty for ${filename}.`);
|
||||||
|
trashFile(fullFilePath);
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
reason: `No text content found in ${filename}.`,
|
||||||
|
documents: [],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`-- Working ${filename} --`);
|
||||||
|
const data = {
|
||||||
|
id: v4(),
|
||||||
|
url: "file://" + fullFilePath,
|
||||||
|
title: filename,
|
||||||
|
docAuthor: "Unknown", // TODO: Find a better author
|
||||||
|
description: "Unknown", // TODO: Find a better description
|
||||||
|
docSource: "a text file uploaded by the user.",
|
||||||
|
chunkSource: "",
|
||||||
|
published: createdDate(fullFilePath),
|
||||||
|
wordCount: content.split(" ").length,
|
||||||
|
pageContent: content,
|
||||||
|
token_count_estimate: tokenizeString(content),
|
||||||
|
};
|
||||||
|
|
||||||
|
const document = writeToServerDocuments(
|
||||||
|
data,
|
||||||
|
`${slugify(filename)}-${data.id}`
|
||||||
|
);
|
||||||
|
trashFile(fullFilePath);
|
||||||
|
console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
|
||||||
|
return { success: true, reason: null, documents: [document] };
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = asImage;
|
||||||
@ -185,6 +185,67 @@ class OCRLoader {
|
|||||||
});
|
});
|
||||||
return documents;
|
return documents;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads an image file and returns the OCRed text.
|
||||||
|
* @param {string} filePath - The path to the image file.
|
||||||
|
* @param {Object} options - The options for the OCR.
|
||||||
|
* @param {number} options.maxExecutionTime - The maximum execution time of the OCR in milliseconds.
|
||||||
|
* @returns {Promise<string>} The OCRed text.
|
||||||
|
*/
|
||||||
|
async ocrImage(filePath, { maxExecutionTime = 300_000 } = {}) {
|
||||||
|
let content = "";
|
||||||
|
let worker = null;
|
||||||
|
if (
|
||||||
|
!filePath ||
|
||||||
|
!fs.existsSync(filePath) ||
|
||||||
|
!fs.statSync(filePath).isFile()
|
||||||
|
) {
|
||||||
|
this.log(`File ${filePath} does not exist. Skipping OCR.`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const documentTitle = path.basename(filePath);
|
||||||
|
try {
|
||||||
|
this.log(`Starting OCR of ${documentTitle}`);
|
||||||
|
const startTime = Date.now();
|
||||||
|
const { createWorker, OEM } = require("tesseract.js");
|
||||||
|
worker = await createWorker("eng", OEM.LSTM_ONLY, {
|
||||||
|
cachePath: this.cacheDir,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Race the timeout with the OCR
|
||||||
|
const timeoutPromise = new Promise((_, reject) => {
|
||||||
|
setTimeout(() => {
|
||||||
|
reject(
|
||||||
|
new Error(
|
||||||
|
`OCR job took too long to complete (${
|
||||||
|
maxExecutionTime / 1000
|
||||||
|
} seconds)`
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}, maxExecutionTime);
|
||||||
|
});
|
||||||
|
|
||||||
|
const processImage = async () => {
|
||||||
|
const { data } = await worker.recognize(filePath, {}, "text");
|
||||||
|
content = data.text;
|
||||||
|
};
|
||||||
|
|
||||||
|
await Promise.race([timeoutPromise, processImage()]);
|
||||||
|
this.log(`Completed OCR of ${documentTitle}!`, {
|
||||||
|
executionTime: `${((Date.now() - startTime) / 1000).toFixed(2)}s`,
|
||||||
|
});
|
||||||
|
|
||||||
|
return content;
|
||||||
|
} catch (e) {
|
||||||
|
this.log(`Error: ${e.message}`);
|
||||||
|
return null;
|
||||||
|
} finally {
|
||||||
|
if (!worker) return;
|
||||||
|
await worker.terminate();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = OCRLoader;
|
module.exports = OCRLoader;
|
||||||
|
|||||||
@ -27,6 +27,9 @@ const ACCEPTED_MIMES = {
|
|||||||
"video/mp4": [".mp4"],
|
"video/mp4": [".mp4"],
|
||||||
"video/mpeg": [".mpeg"],
|
"video/mpeg": [".mpeg"],
|
||||||
"application/epub+zip": [".epub"],
|
"application/epub+zip": [".epub"],
|
||||||
|
"image/png": [".png"],
|
||||||
|
"image/jpeg": [".jpg"],
|
||||||
|
"image/jpg": [".jpg"],
|
||||||
};
|
};
|
||||||
|
|
||||||
const SUPPORTED_FILETYPE_CONVERTERS = {
|
const SUPPORTED_FILETYPE_CONVERTERS = {
|
||||||
@ -55,6 +58,10 @@ const SUPPORTED_FILETYPE_CONVERTERS = {
|
|||||||
".wav": "./convert/asAudio.js",
|
".wav": "./convert/asAudio.js",
|
||||||
".mp4": "./convert/asAudio.js",
|
".mp4": "./convert/asAudio.js",
|
||||||
".mpeg": "./convert/asAudio.js",
|
".mpeg": "./convert/asAudio.js",
|
||||||
|
|
||||||
|
".png": "./convert/asImage.js",
|
||||||
|
".jpg": "./convert/asImage.js",
|
||||||
|
".jpeg": "./convert/asImage.js",
|
||||||
};
|
};
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
const MimeLib = require("mime");
|
const MimeLib = require("mime");
|
||||||
class MimeDetector {
|
class MimeDetector {
|
||||||
nonTextTypes = ["multipart", "image", "model", "audio", "video", "font"];
|
nonTextTypes = ["multipart", "model", "audio", "video", "font"];
|
||||||
badMimes = [
|
badMimes = [
|
||||||
"application/octet-stream",
|
"application/octet-stream",
|
||||||
"application/zip",
|
"application/zip",
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user