Add OCR of image support (#3219)

* OCR PDFs as fallback in spawn thread * wip * build our own worker fanout and wrapper * norm pkgs * Add image OCR support
2025-02-14 12:07:33 -08:00 · 2025-02-14 12:07:33 -08:00 · 89bba68219
commit 89bba68219
parent 2a9066e83a
5 changed files with 118 additions and 2 deletions
--- a/.github/workflows/dev-build.yaml
+++ b/.github/workflows/dev-build.yaml
@ -6,7 +6,7 @@ concurrency:
 on:
  push:
-    branches: ['ocr-parse-pdfs'] # put your current branch to create a build. Core team only.
+    branches: ['ocr-parse-images'] # put your current branch to create a build. Core team only.
    paths-ignore:
      - '**.md'
      - 'cloud-deployments/*'
--- a/collector/processSingleFile/convert/asImage.js
+++ b/collector/processSingleFile/convert/asImage.js
@ -0,0 +1,48 @@
 const { v4 } = require("uuid");
 const { tokenizeString } = require("../../utils/tokenizer");
 const {
  createdDate,
  trashFile,
  writeToServerDocuments,
 } = require("../../utils/files");
 const OCRLoader = require("../../utils/OCRLoader");
 const { default: slugify } = require("slugify");
 async function asImage({ fullFilePath = "", filename = "" }) {
  let content = await new OCRLoader().ocrImage(fullFilePath);
  if (!content?.length) {
    console.error(`Resulting text content was empty for ${filename}.`);
    trashFile(fullFilePath);
    return {
      success: false,
      reason: `No text content found in ${filename}.`,
      documents: [],
    };
  }
  console.log(`-- Working ${filename} --`);
  const data = {
    id: v4(),
    url: "file://" + fullFilePath,
    title: filename,
    docAuthor: "Unknown", // TODO: Find a better author
    description: "Unknown", // TODO: Find a better description
    docSource: "a text file uploaded by the user.",
    chunkSource: "",
    published: createdDate(fullFilePath),
    wordCount: content.split(" ").length,
    pageContent: content,
    token_count_estimate: tokenizeString(content),
  };
  const document = writeToServerDocuments(
    data,
    `${slugify(filename)}-${data.id}`
  );
  trashFile(fullFilePath);
  console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
  return { success: true, reason: null, documents: [document] };
 }
 module.exports = asImage;
--- a/collector/utils/OCRLoader/index.js
+++ b/collector/utils/OCRLoader/index.js
@ -185,6 +185,67 @@ class OCRLoader {
    });
    return documents;
  }
  /**
   * Loads an image file and returns the OCRed text.
   * @param {string} filePath - The path to the image file.
   * @param {Object} options - The options for the OCR.
   * @param {number} options.maxExecutionTime - The maximum execution time of the OCR in milliseconds.
   * @returns {Promise<string>} The OCRed text.
   */
  async ocrImage(filePath, { maxExecutionTime = 300_000 } = {}) {
    let content = "";
    let worker = null;
    if (
      !filePath ||
      !fs.existsSync(filePath) ||
      !fs.statSync(filePath).isFile()
    ) {
      this.log(`File ${filePath} does not exist. Skipping OCR.`);
      return null;
    }
    const documentTitle = path.basename(filePath);
    try {
      this.log(`Starting OCR of ${documentTitle}`);
      const startTime = Date.now();
      const { createWorker, OEM } = require("tesseract.js");
      worker = await createWorker("eng", OEM.LSTM_ONLY, {
        cachePath: this.cacheDir,
      });
      // Race the timeout with the OCR
      const timeoutPromise = new Promise((_, reject) => {
        setTimeout(() => {
          reject(
            new Error(
              `OCR job took too long to complete (${
                maxExecutionTime / 1000
              } seconds)`
            )
          );
        }, maxExecutionTime);
      });
      const processImage = async () => {
        const { data } = await worker.recognize(filePath, {}, "text");
        content = data.text;
      };
      await Promise.race([timeoutPromise, processImage()]);
      this.log(`Completed OCR of ${documentTitle}!`, {
        executionTime: `${((Date.now() - startTime) / 1000).toFixed(2)}s`,
      });
      return content;
    } catch (e) {
      this.log(`Error: ${e.message}`);
      return null;
    } finally {
      if (!worker) return;
      await worker.terminate();
    }
  }
 }
 module.exports = OCRLoader;
--- a/collector/utils/constants.js
+++ b/collector/utils/constants.js
@ -27,6 +27,9 @@ const ACCEPTED_MIMES = {
  "video/mp4": [".mp4"],
  "video/mpeg": [".mpeg"],
  "application/epub+zip": [".epub"],
  "image/png": [".png"],
  "image/jpeg": [".jpg"],
  "image/jpg": [".jpg"],
 };
 const SUPPORTED_FILETYPE_CONVERTERS = {
@ -55,6 +58,10 @@ const SUPPORTED_FILETYPE_CONVERTERS = {
  ".wav": "./convert/asAudio.js",
  ".mp4": "./convert/asAudio.js",
  ".mpeg": "./convert/asAudio.js",
  ".png": "./convert/asImage.js",
  ".jpg": "./convert/asImage.js",
  ".jpeg": "./convert/asImage.js",
 };
 module.exports = {
--- a/collector/utils/files/mime.js
+++ b/collector/utils/files/mime.js
@ -1,6 +1,6 @@
 const MimeLib = require("mime");
 class MimeDetector {
-  nonTextTypes = ["multipart", "image", "model", "audio", "video", "font"];
+  nonTextTypes = ["multipart", "model", "audio", "video", "font"];
  badMimes = [
    "application/octet-stream",
    "application/zip",