feat: Add multilingual support for ocr module (#3325)
* Add multilingual support for ocr mudule * Add OCR langauge as server var that is passed into Collector Support all valid tesseract language codes Filter and parse only valid codes with fallbacks' * persist TARGET_OCR_LANG * update docker example env --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
This commit is contained in:
parent
c928d3d0c5
commit
df166eb64e
@ -8,8 +8,10 @@ const {
|
||||
const OCRLoader = require("../../utils/OCRLoader");
|
||||
const { default: slugify } = require("slugify");
|
||||
|
||||
async function asImage({ fullFilePath = "", filename = "" }) {
|
||||
let content = await new OCRLoader().ocrImage(fullFilePath);
|
||||
async function asImage({ fullFilePath = "", filename = "", options = {} }) {
|
||||
let content = await new OCRLoader({
|
||||
targetLanguages: options?.ocr?.langList,
|
||||
}).ocrImage(fullFilePath);
|
||||
|
||||
if (!content?.length) {
|
||||
console.error(`Resulting text content was empty for ${filename}.`);
|
||||
|
||||
@ -9,7 +9,7 @@ const { default: slugify } = require("slugify");
|
||||
const PDFLoader = require("./PDFLoader");
|
||||
const OCRLoader = require("../../../utils/OCRLoader");
|
||||
|
||||
async function asPdf({ fullFilePath = "", filename = "" }) {
|
||||
async function asPdf({ fullFilePath = "", filename = "", options = {} }) {
|
||||
const pdfLoader = new PDFLoader(fullFilePath, {
|
||||
splitPages: true,
|
||||
});
|
||||
@ -22,7 +22,9 @@ async function asPdf({ fullFilePath = "", filename = "" }) {
|
||||
console.log(
|
||||
`[asPDF] No text content found for ${filename}. Will attempt OCR parse.`
|
||||
);
|
||||
docs = await new OCRLoader().ocrPDF(fullFilePath);
|
||||
docs = await new OCRLoader({
|
||||
targetLanguages: options?.ocr?.langList,
|
||||
}).ocrPDF(fullFilePath);
|
||||
}
|
||||
|
||||
for (const doc of docs) {
|
||||
|
||||
@ -1,14 +1,61 @@
|
||||
const fs = require("fs");
|
||||
const os = require("os");
|
||||
const path = require("path");
|
||||
const { VALID_LANGUAGE_CODES } = require("./validLangs");
|
||||
|
||||
class OCRLoader {
|
||||
constructor() {
|
||||
/**
|
||||
* The language code(s) to use for the OCR.
|
||||
* @type {string[]}
|
||||
*/
|
||||
language;
|
||||
/**
|
||||
* The cache directory for the OCR.
|
||||
* @type {string}
|
||||
*/
|
||||
cacheDir;
|
||||
|
||||
/**
|
||||
* The constructor for the OCRLoader.
|
||||
* @param {Object} options - The options for the OCRLoader.
|
||||
* @param {string} options.targetLanguages - The target languages to use for the OCR as a comma separated string. eg: "eng,deu,..."
|
||||
*/
|
||||
constructor({ targetLanguages = "eng" } = {}) {
|
||||
this.language = this.parseLanguages(targetLanguages);
|
||||
this.cacheDir = path.resolve(
|
||||
process.env.STORAGE_DIR
|
||||
? path.resolve(process.env.STORAGE_DIR, `models`, `tesseract`)
|
||||
: path.resolve(__dirname, `../../../server/storage/models/tesseract`)
|
||||
);
|
||||
|
||||
// Ensure the cache directory exists or else Tesseract will persist the cache in the default location.
|
||||
if (!fs.existsSync(this.cacheDir))
|
||||
fs.mkdirSync(this.cacheDir, { recursive: true });
|
||||
this.log(
|
||||
`OCRLoader initialized with language support for:`,
|
||||
this.language.map((lang) => VALID_LANGUAGE_CODES[lang]).join(", ")
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses the language code from a provided comma separated string of language codes.
|
||||
* @param {string} language - The language code to parse.
|
||||
* @returns {string[]} The parsed language code.
|
||||
*/
|
||||
parseLanguages(language = null) {
|
||||
try {
|
||||
if (!language || typeof language !== "string") return ["eng"];
|
||||
const langList = language
|
||||
.split(",")
|
||||
.map((lang) => (lang.trim() !== "" ? lang.trim() : null))
|
||||
.filter(Boolean)
|
||||
.filter((lang) => VALID_LANGUAGE_CODES.hasOwnProperty(lang));
|
||||
if (langList.length === 0) return ["eng"];
|
||||
return langList;
|
||||
} catch (e) {
|
||||
this.log(`Error parsing languages: ${e.message}`, e.stack);
|
||||
return ["eng"];
|
||||
}
|
||||
}
|
||||
|
||||
log(text, ...args) {
|
||||
@ -70,7 +117,7 @@ class OCRLoader {
|
||||
Array(NUM_WORKERS)
|
||||
.fill(0)
|
||||
.map(() =>
|
||||
createWorker("eng", OEM.LSTM_ONLY, {
|
||||
createWorker(this.language, OEM.LSTM_ONLY, {
|
||||
cachePath: this.cacheDir,
|
||||
})
|
||||
)
|
||||
@ -188,7 +235,7 @@ class OCRLoader {
|
||||
this.log(`Starting OCR of ${documentTitle}`);
|
||||
const startTime = Date.now();
|
||||
const { createWorker, OEM } = require("tesseract.js");
|
||||
worker = await createWorker("eng", OEM.LSTM_ONLY, {
|
||||
worker = await createWorker(this.language, OEM.LSTM_ONLY, {
|
||||
cachePath: this.cacheDir,
|
||||
});
|
||||
|
||||
|
||||
155
collector/utils/OCRLoader/validLangs.js
Normal file
155
collector/utils/OCRLoader/validLangs.js
Normal file
@ -0,0 +1,155 @@
|
||||
/*
|
||||
|
||||
To get the list of valid language codes - do the following:
|
||||
Open the following URL in your browser: https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html
|
||||
|
||||
Check this element is the proper table tbody with all the codes via console:
|
||||
document.getElementsByTagName('table').item(0).children.item(1)
|
||||
|
||||
Now, copy the following code and paste it into the console:
|
||||
function parseLangs() {
|
||||
let langs = {};
|
||||
Array.from(document.getElementsByTagName('table').item(0).children.item(1).children).forEach((el) => {
|
||||
const [codeEl, languageEl, ...rest] = el.children
|
||||
const code = codeEl.innerText.trim()
|
||||
const language = languageEl.innerText.trim()
|
||||
if (!!code && !!language) langs[code] = language
|
||||
})
|
||||
return langs;
|
||||
}
|
||||
|
||||
now, run the function:
|
||||
copy(parseLangs())
|
||||
*/
|
||||
|
||||
const VALID_LANGUAGE_CODES = {
|
||||
afr: "Afrikaans",
|
||||
amh: "Amharic",
|
||||
ara: "Arabic",
|
||||
asm: "Assamese",
|
||||
aze: "Azerbaijani",
|
||||
aze_cyrl: "Azerbaijani - Cyrilic",
|
||||
bel: "Belarusian",
|
||||
ben: "Bengali",
|
||||
bod: "Tibetan",
|
||||
bos: "Bosnian",
|
||||
bre: "Breton",
|
||||
bul: "Bulgarian",
|
||||
cat: "Catalan; Valencian",
|
||||
ceb: "Cebuano",
|
||||
ces: "Czech",
|
||||
chi_sim: "Chinese - Simplified",
|
||||
chi_tra: "Chinese - Traditional",
|
||||
chr: "Cherokee",
|
||||
cos: "Corsican",
|
||||
cym: "Welsh",
|
||||
dan: "Danish",
|
||||
dan_frak: "Danish - Fraktur (contrib)",
|
||||
deu: "German",
|
||||
deu_frak: "German - Fraktur (contrib)",
|
||||
deu_latf: "German (Fraktur Latin)",
|
||||
dzo: "Dzongkha",
|
||||
ell: "Greek, Modern (1453-)",
|
||||
eng: "English",
|
||||
enm: "English, Middle (1100-1500)",
|
||||
epo: "Esperanto",
|
||||
equ: "Math / equation detection module",
|
||||
est: "Estonian",
|
||||
eus: "Basque",
|
||||
fao: "Faroese",
|
||||
fas: "Persian",
|
||||
fil: "Filipino (old - Tagalog)",
|
||||
fin: "Finnish",
|
||||
fra: "French",
|
||||
frk: "German - Fraktur (now deu_latf)",
|
||||
frm: "French, Middle (ca.1400-1600)",
|
||||
fry: "Western Frisian",
|
||||
gla: "Scottish Gaelic",
|
||||
gle: "Irish",
|
||||
glg: "Galician",
|
||||
grc: "Greek, Ancient (to 1453) (contrib)",
|
||||
guj: "Gujarati",
|
||||
hat: "Haitian; Haitian Creole",
|
||||
heb: "Hebrew",
|
||||
hin: "Hindi",
|
||||
hrv: "Croatian",
|
||||
hun: "Hungarian",
|
||||
hye: "Armenian",
|
||||
iku: "Inuktitut",
|
||||
ind: "Indonesian",
|
||||
isl: "Icelandic",
|
||||
ita: "Italian",
|
||||
ita_old: "Italian - Old",
|
||||
jav: "Javanese",
|
||||
jpn: "Japanese",
|
||||
kan: "Kannada",
|
||||
kat: "Georgian",
|
||||
kat_old: "Georgian - Old",
|
||||
kaz: "Kazakh",
|
||||
khm: "Central Khmer",
|
||||
kir: "Kirghiz; Kyrgyz",
|
||||
kmr: "Kurmanji (Kurdish - Latin Script)",
|
||||
kor: "Korean",
|
||||
kor_vert: "Korean (vertical)",
|
||||
kur: "Kurdish (Arabic Script)",
|
||||
lao: "Lao",
|
||||
lat: "Latin",
|
||||
lav: "Latvian",
|
||||
lit: "Lithuanian",
|
||||
ltz: "Luxembourgish",
|
||||
mal: "Malayalam",
|
||||
mar: "Marathi",
|
||||
mkd: "Macedonian",
|
||||
mlt: "Maltese",
|
||||
mon: "Mongolian",
|
||||
mri: "Maori",
|
||||
msa: "Malay",
|
||||
mya: "Burmese",
|
||||
nep: "Nepali",
|
||||
nld: "Dutch; Flemish",
|
||||
nor: "Norwegian",
|
||||
oci: "Occitan (post 1500)",
|
||||
ori: "Oriya",
|
||||
osd: "Orientation and script detection module",
|
||||
pan: "Panjabi; Punjabi",
|
||||
pol: "Polish",
|
||||
por: "Portuguese",
|
||||
pus: "Pushto; Pashto",
|
||||
que: "Quechua",
|
||||
ron: "Romanian; Moldavian; Moldovan",
|
||||
rus: "Russian",
|
||||
san: "Sanskrit",
|
||||
sin: "Sinhala; Sinhalese",
|
||||
slk: "Slovak",
|
||||
slk_frak: "Slovak - Fraktur (contrib)",
|
||||
slv: "Slovenian",
|
||||
snd: "Sindhi",
|
||||
spa: "Spanish; Castilian",
|
||||
spa_old: "Spanish; Castilian - Old",
|
||||
sqi: "Albanian",
|
||||
srp: "Serbian",
|
||||
srp_latn: "Serbian - Latin",
|
||||
sun: "Sundanese",
|
||||
swa: "Swahili",
|
||||
swe: "Swedish",
|
||||
syr: "Syriac",
|
||||
tam: "Tamil",
|
||||
tat: "Tatar",
|
||||
tel: "Telugu",
|
||||
tgk: "Tajik",
|
||||
tgl: "Tagalog (new - Filipino)",
|
||||
tha: "Thai",
|
||||
tir: "Tigrinya",
|
||||
ton: "Tonga",
|
||||
tur: "Turkish",
|
||||
uig: "Uighur; Uyghur",
|
||||
ukr: "Ukrainian",
|
||||
urd: "Urdu",
|
||||
uzb: "Uzbek",
|
||||
uzb_cyrl: "Uzbek - Cyrilic",
|
||||
vie: "Vietnamese",
|
||||
yid: "Yiddish",
|
||||
yor: "Yoruba",
|
||||
};
|
||||
|
||||
module.exports.VALID_LANGUAGE_CODES = VALID_LANGUAGE_CODES;
|
||||
@ -321,3 +321,8 @@ GID='1000'
|
||||
# Enable simple SSO passthrough to pre-authenticate users from a third party service.
|
||||
# See https://docs.anythingllm.com/configuration#simple-sso-passthrough for more information.
|
||||
# SIMPLE_SSO_ENABLED=1
|
||||
|
||||
# Specify the target languages for when using OCR to parse images and PDFs.
|
||||
# This is a comma separated list of language codes as a string. Unsupported languages will be ignored.
|
||||
# Default is English. See https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html for a list of valid language codes.
|
||||
# TARGET_OCR_LANG=eng,deu,ita,spa,fra,por,rus,nld,tur,hun,pol,ita,spa,fra,por,rus,nld,tur,hun,pol
|
||||
@ -310,3 +310,8 @@ TTS_PROVIDER="native"
|
||||
# Enable simple SSO passthrough to pre-authenticate users from a third party service.
|
||||
# See https://docs.anythingllm.com/configuration#simple-sso-passthrough for more information.
|
||||
# SIMPLE_SSO_ENABLED=1
|
||||
|
||||
# Specify the target languages for when using OCR to parse images and PDFs.
|
||||
# This is a comma separated list of language codes as a string. Unsupported languages will be ignored.
|
||||
# Default is English. See https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html for a list of valid language codes.
|
||||
# TARGET_OCR_LANG=eng,deu,ita,spa,fra,por,rus,nld,tur,hun,pol,ita,spa,fra,por,rus,nld,tur,hun,pol
|
||||
@ -20,6 +20,9 @@ class CollectorApi {
|
||||
whisperProvider: process.env.WHISPER_PROVIDER || "local",
|
||||
WhisperModelPref: process.env.WHISPER_MODEL_PREF,
|
||||
openAiKey: process.env.OPEN_AI_KEY || null,
|
||||
ocr: {
|
||||
langList: process.env.TARGET_OCR_LANG || "eng",
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@ -978,6 +978,9 @@ function dumpENV() {
|
||||
|
||||
// Nvidia NIM Keys that are automatically managed
|
||||
"NVIDIA_NIM_LLM_MODEL_TOKEN_LIMIT",
|
||||
|
||||
// OCR Language Support
|
||||
"TARGET_OCR_LANG",
|
||||
];
|
||||
|
||||
// Simple sanitization of each value to prevent ENV injection via newline or quote escaping.
|
||||
|
||||
Loading…
Reference in New Issue
Block a user