merlyn/collector/utils/OCRLoader/validLangs.js
AbelDuan df166eb64e
feat: Add multilingual support for ocr module (#3325)
* Add multilingual support for ocr mudule

* Add OCR langauge as server var that is passed into Collector
Support all valid tesseract language codes
Filter and parse only valid codes with fallbacks'

* persist TARGET_OCR_LANG

* update docker example env

---------

Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
2025-02-27 12:31:17 -08:00

156 lines
3.7 KiB
JavaScript

/*
To get the list of valid language codes - do the following:
Open the following URL in your browser: https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html
Check this element is the proper table tbody with all the codes via console:
document.getElementsByTagName('table').item(0).children.item(1)
Now, copy the following code and paste it into the console:
function parseLangs() {
let langs = {};
Array.from(document.getElementsByTagName('table').item(0).children.item(1).children).forEach((el) => {
const [codeEl, languageEl, ...rest] = el.children
const code = codeEl.innerText.trim()
const language = languageEl.innerText.trim()
if (!!code && !!language) langs[code] = language
})
return langs;
}
now, run the function:
copy(parseLangs())
*/
const VALID_LANGUAGE_CODES = {
afr: "Afrikaans",
amh: "Amharic",
ara: "Arabic",
asm: "Assamese",
aze: "Azerbaijani",
aze_cyrl: "Azerbaijani - Cyrilic",
bel: "Belarusian",
ben: "Bengali",
bod: "Tibetan",
bos: "Bosnian",
bre: "Breton",
bul: "Bulgarian",
cat: "Catalan; Valencian",
ceb: "Cebuano",
ces: "Czech",
chi_sim: "Chinese - Simplified",
chi_tra: "Chinese - Traditional",
chr: "Cherokee",
cos: "Corsican",
cym: "Welsh",
dan: "Danish",
dan_frak: "Danish - Fraktur (contrib)",
deu: "German",
deu_frak: "German - Fraktur (contrib)",
deu_latf: "German (Fraktur Latin)",
dzo: "Dzongkha",
ell: "Greek, Modern (1453-)",
eng: "English",
enm: "English, Middle (1100-1500)",
epo: "Esperanto",
equ: "Math / equation detection module",
est: "Estonian",
eus: "Basque",
fao: "Faroese",
fas: "Persian",
fil: "Filipino (old - Tagalog)",
fin: "Finnish",
fra: "French",
frk: "German - Fraktur (now deu_latf)",
frm: "French, Middle (ca.1400-1600)",
fry: "Western Frisian",
gla: "Scottish Gaelic",
gle: "Irish",
glg: "Galician",
grc: "Greek, Ancient (to 1453) (contrib)",
guj: "Gujarati",
hat: "Haitian; Haitian Creole",
heb: "Hebrew",
hin: "Hindi",
hrv: "Croatian",
hun: "Hungarian",
hye: "Armenian",
iku: "Inuktitut",
ind: "Indonesian",
isl: "Icelandic",
ita: "Italian",
ita_old: "Italian - Old",
jav: "Javanese",
jpn: "Japanese",
kan: "Kannada",
kat: "Georgian",
kat_old: "Georgian - Old",
kaz: "Kazakh",
khm: "Central Khmer",
kir: "Kirghiz; Kyrgyz",
kmr: "Kurmanji (Kurdish - Latin Script)",
kor: "Korean",
kor_vert: "Korean (vertical)",
kur: "Kurdish (Arabic Script)",
lao: "Lao",
lat: "Latin",
lav: "Latvian",
lit: "Lithuanian",
ltz: "Luxembourgish",
mal: "Malayalam",
mar: "Marathi",
mkd: "Macedonian",
mlt: "Maltese",
mon: "Mongolian",
mri: "Maori",
msa: "Malay",
mya: "Burmese",
nep: "Nepali",
nld: "Dutch; Flemish",
nor: "Norwegian",
oci: "Occitan (post 1500)",
ori: "Oriya",
osd: "Orientation and script detection module",
pan: "Panjabi; Punjabi",
pol: "Polish",
por: "Portuguese",
pus: "Pushto; Pashto",
que: "Quechua",
ron: "Romanian; Moldavian; Moldovan",
rus: "Russian",
san: "Sanskrit",
sin: "Sinhala; Sinhalese",
slk: "Slovak",
slk_frak: "Slovak - Fraktur (contrib)",
slv: "Slovenian",
snd: "Sindhi",
spa: "Spanish; Castilian",
spa_old: "Spanish; Castilian - Old",
sqi: "Albanian",
srp: "Serbian",
srp_latn: "Serbian - Latin",
sun: "Sundanese",
swa: "Swahili",
swe: "Swedish",
syr: "Syriac",
tam: "Tamil",
tat: "Tatar",
tel: "Telugu",
tgk: "Tajik",
tgl: "Tagalog (new - Filipino)",
tha: "Thai",
tir: "Tigrinya",
ton: "Tonga",
tur: "Turkish",
uig: "Uighur; Uyghur",
ukr: "Ukrainian",
urd: "Urdu",
uzb: "Uzbek",
uzb_cyrl: "Uzbek - Cyrilic",
vie: "Vietnamese",
yid: "Yiddish",
yor: "Yoruba",
};
module.exports.VALID_LANGUAGE_CODES = VALID_LANGUAGE_CODES;