* Add multilingual support for ocr mudule * Add OCR langauge as server var that is passed into Collector Support all valid tesseract language codes Filter and parse only valid codes with fallbacks' * persist TARGET_OCR_LANG * update docker example env --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
156 lines
3.7 KiB
JavaScript
156 lines
3.7 KiB
JavaScript
/*
|
|
|
|
To get the list of valid language codes - do the following:
|
|
Open the following URL in your browser: https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html
|
|
|
|
Check this element is the proper table tbody with all the codes via console:
|
|
document.getElementsByTagName('table').item(0).children.item(1)
|
|
|
|
Now, copy the following code and paste it into the console:
|
|
function parseLangs() {
|
|
let langs = {};
|
|
Array.from(document.getElementsByTagName('table').item(0).children.item(1).children).forEach((el) => {
|
|
const [codeEl, languageEl, ...rest] = el.children
|
|
const code = codeEl.innerText.trim()
|
|
const language = languageEl.innerText.trim()
|
|
if (!!code && !!language) langs[code] = language
|
|
})
|
|
return langs;
|
|
}
|
|
|
|
now, run the function:
|
|
copy(parseLangs())
|
|
*/
|
|
|
|
const VALID_LANGUAGE_CODES = {
|
|
afr: "Afrikaans",
|
|
amh: "Amharic",
|
|
ara: "Arabic",
|
|
asm: "Assamese",
|
|
aze: "Azerbaijani",
|
|
aze_cyrl: "Azerbaijani - Cyrilic",
|
|
bel: "Belarusian",
|
|
ben: "Bengali",
|
|
bod: "Tibetan",
|
|
bos: "Bosnian",
|
|
bre: "Breton",
|
|
bul: "Bulgarian",
|
|
cat: "Catalan; Valencian",
|
|
ceb: "Cebuano",
|
|
ces: "Czech",
|
|
chi_sim: "Chinese - Simplified",
|
|
chi_tra: "Chinese - Traditional",
|
|
chr: "Cherokee",
|
|
cos: "Corsican",
|
|
cym: "Welsh",
|
|
dan: "Danish",
|
|
dan_frak: "Danish - Fraktur (contrib)",
|
|
deu: "German",
|
|
deu_frak: "German - Fraktur (contrib)",
|
|
deu_latf: "German (Fraktur Latin)",
|
|
dzo: "Dzongkha",
|
|
ell: "Greek, Modern (1453-)",
|
|
eng: "English",
|
|
enm: "English, Middle (1100-1500)",
|
|
epo: "Esperanto",
|
|
equ: "Math / equation detection module",
|
|
est: "Estonian",
|
|
eus: "Basque",
|
|
fao: "Faroese",
|
|
fas: "Persian",
|
|
fil: "Filipino (old - Tagalog)",
|
|
fin: "Finnish",
|
|
fra: "French",
|
|
frk: "German - Fraktur (now deu_latf)",
|
|
frm: "French, Middle (ca.1400-1600)",
|
|
fry: "Western Frisian",
|
|
gla: "Scottish Gaelic",
|
|
gle: "Irish",
|
|
glg: "Galician",
|
|
grc: "Greek, Ancient (to 1453) (contrib)",
|
|
guj: "Gujarati",
|
|
hat: "Haitian; Haitian Creole",
|
|
heb: "Hebrew",
|
|
hin: "Hindi",
|
|
hrv: "Croatian",
|
|
hun: "Hungarian",
|
|
hye: "Armenian",
|
|
iku: "Inuktitut",
|
|
ind: "Indonesian",
|
|
isl: "Icelandic",
|
|
ita: "Italian",
|
|
ita_old: "Italian - Old",
|
|
jav: "Javanese",
|
|
jpn: "Japanese",
|
|
kan: "Kannada",
|
|
kat: "Georgian",
|
|
kat_old: "Georgian - Old",
|
|
kaz: "Kazakh",
|
|
khm: "Central Khmer",
|
|
kir: "Kirghiz; Kyrgyz",
|
|
kmr: "Kurmanji (Kurdish - Latin Script)",
|
|
kor: "Korean",
|
|
kor_vert: "Korean (vertical)",
|
|
kur: "Kurdish (Arabic Script)",
|
|
lao: "Lao",
|
|
lat: "Latin",
|
|
lav: "Latvian",
|
|
lit: "Lithuanian",
|
|
ltz: "Luxembourgish",
|
|
mal: "Malayalam",
|
|
mar: "Marathi",
|
|
mkd: "Macedonian",
|
|
mlt: "Maltese",
|
|
mon: "Mongolian",
|
|
mri: "Maori",
|
|
msa: "Malay",
|
|
mya: "Burmese",
|
|
nep: "Nepali",
|
|
nld: "Dutch; Flemish",
|
|
nor: "Norwegian",
|
|
oci: "Occitan (post 1500)",
|
|
ori: "Oriya",
|
|
osd: "Orientation and script detection module",
|
|
pan: "Panjabi; Punjabi",
|
|
pol: "Polish",
|
|
por: "Portuguese",
|
|
pus: "Pushto; Pashto",
|
|
que: "Quechua",
|
|
ron: "Romanian; Moldavian; Moldovan",
|
|
rus: "Russian",
|
|
san: "Sanskrit",
|
|
sin: "Sinhala; Sinhalese",
|
|
slk: "Slovak",
|
|
slk_frak: "Slovak - Fraktur (contrib)",
|
|
slv: "Slovenian",
|
|
snd: "Sindhi",
|
|
spa: "Spanish; Castilian",
|
|
spa_old: "Spanish; Castilian - Old",
|
|
sqi: "Albanian",
|
|
srp: "Serbian",
|
|
srp_latn: "Serbian - Latin",
|
|
sun: "Sundanese",
|
|
swa: "Swahili",
|
|
swe: "Swedish",
|
|
syr: "Syriac",
|
|
tam: "Tamil",
|
|
tat: "Tatar",
|
|
tel: "Telugu",
|
|
tgk: "Tajik",
|
|
tgl: "Tagalog (new - Filipino)",
|
|
tha: "Thai",
|
|
tir: "Tigrinya",
|
|
ton: "Tonga",
|
|
tur: "Turkish",
|
|
uig: "Uighur; Uyghur",
|
|
ukr: "Ukrainian",
|
|
urd: "Urdu",
|
|
uzb: "Uzbek",
|
|
uzb_cyrl: "Uzbek - Cyrilic",
|
|
vie: "Vietnamese",
|
|
yid: "Yiddish",
|
|
yor: "Yoruba",
|
|
};
|
|
|
|
module.exports.VALID_LANGUAGE_CODES = VALID_LANGUAGE_CODES;
|