Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Script detection #80

Open
pjebs opened this issue Jan 26, 2025 · 2 comments
Open

Script detection #80

pjebs opened this issue Jan 26, 2025 · 2 comments

Comments

@pjebs
Copy link

pjebs commented Jan 26, 2025

It would be great if memory and CPU can be reduced if all that is required is unicode script detection for a given input text

Right now I have to detect language first, then use a table to link the language to the writing system.

Seems overkill.

@pjebs
Copy link
Author

pjebs commented Jan 26, 2025

var _HiraganaKatakana = &unicode.RangeTable{
	R16: append(unicode.Hiragana.R16, unicode.Katakana.R16...),
	R32: append(unicode.Hiragana.R32, unicode.Katakana.R32...),
}

// lang is 2 letter language code ISO639-1
func langToScript(lang string) *unicode.RangeTable {
	// Compiled from sources including https://www.unicode.org/cldr/charts/46/supplemental/languages_and_scripts.html
	scripts := map[string]*unicode.RangeTable{
		"aa": unicode.Latin,      // Name: "Afar", NativeName: "Afaraf"},
		"ab": unicode.Cyrillic,   // Name: "Abkhaz", NativeName: "Аҧсуа бызшәа"},
		"ae": unicode.Avestan,    // Name: "Avestan", NativeName: "Avesta"},
		"af": unicode.Latin,      // Name: "Afrikaans", NativeName: "Afrikaans"},
		"ak": unicode.Latin,      // Name: "Akan", NativeName: "Akan"},
		"am": unicode.Ethiopic,   // Name: "Amharic", NativeName: "አማርኛ"},
		"an": unicode.Latin,      // Name: "Aragonese", NativeName: "Aragonés"},
		"ar": unicode.Arabic,     // Name: "Arabic", NativeName: "اللغة العربية"},
		"as": unicode.Bengali,    // Name: "Assamese", NativeName: "অসমীয়া"},
		"av": unicode.Cyrillic,   // Name: "Avaric", NativeName: "Авар мацӀ"},
		"ay": unicode.Latin,      // Name: "Aymara", NativeName: "Aymar aru"},
		"az": unicode.Latin,      // Name: "Azerbaijani", NativeName: "Azərbaycan dili"},
		"ba": unicode.Cyrillic,   // Name: "Bashkir", NativeName: "Башҡорт теле"},
		"be": unicode.Cyrillic,   // Name: "Belarusian", NativeName: "Беларуская мова"},
		"bg": unicode.Cyrillic,   // Name: "Bulgarian", NativeName: "Български език"},
		"bh": nil,                // Name: "Bihari", NativeName: "भोजपुरी"},
		"bi": unicode.Latin,      // Name: "Bislama", NativeName: "Bislama"},
		"bm": unicode.Latin,      // Name: "Bambara", NativeName: "Bamanankan"},
		"bn": unicode.Bengali,    // Name: "Bengali", NativeName: "বাংলা"},
		"bo": unicode.Tibetan,    // Name: "Tibetan Standard", NativeName: "བོད་ཡིག"},
		"br": unicode.Latin,      // Name: "Breton", NativeName: "Brezhoneg"},
		"bs": unicode.Latin,      // Name: "Bosnian", NativeName: "Bosanski jezik"},
		"ca": unicode.Latin,      // Name: "Catalan", NativeName: "Català"},
		"ce": unicode.Cyrillic,   // Name: "Chechen", NativeName: "Нохчийн мотт"},
		"ch": unicode.Latin,      // Name: "Chamorro", NativeName: "Chamoru"},
		"co": unicode.Latin,      // Name: "Corsican", NativeName: "Corsu"},
		"cr": unicode.Latin,      // Name: "Cree", NativeName: "ᓀᐦᐃᔭᐍᐏᐣ"},
		"cs": unicode.Latin,      // Name: "Czech", NativeName: "Čeština"},
		"cu": nil,                // Name: "Old Church Slavonic", NativeName: "Ѩзыкъ словѣньскъ"},
		"cv": unicode.Cyrillic,   // Name: "Chuvash", NativeName: "Чӑваш чӗлхи"},
		"cy": unicode.Latin,      // Name: "Welsh", NativeName: "Cymraeg"},
		"da": unicode.Latin,      // Name: "Danish", NativeName: "Dansk"},
		"de": unicode.Latin,      // Name: "German", NativeName: "Deutsch"},
		"dv": unicode.Thaana,     // Name: "Divehi", NativeName: "Dhivehi"},
		"dz": unicode.Tibetan,    // Name: "Dzongkha", NativeName: "རྫོང་ཁ"},
		"ee": nil,                // Name: "Ewe", NativeName: "Eʋegbe"},
		"el": unicode.Greek,      // Name: "Greek", NativeName: "Ελληνικά"},
		"en": unicode.Latin,      // Name: "English", NativeName: "English"},
		"eo": unicode.Latin,      // Name: "Esperanto", NativeName: "Esperanto"},
		"es": unicode.Latin,      // Name: "Spanish", NativeName: "Español"},
		"et": unicode.Latin,      // Name: "Estonian", NativeName: "Eesti"},
		"eu": unicode.Latin,      // Name: "Basque", NativeName: "Euskara"},
		"fa": unicode.Arabic,     // Name: "Persian", NativeName: "فارسی"},
		"ff": unicode.Latin,      // Name: "Fula", NativeName: "Fulfulde"},
		"fi": unicode.Latin,      // Name: "Finnish", NativeName: "Suomi"},
		"fj": unicode.Latin,      // Name: "Fijian", NativeName: "Vakaviti"},
		"fo": unicode.Latin,      // Name: "Faroese", NativeName: "Føroyskt"},
		"fr": unicode.Latin,      // Name: "French", NativeName: "Français"},
		"fy": unicode.Latin,      // Name: "Western Frisian", NativeName: "Frysk"},
		"ga": unicode.Latin,      // Name: "Irish", NativeName: "Gaeilge"},
		"gd": unicode.Latin,      // Name: "Scottish Gaelic", NativeName: "Gàidhlig"},
		"gl": unicode.Latin,      // Name: "Galician", NativeName: "Galego"},
		"gn": unicode.Latin,      // Name: "Guaraní", NativeName: "Avañeẽ"},
		"gu": unicode.Gujarati,   // Name: "Gujarati", NativeName: "ગુજરાતી"},
		"gv": unicode.Latin,      // Name: "Manx", NativeName: "Gaelg"},
		"ha": unicode.Latin,      // Name: "Hausa", NativeName: "هَوُسَ"},
		"he": unicode.Hebrew,     // Name: "Hebrew", NativeName: "עברית"},
		"hi": unicode.Devanagari, // Name: "Hindi", NativeName: "हिन्दी"},
		"ho": unicode.Latin,      // Name: "Hiri Motu", NativeName: "Hiri Motu"},
		"hr": unicode.Latin,      // Name: "Croatian", NativeName: "Hrvatski jezik"},
		"ht": unicode.Latin,      // Name: "Haitian", NativeName: "Kreyòl ayisyen"},
		"hu": unicode.Latin,      // Name: "Hungarian", NativeName: "Magyar"},
		"hy": unicode.Armenian,   // Name: "Armenian", NativeName: "Հայերեն"},
		"hz": unicode.Latin,      // Name: "Herero", NativeName: "Otjiherero"},
		"ia": unicode.Latin,      // Name: "Interlingua", NativeName: "Interlingua"},
		"id": unicode.Latin,      // Name: "Indonesian", NativeName: "Indonesian"},
		"ie": unicode.Latin,      // Name: "Interlingue", NativeName: "Interlingue"},
		"ig": unicode.Latin,      // Name: "Igbo", NativeName: "Asụsụ Igbo"},
		"ii": nil,                // Name: "Nuosu", NativeName: "ꆈꌠ꒿ Nuosuhxop"},
		"ik": unicode.Latin,      // Name: "Inupiaq", NativeName: "Iñupiaq"},
		"io": nil,                // Name: "Ido", NativeName: "Ido"},
		"is": unicode.Latin,      // Name: "Icelandic", NativeName: "Íslenska"},
		"it": unicode.Latin,      // Name: "Italian", NativeName: "Italiano"},
		"iu": unicode.Latin,      // Name: "Inuktitut", NativeName: "ᐃᓄᒃᑎᑐᑦ"},
		"ja": _HiraganaKatakana,  // Name: "Japanese", NativeName: "日本語"},
		"jv": unicode.Javanese,   // Name: "Javanese", NativeName: "Basa Jawa"},
		"ka": unicode.Georgian,   // Name: "Georgian", NativeName: "Ქართული"},
		"kg": unicode.Latin,      // Name: "Kongo", NativeName: "Kikongo"},
		"ki": unicode.Latin,      // Name: "Kikuyu", NativeName: "Gĩkũyũ"},
		"kj": nil,                // Name: "Kwanyama", NativeName: "Kuanyama"},
		"kk": unicode.Cyrillic,   // Name: "Kazakh", NativeName: "Қазақ тілі"},
		"kl": unicode.Latin,      // Name: "Kalaallisut", NativeName: "Kalaallisut"},
		"km": unicode.Khmer,      // Name: "Khmer", NativeName: "ខេមរភាសា"},
		"kn": unicode.Kannada,    // Name: "Kannada", NativeName: "ಕನ್ನಡ"},
		"ko": unicode.Hangul,     // Name: "Korean", NativeName: "한국어"},
		"kr": unicode.Latin,      // Name: "Kanuri", NativeName: "Kanuri"},
		"ks": unicode.Arabic,     // Name: "Kashmiri", NativeName: "कश्मीरी"},
		"ku": unicode.Arabic,     // Name: "Kurdish", NativeName: "Kurdî"},
		"kv": unicode.Cyrillic,   // Name: "Komi", NativeName: "Коми кыв"},
		"kw": unicode.Latin,      // Name: "Cornish", NativeName: "Kernewek"},
		"ky": unicode.Cyrillic,   // Name: "Kyrgyz", NativeName: "Кыргызча"},
		"la": unicode.Latin,      // Name: "Latin", NativeName: "Latine"},
		"lb": unicode.Latin,      // Name: "Luxembourgish", NativeName: "Lëtzebuergesch"},
		"lg": unicode.Latin,      // Name: "Ganda", NativeName: "Luganda"},
		"li": unicode.Latin,      // Name: "Limburgish", NativeName: "Limburgs"},
		"ln": unicode.Latin,      // Name: "Lingala", NativeName: "Lingála"},
		"lo": unicode.Lao,        // Name: "Lao", NativeName: "ພາສາ"},
		"lt": unicode.Latin,      // Name: "Lithuanian", NativeName: "Lietuvių kalba"},
		"lu": nil,                // Name: "Luba-Katanga", NativeName: "Tshiluba"},
		"lv": unicode.Latin,      // Name: "Latvian", NativeName: "Latviešu valoda"},
		"mg": unicode.Latin,      // Name: "Malagasy", NativeName: "Fiteny malagasy"},
		"mh": unicode.Latin,      // Name: "Marshallese", NativeName: "Kajin M̧ajeļ"},
		"mi": unicode.Latin,      // Name: "Māori", NativeName: "Te reo Māori"},
		"mk": unicode.Cyrillic,   // Name: "Macedonian", NativeName: "Македонски јазик"},
		"ml": unicode.Malayalam,  // Name: "Malayalam", NativeName: "മലയാളം"},
		"mn": unicode.Mongolian,  // Name: "Mongolian", NativeName: "Монгол хэл"},
		"mr": unicode.Devanagari, // Name: "Marathi", NativeName: "मराठी"},
		"ms": unicode.Latin,      // Name: "Malay", NativeName: "هاس ملايو‎"},
		"mt": unicode.Latin,      // Name: "Maltese", NativeName: "Malti"},
		"my": unicode.Myanmar,    // Name: "Burmese", NativeName: "ဗမာစာ"},
		"na": unicode.Latin,      // Name: "Nauru", NativeName: "Ekakairũ Naoero"},
		"nb": unicode.Latin,      // Name: "Norwegian Bokmål", NativeName: "Norsk bokmål"},
		"nd": unicode.Latin,      // Name: "Northern Ndebele", NativeName: "IsiNdebele"},
		"ne": unicode.Devanagari, // Name: "Nepali", NativeName: "नेपाली"},
		"ng": unicode.Latin,      // Name: "Ndonga", NativeName: "Owambo"},
		"nl": unicode.Latin,      // Name: "Dutch", NativeName: "Nederlands"},
		"nn": unicode.Latin,      // Name: "Norwegian Nynorsk", NativeName: "Norsk nynorsk"},
		"no": unicode.Latin,      // Name: "Norwegian", NativeName: "Norsk"},
		"nr": unicode.Latin,      // Name: "Southern Ndebele", NativeName: "IsiNdebele"},
		"nv": unicode.Latin,      // Name: "Navajo", NativeName: "Diné bizaad"},
		"ny": nil,                // Name: "Chichewa", NativeName: "ChiCheŵa"},
		"oc": unicode.Latin,      // Name: "Occitan", NativeName: "Occitan"},
		"oj": nil,                // Name: "Ojibwe", NativeName: "ᐊᓂᔑᓈᐯᒧᐎᓐ"},
		"om": unicode.Latin,      // Name: "Oromo", NativeName: "Afaan Oromoo"},
		"or": unicode.Oriya,      // Name: "Oriya", NativeName: "ଓଡ଼ିଆ"},
		"os": nil,                // Name: "Ossetian", NativeName: "Ирон æвзаг"},
		"pa": unicode.Gurmukhi,   // Name: "Panjabi", NativeName: "ਪੰਜਾਬੀ"},
		"pi": unicode.Devanagari, // Name: "Pāli", NativeName: "पाऴि"},
		"pl": unicode.Latin,      // Name: "Polish", NativeName: "Język polski"},
		"ps": unicode.Arabic,     // Name: "Pashto", NativeName: "پښتو"},
		"pt": unicode.Latin,      // Name: "Portuguese", NativeName: "Português"},
		"qu": unicode.Latin,      // Name: "Quechua", NativeName: "Runa Simi"},
		"rm": unicode.Latin,      // Name: "Romansh", NativeName: "Rumantsch grischun"},
		"rn": nil,                // Name: "Kirundi", NativeName: "Ikirundi"},
		"ro": unicode.Latin,      // Name: "Romanian", NativeName: "Română"},
		"ru": unicode.Cyrillic,   // Name: "Russian", NativeName: "Русский"},
		"rw": unicode.Latin,      // Name: "Kinyarwanda", NativeName: "Ikinyarwanda"},
		"sa": unicode.Devanagari, // Name: "Sanskrit", NativeName: "संस्कृतम्"},
		"sc": unicode.Latin,      // Name: "Sardinian", NativeName: "Sardu"},
		"sd": unicode.Arabic,     // Name: "Sindhi", NativeName: "सिन्धी"},
		"se": unicode.Latin,      // Name: "Northern Sami", NativeName: "Davvisámegiella"},
		"sg": unicode.Latin,      // Name: "Sango", NativeName: "Yângâ tî sängö"},
		"si": unicode.Sinhala,    // Name: "Sinhala", NativeName: "සිංහල"},
		"sk": unicode.Latin,      // Name: "Slovak", NativeName: "Slovenčina"},
		"sl": unicode.Latin,      // Name: "Slovene", NativeName: "Slovenski jezik"},
		"sm": unicode.Latin,      // Name: "Samoan", NativeName: "Gagana faa Samoa"},
		"sn": unicode.Latin,      // Name: "Shona", NativeName: "ChiShona"},
		"so": unicode.Osmanya,    // Name: "Somali", NativeName: "Soomaaliga"},
		"sq": unicode.Latin,      // Name: "Albanian", NativeName: "Shqip"},
		"sr": unicode.Cyrillic,   // Name: "Serbian", NativeName: "Српски језик"},
		"ss": unicode.Latin,      // Name: "Swati", NativeName: "SiSwati"},
		"st": unicode.Latin,      // Name: "Southern Sotho", NativeName: "Sesotho"},
		"su": unicode.Sundanese,  // Name: "Sundanese", NativeName: "Basa Sunda"},
		"sv": unicode.Latin,      // Name: "Swedish", NativeName: "Svenska"},
		"sw": unicode.Latin,      // Name: "Swahili", NativeName: "Kiswahili"},
		"ta": unicode.Tamil,      // Name: "Tamil", NativeName: "தமிழ்"},
		"te": unicode.Telugu,     // Name: "Telugu", NativeName: "తెలుగు"},
		"tg": unicode.Cyrillic,   // Name: "Tajik", NativeName: "Тоҷикӣ"},
		"th": unicode.Thai,       // Name: "Thai", NativeName: "ไทย"},
		"ti": unicode.Ethiopic,   // Name: "Tigrinya", NativeName: "ትግርኛ"},
		"tk": unicode.Cyrillic,   // Name: "Turkmen", NativeName: "Türkmen"},
		"tl": unicode.Tagalog,    // Name: "Tagalog", NativeName: "Wikang Tagalog"},
		"tn": unicode.Latin,      // Name: "Tswana", NativeName: "Setswana"},
		"to": unicode.Latin,      // Name: "Tonga", NativeName: "Faka Tonga"},
		"tr": unicode.Latin,      // Name: "Turkish", NativeName: "Türkçe"},
		"ts": unicode.Latin,      // Name: "Tsonga", NativeName: "Xitsonga"},
		"tt": unicode.Cyrillic,   // Name: "Tatar", NativeName: "Татар теле"},
		"tw": nil,                // Name: "Twi", NativeName: "Twi"},
		"ty": unicode.Latin,      // Name: "Tahitian", NativeName: "Reo Tahiti"},
		"ug": unicode.Arabic,     // Name: "Uyghur", NativeName: "ئۇيغۇرچە‎"},
		"uk": unicode.Cyrillic,   // Name: "Ukrainian", NativeName: "Українська"},
		"ur": unicode.Arabic,     // Name: "Urdu", NativeName: "اردو"},
		"uz": unicode.Latin,      // Name: "Uzbek", NativeName: "Ўзбек"},
		"ve": unicode.Latin,      // Name: "Venda", NativeName: "Tshivenḓa"},
		"vi": unicode.Latin,      // Name: "Vietnamese", NativeName: "Tiếng Việt"},
		"vo": unicode.Latin,      // Name: "Volapük", NativeName: "Volapük"},
		"wa": unicode.Latin,      // Name: "Walloon", NativeName: "Walon"},
		"wo": unicode.Latin,      // Name: "Wolof", NativeName: "Wollof"},
		"xh": unicode.Latin,      // Name: "Xhosa", NativeName: "IsiXhosa"},
		"yi": unicode.Hebrew,     // Name: "Yiddish", NativeName: "ייִדיש"},
		"yo": unicode.Latin,      // Name: "Yoruba", NativeName: "Yorùbá"},
		"za": unicode.Han,        // Name: "Zhuang", NativeName: "Saɯ cueŋƅ"},
		"zh": unicode.Han,        // Name: "Chinese", NativeName: "中文"},
		"zu": unicode.Latin,      // Name: "Zulu", NativeName: "IsiZulu"},
	}
	return scripts[strings.ToLower(lang)]
}

@tassa-yoniso-manasi-karoto

I made an implementation for ISO639-3 from unicode's doc here

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants