-
Notifications
You must be signed in to change notification settings - Fork 67
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Script detection #80
Comments
var _HiraganaKatakana = &unicode.RangeTable{
R16: append(unicode.Hiragana.R16, unicode.Katakana.R16...),
R32: append(unicode.Hiragana.R32, unicode.Katakana.R32...),
}
// lang is 2 letter language code ISO639-1
func langToScript(lang string) *unicode.RangeTable {
// Compiled from sources including https://www.unicode.org/cldr/charts/46/supplemental/languages_and_scripts.html
scripts := map[string]*unicode.RangeTable{
"aa": unicode.Latin, // Name: "Afar", NativeName: "Afaraf"},
"ab": unicode.Cyrillic, // Name: "Abkhaz", NativeName: "Аҧсуа бызшәа"},
"ae": unicode.Avestan, // Name: "Avestan", NativeName: "Avesta"},
"af": unicode.Latin, // Name: "Afrikaans", NativeName: "Afrikaans"},
"ak": unicode.Latin, // Name: "Akan", NativeName: "Akan"},
"am": unicode.Ethiopic, // Name: "Amharic", NativeName: "አማርኛ"},
"an": unicode.Latin, // Name: "Aragonese", NativeName: "Aragonés"},
"ar": unicode.Arabic, // Name: "Arabic", NativeName: "اللغة العربية"},
"as": unicode.Bengali, // Name: "Assamese", NativeName: "অসমীয়া"},
"av": unicode.Cyrillic, // Name: "Avaric", NativeName: "Авар мацӀ"},
"ay": unicode.Latin, // Name: "Aymara", NativeName: "Aymar aru"},
"az": unicode.Latin, // Name: "Azerbaijani", NativeName: "Azərbaycan dili"},
"ba": unicode.Cyrillic, // Name: "Bashkir", NativeName: "Башҡорт теле"},
"be": unicode.Cyrillic, // Name: "Belarusian", NativeName: "Беларуская мова"},
"bg": unicode.Cyrillic, // Name: "Bulgarian", NativeName: "Български език"},
"bh": nil, // Name: "Bihari", NativeName: "भोजपुरी"},
"bi": unicode.Latin, // Name: "Bislama", NativeName: "Bislama"},
"bm": unicode.Latin, // Name: "Bambara", NativeName: "Bamanankan"},
"bn": unicode.Bengali, // Name: "Bengali", NativeName: "বাংলা"},
"bo": unicode.Tibetan, // Name: "Tibetan Standard", NativeName: "བོད་ཡིག"},
"br": unicode.Latin, // Name: "Breton", NativeName: "Brezhoneg"},
"bs": unicode.Latin, // Name: "Bosnian", NativeName: "Bosanski jezik"},
"ca": unicode.Latin, // Name: "Catalan", NativeName: "Català"},
"ce": unicode.Cyrillic, // Name: "Chechen", NativeName: "Нохчийн мотт"},
"ch": unicode.Latin, // Name: "Chamorro", NativeName: "Chamoru"},
"co": unicode.Latin, // Name: "Corsican", NativeName: "Corsu"},
"cr": unicode.Latin, // Name: "Cree", NativeName: "ᓀᐦᐃᔭᐍᐏᐣ"},
"cs": unicode.Latin, // Name: "Czech", NativeName: "Čeština"},
"cu": nil, // Name: "Old Church Slavonic", NativeName: "Ѩзыкъ словѣньскъ"},
"cv": unicode.Cyrillic, // Name: "Chuvash", NativeName: "Чӑваш чӗлхи"},
"cy": unicode.Latin, // Name: "Welsh", NativeName: "Cymraeg"},
"da": unicode.Latin, // Name: "Danish", NativeName: "Dansk"},
"de": unicode.Latin, // Name: "German", NativeName: "Deutsch"},
"dv": unicode.Thaana, // Name: "Divehi", NativeName: "Dhivehi"},
"dz": unicode.Tibetan, // Name: "Dzongkha", NativeName: "རྫོང་ཁ"},
"ee": nil, // Name: "Ewe", NativeName: "Eʋegbe"},
"el": unicode.Greek, // Name: "Greek", NativeName: "Ελληνικά"},
"en": unicode.Latin, // Name: "English", NativeName: "English"},
"eo": unicode.Latin, // Name: "Esperanto", NativeName: "Esperanto"},
"es": unicode.Latin, // Name: "Spanish", NativeName: "Español"},
"et": unicode.Latin, // Name: "Estonian", NativeName: "Eesti"},
"eu": unicode.Latin, // Name: "Basque", NativeName: "Euskara"},
"fa": unicode.Arabic, // Name: "Persian", NativeName: "فارسی"},
"ff": unicode.Latin, // Name: "Fula", NativeName: "Fulfulde"},
"fi": unicode.Latin, // Name: "Finnish", NativeName: "Suomi"},
"fj": unicode.Latin, // Name: "Fijian", NativeName: "Vakaviti"},
"fo": unicode.Latin, // Name: "Faroese", NativeName: "Føroyskt"},
"fr": unicode.Latin, // Name: "French", NativeName: "Français"},
"fy": unicode.Latin, // Name: "Western Frisian", NativeName: "Frysk"},
"ga": unicode.Latin, // Name: "Irish", NativeName: "Gaeilge"},
"gd": unicode.Latin, // Name: "Scottish Gaelic", NativeName: "Gàidhlig"},
"gl": unicode.Latin, // Name: "Galician", NativeName: "Galego"},
"gn": unicode.Latin, // Name: "Guaraní", NativeName: "Avañeẽ"},
"gu": unicode.Gujarati, // Name: "Gujarati", NativeName: "ગુજરાતી"},
"gv": unicode.Latin, // Name: "Manx", NativeName: "Gaelg"},
"ha": unicode.Latin, // Name: "Hausa", NativeName: "هَوُسَ"},
"he": unicode.Hebrew, // Name: "Hebrew", NativeName: "עברית"},
"hi": unicode.Devanagari, // Name: "Hindi", NativeName: "हिन्दी"},
"ho": unicode.Latin, // Name: "Hiri Motu", NativeName: "Hiri Motu"},
"hr": unicode.Latin, // Name: "Croatian", NativeName: "Hrvatski jezik"},
"ht": unicode.Latin, // Name: "Haitian", NativeName: "Kreyòl ayisyen"},
"hu": unicode.Latin, // Name: "Hungarian", NativeName: "Magyar"},
"hy": unicode.Armenian, // Name: "Armenian", NativeName: "Հայերեն"},
"hz": unicode.Latin, // Name: "Herero", NativeName: "Otjiherero"},
"ia": unicode.Latin, // Name: "Interlingua", NativeName: "Interlingua"},
"id": unicode.Latin, // Name: "Indonesian", NativeName: "Indonesian"},
"ie": unicode.Latin, // Name: "Interlingue", NativeName: "Interlingue"},
"ig": unicode.Latin, // Name: "Igbo", NativeName: "Asụsụ Igbo"},
"ii": nil, // Name: "Nuosu", NativeName: "ꆈꌠ꒿ Nuosuhxop"},
"ik": unicode.Latin, // Name: "Inupiaq", NativeName: "Iñupiaq"},
"io": nil, // Name: "Ido", NativeName: "Ido"},
"is": unicode.Latin, // Name: "Icelandic", NativeName: "Íslenska"},
"it": unicode.Latin, // Name: "Italian", NativeName: "Italiano"},
"iu": unicode.Latin, // Name: "Inuktitut", NativeName: "ᐃᓄᒃᑎᑐᑦ"},
"ja": _HiraganaKatakana, // Name: "Japanese", NativeName: "日本語"},
"jv": unicode.Javanese, // Name: "Javanese", NativeName: "Basa Jawa"},
"ka": unicode.Georgian, // Name: "Georgian", NativeName: "Ქართული"},
"kg": unicode.Latin, // Name: "Kongo", NativeName: "Kikongo"},
"ki": unicode.Latin, // Name: "Kikuyu", NativeName: "Gĩkũyũ"},
"kj": nil, // Name: "Kwanyama", NativeName: "Kuanyama"},
"kk": unicode.Cyrillic, // Name: "Kazakh", NativeName: "Қазақ тілі"},
"kl": unicode.Latin, // Name: "Kalaallisut", NativeName: "Kalaallisut"},
"km": unicode.Khmer, // Name: "Khmer", NativeName: "ខេមរភាសា"},
"kn": unicode.Kannada, // Name: "Kannada", NativeName: "ಕನ್ನಡ"},
"ko": unicode.Hangul, // Name: "Korean", NativeName: "한국어"},
"kr": unicode.Latin, // Name: "Kanuri", NativeName: "Kanuri"},
"ks": unicode.Arabic, // Name: "Kashmiri", NativeName: "कश्मीरी"},
"ku": unicode.Arabic, // Name: "Kurdish", NativeName: "Kurdî"},
"kv": unicode.Cyrillic, // Name: "Komi", NativeName: "Коми кыв"},
"kw": unicode.Latin, // Name: "Cornish", NativeName: "Kernewek"},
"ky": unicode.Cyrillic, // Name: "Kyrgyz", NativeName: "Кыргызча"},
"la": unicode.Latin, // Name: "Latin", NativeName: "Latine"},
"lb": unicode.Latin, // Name: "Luxembourgish", NativeName: "Lëtzebuergesch"},
"lg": unicode.Latin, // Name: "Ganda", NativeName: "Luganda"},
"li": unicode.Latin, // Name: "Limburgish", NativeName: "Limburgs"},
"ln": unicode.Latin, // Name: "Lingala", NativeName: "Lingála"},
"lo": unicode.Lao, // Name: "Lao", NativeName: "ພາສາ"},
"lt": unicode.Latin, // Name: "Lithuanian", NativeName: "Lietuvių kalba"},
"lu": nil, // Name: "Luba-Katanga", NativeName: "Tshiluba"},
"lv": unicode.Latin, // Name: "Latvian", NativeName: "Latviešu valoda"},
"mg": unicode.Latin, // Name: "Malagasy", NativeName: "Fiteny malagasy"},
"mh": unicode.Latin, // Name: "Marshallese", NativeName: "Kajin M̧ajeļ"},
"mi": unicode.Latin, // Name: "Māori", NativeName: "Te reo Māori"},
"mk": unicode.Cyrillic, // Name: "Macedonian", NativeName: "Македонски јазик"},
"ml": unicode.Malayalam, // Name: "Malayalam", NativeName: "മലയാളം"},
"mn": unicode.Mongolian, // Name: "Mongolian", NativeName: "Монгол хэл"},
"mr": unicode.Devanagari, // Name: "Marathi", NativeName: "मराठी"},
"ms": unicode.Latin, // Name: "Malay", NativeName: "هاس ملايو"},
"mt": unicode.Latin, // Name: "Maltese", NativeName: "Malti"},
"my": unicode.Myanmar, // Name: "Burmese", NativeName: "ဗမာစာ"},
"na": unicode.Latin, // Name: "Nauru", NativeName: "Ekakairũ Naoero"},
"nb": unicode.Latin, // Name: "Norwegian Bokmål", NativeName: "Norsk bokmål"},
"nd": unicode.Latin, // Name: "Northern Ndebele", NativeName: "IsiNdebele"},
"ne": unicode.Devanagari, // Name: "Nepali", NativeName: "नेपाली"},
"ng": unicode.Latin, // Name: "Ndonga", NativeName: "Owambo"},
"nl": unicode.Latin, // Name: "Dutch", NativeName: "Nederlands"},
"nn": unicode.Latin, // Name: "Norwegian Nynorsk", NativeName: "Norsk nynorsk"},
"no": unicode.Latin, // Name: "Norwegian", NativeName: "Norsk"},
"nr": unicode.Latin, // Name: "Southern Ndebele", NativeName: "IsiNdebele"},
"nv": unicode.Latin, // Name: "Navajo", NativeName: "Diné bizaad"},
"ny": nil, // Name: "Chichewa", NativeName: "ChiCheŵa"},
"oc": unicode.Latin, // Name: "Occitan", NativeName: "Occitan"},
"oj": nil, // Name: "Ojibwe", NativeName: "ᐊᓂᔑᓈᐯᒧᐎᓐ"},
"om": unicode.Latin, // Name: "Oromo", NativeName: "Afaan Oromoo"},
"or": unicode.Oriya, // Name: "Oriya", NativeName: "ଓଡ଼ିଆ"},
"os": nil, // Name: "Ossetian", NativeName: "Ирон æвзаг"},
"pa": unicode.Gurmukhi, // Name: "Panjabi", NativeName: "ਪੰਜਾਬੀ"},
"pi": unicode.Devanagari, // Name: "Pāli", NativeName: "पाऴि"},
"pl": unicode.Latin, // Name: "Polish", NativeName: "Język polski"},
"ps": unicode.Arabic, // Name: "Pashto", NativeName: "پښتو"},
"pt": unicode.Latin, // Name: "Portuguese", NativeName: "Português"},
"qu": unicode.Latin, // Name: "Quechua", NativeName: "Runa Simi"},
"rm": unicode.Latin, // Name: "Romansh", NativeName: "Rumantsch grischun"},
"rn": nil, // Name: "Kirundi", NativeName: "Ikirundi"},
"ro": unicode.Latin, // Name: "Romanian", NativeName: "Română"},
"ru": unicode.Cyrillic, // Name: "Russian", NativeName: "Русский"},
"rw": unicode.Latin, // Name: "Kinyarwanda", NativeName: "Ikinyarwanda"},
"sa": unicode.Devanagari, // Name: "Sanskrit", NativeName: "संस्कृतम्"},
"sc": unicode.Latin, // Name: "Sardinian", NativeName: "Sardu"},
"sd": unicode.Arabic, // Name: "Sindhi", NativeName: "सिन्धी"},
"se": unicode.Latin, // Name: "Northern Sami", NativeName: "Davvisámegiella"},
"sg": unicode.Latin, // Name: "Sango", NativeName: "Yângâ tî sängö"},
"si": unicode.Sinhala, // Name: "Sinhala", NativeName: "සිංහල"},
"sk": unicode.Latin, // Name: "Slovak", NativeName: "Slovenčina"},
"sl": unicode.Latin, // Name: "Slovene", NativeName: "Slovenski jezik"},
"sm": unicode.Latin, // Name: "Samoan", NativeName: "Gagana faa Samoa"},
"sn": unicode.Latin, // Name: "Shona", NativeName: "ChiShona"},
"so": unicode.Osmanya, // Name: "Somali", NativeName: "Soomaaliga"},
"sq": unicode.Latin, // Name: "Albanian", NativeName: "Shqip"},
"sr": unicode.Cyrillic, // Name: "Serbian", NativeName: "Српски језик"},
"ss": unicode.Latin, // Name: "Swati", NativeName: "SiSwati"},
"st": unicode.Latin, // Name: "Southern Sotho", NativeName: "Sesotho"},
"su": unicode.Sundanese, // Name: "Sundanese", NativeName: "Basa Sunda"},
"sv": unicode.Latin, // Name: "Swedish", NativeName: "Svenska"},
"sw": unicode.Latin, // Name: "Swahili", NativeName: "Kiswahili"},
"ta": unicode.Tamil, // Name: "Tamil", NativeName: "தமிழ்"},
"te": unicode.Telugu, // Name: "Telugu", NativeName: "తెలుగు"},
"tg": unicode.Cyrillic, // Name: "Tajik", NativeName: "Тоҷикӣ"},
"th": unicode.Thai, // Name: "Thai", NativeName: "ไทย"},
"ti": unicode.Ethiopic, // Name: "Tigrinya", NativeName: "ትግርኛ"},
"tk": unicode.Cyrillic, // Name: "Turkmen", NativeName: "Türkmen"},
"tl": unicode.Tagalog, // Name: "Tagalog", NativeName: "Wikang Tagalog"},
"tn": unicode.Latin, // Name: "Tswana", NativeName: "Setswana"},
"to": unicode.Latin, // Name: "Tonga", NativeName: "Faka Tonga"},
"tr": unicode.Latin, // Name: "Turkish", NativeName: "Türkçe"},
"ts": unicode.Latin, // Name: "Tsonga", NativeName: "Xitsonga"},
"tt": unicode.Cyrillic, // Name: "Tatar", NativeName: "Татар теле"},
"tw": nil, // Name: "Twi", NativeName: "Twi"},
"ty": unicode.Latin, // Name: "Tahitian", NativeName: "Reo Tahiti"},
"ug": unicode.Arabic, // Name: "Uyghur", NativeName: "ئۇيغۇرچە"},
"uk": unicode.Cyrillic, // Name: "Ukrainian", NativeName: "Українська"},
"ur": unicode.Arabic, // Name: "Urdu", NativeName: "اردو"},
"uz": unicode.Latin, // Name: "Uzbek", NativeName: "Ўзбек"},
"ve": unicode.Latin, // Name: "Venda", NativeName: "Tshivenḓa"},
"vi": unicode.Latin, // Name: "Vietnamese", NativeName: "Tiếng Việt"},
"vo": unicode.Latin, // Name: "Volapük", NativeName: "Volapük"},
"wa": unicode.Latin, // Name: "Walloon", NativeName: "Walon"},
"wo": unicode.Latin, // Name: "Wolof", NativeName: "Wollof"},
"xh": unicode.Latin, // Name: "Xhosa", NativeName: "IsiXhosa"},
"yi": unicode.Hebrew, // Name: "Yiddish", NativeName: "ייִדיש"},
"yo": unicode.Latin, // Name: "Yoruba", NativeName: "Yorùbá"},
"za": unicode.Han, // Name: "Zhuang", NativeName: "Saɯ cueŋƅ"},
"zh": unicode.Han, // Name: "Chinese", NativeName: "中文"},
"zu": unicode.Latin, // Name: "Zulu", NativeName: "IsiZulu"},
}
return scripts[strings.ToLower(lang)]
} |
I made an implementation for ISO639-3 from unicode's doc here |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
It would be great if memory and CPU can be reduced if all that is required is unicode script detection for a given input text
Right now I have to detect language first, then use a table to link the language to the writing system.
Seems overkill.
The text was updated successfully, but these errors were encountered: