Skip to content

Commit

Permalink
Fix language detection of non-latin alphabets even at few characters (m…
Browse files Browse the repository at this point in the history
  • Loading branch information
Gargron authored Mar 15, 2019
1 parent 51226ab commit 2ced6a1
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 9 deletions.
31 changes: 24 additions & 7 deletions app/lib/language_detector.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,23 @@
class LanguageDetector
include Singleton

CHARACTER_THRESHOLD = 140
CHARACTER_THRESHOLD = 140
RELIABLE_CHARACTERS_RE = /[\p{Hebrew}\p{Arabic}\p{Syriac}\p{Thaana}\p{Nko}\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}]+/m

def initialize
@identifier = CLD3::NNetLanguageIdentifier.new(1, 2048)
end

def detect(text, account)
input_text = prepare_text(text)

return if input_text.blank?

detect_language_code(input_text) || default_locale(account)
end

def language_names
@language_names =
CLD3::TaskContextParams::LANGUAGE_NAMES.map { |name| iso6391(name.to_s).to_sym }
.uniq
@language_names = CLD3::TaskContextParams::LANGUAGE_NAMES.map { |name| iso6391(name.to_s).to_sym }.uniq
end

private
Expand All @@ -29,12 +29,29 @@ def prepare_text(text)
end

def unreliable_input?(text)
text.size < CHARACTER_THRESHOLD
!reliable_input?(text)
end

def reliable_input?(text)
sufficient_text_length?(text) || language_specific_character_set?(text)
end

def sufficient_text_length?(text)
text.size >= CHARACTER_THRESHOLD
end

def language_specific_character_set?(text)
words = text.scan(RELIABLE_CHARACTERS_RE)

if words.present?
words.reduce(0) { |acc, elem| acc + elem.size }.to_f / text.size.to_f > 0.3
else
false
end
end

def detect_language_code(text)
return if unreliable_input?(text)

result = @identifier.find_language(text)
iso6391(result.language.to_s).to_sym if result.reliable?
end
Expand Down Expand Up @@ -77,6 +94,6 @@ def remove_html(text)
end

def default_locale(account)
return account.user_locale&.to_sym || I18n.default_locale if account.local?
account.user_locale&.to_sym || I18n.default_locale if account.local?
end
end
4 changes: 2 additions & 2 deletions spec/lib/language_detector_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -106,11 +106,11 @@
end

describe 'remote user' do
it 'nil for foreign user when language is not present' do
it 'detects Korean language' do
string = '안녕하세요'
result = described_class.instance.detect(string, account_remote)

expect(result).to eq nil
expect(result).to eq :ko
end
end

Expand Down

0 comments on commit 2ced6a1

Please sign in to comment.