Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Michael Hansen committed Sep 26, 2019
0 parents commit b510aae
Show file tree
Hide file tree
Showing 27 changed files with 280,445 additions and 0 deletions.
9 changes: 9 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
3 changes: 3 additions & 0 deletions SOURCE
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
cmusphinx-ru-5.2

https://sourceforge.net/projects/cmusphinx/files/Acoustic%20and%20Language%20Models/Russian/
3 changes: 3 additions & 0 deletions acoustic_model/README
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Work in progress on Russian

Uses simplified Russian dictionary
116 changes: 116 additions & 0 deletions acoustic_model/dictionary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#!/usr/bin/python
# -- coding: utf-8 --


# Converts an accented vocabulary to dictionary, for example
#
# абстракцион+истов
# абстр+акцию
# абстр+акция
#
# абстракционистов a0 b s t r a0 k c i0 o0 nj i1 s t o0 v
# абстракцию a0 b s t r a1 k c i0 j u0
# абстракция a0 b s t r a1 k c i0 j a0
#

import sys

softletters=set(u"яёюиье")
startsyl=set(u"#ъьаяоёуюэеиы-")
others = set(["#", "+", "-", u"ь", u"ъ"])

softhard_cons = {
u"б" : u"b",
u"в" : u"v",
u"г" : u"g",
u"Г" : u"g",
u"д" : u"d",
u"з" : u"z",
u"к" : u"k",
u"л" : u"l",
u"м" : u"m",
u"н" : u"n",
u"п" : u"p",
u"р" : u"r",
u"с" : u"s",
u"т" : u"t",
u"ф" : u"f",
u"х" : u"h"
}

other_cons = {
u"ж" : u"zh",
u"ц" : u"c",
u"ч" : u"ch",
u"ш" : u"sh",
u"щ" : u"sch",
u"й" : u"j"
}

vowels = {
u"а" : u"a",
u"я" : u"a",
u"у" : u"u",
u"ю" : u"u",
u"о" : u"o",
u"ё" : u"o",
u"э" : u"e",
u"е" : u"e",
u"и" : u"i",
u"ы" : u"y",
}

def pallatize(phones):
for i, phone in enumerate(phones[:-1]):
if phone[0] in softhard_cons:
if phones[i+1][0] in softletters:
phones[i] = (softhard_cons[phone[0]] + "j", 0)
else:
phones[i] = (softhard_cons[phone[0]], 0)
if phone[0] in other_cons:
phones[i] = (other_cons[phone[0]], 0)

def convert_vowels(phones):
new_phones = []
prev = ""
for phone in phones:
if prev in startsyl:
if phone[0] in set(u"яюеё"):
new_phones.append("j")
if phone[0] in vowels:
new_phones.append(vowels[phone[0]] + str(phone[1]))
else:
new_phones.append(phone[0])
prev = phone[0]

return new_phones

def convert(stressword):
phones = ("#" + stressword + "#").decode('utf-8')


# Assign stress marks
stress_phones = []
stress = 0
for phone in phones:
if phone == "+":
stress = 1
else:
stress_phones.append((phone, stress))
stress = 0

# Pallatize
pallatize(stress_phones)

# Assign stress
phones = convert_vowels(stress_phones)

# Filter
phones = [x for x in phones if x not in others]

return " ".join(phones).encode("utf-8")

for line in open(sys.argv[1]):
stressword = line.strip()
print stressword.replace("+", ""), convert(stressword)

10 changes: 10 additions & 0 deletions acoustic_model/feat.params
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
-lowerf 130
-upperf 6800
-nfilt 25
-transform dct
-lifter 22
-feat 1s_c_d_dd
-agc none
-cmn current
-varnorm no
-cmninit 40,3,-1
Binary file added acoustic_model/feature_transform
Binary file not shown.
Loading

0 comments on commit b510aae

Please sign in to comment.