-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Created scripts to allow for the discovery of the continuation lexica…
… (and their content) which are passed through to generate an FST output string.
- Loading branch information
Showing
2 changed files
with
197 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
#!/bin/sh | ||
|
||
gawk 'BEGIN { mcs=0; } | ||
{ | ||
line[NR]=$0; | ||
# Set flag for recognizing multichar symbols, | ||
# within the Multichar_Symbols field in LEXC code | ||
if(match($0, "^Multichar_Symbols")!=0) | ||
multichar=1; | ||
if(match($0, "^LEXICON")!=0) | ||
multichar=0; | ||
# Recognizing multichar symbols (i.e. tags) and creating corresponding flags | ||
if(multichar && match($0, "(^[^\\+ \t]+\\+|^\\+[^ \t]+)", f)!=0) | ||
{ | ||
tagflag="@P.FSTTAG." f[1] "@"; | ||
tagflags[f[1]]=tagflag; | ||
taglen[f[1]]=length(f[1]); | ||
} | ||
# Recognizing contlex names and creating corresponding flags | ||
if(match($0, "^LEXICON[ \t]+([^ \t]+)", f)!=0) | ||
{ | ||
lexicon[f[1]]++; | ||
if(lexicon[f[1]]>=2) | ||
{ | ||
printf "Aborting - More than one continuation lexicon with the same name:\n" > "/dev/stderr"; | ||
printf "=> LEXICON: %s\n", f[1] > "/dev/stderr"; | ||
_assert_exit=1; | ||
exit 1; | ||
} | ||
lexflag=sprintf("@P.LEXICON.%s@", f[1]); | ||
gsub("0", "%0", lexflag); | ||
flags[lexflag]=lexflag; | ||
} | ||
} | ||
END { | ||
if(_assert_exit) exit 1; | ||
delete lexicon; | ||
# Creating single regexp covering all tags in LEXC code | ||
# (as defined in the Multichar_Symbols field in LEXC code | ||
tagregexp=""; | ||
for(t in taglen) | ||
tagregexp = tagregexp "|" t; | ||
# Remove initial "|" operator | ||
sub("^\\|", "", tagregexp); | ||
# Re-encode certain special characters | ||
gsub("\\+", "\\+", tagregexp); | ||
gsub("[-]", "\\-", tagregexp); | ||
gsub("0", "%0", tagregexp); | ||
gsub("[%]+0", "%0", tagregexp); | ||
for(i=1; i<=NR; i++) | ||
{ | ||
if(index(line[i], "Multichar_Symbols")!=0) | ||
{ | ||
print line[i]; | ||
PROCINFO["sorted_in"]="@ind_str_asc"; | ||
for(flag in flags) | ||
print flag; | ||
for(tag in tagflags) | ||
print tagflags[tag]; | ||
printf "\n"; | ||
i++; | ||
} | ||
if(match(line[i], "^LEXICON[ \t]+([^ \t]+)", f)!=0) | ||
{ | ||
print line[i]; | ||
lexflag=sprintf("@P.LEXICON.%s@", f[1]); | ||
gsub("0", "%0", lexflag); | ||
} | ||
else | ||
if(match(line[i], "^([^!;]+)(;)(.*)$", f)!=0) | ||
{ | ||
content=f[1]; | ||
sep=f[2]; | ||
comment=f[3]; | ||
n=split(content, ff, ":") | ||
if(n==2) | ||
{ | ||
anl=ff[1]; tagflag=""; | ||
# Encoding tags as flags, by matching with longest-to-shortest tags | ||
# PROCINFO["sorted_in"]="@val_num_desc"; | ||
# for(t in taglen) | ||
# { | ||
# if(index(anl, t)!=0) | ||
# { | ||
# tagflag=tagflag tagflags[t]; | ||
# sub("\\+", "\\+", t); | ||
# sub(t, "", anl); | ||
# } | ||
# } | ||
# Encoding tags as flags, by matching with single regexp including all tags | ||
while(match(anl, tagregexp, fff)!=0) | ||
{ | ||
tag=fff[0]; # print "Pah0:"tag; | ||
tagflag=tagflag tagflags[tag]; | ||
sub("\\+", "\\+", tag); | ||
sub("\\.", "\\.", tag); | ||
sub(tag, "", anl); | ||
} | ||
# Encoding tags as flags, by matching with regexp identifying potential tags | ||
# starting with prefixal tags ([...]+) and then suffixal tags (+[...]) | ||
# Does not fully work with the combination of prefixal and suffixal tags in LEXC code | ||
# if(match(anl, "(^[^@\\+]+\\+)|(@[^@\\+]+\\+))", fff)!=0) | ||
# while(match(anl, "[^@\\+]+\\+", fff)!=0) | ||
# { | ||
# tag=fff[0]; # print "Pah1:"tag; | ||
# tagflag=tagflag tagflags[tag]; | ||
# sub("\\+", "\\+", tag); | ||
# sub("\\.", "\\.", tag); | ||
# if(tag in tagflags) | ||
# sub(tag, "", anl); | ||
# else | ||
# break; | ||
# } | ||
# if(match(anl, "(^\\+[^@\\+]+)|(@\\+[^@\\+]+))", fff)!=0) | ||
# while(match(anl, "\\+[^@\\+]+", fff)!=0) | ||
# { | ||
# tag=fff[0]; # print "Pah2:"tag; | ||
# tagflag=tagflag tagflags[tag]; | ||
# sub("\\+", "\\+", tag); | ||
# sub("\\.", "\\.", tag); | ||
# if(tag in tagflags) | ||
# sub(tag, "", anl); | ||
# else | ||
# break; | ||
# } | ||
# Adding lexicon and tag flags to LEXC code | ||
content=lexflag tagflag ff[1] ":" lexflag tagflag ff[2]; | ||
} | ||
else | ||
{ | ||
if(match(content, "^[ ]*[^ ]+[ ]*;")!=0) | ||
content=lexflag content; | ||
else | ||
content=lexflag " " content; | ||
gsub("@ @", "@@", content); | ||
} | ||
print content sep comment; | ||
} | ||
else | ||
print line[i]; | ||
} | ||
}' |
41 changes: 41 additions & 0 deletions
41
tools/shellscripts/parse-fst-output-string-4-lexicon-and-tag-flags.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
#!/bin/sh | ||
|
||
gawk '{ | ||
n=split($0, a, "@P\\.LEXICON\\.[^@]+@", s); | ||
for(i=1; i<=n-1; i++) | ||
{ | ||
match(s[i], "@P\\.LEXICON\\.([^@]+)@", f); | ||
out=a[i+1]; inp=""; clex=f[1]; | ||
while(match(out, "@P\\.FSTTAG.([^@]+)@", ff)!=0) | ||
{ | ||
gsub("\\+", "\\+", ff[0]); | ||
gsub("\\.", "\\.", ff[0]); | ||
sub(ff[0], "", out); | ||
inp=inp ff[1]; | ||
} | ||
if(anl=="") anl="0"; | ||
printf "%i\t%s\t%s\t%s\n", i, clex, inp, out; | ||
} | ||
}' | | ||
|
||
gawk -F"\t" 'BEGIN { max1=5; max2=7; max3=5; max4=5; } | ||
{ | ||
if(length($1)>max1) max1=length($1); | ||
if(length($2)>max2) max2=length($2); | ||
if(length($3)>max3) max3=length($3); | ||
if(length($4)>max4) max4=length($4); | ||
for(j=1; j<=NF; j++) | ||
if($j=="") | ||
cell[NR, j]="0"; | ||
else | ||
cell[NR, j]=$j; | ||
} | ||
END { | ||
printf "%"(max1+1)"s %"max2"s\t%"max3"s : %-"max4"s\n", "INDEX", "CONTLEX", "INPUT", "OUTPUT"; | ||
for(i=1; i<=NR; i++) | ||
printf "%"max1"i: %"max2"s\t%"max3"s : %-"max4"s\n", cell[i, 1], cell[i, 2], cell[i, 3], cell[i, 4]; | ||
}' | ||
|