Skip to content

Commit

Permalink
Created scripts to allow for the discovery of the continuation lexica…
Browse files Browse the repository at this point in the history
… (and their content) which are passed through to generate an FST output string.
  • Loading branch information
aarppe committed Nov 4, 2024
1 parent 99c37a4 commit cfc48fc
Show file tree
Hide file tree
Showing 2 changed files with 197 additions and 0 deletions.
156 changes: 156 additions & 0 deletions tools/shellscripts/add-lexicon-and-tag-flags-2-lexc.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
#!/bin/sh

gawk 'BEGIN { mcs=0; }
{
line[NR]=$0;
# Set flag for recognizing multichar symbols,
# within the Multichar_Symbols field in LEXC code
if(match($0, "^Multichar_Symbols")!=0)
multichar=1;
if(match($0, "^LEXICON")!=0)
multichar=0;
# Recognizing multichar symbols (i.e. tags) and creating corresponding flags
if(multichar && match($0, "(^[^\\+ \t]+\\+|^\\+[^ \t]+)", f)!=0)
{
tagflag="@P.FSTTAG." f[1] "@";
tagflags[f[1]]=tagflag;
taglen[f[1]]=length(f[1]);
}
# Recognizing contlex names and creating corresponding flags
if(match($0, "^LEXICON[ \t]+([^ \t]+)", f)!=0)
{
lexicon[f[1]]++;
if(lexicon[f[1]]>=2)
{
printf "Aborting - More than one continuation lexicon with the same name:\n" > "/dev/stderr";
printf "=> LEXICON: %s\n", f[1] > "/dev/stderr";
_assert_exit=1;
exit 1;
}
lexflag=sprintf("@P.LEXICON.%s@", f[1]);
gsub("0", "%0", lexflag);
flags[lexflag]=lexflag;
}
}
END {
if(_assert_exit) exit 1;
delete lexicon;
# Creating single regexp covering all tags in LEXC code
# (as defined in the Multichar_Symbols field in LEXC code
tagregexp="";
for(t in taglen)
tagregexp = tagregexp "|" t;
# Remove initial "|" operator
sub("^\\|", "", tagregexp);
# Re-encode certain special characters
gsub("\\+", "\\+", tagregexp);
gsub("[-]", "\\-", tagregexp);
gsub("0", "%0", tagregexp);
gsub("[%]+0", "%0", tagregexp);
for(i=1; i<=NR; i++)
{
if(index(line[i], "Multichar_Symbols")!=0)
{
print line[i];
PROCINFO["sorted_in"]="@ind_str_asc";
for(flag in flags)
print flag;
for(tag in tagflags)
print tagflags[tag];
printf "\n";
i++;
}
if(match(line[i], "^LEXICON[ \t]+([^ \t]+)", f)!=0)
{
print line[i];
lexflag=sprintf("@P.LEXICON.%s@", f[1]);
gsub("0", "%0", lexflag);
}
else
if(match(line[i], "^([^!;]+)(;)(.*)$", f)!=0)
{
content=f[1];
sep=f[2];
comment=f[3];
n=split(content, ff, ":")
if(n==2)
{
anl=ff[1]; tagflag="";
# Encoding tags as flags, by matching with longest-to-shortest tags
# PROCINFO["sorted_in"]="@val_num_desc";
# for(t in taglen)
# {
# if(index(anl, t)!=0)
# {
# tagflag=tagflag tagflags[t];
# sub("\\+", "\\+", t);
# sub(t, "", anl);
# }
# }
# Encoding tags as flags, by matching with single regexp including all tags
while(match(anl, tagregexp, fff)!=0)
{
tag=fff[0]; # print "Pah0:"tag;
tagflag=tagflag tagflags[tag];
sub("\\+", "\\+", tag);
sub("\\.", "\\.", tag);
sub(tag, "", anl);
}
# Encoding tags as flags, by matching with regexp identifying potential tags
# starting with prefixal tags ([...]+) and then suffixal tags (+[...])
# Does not fully work with the combination of prefixal and suffixal tags in LEXC code
# if(match(anl, "(^[^@\\+]+\\+)|(@[^@\\+]+\\+))", fff)!=0)
# while(match(anl, "[^@\\+]+\\+", fff)!=0)
# {
# tag=fff[0]; # print "Pah1:"tag;
# tagflag=tagflag tagflags[tag];
# sub("\\+", "\\+", tag);
# sub("\\.", "\\.", tag);
# if(tag in tagflags)
# sub(tag, "", anl);
# else
# break;
# }
# if(match(anl, "(^\\+[^@\\+]+)|(@\\+[^@\\+]+))", fff)!=0)
# while(match(anl, "\\+[^@\\+]+", fff)!=0)
# {
# tag=fff[0]; # print "Pah2:"tag;
# tagflag=tagflag tagflags[tag];
# sub("\\+", "\\+", tag);
# sub("\\.", "\\.", tag);
# if(tag in tagflags)
# sub(tag, "", anl);
# else
# break;
# }
# Adding lexicon and tag flags to LEXC code
content=lexflag tagflag ff[1] ":" lexflag tagflag ff[2];
}
else
{
if(match(content, "^[ ]*[^ ]+[ ]*;")!=0)
content=lexflag content;
else
content=lexflag " " content;
gsub("@ @", "@@", content);
}
print content sep comment;
}
else
print line[i];
}
}'
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/bin/sh

gawk '{
n=split($0, a, "@P\\.LEXICON\\.[^@]+@", s);
for(i=1; i<=n-1; i++)
{
match(s[i], "@P\\.LEXICON\\.([^@]+)@", f);
out=a[i+1]; inp=""; clex=f[1];
while(match(out, "@P\\.FSTTAG.([^@]+)@", ff)!=0)
{
gsub("\\+", "\\+", ff[0]);
gsub("\\.", "\\.", ff[0]);
sub(ff[0], "", out);
inp=inp ff[1];
}
if(anl=="") anl="0";
printf "%i\t%s\t%s\t%s\n", i, clex, inp, out;
}
}' |

gawk -F"\t" 'BEGIN { max1=5; max2=7; max3=5; max4=5; }
{
if(length($1)>max1) max1=length($1);
if(length($2)>max2) max2=length($2);
if(length($3)>max3) max3=length($3);
if(length($4)>max4) max4=length($4);
for(j=1; j<=NF; j++)
if($j=="")
cell[NR, j]="0";
else
cell[NR, j]=$j;
}
END {
printf "%"(max1+1)"s %"max2"s\t%"max3"s : %-"max4"s\n", "INDEX", "CONTLEX", "INPUT", "OUTPUT";
for(i=1; i<=NR; i++)
printf "%"max1"i: %"max2"s\t%"max3"s : %-"max4"s\n", cell[i, 1], cell[i, 2], cell[i, 3], cell[i, 4];
}'

0 comments on commit cfc48fc

Please sign in to comment.