Created scripts to allow for the discovery of the continuation lexica…

… (and their content) which are passed through to generate an FST output string.
giellalt · Nov 4, 2024 · cfc48fc · cfc48fc
1 parent 99c37a4
commit cfc48fc
Show file tree

Hide file tree

Showing 2 changed files with 197 additions and 0 deletions.
diff --git a/tools/shellscripts/add-lexicon-and-tag-flags-2-lexc.sh b/tools/shellscripts/add-lexicon-and-tag-flags-2-lexc.sh
@@ -0,0 +1,156 @@
+#!/bin/sh
+
+gawk 'BEGIN { mcs=0; }
+{
+  line[NR]=$0;
+
+  # Set flag for recognizing multichar symbols,
+  # within the Multichar_Symbols field in LEXC code
+
+  if(match($0, "^Multichar_Symbols")!=0)
+    multichar=1;
+  if(match($0, "^LEXICON")!=0)
+    multichar=0;
+
+  # Recognizing multichar symbols (i.e. tags) and creating corresponding flags
+  if(multichar && match($0, "(^[^\\+ \t]+\\+|^\\+[^ \t]+)", f)!=0)
+    {
+      tagflag="@P.FSTTAG." f[1] "@";
+      tagflags[f[1]]=tagflag;
+      taglen[f[1]]=length(f[1]);
+    }
+
+  # Recognizing contlex names and creating corresponding flags
+  if(match($0, "^LEXICON[ \t]+([^ \t]+)", f)!=0)
+    {
+      lexicon[f[1]]++;
+      if(lexicon[f[1]]>=2)
+        {
+          printf "Aborting - More than one continuation lexicon with the same name:\n" > "/dev/stderr";
+          printf "=> LEXICON: %s\n", f[1] > "/dev/stderr";
+          _assert_exit=1;
+          exit 1;
+        }
+      lexflag=sprintf("@P.LEXICON.%s@", f[1]);
+      gsub("0", "%0", lexflag);
+      flags[lexflag]=lexflag;
+    }
+}
+END {
+  if(_assert_exit) exit 1;
+
+  delete lexicon;
+
+  # Creating single regexp covering all tags in LEXC code
+  # (as defined in the Multichar_Symbols field in LEXC code
+  tagregexp="";
+  for(t in taglen)
+     tagregexp = tagregexp "|" t;
+  # Remove initial "|" operator
+  sub("^\\|", "", tagregexp);
+  # Re-encode certain special characters
+  gsub("\\+", "\\+", tagregexp);
+  gsub("[-]", "\\-", tagregexp);
+  gsub("0", "%0", tagregexp);
+  gsub("[%]+0", "%0", tagregexp);
+
+  for(i=1; i<=NR; i++)
+    {
+      if(index(line[i], "Multichar_Symbols")!=0)
+        {
+          print line[i];
+          PROCINFO["sorted_in"]="@ind_str_asc";
+          for(flag in flags)
+             print flag;
+          for(tag in tagflags)
+             print tagflags[tag];
+          printf "\n";
+          i++;
+        }
+             
+      if(match(line[i], "^LEXICON[ \t]+([^ \t]+)", f)!=0)
+        { 
+          print line[i];
+          lexflag=sprintf("@P.LEXICON.%s@", f[1]);
+          gsub("0", "%0", lexflag);
+        }
+      else
+      if(match(line[i], "^([^!;]+)(;)(.*)$", f)!=0)
+      {
+        content=f[1];
+        sep=f[2];
+        comment=f[3];
+
+        n=split(content, ff, ":")
+        if(n==2)
+          {
+            anl=ff[1]; tagflag="";
+
+            # Encoding tags as flags, by matching with longest-to-shortest tags
+            # PROCINFO["sorted_in"]="@val_num_desc";
+            # for(t in taglen)
+            #    {
+            #      if(index(anl, t)!=0)
+            #        {
+            #          tagflag=tagflag tagflags[t];
+            #          sub("\\+", "\\+", t);
+            #          sub(t, "", anl);
+            #        }
+            #    }
+
+            # Encoding tags as flags, by matching with single regexp including all tags
+            while(match(anl, tagregexp, fff)!=0)
+               {
+                   tag=fff[0]; # print "Pah0:"tag;
+                   tagflag=tagflag tagflags[tag];
+                   sub("\\+", "\\+", tag);
+                   sub("\\.", "\\.", tag);
+                      sub(tag, "", anl);
+               }
+
+            # Encoding tags as flags, by matching with regexp identifying potential tags
+            # starting with prefixal tags ([...]+) and then suffixal tags (+[...])
+            # Does not fully work with the combination of prefixal and suffixal tags in LEXC code
+            # if(match(anl, "(^[^@\\+]+\\+)|(@[^@\\+]+\\+))", fff)!=0)
+            # while(match(anl, "[^@\\+]+\\+", fff)!=0)
+            #      {
+            #        tag=fff[0]; # print "Pah1:"tag;
+            #        tagflag=tagflag tagflags[tag];
+            #        sub("\\+", "\\+", tag);
+            #        sub("\\.", "\\.", tag);
+            #        if(tag in tagflags)
+            #           sub(tag, "", anl);
+            #        else
+            #          break;
+            #      }
+            # if(match(anl, "(^\\+[^@\\+]+)|(@\\+[^@\\+]+))", fff)!=0)
+            # while(match(anl, "\\+[^@\\+]+", fff)!=0)
+            #      {
+            #        tag=fff[0]; # print "Pah2:"tag;
+            #        tagflag=tagflag tagflags[tag];
+            #        sub("\\+", "\\+", tag); 
+            #        sub("\\.", "\\.", tag);
+            #        if(tag in tagflags)
+            #          sub(tag, "", anl);
+            #        else
+            #          break;
+            #      }
+
+            # Adding lexicon and tag flags to LEXC code
+            content=lexflag tagflag ff[1] ":" lexflag tagflag ff[2];
+          }
+        else
+          {
+            if(match(content, "^[ ]*[^ ]+[ ]*;")!=0)
+              content=lexflag content;
+            else
+              content=lexflag " " content;
+            gsub("@ @", "@@", content);
+          }
+
+        print content sep comment;
+      }
+      else
+        print line[i];
+    }
+}'
diff --git a/tools/shellscripts/parse-fst-output-string-4-lexicon-and-tag-flags.sh b/tools/shellscripts/parse-fst-output-string-4-lexicon-and-tag-flags.sh
@@ -0,0 +1,41 @@
+#!/bin/sh
+
+gawk '{
+  n=split($0, a, "@P\\.LEXICON\\.[^@]+@", s);
+  for(i=1; i<=n-1; i++)
+     {
+       match(s[i], "@P\\.LEXICON\\.([^@]+)@", f);
+       out=a[i+1]; inp=""; clex=f[1];
+       while(match(out, "@P\\.FSTTAG.([^@]+)@", ff)!=0)
+            {
+              gsub("\\+", "\\+", ff[0]);
+              gsub("\\.", "\\.", ff[0]);
+              sub(ff[0], "", out);
+              inp=inp ff[1];
+            }
+
+       if(anl=="") anl="0";
+       printf "%i\t%s\t%s\t%s\n", i, clex, inp, out;
+     }
+}' |
+
+gawk -F"\t" 'BEGIN { max1=5; max2=7; max3=5; max4=5; }
+{ 
+  if(length($1)>max1) max1=length($1);
+  if(length($2)>max2) max2=length($2);
+  if(length($3)>max3) max3=length($3);
+  if(length($4)>max4) max4=length($4);
+
+  for(j=1; j<=NF; j++)
+     if($j=="")
+       cell[NR, j]="0";
+     else
+       cell[NR, j]=$j;
+
+}
+END {
+  printf  "%"(max1+1)"s %"max2"s\t%"max3"s : %-"max4"s\n", "INDEX", "CONTLEX", "INPUT", "OUTPUT";
+  for(i=1; i<=NR; i++)
+     printf "%"max1"i: %"max2"s\t%"max3"s : %-"max4"s\n", cell[i, 1], cell[i, 2], cell[i, 3], cell[i, 4];
+}'
+