From 613d090b3c46c36e4e503ed6463201f87bc09378 Mon Sep 17 00:00:00 2001
From: Felix Stutz <fstutz@mpi-inf.mpg.de>
Date: Mon, 27 Jun 2022 13:38:38 -0400
Subject: [PATCH 01/64] First PR for new annotations and new model for DFG
 nodes (#583)

* WIP: first changes for new annotations

* Make parallelization use mapper info from new annotations

* Make parallelization use mapper info from new annotations

* Delete com_mapper field in dfg_node and use the info from new annotations

* Use input info from new annotations

* Use output info from new annotations

* Make dfg_options use the information from new annotations and completely remove use of old aggregator com_aggregator

* Make to_ast work for wf.sh with some hacks, e.g. to_ast for eager to handle special case of intermediate file as last operand and fixed parsing issue for newline

* Remove com_mapper and com_aggregator from DFGNode

* WIP: incorporating remodelled command invocations

* Parsing with new dataflow node model works

* WIP: parallelization

* Rudimentary parallelization with new annotations works

* Add way to specify where to find repository for annotations repository

* Do not require flag `r_split` since we do consecutive chunks for now

* 1st part of changes due to comments for PR

* minor fix

* 2nd part of changes due to comments for PR

Co-authored-by: Felix Stutz <fstutz@mpi-sws.org>
---
 README.md                                     |   6 +
 TODO.md                                       |  10 +
 compiler/annotations.py                       |   1 +
 compiler/annotations_utils/util_aggregator.py |  59 ++++
 .../annotations_utils/util_cmd_invocations.py |  96 ++++++
 .../util_file_descriptors.py                  |  21 ++
 compiler/annotations_utils/util_mapper.py     |  97 ++++++
 compiler/annotations_utils/util_parsing.py    |  92 ++++++
 compiler/config.py                            |   9 +
 compiler/definitions/ir/aggregator_node.py    |  58 +++-
 compiler/definitions/ir/arg.py                |   7 +
 compiler/definitions/ir/dfg_node.py           | 243 +++++++--------
 compiler/definitions/ir/nodes/cat.py          |  29 +-
 compiler/definitions/ir/nodes/eager.py        | 106 ++++++-
 compiler/definitions/ir/nodes/pash_split.py   |  40 ++-
 compiler/definitions/ir/nodes/r_split.py      |  49 ++-
 compiler/definitions/ir/resource.py           |   1 +
 compiler/ir.py                                | 291 ++++++++++++------
 compiler/pash_runtime.py                      | 133 ++++----
 compiler/util.py                              |  15 +
 20 files changed, 1013 insertions(+), 350 deletions(-)
 create mode 100644 TODO.md
 create mode 100644 compiler/annotations_utils/util_aggregator.py
 create mode 100644 compiler/annotations_utils/util_cmd_invocations.py
 create mode 100644 compiler/annotations_utils/util_file_descriptors.py
 create mode 100644 compiler/annotations_utils/util_mapper.py
 create mode 100644 compiler/annotations_utils/util_parsing.py

diff --git a/README.md b/README.md
index fa5ab805e..ff9ce1d2d 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,11 @@
 ## PaSh: Light-touch Data-Parallel Shell Processing
 
+**TODO before testing new annotations (temporary fix):**
+
+Connect the new annotations repository to PaSh in the `future_annotations`-branch:
+- clone the `connect_to_pash` branch from the new repository for annotations: git@github.com:binpash/annotations.git 
+- Specify the path in `compiler/config.py`
+
 > _A system for parallelizing POSIX shell scripts._
 > _Hosted by the [Linux Foundation](https://linuxfoundation.org/press-release/linux-foundation-to-host-the-pash-project-accelerating-shell-scripting-with-automated-parallelization-for-industrial-use-cases/)._
 
diff --git a/TODO.md b/TODO.md
new file mode 100644
index 000000000..67b55f0ea
--- /dev/null
+++ b/TODO.md
@@ -0,0 +1,10 @@
+## TODOs before merging to `future`
+
+- eager
+- aggregation trees
+- r_split
+- cat-split fusion
+- working on all tests
+- Adding annotation library installation and removing ad-hoc import of the latter
+- clean up utils for annotations
+- Changing PaSh flags (making the default be priority r-split and then consecutive chunks), so remove the r_split flag and make defaults be the ones from the OSDI paper (I can do that TODO too) 
\ No newline at end of file
diff --git a/compiler/annotations.py b/compiler/annotations.py
index 28d61e541..a7c78be02 100644
--- a/compiler/annotations.py
+++ b/compiler/annotations.py
@@ -284,6 +284,7 @@ def get_command_properties_from_annotations(command, options, annotations):
         return command_ann['properties']
 
 def get_command_aggregator_from_annotations(command, options, annotations):
+    log(f'still used')
     command_ann = get_command_from_annotations(command, options, annotations)
     if(command_ann
        and 'aggregator' in command_ann):
diff --git a/compiler/annotations_utils/util_aggregator.py b/compiler/annotations_utils/util_aggregator.py
new file mode 100644
index 000000000..3382730c6
--- /dev/null
+++ b/compiler/annotations_utils/util_aggregator.py
@@ -0,0 +1,59 @@
+# TODO: this file can properly be deleted
+
+import sys
+from config import get_path_annotation_repo
+sys.path.insert(1, get_path_annotation_repo())
+
+from definitions.ir.dfg_node import DFGNode
+from definitions.ir.nodes.cat import Cat
+from annotations_utils.util_cmd_invocations import get_command_invocation_prefix_from_dfg_node
+from util import log
+from ir_utils import string_to_argument
+from definitions.ir.arg import Arg
+
+def get_aggregator_as_dfg_node_from_node(node, parallelizer, inputs, outputs) -> DFGNode:
+    assert(False)
+    cmd_inv_pref = get_command_invocation_prefix_from_dfg_node(node)
+    log(f'cmdinvpref for agg: {cmd_inv_pref}')
+    aggregator = parallelizer.get_actual_aggregator(cmd_inv_pref)
+    log(f'here agg: {aggregator}')
+    # TODO: this could be simplified once we use the new attributes
+    if aggregator.cmd_name == 'cat':
+        return Cat(inputs=inputs,
+                    outputs=outputs,
+                    com_name=Arg(string_to_argument(aggregator.cmd_name)),
+                    com_options=[], # empty and not taking over from other one
+                    com_category="stateless",
+                    com_redirs=node.com_redirs,
+                    com_assignments=node.com_assignments,
+                    flag_option_list=aggregator.flag_option_list,
+                    positional_config_list=aggregator.positional_config_list,
+                    positional_input_list=None,     # TODO: somehow from inputs, future shift
+                    positional_output_list=None    # TODO: somehow from outputs, future shift
+                # TODO:
+                # implicit_use_of_stdin = False,
+                # implicit_use_of_stdout = False,
+                # omitted for now since we do not consider nested parallelization
+                # parallelizer_list = None,
+                # cmd_related_properties = None,
+        )
+    else:
+        log(f'agg_com_name: {aggregator.cmd_name}')
+        log(f'agg_flag_option_list: {aggregator.flag_option_list}')
+        return DFGNode(inputs=inputs,
+                       outputs=outputs,
+                       com_name=Arg(string_to_argument(aggregator.cmd_name)),
+                       com_options=node.com_options,
+                       com_redirs=node.com_redirs,
+                       com_assignments=node.com_assignments,
+                       flag_option_list=aggregator.flag_option_list,
+                       positional_config_list=aggregator.positional_config_list,
+                       positional_input_list=None,     # TODO: somehow from inputs, future shift
+                       positional_output_list=None    # TODO: somehow from outputs, future shift
+                       # TODO:
+                       # implicit_use_of_stdin = False,
+                       # implicit_use_of_stdout = False,
+                       # omitted for now since we do not consider nested parallelization
+                       # parallelizer_list = None,
+                       # cmd_related_properties = None,
+                       )
diff --git a/compiler/annotations_utils/util_cmd_invocations.py b/compiler/annotations_utils/util_cmd_invocations.py
new file mode 100644
index 000000000..90e5f6c10
--- /dev/null
+++ b/compiler/annotations_utils/util_cmd_invocations.py
@@ -0,0 +1,96 @@
+import sys
+
+from datatypes_new.BasicDatatypes import Flag
+from datatypes_new.BasicDatatypesWithIO import OptionWithIO
+from datatypes_new.CommandInvocationInitial import CommandInvocationInitial
+from annotation_generation_new.datatypes.InputOutputInfo import InputOutputInfo
+from annotation_generation_new.datatypes.ParallelizabilityInfo import ParallelizabilityInfo
+from annotation_generation_new.datatypes.CommandProperties import CommandProperties
+from annotation_generation_new.AnnotationGeneration import get_input_output_info_from_cmd_invocation, \
+    get_parallelizability_info_from_cmd_invocation
+
+from util import log
+
+from config import get_path_annotation_repo
+sys.path.insert(1, get_path_annotation_repo())
+
+# for typing
+from datatypes_new.CommandInvocationPrefix import CommandInvocationPrefix
+
+from ir_utils import string_to_argument, redir_stdout_to_file, redir_file_to_stdin, make_command
+
+def get_command_invocation_prefix_from_dfg_node(dfg_node):
+    return CommandInvocationPrefix(cmd_name = dfg_node.com_name,
+                                   flag_option_list = dfg_node.flag_option_list,
+                                   positional_config_list = dfg_node.positional_config_list)
+
+# TODO: ideally methods in the respective classes but requires refactoring of parsing infrastructure
+def to_node_cmd_inv_with_io_vars(cmd_inv, edges, redirs, assignments):
+    log("edges", edges)
+    ast_cmd_name = string_to_argument(cmd_inv.cmd_name)
+    log("ast_cmd_name", ast_cmd_name)
+    ast_flagoptions = []
+    for flagoption in cmd_inv.flag_option_list:
+        ast_flagoptions += to_ast_flagoption(flagoption, edges)
+    log("flagoptions", cmd_inv.flag_option_list)
+    log("ast_flagoptions", ast_flagoptions)
+    ast_operands = [to_ast_operand(operand, edges) for operand in cmd_inv.operand_list]
+    log("operands", cmd_inv.operand_list)
+    log("ast_operands", ast_operands)
+    # log("type of ast_operands [0]", type(ast_operands[0])) # can only be used if there are operands
+    cmd_asts = [ast_cmd_name] + ast_flagoptions + ast_operands
+
+    # TODO: check for actual stdin
+    stdin_redir = []
+    if cmd_inv.implicit_use_of_streaming_input is not None:
+        fid, _, _ = edges[cmd_inv.implicit_use_of_streaming_input]
+        if not (fid.has_file_descriptor_resource() and fid.resource.is_stdin()):
+            stdin_redir = [redir_file_to_stdin(fid.to_ast())]
+
+    # TODO: check for actual stdout
+    stdout_redir = []
+    if cmd_inv.implicit_use_of_streaming_output is not None:
+        fid, _, _ = edges[cmd_inv.implicit_use_of_streaming_output]
+        if not (fid.has_file_descriptor_resource() and fid.resource.is_stdout()):
+            stdout_redir = [redir_stdout_to_file(fid.to_ast())]
+
+    new_redirs = redirs + stdin_redir + stdout_redir
+    node = make_command(cmd_asts, redirections=new_redirs, assignments=assignments)
+    log("node", node)
+    return node
+
+def to_ast_flagoption(flagoption, _edges):
+    if isinstance(flagoption, Flag):
+        return [string_to_argument(flagoption.get_name())]
+    elif isinstance(flagoption, OptionWithIO): # retype to IOVar
+        opt_name_ast = string_to_argument(flagoption.get_name())
+        opt_arg_ast = translate_io_var_if_applicable(flagoption.get_arg())
+        return [opt_name_ast, opt_arg_ast]
+
+def to_ast_operand(operand, edges):
+    return translate_io_var_if_applicable(operand, edges)
+
+def translate_io_var_if_applicable(pot_io_var, edges):
+    if isinstance(pot_io_var, int):
+        return dereference_io_var(pot_io_var, edges)
+    else:
+        return to_ast_arg_string_type(pot_io_var)
+
+def to_ast_arg_string_type(arg_string_type):
+    return arg_string_type.get_name().arg_char_list # is of type Arg
+
+# assumes io_var is an edge id
+def dereference_io_var(io_var, edges):
+    fid, _, _ = edges[io_var]
+    log(fid)
+    return fid.to_ast()
+
+def get_input_output_info_from_cmd_invocation_util(cmd_invocationInitial : CommandInvocationInitial) -> InputOutputInfo:
+    return get_input_output_info_from_cmd_invocation(cmd_invocationInitial)
+
+def get_parallelizability_info_from_cmd_invocation_util(cmd_invocationInitial : CommandInvocationInitial) -> ParallelizabilityInfo:
+    return get_parallelizability_info_from_cmd_invocation(cmd_invocationInitial)
+
+def construct_property_container_from_list_of_properties(list_properties):
+    return CommandProperties(dict(list_properties))
+
diff --git a/compiler/annotations_utils/util_file_descriptors.py b/compiler/annotations_utils/util_file_descriptors.py
new file mode 100644
index 000000000..910efa632
--- /dev/null
+++ b/compiler/annotations_utils/util_file_descriptors.py
@@ -0,0 +1,21 @@
+from util import log
+from definitions.ir.resource import FileResource, Resource, FileDescriptorResource
+import sys
+from config import get_path_annotation_repo
+sys.path.insert(1, get_path_annotation_repo())
+from datatypes_new.BasicDatatypesWithIO import FileNameWithIOInfo, StdDescriptorWithIOInfo
+
+
+def resource_from_file_descriptor(file_descriptor) -> Resource:
+    if isinstance(file_descriptor, FileNameWithIOInfo):
+        arg = file_descriptor.get_name()
+        log(f'filedes name: {file_descriptor.get_name()}')
+        log(f'filedes name type: {type(file_descriptor.get_name())}')
+        log(f'arg: {arg}')
+        return FileResource(file_descriptor.get_name())
+    elif isinstance(file_descriptor, StdDescriptorWithIOInfo):
+        resource = ("fd", file_descriptor.get_type().value)
+        return FileDescriptorResource(resource)
+    else:
+        assert(False)
+        # unreachable
diff --git a/compiler/annotations_utils/util_mapper.py b/compiler/annotations_utils/util_mapper.py
new file mode 100644
index 000000000..64657cf03
--- /dev/null
+++ b/compiler/annotations_utils/util_mapper.py
@@ -0,0 +1,97 @@
+# TODO: this file can properly be deleted
+
+# imports from annotation framework
+import sys
+from config import get_path_annotation_repo
+sys.path.insert(1, get_path_annotation_repo())
+# for typing
+# for use
+from annotation_generation_new.datatypes.parallelizability.Mapper import Mapper
+
+from definitions.ir.dfg_node import DFGNode
+from annotations_utils.util_cmd_invocations import get_command_invocation_prefix_from_dfg_node
+from util import log
+
+def get_actual_mapper_from_node(node, parallelizer) -> Mapper:
+    assert(False)
+    cmd_inv_pref = get_command_invocation_prefix_from_dfg_node(node)
+    return parallelizer.get_actual_mapper(cmd_inv_pref)
+
+def get_mapper_as_dfg_node_from_node(node, parallelizer, inputs, outputs) -> DFGNode:
+    assert(False)
+    mapper = get_actual_mapper_from_node(node, parallelizer)
+    log(f'mapper for cmd_name: {node.com_name}')
+    log(f'here mapper: {mapper}')
+    return DFGNode(inputs=inputs,
+                outputs=outputs,
+                com_name=mapper.cmd_name,
+                # com_options=node.com_options,
+                com_redirs=node.com_redirs,
+                com_assignments=node.com_assignments,
+                flag_option_list=mapper.flag_option_list,
+                positional_config_list=mapper.positional_config_list,
+                positional_input_list=None,     # TODO: somehow from inputs, future shift
+                positional_output_list=None    # TODO: somehow from outputs, future shift
+            # TODO:
+            # implicit_use_of_stdin = False,
+            # implicit_use_of_stdout = False,
+            # omitted for now since we do not consider nested parallelization
+            # parallelizer_list = None,
+            # cmd_related_properties = None,
+    )
+
+## MOVED from dfg_node
+## Get the file names of the outputs of the map commands. This
+## differs if the command is stateless, pure that can be
+## written as a map and a reduce, and a pure that can be
+## written as a generalized map and reduce.
+# BEGIN ANNO
+# OLD
+# def get_map_output_files(node, input_edge_ids, fileIdGen):
+# NEW
+def get_map_output_files(node, input_edge_ids, fileIdGen, parallelizer):
+    assert(False)
+    assert (node.is_parallelizable())
+    # TODO ANNO: How to substitute? @KK
+    if (node.com_category == "stateless"):
+        map_output_fids = [fileIdGen.next_ephemeral_file_id() for in_fid in input_edge_ids]
+    elif (node.is_pure_parallelizable()):
+        # BEGIN ANNO
+        # OLD
+        # map_output_fids = node.pure_get_map_output_files(input_edge_ids, fileIdGen)
+        # NEW
+        map_output_fids = pure_get_map_output_files(node, input_edge_ids, fileIdGen, parallelizer)
+        # END ANNO
+    else:
+        log("Unreachable code reached :(")
+        assert (False)
+        ## This should be unreachable
+
+    return map_output_fids
+
+## TODO: Fix this somewhere in the annotations and not in the code
+# BEGIN ANNO
+# OLD
+# def pure_get_map_output_files(node, input_edge_ids, fileIdGen):
+# NEW
+def pure_get_map_output_files(node, input_edge_ids, fileIdGen, parallelizer):
+    assert(False)
+    assert (node.is_pure_parallelizable())
+    # BEGIN ANNO
+    # OLD
+    ## The number of the mapper outputs defaults to 1
+    # if(node.com_mapper is None):
+    #     number_outputs = 1
+    # else:
+    #     number_outputs = node.com_mapper.num_outputs
+    # NEW
+    # TODO: which parallelizer did we choose?
+    actual_mapper = get_actual_mapper_from_node(node, parallelizer)
+    number_outputs = actual_mapper.num_outputs  # defaults to 1 in class Mapper
+    # END ANNO
+
+    new_output_fids = [[fileIdGen.next_ephemeral_file_id() for i in range(number_outputs)]
+                       for in_fid in input_edge_ids]
+    return new_output_fids
+
+
diff --git a/compiler/annotations_utils/util_parsing.py b/compiler/annotations_utils/util_parsing.py
new file mode 100644
index 000000000..19a098403
--- /dev/null
+++ b/compiler/annotations_utils/util_parsing.py
@@ -0,0 +1,92 @@
+import sys
+from typing import Set, List, Any
+
+from definitions.ir.arg import Arg
+
+from config import get_path_annotation_repo
+sys.path.insert(1, get_path_annotation_repo())
+from datatypes_new.CommandInvocationInitial import CommandInvocationInitial
+from datatypes_new.BasicDatatypes import Option, ArgStringType, Flag, Operand
+from parser_new.parser import parse, get_set_of_all_flags, get_dict_flag_to_primary_repr, get_set_of_all_options, \
+    get_dict_option_to_primary_repr, are_all_individually_flags
+from parser_new.util_parser import get_json_data
+
+
+from ir_utils import format_arg_chars, string_to_argument, log
+
+
+def merge_to_single_string_with_space(list_str):
+    if len(list_str) == 1:
+        return list_str[0]
+    else:
+        return " ".join(list_str)
+
+def get_command_invocation(command, options) -> CommandInvocationInitial:
+    command_as_string: str = format_arg_chars(command)
+    options_and_operands_as_string: str = merge_to_single_string_with_space([format_arg_chars(option) for option in options])
+    command_invocation_as_string: str = f'{command_as_string} {options_and_operands_as_string}'
+    command_invocation: CommandInvocationInitial = parse(command_invocation_as_string)
+    return command_invocation
+
+def get_ast_for_flagoption(flagoption):
+    result = string_to_argument(flagoption.get_name())
+    if isinstance(flagoption, Option):
+        # TODO: add argument here as well but eventually also fid
+        assert False
+    return result
+
+def get_ast_for_argstringtype(arg):
+    return string_to_argument(arg.get_name())
+
+# TODO: this is a hack to fix the wrong parsing of "
+def fix_parsing_newline(arg):
+    if arg.get_name() == '\\n':
+        return ArgStringType(r'"\n"')
+    else:
+        return arg
+
+
+def parse_arg_list_to_command_invocation(command, flags_options_operands) -> CommandInvocationInitial:
+
+    cmd_name = format_arg_chars(command)
+    json_data = get_json_data(cmd_name)
+
+    set_of_all_flags: Set[str] = get_set_of_all_flags(json_data)
+    dict_flag_to_primary_repr: dict[str, str] = get_dict_flag_to_primary_repr(json_data)
+    set_of_all_options: Set[str] = get_set_of_all_options(json_data)
+    dict_option_to_primary_repr: dict[str, str] = get_dict_option_to_primary_repr(json_data)
+    # we keep the Arg for everything but flag and option names
+
+    # parse list of command invocation terms
+    flag_option_list: List[Any] = []
+    i = 0
+    while i < len(flags_options_operands):
+        potential_flag_or_option_arg = flags_options_operands[i]
+        potential_flag_or_option_name = format_arg_chars(potential_flag_or_option_arg)
+        if potential_flag_or_option_name in set_of_all_flags:
+            flag_name_as_string: str = dict_flag_to_primary_repr.get(potential_flag_or_option_name, potential_flag_or_option_name)
+            flag: Flag = Flag(flag_name_as_string)
+            flag_option_list.append(flag)
+        elif (potential_flag_or_option_name in set_of_all_options) and ((i+1) < len(flags_options_operands)):
+            option_name_as_string: str = dict_option_to_primary_repr.get(potential_flag_or_option_name, potential_flag_or_option_name)
+            option_arg_as_arg: Arg = Arg(flags_options_operands[i+1])
+            option = Option(option_name_as_string, option_arg_as_arg)
+            flag_option_list.append(option)
+            i += 1  # since we consumed another term for the argument
+        elif are_all_individually_flags(potential_flag_or_option_name, set_of_all_flags):
+            for split_el in list(potential_flag_or_option_name[1:]):
+                flag: Flag = Flag(f'-{split_el}')
+                flag_option_list.append(flag)
+        else:
+            break  # next one is Operand, and we keep these in separate list
+        i += 1
+
+    # we would probably want to skip '--' but then the unparsed command could have a different meaning so we'd need to keep it
+    # for now, omitted
+    # if parsed_elements_list[i] == '--':
+    #     i += 1
+
+    operand_list = [Operand(Arg(operand_arg)) for operand_arg in flags_options_operands[i:]]
+    # log("type of operand_list[0].get_name()", type(operand_list[0].get_name()))   can only be used if there are operands
+
+    return CommandInvocationInitial(cmd_name, flag_option_list, operand_list)
diff --git a/compiler/config.py b/compiler/config.py
index f5e7648b7..71a9959fc 100644
--- a/compiler/config.py
+++ b/compiler/config.py
@@ -29,6 +29,15 @@
 
 HDFS_PREFIX = "$HDFS_DATANODE_DIR/"
 
+# move this to `config.json` if possible
+PATH_ANNOTATION_REPO="/home/felix/git-repos/MIT/annotations"
+
+def get_path_annotation_repo():
+    if PATH_ANNOTATION_REPO is None:
+        log("No path for annotation repository given! Specify it in compiler/config.py")
+        raise Exception("No path for annotation repository given! Specify it in compiler/config.py")
+    return PATH_ANNOTATION_REPO
+
 config = {}
 annotations = []
 pash_args = None
diff --git a/compiler/definitions/ir/aggregator_node.py b/compiler/definitions/ir/aggregator_node.py
index 511e18c9a..04b2eb8ce 100644
--- a/compiler/definitions/ir/aggregator_node.py
+++ b/compiler/definitions/ir/aggregator_node.py
@@ -1,10 +1,13 @@
 from definitions.ir.dfg_node import *
+# from definitions.ir.nodes.arg import Arg
+from annotations_utils.util_cmd_invocations import get_command_invocation_prefix_from_dfg_node
+
 
 ## This class corresponds to a generic n-ary aggregator
 ##
 ## TODO: Do we need to do anything special for binary aggregators?
 class MapperAggregatorNode(DFGNode):
-    def __init__(self, old_node, input_ids, output_ids, name_string, new_options):
+    def __init__(self, old_node, input_ids, output_ids, name_string, new_options, flag_option_list):
 
         ## The name of the aggregator command
         name = Arg(string_to_argument(name_string))
@@ -17,36 +20,73 @@ def __init__(self, old_node, input_ids, output_ids, name_string, new_options):
         super().__init__(input_ids,
                          output_ids, 
                          name,
-                         com_category, 
-                         com_options=old_node.com_options, 
+                         com_category,
+                         # BEGIN ANNO
+                         # OLD
+                         # com_options=old_node.com_options,
+                         # NEW
+                         com_options=new_options, # changed that all are already in there and not appended
+                         flag_option_list=flag_option_list,
+                         # END ANNO
                          com_redirs=com_redirs, 
                          com_assignments=old_node.com_assignments)
-        
+
         ## TODO: This assumes that all options from the old function are copied to the new.
         ##
         ## TODO: If we need a behavior where we don't keep the old flags, we can extend this
-        self.append_options(new_options)
+        # BEGIN ANNO
+        # OLD
+        # self.append_options(new_options)
+        # END ANNO
 
 
 class AggregatorNode(MapperAggregatorNode):
     def __init__(self, old_node, input_ids, output_ids):
 
+        # BEGIN ANNO
+        used_parallelizer = old_node.get_used_parallelizer()
+        cmd_inv_pref = get_command_invocation_prefix_from_dfg_node(old_node)
+        used_aggregator = used_parallelizer.get_actual_aggregator(cmd_inv_pref)
+        log(f'used_agg: {used_aggregator}')
+        log(f'old_node: {old_node}')
+        # END ANNO
+
         ## Check if an aggregator can be instantiated from the node
-        if(old_node.com_aggregator is None):
+        # BEGIN ANNO
+        # OLD
+        # if(old_node.com_aggregator is None):
+        # NEW
+        if(used_aggregator is None):
+        # END ANNO
             log("Error: Node:", old_node, "does not contain information to instantiate an aggregator!")
             raise Exception('No information to instantiate aggregator')
 
         ## The name of the aggregator command
-        agg_name_string = old_node.com_aggregator.name
-        new_options = old_node.com_aggregator.options
+        # BEGIN ANNO
+        # OLD
+        # agg_name_string = old_node.com_aggregator.name
+        # new_options = old_node.com_aggregator.options
+        # NEW
+        agg_name_string = used_aggregator.cmd_name
+        all_options_incl_new = [Arg.string_to_arg(el.get_name()) for el in used_aggregator.flag_option_list + used_aggregator.positional_config_list]
+        # TODO: zip is nicer
+        all_options_incl_new_right_format = [(i, all_options_incl_new[i]) for i in range(len(all_options_incl_new))]
+        # END ANNO
 
-        super().__init__(old_node, input_ids, output_ids, agg_name_string, new_options)
+        # BEGIN ANNO
+        # OLD
+        # super().__init__(old_node, input_ids, output_ids, agg_name_string, new_options)
+        # NEW
+        super().__init__(old_node, input_ids, output_ids, agg_name_string, all_options_incl_new_right_format,
+                         flag_option_list=used_aggregator.flag_option_list)
+        # END ANNO
 
         log("Generic Aggregator Created:", self)
 
 class MapperNode(MapperAggregatorNode):
     def __init__(self, old_node, input_ids, output_ids):
 
+        assert(False)
         ## Check if an mapper can be instantiated from the node
         if(old_node.com_mapper is None):
             log("Error: Node:", old_node, "does not contain information to instantiate a mapper!")
diff --git a/compiler/definitions/ir/arg.py b/compiler/definitions/ir/arg.py
index 40dbcc785..8ca591733 100644
--- a/compiler/definitions/ir/arg.py
+++ b/compiler/definitions/ir/arg.py
@@ -1,3 +1,4 @@
+from __future__ import annotations
 from ir_utils import *
 from util import *
 
@@ -29,3 +30,9 @@ def concatenate(self, other):
         space = [['C', 32]]
         self.arg_char_list.extend(space)
         self.arg_char_list.extend(other.arg_char_list)
+
+    @staticmethod
+    def string_to_arg(string) -> Arg:
+        return Arg(string_to_argument(string))
+
+
diff --git a/compiler/definitions/ir/dfg_node.py b/compiler/definitions/ir/dfg_node.py
index a38268082..b9e990fad 100644
--- a/compiler/definitions/ir/dfg_node.py
+++ b/compiler/definitions/ir/dfg_node.py
@@ -1,68 +1,53 @@
 import copy
-import annotations
 from command_categories import *
-from util import *
-from ir_utils import *
 from definitions.ir.redirection import *
 from definitions.ir.resource import *
 
+from annotations_utils.util_cmd_invocations import to_node_cmd_inv_with_io_vars, construct_property_container_from_list_of_properties
+
+import sys
+from config import get_path_annotation_repo
+sys.path.insert(1, get_path_annotation_repo())
+
+from util import return_empty_list_if_none_else_itself, return_default_if_none_else_itself
+
 ## Assumption: Everything related to a DFGNode must be already expanded.
 ## TODO: Ensure that this is true with assertions
 class DFGNode:
     ## Unique identifier for nodes
     next_id = 0
 
-    ## inputs : tuple of lists of fid_ids (that can be used to retrieve fid from edges)
-    ## outputs : list of fid_ids 
-    ## com_name : command name Arg
-    ## com_category : string denoting category
-    ## input_consumption_mode : enumeration
-    ## com_properties : properties such as commutativity
-    ## com_mapper : a class that contains necessary information to instantiate a mapper (by defaule this corresponds to the command)
-    ## com_aggregator : a class that contains necessary information to instantiate an aggregator
-    ## com_options : list of tuples with the option index and the argument Arg
+    ## cmd_invocation_with_io_vars : command invocation data structure with edge ids as symbolic variables for filenames etc.
     ## com_redirs : list of redirections
     ## com_assignments : list of assignments
-    def __init__(self, inputs, outputs, com_name, com_category,
-                 com_properties = [],
-                 com_mapper = None,
-                 com_aggregator = None,
-                 com_options = [],
+    ## parallelizer_list : list of parallelizers for this DFGNode
+    ## cmd_related_properties : dict to store properties like commutativity
+    def __init__(self,
+                 cmd_invocation_with_io_vars,
                  com_redirs = [],
-                 com_assignments=[]):
+                 com_assignments=[],
+                 parallelizer_list=None,
+                 cmd_related_properties=None,
+                 ):
+        # TODO []: default parameters!
+
+        ## @KK: can this be deleted? Was there another id in the member attributes before?
         ## Add a unique identifier to each DFGNode since id() is not guaranteed to be unique for objects that have different lifetimes.
         ## This leads to issues when nodes are deleted and new ones are created, leading to id() clashes between them
         self.id = DFGNode.next_id
         DFGNode.next_id += 1
 
-        self.set_inputs(inputs)
-        self.outputs = outputs
-        self.com_name = com_name
-        self.com_category = com_category
-        self.com_properties = com_properties
-        self.com_mapper = com_mapper
-        self.com_aggregator = com_aggregator
-        self.com_options = com_options
         self.com_redirs = [Redirection(redirection) for redirection in com_redirs]
         self.com_assignments = com_assignments
-
+        self.parallelizer_list = return_empty_list_if_none_else_itself(parallelizer_list)
+        default_cmd_properties = construct_property_container_from_list_of_properties([])
+        self.cmd_related_properties = return_default_if_none_else_itself(cmd_related_properties, default_cmd_properties)
+        self.cmd_invocation_with_io_vars = cmd_invocation_with_io_vars
         # log("Node created:", self.id, self)
 
     def __repr__(self):
-        prefix = "Node"
-        if (self.com_category == "stateless"):
-            prefix = "Stateless"
-        elif (self.com_category == "pure"):
-            prefix = "Pure"
-        elif (self.is_pure_parallelizable()):
-            prefix = "Par. Pure"
-        if (self.is_commutative()):
-            prefix = 'Commutative ' + prefix
-        output = "{}: \"{}\" in:{} out:{}".format(
-            prefix, self.com_name, 
-            self.get_input_list(),
-            self.outputs)
-        return output
+        # TODO: add other attributes
+        return str(self.cmd_invocation_with_io_vars)
 
     ## Generates a dot node for the DFG node
     def add_dot_node(self, dot, node_id):
@@ -73,7 +58,7 @@ def add_dot_node(self, dot, node_id):
     ## Get the label of the node. By default, it is simply the name
     def get_dot_label(self) -> str:
         ## The name could be a full path
-        name = self.com_name
+        name = self.cmd_invocation_with_io_vars.cmd_name
         basename = os.path.basename(str(name))
         return basename
 
@@ -90,6 +75,7 @@ def copy(self):
 
     ## TODO: Make that a proper class.
     def set_inputs(self, inputs):
+        assert(False)
         if(isinstance(inputs, list)):
             self.inputs = ([], inputs)
         elif(isinstance(inputs, tuple)):
@@ -98,32 +84,48 @@ def set_inputs(self, inputs):
             raise NotImplementedError()
 
     def get_input_list(self):
-        return (self.inputs[0] + self.inputs[1])
-    
-    def get_standard_inputs(self):
-        return self.inputs[1]
-    
+        inputs = self.cmd_invocation_with_io_vars.generate_inputs()
+        return inputs.get_all_inputs()
+
+    def get_output_list(self):
+        return self.cmd_invocation_with_io_vars.generate_outputs()
+
+    def get_streaming_inputs(self):
+        inputs = self.cmd_invocation_with_io_vars.generate_inputs()
+        return inputs.get_streaming_inputs()
+
     def get_configuration_inputs(self):
-        return self.inputs[0]
+        inputs = self.cmd_invocation_with_io_vars.generate_inputs()
+        return inputs.get_config_inputs()
 
-    def is_at_most_pure(self):
-        return (self.com_category in ["stateless", "pure", "parallelizable_pure"])
+    # def is_at_most_pure(self):
+    #     return (self.com_category in ["stateless", "pure", "parallelizable_pure"])
 
-    def is_parallelizable(self):
-        return (self.is_pure_parallelizable() or self.is_stateless())
+    # def is_parallelizable(self):
+    #     return (self.is_pure_parallelizable() or self.is_stateless())
 
-    def is_stateless(self):
-        return (self.com_category == "stateless")
+    # def is_stateless(self):
+    #     return (self.com_category == "stateless")
 
-    def is_pure_parallelizable(self):
-        return (self.com_category == "parallelizable_pure")
+    # def is_pure_parallelizable(self):
+    #     return (self.com_category == "parallelizable_pure")
 
     def is_commutative(self):
-        return ('commutative' in self.com_properties)
+        # BEGIN ANNO
+        # OLD
+        # return ('commutative' in self.com_properties)
+        # NEW
+        val = self.cmd_related_properties.get_property_value('commutative')
+        if val is not None:
+            return val
+        else:
+            return False
+        # END ANNO
 
     ## kk: 2021-07-23 Not totally sure if that is generally correct. Tests will say ¯\_(ツ)_/¯
     ##     I think it assumes that new options can be added in the beginning if there are no options already
     def append_options(self, new_options):
+        assert(False) # unreachable
         if(len(self.com_options) > 0):
             max_opt_index = max([i for i, _opt in self.com_options])
         else:
@@ -140,6 +142,10 @@ def append_options(self, new_options):
     ##
     ## TODO: Abstract this function away to annotations 2.0
     def special_to_ast(self, edges):
+        assert(False) # unreachable
+        # BEGIN ANNO
+        return None
+        # END ANNO
         ## Every argument should be completely expanded so making it a string should be fine
         if str(self.com_name) == "cat":
             redirs = self._to_ast_aux_get_redirs()
@@ -166,6 +172,7 @@ def special_to_ast(self, edges):
 
     ## This function handles the input fids as arguments.
     def _to_ast_aux_inputs_as_args(self, edges, stdin_dash=False):
+        assert(False) # unreachable
         input_fids = [edges[in_id][0] for in_id in self.get_input_list()]
 
         input_arguments = [fid.to_ast(stdin_dash=stdin_dash)
@@ -175,6 +182,7 @@ def _to_ast_aux_inputs_as_args(self, edges, stdin_dash=False):
     ## This function handles the redirections when a command has a single output
     ##   and it can always be stdout.
     def _to_ast_aux_single_stdout_fid(self, edges):
+        assert(False) # unreachable
         output_fids = [edges[out_id][0] for out_id in self.outputs]
         assert len(output_fids) == 1
         output_fid = output_fids[0]
@@ -187,6 +195,7 @@ def _to_ast_aux_single_stdout_fid(self, edges):
     ## Auxiliary method that returns any necessary redirections,
     ##   at the moment it doesn't look necessary.
     def _to_ast_aux_get_redirs(self):
+        ## still used in to_ast
         ## TODO: Properly handle redirections
         ##
         ## TODO: If one of the redirected outputs or inputs is changed in the IR 
@@ -204,60 +213,30 @@ def _to_ast_aux_get_redirs(self):
         return []
 
 
-    ## TODO: Improve this functio to be separately implemented for different special nodes,
+    ## TODO: Improve this function to be separately implemented for different special nodes,
     ##       such as cat, eager, split, etc...
-    def to_ast(self, edges, drain_streams):    
+    ## I do not think this is reasonable anymore since we remodelled nodes in a way that the back-translation is trivial
+    def to_ast(self, edges, drain_streams):
         ## TODO: We might not want to implement this at all actually
         if (drain_streams):
             raise NotImplementedError()
         else:
+            # commented since "see above"
             ## Handle special node to ast here
-            node = self.special_to_ast(edges)
-            if node is not None:
-                return node
-            
+            # node = self.special_to_ast(edges)
+            # if node is not None:
+            #     return node
 
             redirs = self._to_ast_aux_get_redirs()
             assignments = self.com_assignments
-            ## Start filling in the arguments
-            opt_arguments = []
-            for i, opt in self.com_options:
-                ## Pad the argument list with None 
-                opt_arguments = pad(opt_arguments, i)
-                opt_arguments[i] = opt.to_ast()
 
-            com_name_ast = self.com_name.to_ast()
-            option_asts = [opt.to_ast() for _, opt in self.com_options]
-
-            ##
-            ## 1. Find the input and output fids
-            ## 2. Construct the rest of the arguments and input/output redirections according to
-            ##    the command IO
-            input_fids = [edges[in_id][0] for in_id in self.get_input_list()]
-            output_fids = [edges[out_id][0] for out_id in self.outputs]
-            rest_argument_fids, new_redirs = create_command_arguments_redirs(com_name_ast,
-                                                                             option_asts,
-                                                                             input_fids,
-                                                                             output_fids)
-            
-            ## Transform the rest of the argument fids to arguments
-            ## Since some of the rest_arguments can be None (they only contain inputs and outputs)
-            ## we need to make sure that we don't turn None objects to asts.
-            ##
-            ## The None fields need to be filtered out because they are taken care of by the interleave function.
-            ##
-            ## TODO: Is this actually OK?
-            rest_arguments = [fid.to_ast()
-                              for fid in rest_argument_fids
-                              if not fid is None]
-
-            ## Interleave the arguments since options args might contain gaps.
-            arguments = interleave_args(opt_arguments, rest_arguments) 
-
-            all_arguments = [com_name_ast] + arguments
-            all_redirs = redirs + new_redirs
-
-            node = make_command(all_arguments, redirections=all_redirs, assignments=assignments)
+            node = to_node_cmd_inv_with_io_vars(self.cmd_invocation_with_io_vars, edges, redirs, assignments)
+            # TODO: think about redirections
+            # old code for this:
+            # rest_argument_fids, new_redirs = create_command_arguments_redirs(com_name_ast,
+            #                                                                  option_asts,
+            #                                                                  input_fids,
+            #                                                                  output_fids)
         return node
 
     ## This method applies the redirections to get the correct, inputs, outputs of a node.
@@ -280,8 +259,8 @@ def apply_redirections(self, edges):
                 # log(redirection)
                 file_resource = FileResource(redirection.file_arg)
                 success = False
-                for i in range(len(self.outputs)):
-                    output_edge_id = self.outputs[i]
+                for i in range(len(self.get_output_list())):
+                    output_edge_id = self.get_output_list()[i]
                     output_fid = edges[output_edge_id][0]
                     if(output_fid.has_file_descriptor_resource()
                        and output_fid.resource.is_stdout()):
@@ -315,12 +294,7 @@ def apply_redirections(self, edges):
     ##
     ## TODO: Make this a method of graph to change the from, to too.
     def replace_edge(self, from_id, to_id):
-        new_config_inputs = self.replace_edge_in_list(self.inputs[0], from_id, to_id)
-        new_standard_inputs = self.replace_edge_in_list(self.inputs[1], from_id, to_id)
-        new_outputs = self.replace_edge_in_list(self.outputs, from_id, to_id)
-        
-        self.set_inputs((new_config_inputs, new_standard_inputs))
-        self.outputs = new_outputs
+        self.cmd_invocation_with_io_vars.replace_var(from_id, to_id)
 
     ## TODO: There must be a lib function to do this.
     def replace_edge_in_list(self, edge_ids, from_id, to_id):
@@ -333,33 +307,24 @@ def replace_edge_in_list(self, edge_ids, from_id, to_id):
             new_edge_ids.append(new_edge_id)
         return new_edge_ids
 
-    ## Get the file names of the outputs of the map commands. This
-    ## differs if the command is stateless, pure that can be
-    ## written as a map and a reduce, and a pure that can be
-    ## written as a generalized map and reduce.
-    def get_map_output_files(self, input_edge_ids, fileIdGen):
-        assert(self.is_parallelizable())
-        if(self.com_category == "stateless"):
-            map_output_fids = [fileIdGen.next_ephemeral_file_id() for in_fid in input_edge_ids]
-        elif(self.is_pure_parallelizable()):
-            map_output_fids = self.pure_get_map_output_files(input_edge_ids, fileIdGen)
-        else:
-            log("Unreachable code reached :(")
-            assert(False)
-            ## This should be unreachable
-        
-        return map_output_fids
-
-    ## TODO: Fix this somewhere in the annotations and not in the code
-    def pure_get_map_output_files(self, input_edge_ids, fileIdGen):
-        assert(self.is_pure_parallelizable())
-        
-        ## The number of the mapper outputs defaults to 1
-        if(self.com_mapper is None):
-            number_outputs = 1
-        else:
-            number_outputs = self.com_mapper.num_outputs
-
-        new_output_fids = [[fileIdGen.next_ephemeral_file_id() for i in range(number_outputs)] 
-                           for in_fid in input_edge_ids]
-        return new_output_fids
+    def set_used_parallelizer(self, parallelizer):
+        assert(False)
+        # TODO: instantiate in __init__ already in some way
+        self.used_parallelizer = parallelizer
+
+    def get_used_parallelizer(self):
+        assert(False)
+        return self.used_parallelizer
+
+    def get_option_implemented_round_robin_parallelizer(self):
+        for parallelizer in self.parallelizer_list:
+            splitter = parallelizer.get_splitter()
+            mapper_spec = parallelizer.get_mapper_spec()
+            aggregator_spec = parallelizer.get_aggregator_spec()
+            if splitter.is_splitter_round_robin() and mapper_spec.is_implemented and aggregator_spec.is_implemented:
+                return parallelizer
+        return None
+
+    @staticmethod
+    def make_simple_dfg_node_from_cmd_inv_with_io_vars(cmd_inv_with_io_vars):
+        return DFGNode(cmd_inv_with_io_vars)
\ No newline at end of file
diff --git a/compiler/definitions/ir/nodes/cat.py b/compiler/definitions/ir/nodes/cat.py
index a898d2dd8..a27b89f4f 100644
--- a/compiler/definitions/ir/nodes/cat.py
+++ b/compiler/definitions/ir/nodes/cat.py
@@ -2,13 +2,34 @@
 
 class Cat(DFGNode):
     def __init__(self, inputs, outputs, com_name, com_category,
-                 com_options = [], com_redirs = [], com_assignments=[]):
+                 com_options = [], com_redirs = [], com_assignments=[],
+                 # BEGIN ANNO
+                 flag_option_list = None,
+                 positional_config_list = None,
+                 positional_input_list = None,
+                 positional_output_list = None,
+                 implicit_use_of_stdin = None,
+                 implicit_use_of_stdout = None,
+                 parallelizer_list = None,
+                 cmd_related_properties = None
+                 # END ANNO
+                 ):
+        assert(False)
         assert(str(com_name) == "cat")
         assert(com_category == "stateless")
         super().__init__(inputs, outputs, com_name, com_category,
-                         com_options=com_options, 
-                         com_redirs=com_redirs, 
-                         com_assignments=com_assignments)
+                         com_options=com_options,
+                         flag_option_list=flag_option_list,
+                         com_redirs=com_redirs,
+                         com_assignments=com_assignments,
+                         positional_config_list=positional_config_list,
+                         positional_input_list=positional_input_list,
+                         positional_output_list=positional_output_list,
+                         implicit_use_of_stdin=implicit_use_of_stdin,
+                         implicit_use_of_stdout=implicit_use_of_stdout,
+                         parallelizer_list=parallelizer_list,
+                         cmd_related_properties=cmd_related_properties
+                         )
 
 def make_cat_node(inputs, output):
     com_name = Arg(string_to_argument("cat"))
diff --git a/compiler/definitions/ir/nodes/eager.py b/compiler/definitions/ir/nodes/eager.py
index d7c70210c..9cc37315f 100644
--- a/compiler/definitions/ir/nodes/eager.py
+++ b/compiler/definitions/ir/nodes/eager.py
@@ -1,20 +1,118 @@
 from definitions.ir.dfg_node import *
+from ir_utils import *
 
 class Eager(DFGNode):
     def __init__(self, inputs, outputs, com_name, com_category, com_options = [], 
-                 com_redirs = [], com_assignments=[]):
-        super().__init__(inputs, outputs, com_name, com_category, 
+                 com_redirs = [], com_assignments=[],
+                 intermediate = None):
+        # BEGIN ANNO : hack for intermediate at the end
+        self.intermediate = intermediate
+        # END ANNO
+        super().__init__(inputs, outputs, com_name, com_category,
                          com_options=com_options, 
                          com_redirs=com_redirs, 
                          com_assignments=com_assignments)
 
+    # BEGIN ANNO : copied from DFG node for hack for intermediate at the end
+    def to_ast(self, edges, drain_streams):
+        log(f'do we get here?')
+        ## TODO: We might not want to implement this at all actually
+        if (drain_streams):
+            raise NotImplementedError()
+        else:
+            ## Handle special node to ast here
+            # node = self.special_to_ast(edges)
+            # if node is not None:
+            #     return node
+
+            redirs = self._to_ast_aux_get_redirs()
+            assignments = self.com_assignments
+            ## Start filling in the arguments
+            opt_arguments = []
+            # BEGIN ANNO
+            # get_command_invocation_prefix_from_dfg_node
+            log(f'com_name: {self.com_name}')
+            log(f'edges: {edges}')
+            log(f'inputs: {self.inputs}')
+            log(f'outputs: {self.outputs}')
+            log(f'com_redirs: {self.com_redirs}')
+            log(f'pos config: {self.positional_config_list}')
+            log(f'pos input: {self.positional_input_list}')
+            log(f'pos output: {self.positional_output_list}')
+            log(f'com_options: {self.com_options}')
+            log(f'flag_option_list: {self.flag_option_list}')
+
+            # if self.implicit_use_of_stdin: # need to recompute
+            # cat a list of inputs into it; redirect a single one
+            # else:
+
+            # OLD
+            # for i, opt in self.com_options:
+            #     ## Pad the argument list with None
+            #     opt_arguments = pad(opt_arguments, i)
+            #     opt_arguments[i] = opt.to_ast()
+            # log(f'opt_arguments: {format_args([val for val in opt_arguments if val is not None])}')
+            # NEW
+            opt_arguments_new = [get_ast_for_flagoption(flagoption) for flagoption in self.flag_option_list]
+            opt_arguments_new += [get_ast_for_argstringtype(arg) for arg in self.positional_config_list]
+            log(f'opt_arguments_new: {format_args(opt_arguments_new)}')
+            # END ANNO
+
+            com_name_ast = self.com_name.to_ast()
+            option_asts = [opt.to_ast() for _, opt in self.com_options]
+
+            ##
+            ## 1. Find the input and output fids
+            ## 2. Construct the rest of the arguments and input/output redirections according to
+            ##    the command IO
+            input_fids = [edges[in_id][0] for in_id in self.get_input_list()]
+            output_fids = [edges[out_id][0] for out_id in self.outputs]
+            rest_argument_fids, new_redirs = create_command_arguments_redirs(com_name_ast,
+                                                                             option_asts,
+                                                                             input_fids,
+                                                                             output_fids)
+
+            ## Transform the rest of the argument fids to arguments
+            ## Since some of the rest_arguments can be None (they only contain inputs and outputs)
+            ## we need to make sure that we don't turn None objects to asts.
+            ##
+            ## The None fields need to be filtered out because they are taken care of by the interleave function.
+            ##
+            ## TODO: Is this actually OK?
+            rest_arguments = [fid.to_ast()
+                              for fid in rest_argument_fids
+                              if not fid is None]
+            log(f'rest_arguments: {format_args(rest_arguments)}')
+
+            ## Interleave the arguments since options args might contain gaps.
+            # BEGIN ANNO
+            rest_arguments_backup = rest_arguments.copy()
+            # OLD
+            # arguments = interleave_args(opt_arguments, rest_arguments)
+            # log(f'arguments fin: {format_args(arguments)}')
+            # NEW
+            arguments_new = opt_arguments_new + rest_arguments_backup + [self.intermediate.to_ast()]
+            log(f'arguments_new: {format_args(arguments_new)}')
+            # END ANNO
+
+            all_arguments = [com_name_ast] + arguments_new
+            all_redirs = redirs + new_redirs
+
+            node = make_command(all_arguments, redirections=all_redirs, assignments=assignments)
+        return node
+
+
 def make_eager_node(input_id, output_id, intermediate_file_id, eager_exec_path):
     com_name = Arg(string_to_argument(eager_exec_path))
     com_category = "pure"
     ## TODO: In theory the intermediate file id is also an output...
-    com_options = [(2, Arg(intermediate_file_id.to_ast()))]
+    # BEGIN ANNO
+    # OLD
+    intermediate_identifier = Arg(intermediate_file_id.to_ast())
+    com_options = [(2, intermediate_identifier)]
     return Eager([input_id],
                  [output_id],
                  com_name, 
                  com_category,
-                 com_options=com_options)
+                 com_options=com_options,
+                 intermediate=intermediate_identifier)
diff --git a/compiler/definitions/ir/nodes/pash_split.py b/compiler/definitions/ir/nodes/pash_split.py
index 7c6c8d9ba..9c28267d5 100644
--- a/compiler/definitions/ir/nodes/pash_split.py
+++ b/compiler/definitions/ir/nodes/pash_split.py
@@ -1,3 +1,6 @@
+from datatypes_new.AccessKind import AccessKind
+from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars
+
 from definitions.ir.file_id import *
 from definitions.ir.dfg_node import *
 from ir_utils import string_to_argument
@@ -6,19 +9,30 @@
 import os
 
 class Split(DFGNode):
-    def __init__(self, inputs, outputs, com_name, com_category, com_options = [], 
-                 com_redirs = [], com_assignments=[]):
-        super().__init__(inputs, outputs, com_name, com_category, 
-                         com_options=com_options, 
-                         com_redirs=com_redirs, 
-                         com_assignments=com_assignments)
+    def __init__(self,
+                 cmd_invocation_with_io_vars,
+                 com_redirs=[],
+                 com_assignments=[],
+                 parallelizer_list=None,
+                 cmd_related_properties=None):
+        # TODO []: default arguments!
+        super().__init__(cmd_invocation_with_io_vars=cmd_invocation_with_io_vars,
+                         com_redirs=com_redirs,
+                         com_assignments=com_assignments,
+                         parallelizer_list=parallelizer_list,
+                         cmd_related_properties=cmd_related_properties)
 
-## TODO: Make a proper splitter subclass of Node
 def make_split_file(input_id, out_ids):
     auto_split_bin = os.path.join(config.PASH_TOP, config.config['runtime']['auto_split_binary'])
-    com_name = Arg(string_to_argument(auto_split_bin))
-    com_category = "pure"
-    return Split([input_id],
-                 out_ids,
-                 com_name,
-                 com_category)
+    operand_list = [input_id]
+    operand_list.extend(out_ids)
+    access_map = {output_id: AccessKind.make_stream_output() for output_id in out_ids}
+    access_map[input_id] = AccessKind.make_stream_input()
+    cmd_inv_with_io_vars = CommandInvocationWithIOVars(
+        cmd_name=auto_split_bin,
+        flag_option_list=[],
+        operand_list=operand_list,
+        implicit_use_of_streaming_input=None,
+        implicit_use_of_streaming_output=None,
+        access_map=access_map)
+    return Split(cmd_inv_with_io_vars)
diff --git a/compiler/definitions/ir/nodes/r_split.py b/compiler/definitions/ir/nodes/r_split.py
index 1c29b89b8..68a889f2f 100644
--- a/compiler/definitions/ir/nodes/r_split.py
+++ b/compiler/definitions/ir/nodes/r_split.py
@@ -1,4 +1,9 @@
 import os
+
+from datatypes_new.AccessKind import AccessKind
+from datatypes_new.BasicDatatypes import Operand
+from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars
+
 import config
 
 from definitions.ir.dfg_node import *
@@ -6,15 +11,22 @@
 from ir_utils import string_to_argument
 
 class RSplit(DFGNode):
-    def __init__(self, inputs, outputs, com_name, com_category, com_options = [], 
-                 com_redirs = [], com_assignments=[]):
-        super().__init__(inputs, outputs, com_name, com_category, 
-                         com_options=com_options, 
-                         com_redirs=com_redirs, 
-                         com_assignments=com_assignments)
-    
+    def __init__(self,
+                 cmd_invocation_with_io_vars,
+                 com_redirs=[],
+                 com_assignments=[],
+                 parallelizer_list=None,
+                 cmd_related_properties=None):
+        # TODO []: default arguments!
+        super().__init__(cmd_invocation_with_io_vars=cmd_invocation_with_io_vars,
+                         com_redirs=com_redirs,
+                         com_assignments=com_assignments,
+                         parallelizer_list=parallelizer_list,
+                         cmd_related_properties=cmd_related_properties)
+
     ## TODO: Generalize this code (for this and SortGReduce) to be able to add an option to any command.
     def add_r_flag(self):
+        assert(False)
         assert(len(self.com_options) <= 1)
             
         ## Add -r in r_split
@@ -24,18 +36,23 @@ def add_r_flag(self):
 
     ## This is not a proper option check. It just works if the r_flag is added as a separate option.
     def has_r_flag(self):
+        assert(False)
         option_strings = [str(opt) for i, opt in self.com_options]
         return ("-r" in option_strings)
 
 
-## TODO: Make a proper splitter subclass of Node
 def make_r_split(input_id, out_ids, r_split_batch_size):
     r_split_bin = os.path.join(config.PASH_TOP, config.config['runtime']['r_split_binary'])
-    com_name = Arg(string_to_argument(r_split_bin))
-    com_category = "pure"
-    com_option = (1, Arg(string_to_argument(str(r_split_batch_size))))
-    return RSplit([input_id],
-                 out_ids,
-                 com_name,
-                 com_category,
-                 com_options=[com_option])
+    operand_list = [input_id,
+                    Operand(Arg(string_to_argument(str(r_split_batch_size))))]
+    operand_list.extend(out_ids)
+    access_map = {output_id: AccessKind.make_stream_output() for output_id in out_ids}
+    access_map[input_id] = AccessKind.make_stream_input()
+    cmd_inv_with_io_vars = CommandInvocationWithIOVars(
+                    cmd_name=r_split_bin,
+                    flag_option_list=[],
+                    operand_list=operand_list,
+                    implicit_use_of_streaming_input=None,
+                    implicit_use_of_streaming_output=None,
+                    access_map=access_map)
+    return RSplit(cmd_inv_with_io_vars)
diff --git a/compiler/definitions/ir/resource.py b/compiler/definitions/ir/resource.py
index ade86870f..999792cd9 100644
--- a/compiler/definitions/ir/resource.py
+++ b/compiler/definitions/ir/resource.py
@@ -44,6 +44,7 @@ def is_stdout(self):
 class FileResource(Resource):
     ## The uri is the path of the file.
     def __init__(self, path):
+        log("class of path", type(path))
         assert(isinstance(path, Arg))
         ## TODO: Make sure that paths are normalized
         self.uri = path
diff --git a/compiler/ir.py b/compiler/ir.py
index eb32479f4..641546cfc 100644
--- a/compiler/ir.py
+++ b/compiler/ir.py
@@ -1,12 +1,31 @@
-import os
+# BEGIN ANNO
+import sys
+
+from config import get_path_annotation_repo
+sys.path.insert(1, get_path_annotation_repo())
+# for typing
+from datatypes_new.CommandInvocationInitial import CommandInvocationInitial
+from datatypes_new.BasicDatatypesWithIO import FileNameWithIOInfo, StdDescriptorWithIOInfo
+from annotation_generation_new.datatypes.InputOutputInfo import InputOutputInfo
+from annotation_generation_new.datatypes.ParallelizabilityInfo import ParallelizabilityInfo
+from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars
+
+# for use
+# --
+
+from annotations_utils.util_parsing import parse_arg_list_to_command_invocation
+from annotations_utils.util_cmd_invocations import get_input_output_info_from_cmd_invocation_util, get_parallelizability_info_from_cmd_invocation_util
+from annotations_utils.util_mapper import get_mapper_as_dfg_node_from_node, get_map_output_files
+from annotations_utils.util_aggregator import get_aggregator_as_dfg_node_from_node
+from annotations_utils.util_file_descriptors import resource_from_file_descriptor
+# END ANNO
+
+# BEGIN REMODEL
+
+# END REMODEL
 
-from definitions.ir.arg import *
-from definitions.ir.dfg_node import *
-from definitions.ir.aggregator_node import *
 from definitions.ir.file_id import *
-from definitions.ir.resource import *
 from definitions.ir.nodes.cat import *
-from definitions.ir.nodes.hdfs_cat import HDFSCat
 
 import definitions.ir.nodes.pash_split as pash_split
 import definitions.ir.nodes.r_merge as r_merge
@@ -14,7 +33,6 @@
 import definitions.ir.nodes.r_wrap as r_wrap
 import definitions.ir.nodes.r_unwrap as r_unwrap
 
-from command_categories import *
 from ir_utils import *
 from util import *
 
@@ -94,78 +112,126 @@ def create_edges_from_opt_or_fd_list(opt_or_fd_list, edges_dict, options, fileId
         new_edge_list.append(fid_id)
     return new_edge_list
 
-def find_input_edges(inputs, dfg_edges, options, fileIdGen):
-    if(isinstance(inputs, list)):
-        return create_edges_from_opt_or_fd_list(inputs, dfg_edges, options, fileIdGen)
-    elif(isinstance(inputs, tuple)):
-        config_inputs = create_edges_from_opt_or_fd_list(inputs[0], dfg_edges, options, fileIdGen)
-        standard_inputs = create_edges_from_opt_or_fd_list(inputs[1], dfg_edges, options, fileIdGen)
-        return (config_inputs, standard_inputs)
+
+def find_input_edges(positional_input_list, implicit_use_of_stdin, dfg_edges, fileIdGen) -> List[int]:
+    assert (not implicit_use_of_stdin or len(positional_input_list) == 0)
+    if implicit_use_of_stdin:
+        resources = [FileDescriptorResource(("fd", 0))]
+    else:
+        resources = [resource_from_file_descriptor(input_el) for input_el in positional_input_list]
+    file_ids = [create_file_id_for_resource(resource, fileIdGen) for resource in resources]
+    return get_edge_list_from_file_id_list(dfg_edges, file_ids)
+
+
+def find_output_edges(positional_output_list, implicit_use_of_stdout, dfg_edges, fileIdGen) -> List[int]:
+    assert (not implicit_use_of_stdout or len(positional_output_list) == 0)
+    if implicit_use_of_stdout:
+        resources = [FileDescriptorResource(("fd", 1))]
+    else:
+        resources = [resource_from_file_descriptor(input_el) for input_el in positional_output_list]
+    file_ids = [create_file_id_for_resource(resource, fileIdGen) for resource in resources]
+    return get_edge_list_from_file_id_list(dfg_edges, file_ids)
+
+
+def get_edge_list_from_file_id_list(dfg_edges, file_ids):
+    new_edge_list = []
+    for file_id in file_ids:
+        fid_id = file_id.get_ident()
+        dfg_edges[fid_id] = (file_id, None, None)
+        new_edge_list.append(fid_id)
+    return new_edge_list
+
+
+def add_file_id_vars(command_invocation_with_io, fileIdGen):
+    # make pass over everything and create file_id for everything
+    # only for operands for now:
+    dfg_edges = {}
+    new_operand_list = []
+    access_map = dict()
+
+    def add_var_for_descriptor(operand):
+        resource = resource_from_file_descriptor(operand)
+        file_id = create_file_id_for_resource(resource, fileIdGen)
+        fid_id = file_id.get_ident()
+        dfg_edges[fid_id] = (file_id, None, None)
+        access_map[fid_id] = operand.get_access()
+        return fid_id
+
+    for i in range(len(command_invocation_with_io.operand_list)):
+        operand = command_invocation_with_io.operand_list[i]
+        if isinstance(operand, FileNameWithIOInfo) or isinstance(operand, StdDescriptorWithIOInfo):
+            fid_id = add_var_for_descriptor(operand)
+            new_operand_list.append(fid_id)
+        else:
+            new_operand_list.append(operand)
+    if command_invocation_with_io.implicit_use_of_streaming_input:
+        new_implicit_use_of_streaming_input = add_var_for_descriptor(command_invocation_with_io.implicit_use_of_streaming_input)
+    else:
+        new_implicit_use_of_streaming_input = None
+    if command_invocation_with_io.implicit_use_of_streaming_output:
+        new_implicit_use_of_streaming_output = add_var_for_descriptor(command_invocation_with_io.implicit_use_of_streaming_output)
     else:
-        raise NotImplementedError()
+        new_implicit_use_of_streaming_output = None
+
+    # this shall become copy-based
+    command_invocation_with_io_vars = CommandInvocationWithIOVars.get_from_without_vars(command_invocation_with_io, access_map)
+    command_invocation_with_io_vars.operand_list = new_operand_list
+    command_invocation_with_io_vars.implicit_use_of_streaming_input = new_implicit_use_of_streaming_input
+    command_invocation_with_io_vars.implicit_use_of_streaming_output = new_implicit_use_of_streaming_output
+    return command_invocation_with_io_vars, dfg_edges
+
 
-## This function creates a DFG with a single node given a command.
 def compile_command_to_DFG(fileIdGen, command, options,
                            redirections=[]):
-    ## TODO: There is no need for this redirection here. We can just straight 
-    ##       come up with inputs, outputs, options
-    inputs, out_stream, opt_indices = find_command_input_output(command, options)
-    # log("Opt indices:", opt_indices, "options:", options)
-    category = find_command_category(command, options)
-    com_properties = find_command_properties(command, options)
-    com_mapper, com_aggregator = find_command_mapper_aggregator(command, options)
+    command_invocation: CommandInvocationInitial = parse_arg_list_to_command_invocation(command, options)
+    io_info: InputOutputInfo = get_input_output_info_from_cmd_invocation_util(command_invocation)
+    para_info: ParallelizabilityInfo = get_parallelizability_info_from_cmd_invocation_util(command_invocation)
+    command_invocation_with_io = io_info.apply_input_output_info_to_command_invocation(command_invocation)
+    parallelizer_list, round_robin_compatible_with_cat, is_commutative = para_info.unpack_info()
+    property_list = [('round_robin_compatible_with_cat', round_robin_compatible_with_cat),
+                     ('is_commutative', is_commutative)]
+    cmd_related_properties = construct_property_container_from_list_of_properties(property_list)
 
     ## TODO: Make an empty IR and add edges and nodes incrementally (using the methods defined in IR).
 
-    dfg_edges = {}
     ## Add all inputs and outputs to the DFG edges
-    dfg_inputs = find_input_edges(inputs, dfg_edges, options, fileIdGen)
-    dfg_outputs = create_edges_from_opt_or_fd_list(out_stream, dfg_edges, options, fileIdGen)
-
-    com_name = Arg(command)
-    com_category = category
-
-    ## Get the options
-    dfg_options = [get_option(opt_or_fd, options, fileIdGen)
-                   for opt_or_fd in opt_indices]
+    cmd_invocation_with_io_vars, dfg_edges = add_file_id_vars(command_invocation_with_io, fileIdGen)
     com_redirs = redirections
     ## TODO: Add assignments
     com_assignments = []
 
     ## TODO: Combine them both in a constructor that decided whether to instantiate Cat or DFGNode
-    if(str(com_name) == "cat"):
-        dfg_node = Cat(dfg_inputs,
-                       dfg_outputs,
-                       com_name,
-                       ## TODO: We don't really need to pass category, name, or input_consumption for Cat
-                       com_category,
-                       com_options=dfg_options,
-                       com_redirs=com_redirs,
-                       com_assignments=com_assignments)
-    elif(str(com_name) == "hdfs" and str(dfg_options[0][1]) == "dfs" and str(dfg_options[1][1]) == "-cat"):
-        dfg_node = HDFSCat(dfg_inputs,
-                        dfg_outputs,
-                        com_name,
-                        com_category,
-                        com_options=dfg_options,
-                        com_redirs=com_redirs,
-                        com_assignments=com_assignments)
-    else:
+    # if(str(com_name) == "cat"):
+    #     dfg_node = Cat(dfg_inputs,
+    #                    dfg_outputs,
+    #                    com_name,
+    #                    ## TODO: We don't really need to pass category, name, or input_consumption for Cat
+    #                    com_category,
+    #                    com_options=dfg_options,
+    #                    com_redirs=com_redirs,
+    #                    com_assignments=com_assignments,
+    #                    )
+    # elif(str(com_name) == "hdfs" and str(dfg_options[0][1]) == "dfs" and str(dfg_options[1][1]) == "-cat"):
+    #     dfg_node = HDFSCat(dfg_inputs,
+    #                     dfg_outputs,
+    #                     com_name,
+    #                     com_category,
+    #                     com_options=dfg_options,
+    #                     com_redirs=com_redirs,
+    #                     com_assignments=com_assignments)
+    # else:
+    if(True):
         ## Assume: Everything must be completely expanded
         ## TODO: Add an assertion about that.
-        dfg_node = DFGNode(dfg_inputs, 
-                           dfg_outputs, 
-                           com_name,
-                           com_category,
-                           com_properties=com_properties,
-                           com_mapper=com_mapper,
-                           com_aggregator=com_aggregator,
-                           com_options=dfg_options,
+        dfg_node = DFGNode(cmd_invocation_with_io_vars,
                            com_redirs=com_redirs,
-                           com_assignments=com_assignments)
-    
-    if(not dfg_node.is_at_most_pure()):
-        raise ValueError()
+                           com_assignments=com_assignments,
+                           parallelizer_list=parallelizer_list,
+                           cmd_related_properties=cmd_related_properties
+                           )
+
+    # if(not dfg_node.is_at_most_pure()): # which consequences has this check had?
+    #     raise ValueError()
 
     node_id = dfg_node.get_id()
 
@@ -175,7 +241,7 @@ def compile_command_to_DFG(fileIdGen, command, options,
         assert(to_node is None)
         dfg_edges[fid_id] = (fid, from_node, node_id)
     
-    for fid_id in dfg_node.outputs:
+    for fid_id in dfg_node.get_output_list():
         fid, from_node, to_node = dfg_edges[fid_id]
         assert(from_node is None)
         dfg_edges[fid_id] = (fid, node_id, to_node)
@@ -212,15 +278,8 @@ def make_tee(input, outputs):
                    com_name, 
                    com_category)
 
-def make_map_node(node, new_inputs, new_outputs):
-    ## Some nodes have special map commands
-    if(not node.com_mapper is None):
-        new_node = MapperNode(node, new_inputs, new_outputs)
-    else:
-        new_node = node.copy()
-        new_node.inputs = new_inputs
-        new_node.outputs = new_outputs
-    return new_node
+def make_map_node(node, new_inputs, new_outputs, parallelizer):
+    return get_mapper_as_dfg_node_from_node(node, parallelizer, new_inputs, new_outputs)
 
 ## Makes a wrap node that encloses a map parallel node.
 ##
@@ -623,7 +682,7 @@ def get_node_inputs(self, node_id):
         return input_edge_ids
 
     def get_node_outputs(self, node_id):
-        output_edge_ids = self.nodes[node_id].outputs
+        output_edge_ids = self.nodes[node_id].get_output_list()
         return output_edge_ids
 
     def get_next_nodes(self, node_id):
@@ -658,7 +717,7 @@ def get_node_input_fids(self, node_id):
 
     def get_node_output_ids_fids(self, node_id):
         node = self.get_node(node_id)
-        return [(output_edge_id, self.edges[output_edge_id][0]) for output_edge_id in node.outputs]
+        return [(output_edge_id, self.edges[output_edge_id][0]) for output_edge_id in node.get_output_list()]
 
     def get_node_output_ids(self, node_id):
         return [fid_id for fid_id, _fid in self.get_node_output_ids_fids(node_id)]
@@ -681,8 +740,8 @@ def remove_node(self, node_id):
         ## Remove the node in the edges dictionary
         for in_id in node.get_input_list():
             self.set_edge_to(in_id, None)
-        
-        for out_id in node.outputs:
+
+        for out_id in node.get_output_list():
             self.set_edge_from(out_id, None)
 
 
@@ -692,15 +751,19 @@ def add_node(self, node):
         ## Add the node in the edges dictionary
         for in_id in node.get_input_list():
             self.set_edge_to(in_id, node_id)
-        
-        for out_id in node.outputs:
+
+        for out_id in node.get_output_list():
             self.set_edge_from(out_id, node_id)
 
+    def generate_ephemeral_edges(self, fileIdGen, num_of_edges):
+        file_ids = [fileIdGen.next_ephemeral_file_id() for _ in range(num_of_edges)]
+        self.add_edges(file_ids)
+        return [edge_fid.get_ident() for edge_fid in file_ids]
 
     def add_edges(self, edge_fids):
         for edge_fid in edge_fids:
             self.add_edge(edge_fid)
-    
+
     def add_edge(self, edge_fid):
         fid_id = edge_fid.get_ident()
         assert(not fid_id in self.edges)
@@ -743,14 +806,25 @@ def empty(self):
     ##
     ## In this case the stateless command is wrapped with wrap so we cannot actually tee the input (since we do not know apriori how many forks we have).
     ## However, we can actually write it to a file (not always worth performance wise) and then read it from all at once.
-    ## 
-    ## 
+    ##
+    ##
     ## TODO: Eventually delete the fileIdGen from here and always use the graph internal one.
     ##
     ## TODO: Eventually this should be tunable to not happen for all inputs (but maybe for less)
     def parallelize_node(self, node_id, fileIdGen):
+        assert(False)
         node = self.get_node(node_id)
-        assert(node.is_parallelizable())
+        # BEGIN ANNO
+        # OLD
+        # assert(node.is_parallelizable())
+        # NEW
+        log(f'parallelizers: {node.parallelizer_list}')
+        rr_parallelizer_list = [parallelizer for parallelizer in node.parallelizer_list if parallelizer.splitter.is_splitter_round_robin()]
+        assert(len(rr_parallelizer_list) == 1)
+        rr_parallelizer = rr_parallelizer_list[0]
+        # to have this info later when the merger is created in a reduce tree
+        node.set_used_parallelizer(rr_parallelizer)
+        # END ANNO
 
         ## Initialize the new_node list
         new_nodes = []
@@ -765,13 +839,14 @@ def parallelize_node(self, node_id, fileIdGen):
         previous_node = self.get_node(previous_node_id)
         assert(isinstance(previous_node, Cat)
                or isinstance(previous_node, r_merge.RMerge))
-        
+
         ## Determine if the previous node is r_merge to determine which of the three parallelization cases to follow
         r_merge_flag = isinstance(previous_node, r_merge.RMerge)
 
-        ## If the previous node of r_merge is an r_split, then we need to replace it with -r, 
+        ## If the previous node of r_merge is an r_split, then we need to replace it with -r,
         ## instead of doing unwraps.
         if(r_merge_flag):
+            assert(False)
             assert(isinstance(previous_node, r_merge.RMerge))
             r_merge_prev_node_ids = self.get_previous_nodes(previous_node_id)
 
@@ -784,16 +859,16 @@ def parallelize_node(self, node_id, fileIdGen):
             r_split_before_r_merge_opt_flag = all([isinstance(self.get_node(node_id), r_split.RSplit)
                                                    for node_id in r_merge_prev_node_ids])
 
-            ## If r_split was right before the r_merge, and the node is pure parallelizable, 
+            ## If r_split was right before the r_merge, and the node is pure parallelizable,
             ## this means that we will not add unwraps, and therefore we need to add the -r flag to r_split.
             if (r_split_before_r_merge_opt_flag
                 and node.is_pure_parallelizable()):
                 assert(node.is_commutative())
                 r_split_id = r_merge_prev_node_ids[0]
                 r_split_node = self.get_node(r_split_id)
-                
+
                 ## Add -r flag in r_split
-                r_split_node.add_r_flag()            
+                r_split_node.add_r_flag()
         else:
             r_split_before_r_merge_opt_flag = False
 
@@ -803,7 +878,7 @@ def parallelize_node(self, node_id, fileIdGen):
         parallelism = len(parallel_input_ids)
 
         ## Identify the output.
-        node_output_edge_ids = node.outputs
+        node_output_edge_ids = node.get_output_list()
         assert(len(node_output_edge_ids) == 1)
         node_output_edge_id = node_output_edge_ids[0]
 
@@ -818,22 +893,31 @@ def parallelize_node(self, node_id, fileIdGen):
         parallel_configuration_ids = [[] for _ in range(parallelism)]
         node_conf_inputs = node.get_configuration_inputs()
         for conf_edge_id in node_conf_inputs:
+            assert(False)
             ## TODO: For now this does not work for r_merge
             assert(not r_merge_flag)
             # self.set_edge_to(conf_edge_id, None)
             tee_id = self.tee_edge(conf_edge_id, parallelism, fileIdGen)
             tee_node = self.get_node(tee_id)
             for i in range(parallelism):
+                # TODO outputs probably non-existent
                 parallel_configuration_ids[i].append(tee_node.outputs[i])
-        
+
         ## Create a temporary output edge for each parallel command.
-        map_output_fids = node.get_map_output_files(parallel_input_ids, fileIdGen)
+        # BEGIN ANNO
+        # OLD
+        # map_output_fids = node.get_map_output_files(parallel_input_ids, fileIdGen)
+        # NEW (added parameter)
+        map_output_fids = get_map_output_files(node, parallel_input_ids, fileIdGen, rr_parallelizer)
+        # END ANNO
+        assert(len(map_output_fids) == len(parallel_input_ids))
 
         all_map_output_ids = []
         ## For each parallel input, create a parallel command
         for index in range(parallelism):
             ## Gather inputs and outputs
             conf_ins = parallel_configuration_ids[index]
+            assert(len(conf_ins) == 0)
             standard_in = parallel_input_ids[index]
             new_inputs = (conf_ins, [standard_in])
             map_output_fid = map_output_fids[index]
@@ -848,9 +932,10 @@ def parallelize_node(self, node_id, fileIdGen):
             for output_fid in output_fid_list:
                 self.add_edge(output_fid)
 
-            ## If the previous merger is r_merge we need to put wrap around the nodes 
+            ## If the previous merger is r_merge we need to put wrap around the nodes
             ## or unwrap before a commutative command
             if(r_merge_flag is True):
+                assert(False)
                 ## For stateless nodes we are in case (2) and we wrap them
                 if (node.is_stateless()):
                     parallel_node = make_wrap_map_node(node, new_inputs, new_output_ids)
@@ -888,23 +973,34 @@ def parallelize_node(self, node_id, fileIdGen):
                         parallel_node = unwrap_node
             else:
                 ## If we are working with a `cat` (and not an r_merge), then we just make a parallel node
-                parallel_node = make_map_node(node, new_inputs, new_output_ids)
+                parallel_node = make_map_node(node, new_inputs, new_output_ids, rr_parallelizer)
                 self.add_node(parallel_node)
 
             parallel_node_id = parallel_node.get_id()
 
             ## Set the to of all input edges
             for conf_in in conf_ins:
+                assert(False)
                 self.set_edge_to(conf_in, parallel_node_id)
             self.set_edge_to(standard_in, parallel_node_id)
 
 
         if (node.com_category == "stateless"):
             if(r_merge_flag is True):
+                assert(False)
                 new_merger = r_merge.make_r_merge_node(flatten_list(all_map_output_ids), node_output_edge_id)
             else:
-                new_merger = make_cat_node(flatten_list(all_map_output_ids), node_output_edge_id)
-            
+                # BEGIN ANNO
+                # OLD
+                # new_merger = make_cat_node(flatten_list(all_map_output_ids), node_output_edge_id)
+                # log(f'old_new_merger: {new_merger}')
+                # NEW
+                log(f'node: {node}')
+                log(f'rr_parallelizer: {rr_parallelizer}')
+                new_merger = get_aggregator_as_dfg_node_from_node(node, rr_parallelizer, flatten_list(all_map_output_ids), [node_output_edge_id])
+                log(f'new_new_merger: {new_merger}')
+                # END ANNO
+
             self.add_node(new_merger)
             new_nodes.append(new_merger)
             self.set_edge_from(node_output_edge_id, new_merger.get_id())
@@ -977,7 +1073,7 @@ def edge_node_consistency(self):
         for edge_id, (_, from_node_id, to_node_id) in self.edges.items():
             if (not from_node_id is None):
                 from_node = self.get_node(from_node_id)
-                if(not (edge_id in from_node.outputs)):
+                if(not (edge_id in from_node.get_output_list())):
                     log("Consistency Error: Edge id:", edge_id, "is not in the node outputs:", from_node)
                     return False
             if (not to_node_id is None):
@@ -992,7 +1088,7 @@ def edge_node_consistency(self):
                 if(not (to_node_id == node_id)):
                     log("Consistency Error: The to_node_id of the input_edge:", edge_id, "of the node:", node, "is equal to:", to_node_id)
                     return False
-            for edge_id in node.outputs:
+            for edge_id in node.get_output_list():
                 _, from_node_id, _ = self.edges[edge_id]
                 if(not (from_node_id == node_id)):
                     log("Consistency Error: The from_node_id of the output_edge:", edge_id, "of the node:", node, "is equal to:", from_node_id)
@@ -1015,4 +1111,3 @@ def valid(self):
                 #      and not self.get_stdin() is None 
                 #      and not self.get_stdout() is None)))
 
-
diff --git a/compiler/pash_runtime.py b/compiler/pash_runtime.py
index b5f881119..6426cce06 100644
--- a/compiler/pash_runtime.py
+++ b/compiler/pash_runtime.py
@@ -14,6 +14,7 @@
 
 from definitions.ir.aggregator_node import *
 
+from definitions.ir.dfg_node import DFGNode
 from definitions.ir.nodes.eager import *
 from definitions.ir.nodes.pash_split import *
 
@@ -284,9 +285,10 @@ def naive_parallelize_stateless_nodes_bfs(graph, fan_out, batch_size, no_cat_spl
             next_node_ids = graph.get_next_nodes(curr_id)
             workset += next_node_ids
 
-            new_nodes = parallelize_cat(curr_id, graph, fileIdGen,
-                                        fan_out, batch_size, no_cat_split_vanish,
-                                        r_split_flag, r_split_batch_size)
+            # function application has side effects on graphs
+            new_nodes = parallelize_node(curr_id, graph, fileIdGen,
+                                         fan_out, batch_size, no_cat_split_vanish,
+                                         r_split_flag, r_split_batch_size)
 
             ## Assert that the graph stayed valid after the transformation
             ## TODO: Do not run this everytime in the loop if we are not in debug mode.
@@ -413,71 +415,66 @@ def split_hdfs_cat_input(hdfs_cat, next_node, graph, fileIdGen):
 ## TODO: At the moment we greedily try to add r-splits if possible, so we need to have a better procedure of deciding whether to put them or not.
 ##       For example for non-commutative pure commands.
 
-## If the current command is a cat, and is followed by a node that
-## is either stateless or pure parallelizable, commute the cat
-## after the node.
-def parallelize_cat(curr_id, graph, fileIdGen, fan_out,
-                    batch_size, no_cat_split_vanish, r_split_flag, r_split_batch_size):
+## This function takes a node (id) and parallelizes it
+def parallelize_node(curr_id, graph, fileIdGen, fan_out,
+                     batch_size, no_cat_split_vanish, r_split_flag, r_split_batch_size):
     curr = graph.get_node(curr_id)
     new_nodes_for_workset = []
 
-    # log("Check to parallelize curr:", curr)
+    option_parallelizer_rr = curr.get_option_implemented_round_robin_parallelizer()
+
+    if option_parallelizer_rr is not None:
+        # TODO: this whole fragment could be moved to the graph after picking a parallelizer
+        # TODO: we only do consecutive chunks here but from a rr splitter
+        parallelizer_rr = option_parallelizer_rr
+        streaming_inputs = curr.get_streaming_inputs()
+        assert(len(streaming_inputs) == 1)
+        streaming_input = streaming_inputs[0]
+        configuration_inputs = curr.get_configuration_inputs()
+        assert(len(configuration_inputs) == 0)
+        streaming_outputs = curr.get_output_list()
+        assert(len(streaming_outputs) == 1)
+        streaming_output = streaming_outputs[0]
+        original_cmd_invocation_with_io_vars = curr.cmd_invocation_with_io_vars
+
+        graph.remove_node(curr_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph
+
+        out_split_ids = graph.generate_ephemeral_edges(fileIdGen, fan_out)
+        splitter = pash_split.make_split_file(streaming_input, out_split_ids)
+        graph.set_edge_to(streaming_input, splitter.get_id())
+        for out_split_id in out_split_ids:
+            graph.set_edge_from(out_split_id, splitter.get_id())
+
+
+        in_mapper_ids = out_split_ids
+        out_mapper_ids = graph.generate_ephemeral_edges(fileIdGen, fan_out)
+        zip_mapper_in_out_ids = zip(in_mapper_ids, out_mapper_ids)
+
+        all_mappers = []
+        for (in_id, out_id) in zip_mapper_in_out_ids:
+            # BEGIN: these 4 lines could be refactored to be a function in graph such that
+            # creating end point of edges and the creation of edges is not decoupled
+            mapper_cmd_inv = parallelizer_rr.get_actual_mapper(original_cmd_invocation_with_io_vars, in_id, out_id)
+            mapper = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(mapper_cmd_inv)
+            graph.set_edge_to(in_id, mapper.get_id())
+            graph.set_edge_from(out_id, mapper.get_id())
+            # END
+            all_mappers.append(mapper)
+
+        in_aggregator_ids = out_mapper_ids
+        out_aggregator_id = streaming_output
+        ## TODO: This could potentially be an aggregator tree (at least in the old PaSh versions)
+        ##       We need to extend the annotations/parallelizers to support this (e.g., for sort)
+        aggregator_cmd_inv = parallelizer_rr.get_actual_aggregator(original_cmd_invocation_with_io_vars, in_aggregator_ids, out_aggregator_id)
+        aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(aggregator_cmd_inv)
+        for in_aggregator_id in in_aggregator_ids:
+            graph.set_edge_to(in_aggregator_id, aggregator.get_id())
+        graph.set_edge_from(streaming_output, aggregator.get_id())
 
-    ## Get next nodes in the graph
-    next_node_ids = graph.get_next_nodes(curr_id)
-
-    ## We try to parallelize for all the edges that go out from the current node and into another node
-    for next_node_id in next_node_ids:
-        next_node = graph.get_node(next_node_id)
-        # log("|-- its next node is:", next_node)
-        new_curr = curr
-        new_curr_id = curr_id
-
-        ## If the next node can be parallelized, then we should try to parallelize
-        ##
-        ## If the user has provided the r_split flag (they want to use r_split), 
-        ## then parallelizability depends on commutativity (if a command is pure parallelizable but not commutative)
-        ## then it can't be parallelized. Therefore we do not parallelize non-commutative pure parallelizable commands.
-        ##
-        ## TODO: We need to extend PaSh to have a mode where it can have both r_splits and auto_split if a command is not
-        ##       commutative. This can be added as an option to the r_split flag, e.g., r_split="no" | "yes" | "optimal".
-        if(next_node.is_parallelizable()
-           and not isinstance(next_node, Cat)
-           and (not r_split_flag
-                or (next_node.is_commutative()
-                    or next_node.is_stateless()))):
-            ## If the current node is not a merger, it means that we need
-            ## to generate a merger using a splitter (auto_split or r_split)
-            if (isinstance(curr, HDFSCat) and config.pash_args.distributed_exec):
-                new_curr = split_hdfs_cat_input(curr, next_node, graph, fileIdGen) # Cat merger
-                new_curr_id = new_curr.get_id()
-            ## no_cat_split_vanish shortcircuits this and inserts a split even if the current node is a cat.
-            elif (fan_out > 1
-               and (no_cat_split_vanish
-                    or (not (isinstance(curr, Cat)
-                             or isinstance(curr, r_merge.RMerge))
-                        or ((isinstance(curr, Cat)
-                             or isinstance(curr, r_merge.RMerge))
-                            and len(curr.get_input_list()) < fan_out)))):
-                new_merger = split_command_input(next_node, graph, fileIdGen, fan_out, batch_size, r_split_flag, r_split_batch_size)
-                ## After split has succeeded we know that the curr node (previous of the next)
-                ## has changed. Therefore we need to retrieve it again.
-                if (not new_merger is None):
-                    new_curr_id = new_merger.get_id()
-                    new_curr = new_merger
-                    assert(isinstance(new_curr, Cat)
-                           or isinstance(new_curr, r_merge.RMerge))
-
-            ## If curr is cat, it means that split suceeded, or it was
-            ## already a cat. In any case, we can proceed with the
-            ## parallelization.
-            ##
-            ## Both Cat and RMerge can be "commuted" with parallelizable nodes
-            if(isinstance(new_curr, Cat)
-               or isinstance(new_curr, r_merge.RMerge)):
-                new_nodes = check_parallelize_dfg_node(new_curr_id, next_node_id, graph, fileIdGen)
-                # log("New nodes:", new_nodes)
-                new_nodes_for_workset += new_nodes
+        ## Add the merge commands in the graph
+        new_nodes = [splitter] + all_mappers + [aggregator]
+        for new_node in new_nodes:
+            graph.add_node(new_node)
 
     return new_nodes_for_workset
 
@@ -491,6 +488,7 @@ def parallelize_cat(curr_id, graph, fileIdGen, fan_out,
 ##
 ## TODO: We need to check if the previous node is a cat or a merge
 def check_parallelize_dfg_node(merger_id, node_id, graph, fileIdGen):
+    assert(False)
 
     ## Get merger inputs (cat or r_merge).
     merger_input_edge_ids = graph.get_node_input_ids(merger_id)
@@ -511,6 +509,7 @@ def check_parallelize_dfg_node(merger_id, node_id, graph, fileIdGen):
     return new_nodes
 
 def parallelize_dfg_node(old_merger_id, node_id, graph, fileIdGen):
+    assert(False)
     node = graph.get_node(node_id)
     assert(node.is_parallelizable())
 
@@ -749,18 +748,18 @@ def add_eager_nodes(graph, use_dgsh_tee):
                         add_eager(curr_input_id, graph, fileIdGen, intermediateFileIdGen, use_dgsh_tee)
 
             if(isinstance(curr, Split)):
-                eager_input_ids = curr.outputs[:-1]
+                eager_input_ids = curr.get_output_list()[:-1]
                 for edge_id in eager_input_ids:
                     add_eager(edge_id, graph, fileIdGen, intermediateFileIdGen, use_dgsh_tee)
 
             ## Add an eager after r_unwrap            
             if(isinstance(curr, r_unwrap.RUnwrap)):
-                eager_input_id = curr.outputs[0]
+                eager_input_id = curr.get_output_list()[0]
                 add_eager(eager_input_id, graph, fileIdGen, intermediateFileIdGen, use_dgsh_tee)
 
             ## Add an eager after r_split
             if(isinstance(curr, r_split.RSplit)):
-                eager_input_ids = curr.outputs
+                eager_input_ids = curr.get_output_list()
                 for edge_id in eager_input_ids:
                     add_eager(edge_id, graph, fileIdGen, intermediateFileIdGen, use_dgsh_tee)
 
diff --git a/compiler/util.py b/compiler/util.py
index a6b3857a9..1c4b30ace 100644
--- a/compiler/util.py
+++ b/compiler/util.py
@@ -1,4 +1,6 @@
 from datetime import timedelta
+from typing import Optional, TypeVar, Union, List, Any
+TType = TypeVar("TType")
 import os
 import sys
 import config
@@ -45,3 +47,16 @@ def ptempfile():
     ## TODO: Get a name without opening the fd too if possible
     os.close(fd)
     return name
+
+def return_empty_list_if_none_else_itself(arg: Optional[TType]) -> Union[TType, List[Any]]: #list always empty
+    if arg is None:
+        return []
+    else:
+        return arg
+
+def return_default_if_none_else_itself(arg: Optional[TType], default: TType) -> TType:
+    if arg is None:
+        return default
+    else:
+        return arg
+

From 8407ca51927d1a3b41ed4627db7c361cbdcaf48a Mon Sep 17 00:00:00 2001
From: Felix Stutz <fstutz@mpi-sws.org>
Date: Mon, 27 Jun 2022 14:56:08 -0400
Subject: [PATCH 02/64] Add support for eager-nodes (#589)

* Add support for eager-nodes

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>

* Remove eager from TODO-list

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>

* Addressed comments from PR

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>
---
 TODO.md                                |   2 +-
 compiler/definitions/ir/nodes/eager.py | 136 +++++--------------------
 compiler/ir.py                         |   4 +
 compiler/pash_runtime.py               |  10 +-
 4 files changed, 37 insertions(+), 115 deletions(-)

diff --git a/TODO.md b/TODO.md
index 67b55f0ea..08bb235b8 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,9 +1,9 @@
 ## TODOs before merging to `future`
 
-- eager
 - aggregation trees
 - r_split
 - cat-split fusion
+- dgsh_tee
 - working on all tests
 - Adding annotation library installation and removing ad-hoc import of the latter
 - clean up utils for annotations
diff --git a/compiler/definitions/ir/nodes/eager.py b/compiler/definitions/ir/nodes/eager.py
index 9cc37315f..2a5ee5aa9 100644
--- a/compiler/definitions/ir/nodes/eager.py
+++ b/compiler/definitions/ir/nodes/eager.py
@@ -1,118 +1,32 @@
+from datatypes_new.AccessKind import AccessKind
+from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars
+
 from definitions.ir.dfg_node import *
-from ir_utils import *
 
 class Eager(DFGNode):
-    def __init__(self, inputs, outputs, com_name, com_category, com_options = [], 
-                 com_redirs = [], com_assignments=[],
-                 intermediate = None):
-        # BEGIN ANNO : hack for intermediate at the end
-        self.intermediate = intermediate
-        # END ANNO
-        super().__init__(inputs, outputs, com_name, com_category,
-                         com_options=com_options, 
-                         com_redirs=com_redirs, 
+    def __init__(self,
+                 cmd_invocation_with_io_vars,
+                 com_redirs=[], com_assignments=[]
+                 ):
+        # TODO []: default
+        super().__init__(
+                         cmd_invocation_with_io_vars,
+                         com_redirs=com_redirs,
                          com_assignments=com_assignments)
 
-    # BEGIN ANNO : copied from DFG node for hack for intermediate at the end
-    def to_ast(self, edges, drain_streams):
-        log(f'do we get here?')
-        ## TODO: We might not want to implement this at all actually
-        if (drain_streams):
-            raise NotImplementedError()
-        else:
-            ## Handle special node to ast here
-            # node = self.special_to_ast(edges)
-            # if node is not None:
-            #     return node
-
-            redirs = self._to_ast_aux_get_redirs()
-            assignments = self.com_assignments
-            ## Start filling in the arguments
-            opt_arguments = []
-            # BEGIN ANNO
-            # get_command_invocation_prefix_from_dfg_node
-            log(f'com_name: {self.com_name}')
-            log(f'edges: {edges}')
-            log(f'inputs: {self.inputs}')
-            log(f'outputs: {self.outputs}')
-            log(f'com_redirs: {self.com_redirs}')
-            log(f'pos config: {self.positional_config_list}')
-            log(f'pos input: {self.positional_input_list}')
-            log(f'pos output: {self.positional_output_list}')
-            log(f'com_options: {self.com_options}')
-            log(f'flag_option_list: {self.flag_option_list}')
-
-            # if self.implicit_use_of_stdin: # need to recompute
-            # cat a list of inputs into it; redirect a single one
-            # else:
-
-            # OLD
-            # for i, opt in self.com_options:
-            #     ## Pad the argument list with None
-            #     opt_arguments = pad(opt_arguments, i)
-            #     opt_arguments[i] = opt.to_ast()
-            # log(f'opt_arguments: {format_args([val for val in opt_arguments if val is not None])}')
-            # NEW
-            opt_arguments_new = [get_ast_for_flagoption(flagoption) for flagoption in self.flag_option_list]
-            opt_arguments_new += [get_ast_for_argstringtype(arg) for arg in self.positional_config_list]
-            log(f'opt_arguments_new: {format_args(opt_arguments_new)}')
-            # END ANNO
-
-            com_name_ast = self.com_name.to_ast()
-            option_asts = [opt.to_ast() for _, opt in self.com_options]
-
-            ##
-            ## 1. Find the input and output fids
-            ## 2. Construct the rest of the arguments and input/output redirections according to
-            ##    the command IO
-            input_fids = [edges[in_id][0] for in_id in self.get_input_list()]
-            output_fids = [edges[out_id][0] for out_id in self.outputs]
-            rest_argument_fids, new_redirs = create_command_arguments_redirs(com_name_ast,
-                                                                             option_asts,
-                                                                             input_fids,
-                                                                             output_fids)
-
-            ## Transform the rest of the argument fids to arguments
-            ## Since some of the rest_arguments can be None (they only contain inputs and outputs)
-            ## we need to make sure that we don't turn None objects to asts.
-            ##
-            ## The None fields need to be filtered out because they are taken care of by the interleave function.
-            ##
-            ## TODO: Is this actually OK?
-            rest_arguments = [fid.to_ast()
-                              for fid in rest_argument_fids
-                              if not fid is None]
-            log(f'rest_arguments: {format_args(rest_arguments)}')
-
-            ## Interleave the arguments since options args might contain gaps.
-            # BEGIN ANNO
-            rest_arguments_backup = rest_arguments.copy()
-            # OLD
-            # arguments = interleave_args(opt_arguments, rest_arguments)
-            # log(f'arguments fin: {format_args(arguments)}')
-            # NEW
-            arguments_new = opt_arguments_new + rest_arguments_backup + [self.intermediate.to_ast()]
-            log(f'arguments_new: {format_args(arguments_new)}')
-            # END ANNO
-
-            all_arguments = [com_name_ast] + arguments_new
-            all_redirs = redirs + new_redirs
-
-            node = make_command(all_arguments, redirections=all_redirs, assignments=assignments)
-        return node
-
 
 def make_eager_node(input_id, output_id, intermediate_file_id, eager_exec_path):
-    com_name = Arg(string_to_argument(eager_exec_path))
-    com_category = "pure"
-    ## TODO: In theory the intermediate file id is also an output...
-    # BEGIN ANNO
-    # OLD
-    intermediate_identifier = Arg(intermediate_file_id.to_ast())
-    com_options = [(2, intermediate_identifier)]
-    return Eager([input_id],
-                 [output_id],
-                 com_name, 
-                 com_category,
-                 com_options=com_options,
-                 intermediate=intermediate_identifier)
+    eager_name = eager_exec_path
+    intermediate_file_id_id = intermediate_file_id.get_ident()
+    operand_list = [input_id, output_id, intermediate_file_id_id]
+    access_map = {output_id: AccessKind.make_stream_output(),
+                  input_id: AccessKind.make_stream_input(),
+                  intermediate_file_id_id: AccessKind.make_other_output()}
+    cmd_inv_with_io_vars = CommandInvocationWithIOVars(
+        cmd_name=eager_name,
+        flag_option_list=[],
+        operand_list=operand_list,
+        implicit_use_of_streaming_input=None,
+        implicit_use_of_streaming_output=None,
+        access_map=access_map)
+    return Eager(cmd_inv_with_io_vars)
diff --git a/compiler/ir.py b/compiler/ir.py
index 641546cfc..d5a395462 100644
--- a/compiler/ir.py
+++ b/compiler/ir.py
@@ -72,6 +72,10 @@ def next_ephemeral_file_id(self):
         fileId.make_ephemeral()
         return fileId
 
+    def bump_counter_to_value_of(self, OtherFileIdGen):
+        # TODO: find a better solution to make unique numbers, currently: set to max-value + 1
+        self.next = OtherFileIdGen.next + 1
+
 ## Returns the resource or file descriptor related to this specific opt_or_fd
 ## NOTE: Assumes that everything is expanded. 
 def get_option_or_fd(opt_or_fd, options, fileIdGen):
diff --git a/compiler/pash_runtime.py b/compiler/pash_runtime.py
index 6426cce06..06d66c129 100644
--- a/compiler/pash_runtime.py
+++ b/compiler/pash_runtime.py
@@ -218,8 +218,7 @@ def optimize_irs(asts_and_irs, args, compiler_config):
                                                                       args.no_cat_split_vanish,
                                                                       args.r_split, args.r_split_batch_size)
             # pr.print_stats()
-            # log(distributed_graph)
-            
+
             # Eagers are added in remote notes when using distributed exec
             if(not args.no_eager and not args.distributed_exec): 
                 eager_distributed_graph = add_eager_nodes(distributed_graph, args.dgsh_tee)
@@ -231,7 +230,6 @@ def optimize_irs(asts_and_irs, args, compiler_config):
 
             ## Print statistics of output nodes
             print_graph_statistics(eager_distributed_graph)
-            # log(eager_distributed_graph)
 
             optimized_asts_and_irs.append(eager_distributed_graph)
         else:
@@ -686,12 +684,18 @@ def add_eager(eager_input_id, graph, fileIdGen, intermediateFileIdGen, use_dgsh_
     new_id = new_fid.get_ident()
 
     if use_dgsh_tee:
+        assert(False)
         ## TODO: seperate to better use dgsh-tee params and maybe deprecate eager
         eager_node = dgsh_tee.make_dgsh_tee_node(eager_input_id, new_id)
     else:
         ## TODO: Remove the line below if eager creates its intermediate file
         ##       on its own.
+        # TODO: find a better solution to make unique numbers, currently: set to max-value + 1
+        intermediateFileIdGen.bump_counter_to_value_of(fileIdGen)
         intermediate_fid = intermediateFileIdGen.next_temporary_file_id()
+        # TODO: this edge will never have to since eager is set to output even though it reads from it
+        graph.add_edge(intermediate_fid)
+        fileIdGen.bump_counter_to_value_of(intermediateFileIdGen)
 
         eager_exec_path = '{}/{}'.format(config.PASH_TOP, runtime_config['eager_executable_path'])
 

From d92cbba6fe30c6c9a703fb25818e6d76cc7c1911 Mon Sep 17 00:00:00 2001
From: Felix Stutz <fstutz@mpi-sws.org>
Date: Mon, 27 Jun 2022 19:03:28 -0400
Subject: [PATCH 03/64] Add reduce/aggregation trees (#590)

* Add reduce trees

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>

* Remove task in TODO

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>

* Remove log statements

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>
---
 TODO.md                                       |  1 -
 .../annotations_utils/util_cmd_invocations.py | 11 ---
 compiler/ir.py                                | 89 +++++++++++++++--
 compiler/pash_runtime.py                      | 98 +++++--------------
 4 files changed, 106 insertions(+), 93 deletions(-)

diff --git a/TODO.md b/TODO.md
index 08bb235b8..9edb7d878 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,6 +1,5 @@
 ## TODOs before merging to `future`
 
-- aggregation trees
 - r_split
 - cat-split fusion
 - dgsh_tee
diff --git a/compiler/annotations_utils/util_cmd_invocations.py b/compiler/annotations_utils/util_cmd_invocations.py
index 90e5f6c10..85cc36b41 100644
--- a/compiler/annotations_utils/util_cmd_invocations.py
+++ b/compiler/annotations_utils/util_cmd_invocations.py
@@ -9,8 +9,6 @@
 from annotation_generation_new.AnnotationGeneration import get_input_output_info_from_cmd_invocation, \
     get_parallelizability_info_from_cmd_invocation
 
-from util import log
-
 from config import get_path_annotation_repo
 sys.path.insert(1, get_path_annotation_repo())
 
@@ -26,18 +24,11 @@ def get_command_invocation_prefix_from_dfg_node(dfg_node):
 
 # TODO: ideally methods in the respective classes but requires refactoring of parsing infrastructure
 def to_node_cmd_inv_with_io_vars(cmd_inv, edges, redirs, assignments):
-    log("edges", edges)
     ast_cmd_name = string_to_argument(cmd_inv.cmd_name)
-    log("ast_cmd_name", ast_cmd_name)
     ast_flagoptions = []
     for flagoption in cmd_inv.flag_option_list:
         ast_flagoptions += to_ast_flagoption(flagoption, edges)
-    log("flagoptions", cmd_inv.flag_option_list)
-    log("ast_flagoptions", ast_flagoptions)
     ast_operands = [to_ast_operand(operand, edges) for operand in cmd_inv.operand_list]
-    log("operands", cmd_inv.operand_list)
-    log("ast_operands", ast_operands)
-    # log("type of ast_operands [0]", type(ast_operands[0])) # can only be used if there are operands
     cmd_asts = [ast_cmd_name] + ast_flagoptions + ast_operands
 
     # TODO: check for actual stdin
@@ -56,7 +47,6 @@ def to_node_cmd_inv_with_io_vars(cmd_inv, edges, redirs, assignments):
 
     new_redirs = redirs + stdin_redir + stdout_redir
     node = make_command(cmd_asts, redirections=new_redirs, assignments=assignments)
-    log("node", node)
     return node
 
 def to_ast_flagoption(flagoption, _edges):
@@ -82,7 +72,6 @@ def to_ast_arg_string_type(arg_string_type):
 # assumes io_var is an edge id
 def dereference_io_var(io_var, edges):
     fid, _, _ = edges[io_var]
-    log(fid)
     return fid.to_ast()
 
 def get_input_output_info_from_cmd_invocation_util(cmd_invocationInitial : CommandInvocationInitial) -> InputOutputInfo:
diff --git a/compiler/ir.py b/compiler/ir.py
index d5a395462..0b8938c0f 100644
--- a/compiler/ir.py
+++ b/compiler/ir.py
@@ -289,8 +289,6 @@ def make_map_node(node, new_inputs, new_outputs, parallelizer):
 ##
 ## At the moment it only works with one input and one output since wrap cannot redirect input in the command.
 def make_wrap_map_node(node, new_inputs, new_outputs):
-    # log("Inputs:", new_inputs)
-    # log("Outputs:", new_outputs)
     assert(is_single_input(new_inputs))
     assert(len(new_outputs) == 1)
 
@@ -318,8 +316,6 @@ def __init__(self, nodes, edges, background = False):
         self.nodes = nodes
         self.edges = edges
         self.background = background
-        # log("Nodes:", self.nodes)
-        # log("Edges:", self.edges)
 
         ## Apply the redirections for each separate node.
         ## This needs to be called here because nodes do not
@@ -822,7 +818,6 @@ def parallelize_node(self, node_id, fileIdGen):
         # OLD
         # assert(node.is_parallelizable())
         # NEW
-        log(f'parallelizers: {node.parallelizer_list}')
         rr_parallelizer_list = [parallelizer for parallelizer in node.parallelizer_list if parallelizer.splitter.is_splitter_round_robin()]
         assert(len(rr_parallelizer_list) == 1)
         rr_parallelizer = rr_parallelizer_list[0]
@@ -997,12 +992,8 @@ def parallelize_node(self, node_id, fileIdGen):
                 # BEGIN ANNO
                 # OLD
                 # new_merger = make_cat_node(flatten_list(all_map_output_ids), node_output_edge_id)
-                # log(f'old_new_merger: {new_merger}')
                 # NEW
-                log(f'node: {node}')
-                log(f'rr_parallelizer: {rr_parallelizer}')
                 new_merger = get_aggregator_as_dfg_node_from_node(node, rr_parallelizer, flatten_list(all_map_output_ids), [node_output_edge_id])
-                log(f'new_new_merger: {new_merger}')
                 # END ANNO
 
             self.add_node(new_merger)
@@ -1115,3 +1106,83 @@ def valid(self):
                 #      and not self.get_stdin() is None 
                 #      and not self.get_stdout() is None)))
 
+    ## This is a function that creates a reduce tree for a given node
+    def create_generic_aggregator_tree(self, curr_node, parallelizer, input_ids_for_aggregators, out_aggregator_id, fileIdGen):
+        def function_to_get_binary_aggregator(in_ids, out_ids):
+            assert(len(out_ids) == 1)
+            aggregator_cmd_inv = parallelizer.get_actual_aggregator(curr_node.cmd_invocation_with_io_vars, in_ids, out_ids[0])
+            aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(aggregator_cmd_inv)
+            return aggregator
+        ## The Aggregator node takes a sequence of input ids and an output id
+        all_aggregators, new_edges, final_output_id = self.create_reduce_tree(lambda in_ids, out_ids: function_to_get_binary_aggregator(in_ids, out_ids),
+                                    input_ids_for_aggregators, fileIdGen)
+        ## Add the edges in the graph
+        self.add_edges(new_edges)
+        ## Add the merge commands in the graph
+        for new_node in all_aggregators:
+            self.add_node(new_node)
+
+        ## Replace the previous final_output_id with the previous id
+        node_output_edge_id = out_aggregator_id
+        final_merge_node_id = self.edges[final_output_id][1]
+        final_merge_node = self.get_node(final_merge_node_id)
+        final_merge_node.replace_edge(final_output_id, node_output_edge_id)
+        self.set_edge_from(node_output_edge_id, final_merge_node_id)
+        self.set_edge_from(final_output_id, None)
+
+    ## This function creates the reduce tree. Both input and output file
+    ## ids must be lists of lists, as the input file ids and the output
+    ## file ids might contain auxiliary files.
+    def create_reduce_tree(self, init_func, input_ids, fileIdGen):
+        tree = []
+        new_edges = []
+        curr_ids = input_ids
+        while(len(curr_ids) > 1):
+            new_level, curr_ids, new_fids = self.create_reduce_tree_level(init_func, curr_ids, fileIdGen)
+            tree += new_level
+            new_edges += new_fids
+
+        # Find the final output    (provided with parameter)
+        final_output_id = curr_ids[0][0]
+
+        ## Drain the final auxiliary outputs
+        final_auxiliary_outputs = curr_ids[0][1:]
+        drain_fids = [fileIdGen.next_file_id()
+                      for final_auxiliary_output in final_auxiliary_outputs]
+        for drain_fid in drain_fids:
+            drain_fid.set_resource(FileResource(Arg(string_to_argument('/dev/null'))))
+            new_edges.append(drain_fid)
+        drain_ids = [fid.get_ident() for fid in drain_fids]
+
+        drain_cat_commands = [make_cat_node([final_auxiliary_output], drain_id)
+                              for final_auxiliary_output, drain_id in zip(final_auxiliary_outputs, drain_ids)]
+        return (tree + drain_cat_commands), new_edges, final_output_id
+
+    @staticmethod
+    ## This function creates a level of the reduce tree. Both input and
+    ## output file ids must be lists of lists, as the input file ids and
+    ## the output file ids might contain auxiliary files.
+    def create_reduce_tree_level(init_func, input_ids, fileIdGen):
+        if(len(input_ids) % 2 == 0):
+            output_ids = []
+            even_input_ids = input_ids
+        else:
+            output_ids = [input_ids[0]]
+            even_input_ids = input_ids[1:]
+
+        new_fids = []
+        level = []
+        for i in range(0, len(even_input_ids), 2):
+            new_out_fids = [fileIdGen.next_ephemeral_file_id() for _ in input_ids[i]]
+            new_fids += new_out_fids
+            new_out_ids = [fid.get_ident() for fid in new_out_fids]
+            output_ids.append(new_out_ids)
+            new_node = IR.create_reduce_node(init_func, even_input_ids[i:i+2], new_out_ids)
+            level.append(new_node)
+        return (level, output_ids, new_fids)
+
+    @staticmethod
+    ## This function creates one node of the reduce tree
+    def create_reduce_node(init_func, input_ids, output_ids):
+        return init_func(flatten_list(input_ids), output_ids)
+    # TODO: this is where we need to use our aggregator spec/node
diff --git a/compiler/pash_runtime.py b/compiler/pash_runtime.py
index 06d66c129..f8045d5e8 100644
--- a/compiler/pash_runtime.py
+++ b/compiler/pash_runtime.py
@@ -4,6 +4,8 @@
 import traceback
 from datetime import datetime
 
+from annotation_generation_new.datatypes.parallelizability.AggregatorKind import AggregatorKindEnum
+
 import config
 from ir import *
 from ast_to_ir import compile_asts
@@ -442,7 +444,7 @@ def parallelize_node(curr_id, graph, fileIdGen, fan_out,
         graph.set_edge_to(streaming_input, splitter.get_id())
         for out_split_id in out_split_ids:
             graph.set_edge_from(out_split_id, splitter.get_id())
-
+        graph.add_node(splitter)
 
         in_mapper_ids = out_split_ids
         out_mapper_ids = graph.generate_ephemeral_edges(fileIdGen, fan_out)
@@ -458,21 +460,32 @@ def parallelize_node(curr_id, graph, fileIdGen, fan_out,
             graph.set_edge_from(out_id, mapper.get_id())
             # END
             all_mappers.append(mapper)
+        for new_node in all_mappers:
+            graph.add_node(new_node)
 
         in_aggregator_ids = out_mapper_ids
         out_aggregator_id = streaming_output
         ## TODO: This could potentially be an aggregator tree (at least in the old PaSh versions)
         ##       We need to extend the annotations/parallelizers to support this (e.g., for sort)
-        aggregator_cmd_inv = parallelizer_rr.get_actual_aggregator(original_cmd_invocation_with_io_vars, in_aggregator_ids, out_aggregator_id)
-        aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(aggregator_cmd_inv)
-        for in_aggregator_id in in_aggregator_ids:
-            graph.set_edge_to(in_aggregator_id, aggregator.get_id())
-        graph.set_edge_from(streaming_output, aggregator.get_id())
+        aggregator_spec = parallelizer_rr.get_aggregator_spec()
+        aggregator_kind = aggregator_spec.get_kind()
+        if aggregator_kind == AggregatorKindEnum.CONCATENATE or aggregator_kind == AggregatorKindEnum.CUSTOM_N_ARY:
+            aggregator_cmd_inv = parallelizer_rr.get_actual_aggregator(original_cmd_invocation_with_io_vars, in_aggregator_ids, out_aggregator_id)
+            aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(aggregator_cmd_inv)
+            for in_aggregator_id in in_aggregator_ids:
+                graph.set_edge_to(in_aggregator_id, aggregator.get_id())
+            graph.set_edge_from(streaming_output, aggregator.get_id())
+            all_aggregators = [aggregator]
+            ## Add the merge commands in the graph
+            for new_node in all_aggregators:
+                graph.add_node(new_node)
+        elif aggregator_kind == AggregatorKindEnum.CUSTOM_2_ARY:
+            # TODO: we simplify and assume that every mapper produces a single output for now:
+            map_in_aggregator_ids = [[id] for id in in_aggregator_ids]
+            graph.create_generic_aggregator_tree(curr, parallelizer_rr, map_in_aggregator_ids, out_aggregator_id, fileIdGen)
+        else:
+            raise Exception("aggregator kind not yet implemented")
 
-        ## Add the merge commands in the graph
-        new_nodes = [splitter] + all_mappers + [aggregator]
-        for new_node in new_nodes:
-            graph.add_node(new_node)
 
     return new_nodes_for_workset
 
@@ -579,28 +592,24 @@ def parallelize_dfg_node(old_merger_id, node_id, graph, fileIdGen):
 ##
 ## TODO: Make that generic to work through annotations
 def create_merge_commands(curr, new_output_ids, fileIdGen):
+    assert(False)
     if(str(curr.com_name) == "uniq"):
         return create_uniq_merge_commands(curr, new_output_ids, fileIdGen)
     else:
         return create_generic_aggregator_tree(curr, new_output_ids, fileIdGen)
 
-## This is a function that creates a reduce tree for a generic function
-def create_generic_aggregator_tree(curr, new_output_ids, fileIdGen):
-    ## The Aggregator node takes a sequence of input ids and an output id
-    output = create_reduce_tree(lambda in_ids, out_ids: AggregatorNode(curr, in_ids, out_ids),
-                                new_output_ids, fileIdGen)
-    return output
-
 ## TODO: These must be generated using some file information
 ##
 ## TODO: Find a better place to put these functions
 def create_sort_merge_commands(curr, new_output_ids, fileIdGen):
+    assert(False)
     output = create_reduce_tree(lambda ids: SortGReduce(curr, ids),
                                 new_output_ids, fileIdGen)
     return output
 
 ## Instead of creating a tree, we just create a single level reducer for uniq
 def create_uniq_merge_commands(curr, new_output_ids, fileIdGen):
+    assert(False)
     ## Make an intermediate cat node
     intermediate_fid = fileIdGen.next_ephemeral_file_id()
     intermediate_id = intermediate_fid.get_ident()
@@ -622,61 +631,6 @@ def create_uniq_merge_commands(curr, new_output_ids, fileIdGen):
 
     return ([new_cat, node], [intermediate_fid, new_out_fid], new_out_id)
 
-## This function creates the reduce tree. Both input and output file
-## ids must be lists of lists, as the input file ids and the output
-## file ids might contain auxiliary files.
-def create_reduce_tree(init_func, input_ids, fileIdGen):
-    tree = []
-    new_edges = []
-    curr_ids = input_ids
-    while(len(curr_ids) > 1):
-        new_level, curr_ids, new_fids = create_reduce_tree_level(init_func, curr_ids, fileIdGen)
-        tree += new_level
-        new_edges += new_fids
-
-    ## Find the final output
-    final_output_id = curr_ids[0][0]
-
-    ## Drain the final auxiliary outputs
-    final_auxiliary_outputs = curr_ids[0][1:]
-    drain_fids = [fileIdGen.next_file_id()
-                  for final_auxiliary_output in final_auxiliary_outputs]
-    for drain_fid in drain_fids:
-        drain_fid.set_resource(FileResource(Arg(string_to_argument('/dev/null'))))
-        new_edges.append(drain_fid)
-    drain_ids = [fid.get_ident() for fid in drain_fids]
-
-    drain_cat_commands = [make_cat_node([final_auxiliary_output], drain_id)
-                          for final_auxiliary_output, drain_id in zip(final_auxiliary_outputs, drain_ids)]
-    return (tree + drain_cat_commands), new_edges, final_output_id
-
-
-## This function creates a level of the reduce tree. Both input and
-## output file ids must be lists of lists, as the input file ids and
-## the output file ids might contain auxiliary files.
-def create_reduce_tree_level(init_func, input_ids, fileIdGen):
-    if(len(input_ids) % 2 == 0):
-        output_ids = []
-        even_input_ids = input_ids
-    else:
-        output_ids = [input_ids[0]]
-        even_input_ids = input_ids[1:]
-
-    new_fids = []
-    level = []
-    for i in range(0, len(even_input_ids), 2):
-        new_out_fids = [fileIdGen.next_ephemeral_file_id() for _ in input_ids[i]]
-        new_fids += new_out_fids
-        new_out_ids = [fid.get_ident() for fid in new_out_fids]
-        output_ids.append(new_out_ids)
-        new_node = create_reduce_node(init_func, even_input_ids[i:i+2], new_out_ids)
-        level.append(new_node)
-    return (level, output_ids, new_fids)
-
-## This function creates one node of the reduce tree
-def create_reduce_node(init_func, input_ids, output_ids):
-    return init_func(flatten_list(input_ids), output_ids)
-
 
 ## This functions adds an eager on a given edge.
 def add_eager(eager_input_id, graph, fileIdGen, intermediateFileIdGen, use_dgsh_tee):

From 2b9935d487fa716aa7a4bb7466f2e2953127c702 Mon Sep 17 00:00:00 2001
From: Felix Stutz <fstutz@mpi-sws.org>
Date: Tue, 28 Jun 2022 16:09:34 -0400
Subject: [PATCH 04/64] Add support for round-robin parallelization, including
 unwrap for commutative commands (#591)

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>
---
 TODO.md                                       |  4 +-
 .../annotations_utils/util_cmd_invocations.py | 57 ++++++++++-
 compiler/definitions/ir/dfg_node.py           | 11 +--
 compiler/definitions/ir/nodes/eager.py        |  3 +-
 compiler/definitions/ir/nodes/r_merge.py      | 38 +++++---
 compiler/definitions/ir/nodes/r_unwrap.py     | 39 +++++---
 compiler/definitions/ir/nodes/r_wrap.py       | 90 ++++++++++--------
 compiler/ir.py                                |  2 +-
 compiler/ir_utils.py                          |  1 +
 compiler/pash_runtime.py                      | 94 +++++++++++++++++--
 10 files changed, 254 insertions(+), 85 deletions(-)

diff --git a/TODO.md b/TODO.md
index 9edb7d878..135a4cf03 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,8 +1,8 @@
 ## TODOs before merging to `future`
 
-- r_split
-- cat-split fusion
 - dgsh_tee
+- cat-split fusion
+- r-unwrap-commutative fusion
 - working on all tests
 - Adding annotation library installation and removing ad-hoc import of the latter
 - clean up utils for annotations
diff --git a/compiler/annotations_utils/util_cmd_invocations.py b/compiler/annotations_utils/util_cmd_invocations.py
index 85cc36b41..19d2d6c0f 100644
--- a/compiler/annotations_utils/util_cmd_invocations.py
+++ b/compiler/annotations_utils/util_cmd_invocations.py
@@ -1,6 +1,6 @@
 import sys
 
-from datatypes_new.BasicDatatypes import Flag
+from datatypes_new.BasicDatatypes import Flag, ArgStringType, Operand
 from datatypes_new.BasicDatatypesWithIO import OptionWithIO
 from datatypes_new.CommandInvocationInitial import CommandInvocationInitial
 from annotation_generation_new.datatypes.InputOutputInfo import InputOutputInfo
@@ -8,8 +8,11 @@
 from annotation_generation_new.datatypes.CommandProperties import CommandProperties
 from annotation_generation_new.AnnotationGeneration import get_input_output_info_from_cmd_invocation, \
     get_parallelizability_info_from_cmd_invocation
+from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars
 
 from config import get_path_annotation_repo
+from definitions.ir.arg import Arg
+
 sys.path.insert(1, get_path_annotation_repo())
 
 # for typing
@@ -23,6 +26,7 @@ def get_command_invocation_prefix_from_dfg_node(dfg_node):
                                    positional_config_list = dfg_node.positional_config_list)
 
 # TODO: ideally methods in the respective classes but requires refactoring of parsing infrastructure
+# TODO: isn't this `to_ast`?
 def to_node_cmd_inv_with_io_vars(cmd_inv, edges, redirs, assignments):
     ast_cmd_name = string_to_argument(cmd_inv.cmd_name)
     ast_flagoptions = []
@@ -58,13 +62,24 @@ def to_ast_flagoption(flagoption, _edges):
         return [opt_name_ast, opt_arg_ast]
 
 def to_ast_operand(operand, edges):
+    if isinstance(operand, Operand):
+        return translate_io_var_if_applicable(operand.get_name(), edges)
     return translate_io_var_if_applicable(operand, edges)
 
 def translate_io_var_if_applicable(pot_io_var, edges):
+    # TODO: this is currently a hack but eventually every possible type gets their own to_ast-function
     if isinstance(pot_io_var, int):
         return dereference_io_var(pot_io_var, edges)
-    else:
+    elif isinstance(pot_io_var, ArgStringType):
         return to_ast_arg_string_type(pot_io_var)
+    elif isinstance(pot_io_var, CommandInvocationWithIOVars):
+        assert(False)
+        # only happens as r-wrapped node
+        return to_node_cmd_inv_with_io_vars(pot_io_var, edges, [], [])
+    elif isinstance(pot_io_var, Arg):
+        return pot_io_var.to_ast()
+    else:
+        raise Exception("Unhandled type for operand in to_ast!")
 
 def to_ast_arg_string_type(arg_string_type):
     return arg_string_type.get_name().arg_char_list # is of type Arg
@@ -83,3 +98,41 @@ def get_parallelizability_info_from_cmd_invocation_util(cmd_invocationInitial :
 def construct_property_container_from_list_of_properties(list_properties):
     return CommandProperties(dict(list_properties))
 
+# this function is needed to wrap a node in `r_wrap`
+def to_arg_from_cmd_inv_with_io_vars_without_streaming_inputs_or_outputs_for_wrapping(cmd_inv, edges):
+    # we already expand here
+    whole_cmd = Arg(string_to_argument("\'"))
+    arg_cmd_name = Arg(string_to_argument(cmd_inv.cmd_name))
+    arg_flagoptions = []
+    for flagoption in cmd_inv.flag_option_list:
+        arg_flagoptions += to_arg_flagoption(flagoption, edges)
+    arg_operands = [to_arg_operand(operand, edges) for operand in cmd_inv.operand_list]
+    all_cmd_parts_arg = [arg_cmd_name]
+    all_cmd_parts_arg.extend(arg_flagoptions)
+    all_cmd_parts_arg.extend(arg_operands)
+    for part in all_cmd_parts_arg:
+        whole_cmd.concatenate(part)
+    whole_cmd.concatenate(Arg(string_to_argument("\'")))
+    return whole_cmd
+
+def to_arg_flagoption(flagoption, _edges):
+    if isinstance(flagoption, Flag):
+        return [Arg(string_to_argument(flagoption.get_name()))]
+    elif isinstance(flagoption, OptionWithIO):
+        opt_name_arg = Arg(string_to_argument(flagoption.get_name()))
+        opt_arg_arg = translate_io_var_to_arg_if_applicable(flagoption.get_arg())
+        return [opt_name_arg, opt_arg_arg]
+
+def to_arg_operand(operand, edges):
+    if isinstance(operand, Operand):
+        return translate_io_var_to_arg_if_applicable(operand.get_name(), edges)
+    return translate_io_var_to_arg_if_applicable(operand, edges)
+
+def translate_io_var_to_arg_if_applicable(pot_io_var, edges):
+    if isinstance(pot_io_var, int):
+        return Arg(dereference_io_var(pot_io_var, edges))
+    elif isinstance(pot_io_var, ArgStringType):
+        result = pot_io_var.get_name()  # is of type Arg
+        return result
+    else:
+        raise Exception("Unhandled type for operand in to_arg!")
diff --git a/compiler/definitions/ir/dfg_node.py b/compiler/definitions/ir/dfg_node.py
index b9e990fad..3564e5928 100644
--- a/compiler/definitions/ir/dfg_node.py
+++ b/compiler/definitions/ir/dfg_node.py
@@ -111,16 +111,11 @@ def get_configuration_inputs(self):
     #     return (self.com_category == "parallelizable_pure")
 
     def is_commutative(self):
-        # BEGIN ANNO
-        # OLD
-        # return ('commutative' in self.com_properties)
-        # NEW
-        val = self.cmd_related_properties.get_property_value('commutative')
+        val = self.cmd_related_properties.get_property_value('is_commutative')
         if val is not None:
             return val
         else:
             return False
-        # END ANNO
 
     ## kk: 2021-07-23 Not totally sure if that is generally correct. Tests will say ¯\_(ツ)_/¯
     ##     I think it assumes that new options can be added in the beginning if there are no options already
@@ -215,7 +210,9 @@ def _to_ast_aux_get_redirs(self):
 
     ## TODO: Improve this function to be separately implemented for different special nodes,
     ##       such as cat, eager, split, etc...
-    ## I do not think this is reasonable anymore since we remodelled nodes in a way that the back-translation is trivial
+    ## I do not think this separation is reasonable anymore since we remodelled nodes in a way that the back-translation is trivial
+    ## One exception: r_wrap; currently, the wrapped command is translated at creation of the r_wrap already and
+    ## hence assumes that non-streaming inputs/outputs will not change
     def to_ast(self, edges, drain_streams):
         ## TODO: We might not want to implement this at all actually
         if (drain_streams):
diff --git a/compiler/definitions/ir/nodes/eager.py b/compiler/definitions/ir/nodes/eager.py
index 2a5ee5aa9..ac49a576e 100644
--- a/compiler/definitions/ir/nodes/eager.py
+++ b/compiler/definitions/ir/nodes/eager.py
@@ -9,8 +9,7 @@ def __init__(self,
                  com_redirs=[], com_assignments=[]
                  ):
         # TODO []: default
-        super().__init__(
-                         cmd_invocation_with_io_vars,
+        super().__init__(cmd_invocation_with_io_vars,
                          com_redirs=com_redirs,
                          com_assignments=com_assignments)
 
diff --git a/compiler/definitions/ir/nodes/r_merge.py b/compiler/definitions/ir/nodes/r_merge.py
index 4eee7285d..f587a94fc 100644
--- a/compiler/definitions/ir/nodes/r_merge.py
+++ b/compiler/definitions/ir/nodes/r_merge.py
@@ -1,18 +1,32 @@
+from datatypes_new.AccessKind import AccessKind
+from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars
+
 from definitions.ir.dfg_node import *
 
 class RMerge(DFGNode):
-    def __init__(self, inputs, outputs, com_name, com_category,
-                 com_options = [], com_redirs = [], com_assignments=[]):
-        super().__init__(inputs, outputs, com_name, com_category,
-                         com_options=com_options, 
-                         com_redirs=com_redirs, 
-                         com_assignments=com_assignments)
+    def __init__(self,
+                 cmd_invocation_with_io_vars,
+                 com_redirs=[],
+                 com_assignments=[],
+                 parallelizer_list=None,
+                 cmd_related_properties=None):
+        # TODO []: default arguments!
+        super().__init__(cmd_invocation_with_io_vars=cmd_invocation_with_io_vars,
+                         com_redirs=com_redirs,
+                         com_assignments=com_assignments,
+                         parallelizer_list=parallelizer_list,
+                         cmd_related_properties=cmd_related_properties)
 
 def make_r_merge_node(inputs, output):
     r_merge_bin = os.path.join(config.PASH_TOP, config.config['runtime']['r_merge_binary'])
-    com_name = Arg(string_to_argument(r_merge_bin))
-    com_category = "pure"
-    return RMerge(inputs,
-                  [output],
-                  com_name, 
-                  com_category)
+    # TODO: assume that the inputs and output is provided as operands
+    access_map = {input_id: AccessKind.make_stream_input() for input_id in inputs}
+    access_map[output] = AccessKind.make_stream_output()
+    cmd_inv_with_io_vars = CommandInvocationWithIOVars(
+        cmd_name=r_merge_bin,
+        flag_option_list=[],
+        operand_list=inputs,
+        implicit_use_of_streaming_input=None,
+        implicit_use_of_streaming_output=output,
+        access_map=access_map)
+    return RMerge(cmd_inv_with_io_vars)
diff --git a/compiler/definitions/ir/nodes/r_unwrap.py b/compiler/definitions/ir/nodes/r_unwrap.py
index f3baa6eae..38cb03dcc 100644
--- a/compiler/definitions/ir/nodes/r_unwrap.py
+++ b/compiler/definitions/ir/nodes/r_unwrap.py
@@ -1,20 +1,33 @@
+from datatypes_new.AccessKind import AccessKind
+from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars
+
 from definitions.ir.dfg_node import *
 from ir_utils import *
 
 class RUnwrap(DFGNode):
-    def __init__(self, inputs, outputs, com_name, com_category,
-                 com_options = [], com_redirs = [], com_assignments=[]):
-        super().__init__(inputs, outputs, com_name, com_category,
-                         com_options=com_options, 
-                         com_redirs=com_redirs, 
-                         com_assignments=com_assignments)
+    def __init__(self,
+                 cmd_invocation_with_io_vars,
+                 com_redirs=[],
+                 com_assignments=[],
+                 parallelizer_list=None,
+                 cmd_related_properties=None):
+        # TODO []: default
+        super().__init__(cmd_invocation_with_io_vars,
+                         com_redirs=com_redirs,
+                         com_assignments=com_assignments,
+                         parallelizer_list=parallelizer_list,
+                         cmd_related_properties=cmd_related_properties)
 
 def make_unwrap_node(inputs, output):
-    assert(is_single_input(inputs))
+    assert(len(inputs) == 1)
+    input_id = inputs[0]
+    access_map = {input_id: AccessKind.make_stream_input(), output: AccessKind.make_stream_output()}
     r_unwrap_bin = os.path.join(config.PASH_TOP, config.config['runtime']['r_unwrap_binary'])
-    com_name = Arg(string_to_argument(r_unwrap_bin))
-    com_category = "pure"
-    return RUnwrap(inputs,
-                   [output],
-                   com_name, 
-                   com_category)
+    cmd_inv_with_io_vars = CommandInvocationWithIOVars(
+        cmd_name=r_unwrap_bin,
+        flag_option_list=[],
+        operand_list=[],
+        implicit_use_of_streaming_input=input_id,
+        implicit_use_of_streaming_output=output,
+        access_map=access_map)
+    return RUnwrap(cmd_inv_with_io_vars)
diff --git a/compiler/definitions/ir/nodes/r_wrap.py b/compiler/definitions/ir/nodes/r_wrap.py
index 62913b1d5..8fd44f6ca 100644
--- a/compiler/definitions/ir/nodes/r_wrap.py
+++ b/compiler/definitions/ir/nodes/r_wrap.py
@@ -1,66 +1,76 @@
+from datatypes_new.AccessKind import AccessKind
+from datatypes_new.BasicDatatypes import ArgStringType
+from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars
+
+from annotations_utils.util_cmd_invocations import to_arg_from_cmd_inv_with_io_vars_without_streaming_inputs_or_outputs_for_wrapping
 from definitions.ir.dfg_node import *
 from ir_utils import *
 
 class RWrap(DFGNode):
-    def __init__(self, inputs, outputs, com_name, com_category,
-                 com_options = [], com_redirs = [], com_assignments=[], wrapped_node_name=None):
-        super().__init__(inputs, outputs, com_name, com_category,
-                         com_options=com_options, 
-                         com_redirs=com_redirs, 
-                         com_assignments=com_assignments)
+    def __init__(self,
+                 cmd_invocation_with_io_vars,
+                 com_redirs=[],
+                 com_assignments=[],
+                 parallelizer_list=None,
+                 cmd_related_properties=None,
+                 wrapped_node_name=None):
+        # TODO []: default
         self.wrapped_node_name = wrapped_node_name
-    
+        super().__init__(cmd_invocation_with_io_vars,
+                         com_redirs=com_redirs,
+                         com_assignments=com_assignments,
+                         parallelizer_list=parallelizer_list,
+                         cmd_related_properties=cmd_related_properties)
+
     ## Get the label of the node. By default, it is simply the name
     def get_dot_label(self) -> str:
         ## The name could be a full path
-        name = self.com_name
+        name = self.cmd_invocation_with_io_vars.cmd_name
         basename = os.path.basename(str(name))
 
         wrapped_node_name = self.wrapped_node_name
         return f'{basename}({wrapped_node_name})'
 
-def wrap_node(node):
+def wrap_node(node: DFGNode, edges):
     r_wrap_bin = os.path.join(config.PASH_TOP, config.config['runtime']['r_wrap_binary'])
-    com_name = Arg(string_to_argument(r_wrap_bin))
-    ## TODO: Is it actually pure? What is it?
-    com_category = "pure"
-    ## At the moment we can only wrap a node that takes its input from stdin 
-    ## and outputs to stdout. Therefore the node needs to have only one input and one output.
-    inputs = node.inputs
-    assert(is_single_input(inputs))
 
-    outputs = node.outputs
+    ## At the moment we can only wrap a node that takes its input from stdin
+    ## and outputs to stdout. Therefore the node needs to have only one input and one output.
+    ## TO CHECK: with the remodelling also other cases should be handled
+    inputs = node.get_input_list()
+    assert(len(inputs) == 1)
+    input_id = inputs[0]
+    outputs = node.get_output_list()
     ## TODO: Would it make sense for outputs to be less than one?
-    assert(len(outputs) <= 1)
-
-    ## TODO: For now we can only wrap stateless commands
-    assert(node.com_category == "stateless")
-
-    ## TODO: All arguments must be options, otherwise there must be
-    ##       special handling in the wrap node2ast code. 
-    single_quote = Arg(string_to_argument("\'"))
-    cmd = Arg(string_to_argument(""))
+    ## TODO: changed this from <= to == 1 to simplify reasoning later for now
+    assert(len(outputs) == 1)
+    output_id = outputs[0]
+    access_map = {input_id: AccessKind.make_stream_input(), output_id: AccessKind.make_stream_output()}
 
     #create bash -c argument
-    cmd.concatenate(single_quote)
-    cmd.concatenate(node.com_name)
-    for i, opt in node.com_options:
-        cmd.concatenate(opt)
-    cmd.concatenate(single_quote)
+    cmd_inv_with_io_vars: CommandInvocationWithIOVars = node.cmd_invocation_with_io_vars
+    # do we need to copy here? currently, it seems fine
+    cmd_inv_with_io_vars.remove_streaming_inputs()
+    cmd_inv_with_io_vars.remove_streaming_outputs()
+    # any non-streaming inputs or outputs are converted here already!
+    cmd = to_arg_from_cmd_inv_with_io_vars_without_streaming_inputs_or_outputs_for_wrapping(cmd_inv_with_io_vars, edges)
+
+    bash_command_arg = [Arg(string_to_argument("bash -c"))]
+    operand_list = bash_command_arg + [cmd]
 
-    wrapped_command_arg = [(1, cmd)]
-    bash_command_arg = [(0, Arg(string_to_argument("bash -c")))]
-    options = bash_command_arg +  wrapped_command_arg
+    cmd_inv_with_io_vars = CommandInvocationWithIOVars(
+        cmd_name=r_wrap_bin,
+        flag_option_list=[],
+        operand_list=operand_list,
+        implicit_use_of_streaming_input=input_id,
+        implicit_use_of_streaming_output=output_id,
+        access_map=access_map)
 
     ## TODO: It is not clear if it is safe to just pass redirections and assignments down the line as is
     redirs = node.com_redirs
     assignments = node.com_assignments
 
-    return RWrap(inputs,
-                 outputs,
-                 com_name, 
-                 com_category,
-                 com_options=options,
+    return RWrap(cmd_inv_with_io_vars,
                  com_redirs=redirs,
                  com_assignments=assignments,
-                 wrapped_node_name=node.com_name)
+                 wrapped_node_name=node.cmd_invocation_with_io_vars.cmd_name)
diff --git a/compiler/ir.py b/compiler/ir.py
index 0b8938c0f..0e387ed47 100644
--- a/compiler/ir.py
+++ b/compiler/ir.py
@@ -289,7 +289,7 @@ def make_map_node(node, new_inputs, new_outputs, parallelizer):
 ##
 ## At the moment it only works with one input and one output since wrap cannot redirect input in the command.
 def make_wrap_map_node(node, new_inputs, new_outputs):
-    assert(is_single_input(new_inputs))
+    assert(len(new_inputs) == 1)
     assert(len(new_outputs) == 1)
 
     new_node = make_map_node(node, new_inputs, new_outputs)
diff --git a/compiler/ir_utils.py b/compiler/ir_utils.py
index 32a5f474f..ef42e875f 100644
--- a/compiler/ir_utils.py
+++ b/compiler/ir_utils.py
@@ -145,6 +145,7 @@ def format_expanded_arg_char(arg_char):
 
 ## These functions check tuple inputs (configuration and streaming ones)
 def is_single_input(inputs):
+    assert(False)
     assert(isinstance(inputs, tuple))
     conf_inputs = inputs[0]
     streaming_inputs = inputs[1]
diff --git a/compiler/pash_runtime.py b/compiler/pash_runtime.py
index f8045d5e8..9ec83aa09 100644
--- a/compiler/pash_runtime.py
+++ b/compiler/pash_runtime.py
@@ -421,11 +421,96 @@ def parallelize_node(curr_id, graph, fileIdGen, fan_out,
     curr = graph.get_node(curr_id)
     new_nodes_for_workset = []
 
+    # TODO: this whole fragment could be moved to the graph after picking a parallelizer
     option_parallelizer_rr = curr.get_option_implemented_round_robin_parallelizer()
+    # for now, we use the `r_split_flag` here again:
+    if r_split_flag and option_parallelizer_rr is not None:
+        parallelizer_rr = option_parallelizer_rr
+        streaming_inputs = curr.get_streaming_inputs()
+        assert(len(streaming_inputs) == 1)
+        streaming_input = streaming_inputs[0]
+        configuration_inputs = curr.get_configuration_inputs()
+        assert(len(configuration_inputs) == 0)
+        streaming_outputs = curr.get_output_list()
+        assert(len(streaming_outputs) == 1)
+        streaming_output = streaming_outputs[0]
+        original_cmd_invocation_with_io_vars = curr.cmd_invocation_with_io_vars
+
+        graph.remove_node(curr_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph
+
+        out_split_ids = graph.generate_ephemeral_edges(fileIdGen, fan_out)
+        splitter = r_split.make_r_split(streaming_input, out_split_ids, r_split_batch_size)
+        graph.set_edge_to(streaming_input, splitter.get_id())
+        for out_split_id in out_split_ids:
+            graph.set_edge_from(out_split_id, splitter.get_id())
+        graph.add_node(splitter)
+
+        in_mapper_ids = out_split_ids
+        out_mapper_ids = graph.generate_ephemeral_edges(fileIdGen, fan_out)
+        zip_mapper_in_out_ids = zip(in_mapper_ids, out_mapper_ids)
+
+        aggregator_spec = parallelizer_rr.get_aggregator_spec()
+        aggregator_kind = aggregator_spec.get_kind()
+        if aggregator_kind == AggregatorKindEnum.CONCATENATE: # is turned into an r_merge
+            all_mappers = []
+            for (in_id, out_id) in zip_mapper_in_out_ids:
+                # BEGIN: these 4 lines could be refactored to be a function in graph such that
+                # creating end point of edges and the creation of edges is not decoupled
+                mapper_cmd_inv = parallelizer_rr.get_actual_mapper(original_cmd_invocation_with_io_vars, in_id, out_id)
+                mapper = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(mapper_cmd_inv)
+                # add r_wrap here:
+                mapper_r_wrapped = r_wrap.wrap_node(mapper, graph.edges)
+                graph.set_edge_to(in_id, mapper_r_wrapped.get_id())
+                graph.set_edge_from(out_id, mapper_r_wrapped.get_id())
+                # END
+                all_mappers.append(mapper_r_wrapped)
+            for new_node in all_mappers:
+                graph.add_node(new_node)
 
-    if option_parallelizer_rr is not None:
-        # TODO: this whole fragment could be moved to the graph after picking a parallelizer
-        # TODO: we only do consecutive chunks here but from a rr splitter
+            in_aggregator_ids = out_mapper_ids
+            out_aggregator_id = streaming_output
+            aggregator = r_merge.make_r_merge_node(in_aggregator_ids, out_aggregator_id)
+            for in_aggregator_id in in_aggregator_ids:
+                graph.set_edge_to(in_aggregator_id, aggregator.get_id())
+            graph.set_edge_from(streaming_output, aggregator.get_id())
+            all_aggregators = [aggregator]
+            ## Add the merge commands in the graph
+            for new_node in all_aggregators:
+                graph.add_node(new_node)
+        elif curr.is_commutative(): # we can apply RR and do r_unwrap before the aggregator
+            all_mappers = []
+            for (in_id, out_id) in zip_mapper_in_out_ids:
+                # generate ephemeral edge for wrap to unwrap
+                [wrap_to_unwrap_id] = graph.generate_ephemeral_edges(fileIdGen, 1)
+                # BEGIN: these 4 lines could be refactored to be a function in graph such that
+                # creating end point of edges and the creation of edges is not decoupled
+                mapper_cmd_inv = parallelizer_rr.get_actual_mapper(original_cmd_invocation_with_io_vars, in_id, wrap_to_unwrap_id)
+                mapper = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(mapper_cmd_inv)
+                # add r_wrap here:
+                mapper_r_wrapped = r_wrap.wrap_node(mapper, graph.edges)
+                graph.set_edge_to(in_id, mapper_r_wrapped.get_id())
+                graph.set_edge_from(wrap_to_unwrap_id, mapper_r_wrapped.get_id())
+                # add unwrap as the command is commutative
+                unwrap = r_unwrap.make_unwrap_node([wrap_to_unwrap_id], out_id)
+                graph.set_edge_to(wrap_to_unwrap_id, unwrap.get_id())
+                graph.set_edge_from(out_id, unwrap.get_id())
+                # END
+                all_mappers.append(mapper_r_wrapped)
+                all_mappers.append(unwrap)
+            for new_node in all_mappers:
+                graph.add_node(new_node)
+
+            in_aggregator_ids = out_mapper_ids
+            out_aggregator_id = streaming_output
+            if aggregator_kind == AggregatorKindEnum.CUSTOM_2_ARY:
+                # TODO: we simplify and assume that every mapper produces a single output for now:
+                map_in_aggregator_ids = [[id] for id in in_aggregator_ids]
+                graph.create_generic_aggregator_tree(curr, parallelizer_rr, map_in_aggregator_ids, out_aggregator_id,
+                                                     fileIdGen)
+        else:
+            raise Exception("aggregator kind not yet implemented")
+    elif option_parallelizer_rr is not None: # do consecutive chunks
+        # TODO: we do consecutive chunks here but from a rr splitter
         parallelizer_rr = option_parallelizer_rr
         streaming_inputs = curr.get_streaming_inputs()
         assert(len(streaming_inputs) == 1)
@@ -465,8 +550,6 @@ def parallelize_node(curr_id, graph, fileIdGen, fan_out,
 
         in_aggregator_ids = out_mapper_ids
         out_aggregator_id = streaming_output
-        ## TODO: This could potentially be an aggregator tree (at least in the old PaSh versions)
-        ##       We need to extend the annotations/parallelizers to support this (e.g., for sort)
         aggregator_spec = parallelizer_rr.get_aggregator_spec()
         aggregator_kind = aggregator_spec.get_kind()
         if aggregator_kind == AggregatorKindEnum.CONCATENATE or aggregator_kind == AggregatorKindEnum.CUSTOM_N_ARY:
@@ -486,7 +569,6 @@ def parallelize_node(curr_id, graph, fileIdGen, fan_out,
         else:
             raise Exception("aggregator kind not yet implemented")
 
-
     return new_nodes_for_workset
 
 ## TODO: Instead of moving a cat after a node, we need to parallelize cat,

From 77da7070d63a463e8e30754baa03a07afa625a71 Mon Sep 17 00:00:00 2001
From: Felix Stutz <fstutz@mpi-sws.org>
Date: Tue, 28 Jun 2022 17:31:07 -0400
Subject: [PATCH 05/64] Fix unwrap and commutative interplay (#593)

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>
---
 compiler/pash_runtime.py | 78 ++++++++++++----------------------------
 1 file changed, 23 insertions(+), 55 deletions(-)

diff --git a/compiler/pash_runtime.py b/compiler/pash_runtime.py
index 9ec83aa09..f3daa3378 100644
--- a/compiler/pash_runtime.py
+++ b/compiler/pash_runtime.py
@@ -426,32 +426,32 @@ def parallelize_node(curr_id, graph, fileIdGen, fan_out,
     # for now, we use the `r_split_flag` here again:
     if r_split_flag and option_parallelizer_rr is not None:
         parallelizer_rr = option_parallelizer_rr
-        streaming_inputs = curr.get_streaming_inputs()
-        assert(len(streaming_inputs) == 1)
-        streaming_input = streaming_inputs[0]
-        configuration_inputs = curr.get_configuration_inputs()
-        assert(len(configuration_inputs) == 0)
-        streaming_outputs = curr.get_output_list()
-        assert(len(streaming_outputs) == 1)
-        streaming_output = streaming_outputs[0]
-        original_cmd_invocation_with_io_vars = curr.cmd_invocation_with_io_vars
-
-        graph.remove_node(curr_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph
-
-        out_split_ids = graph.generate_ephemeral_edges(fileIdGen, fan_out)
-        splitter = r_split.make_r_split(streaming_input, out_split_ids, r_split_batch_size)
-        graph.set_edge_to(streaming_input, splitter.get_id())
-        for out_split_id in out_split_ids:
-            graph.set_edge_from(out_split_id, splitter.get_id())
-        graph.add_node(splitter)
-
-        in_mapper_ids = out_split_ids
-        out_mapper_ids = graph.generate_ephemeral_edges(fileIdGen, fan_out)
-        zip_mapper_in_out_ids = zip(in_mapper_ids, out_mapper_ids)
-
         aggregator_spec = parallelizer_rr.get_aggregator_spec()
         aggregator_kind = aggregator_spec.get_kind()
         if aggregator_kind == AggregatorKindEnum.CONCATENATE: # is turned into an r_merge
+            streaming_inputs = curr.get_streaming_inputs()
+            assert(len(streaming_inputs) == 1)
+            streaming_input = streaming_inputs[0]
+            configuration_inputs = curr.get_configuration_inputs()
+            assert(len(configuration_inputs) == 0)
+            streaming_outputs = curr.get_output_list()
+            assert(len(streaming_outputs) == 1)
+            streaming_output = streaming_outputs[0]
+            original_cmd_invocation_with_io_vars = curr.cmd_invocation_with_io_vars
+
+            graph.remove_node(curr_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph
+
+            out_split_ids = graph.generate_ephemeral_edges(fileIdGen, fan_out)
+            splitter = r_split.make_r_split(streaming_input, out_split_ids, r_split_batch_size)
+            graph.set_edge_to(streaming_input, splitter.get_id())
+            for out_split_id in out_split_ids:
+                graph.set_edge_from(out_split_id, splitter.get_id())
+            graph.add_node(splitter)
+
+            in_mapper_ids = out_split_ids
+            out_mapper_ids = graph.generate_ephemeral_edges(fileIdGen, fan_out)
+            zip_mapper_in_out_ids = zip(in_mapper_ids, out_mapper_ids)
+
             all_mappers = []
             for (in_id, out_id) in zip_mapper_in_out_ids:
                 # BEGIN: these 4 lines could be refactored to be a function in graph such that
@@ -477,38 +477,6 @@ def parallelize_node(curr_id, graph, fileIdGen, fan_out,
             ## Add the merge commands in the graph
             for new_node in all_aggregators:
                 graph.add_node(new_node)
-        elif curr.is_commutative(): # we can apply RR and do r_unwrap before the aggregator
-            all_mappers = []
-            for (in_id, out_id) in zip_mapper_in_out_ids:
-                # generate ephemeral edge for wrap to unwrap
-                [wrap_to_unwrap_id] = graph.generate_ephemeral_edges(fileIdGen, 1)
-                # BEGIN: these 4 lines could be refactored to be a function in graph such that
-                # creating end point of edges and the creation of edges is not decoupled
-                mapper_cmd_inv = parallelizer_rr.get_actual_mapper(original_cmd_invocation_with_io_vars, in_id, wrap_to_unwrap_id)
-                mapper = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(mapper_cmd_inv)
-                # add r_wrap here:
-                mapper_r_wrapped = r_wrap.wrap_node(mapper, graph.edges)
-                graph.set_edge_to(in_id, mapper_r_wrapped.get_id())
-                graph.set_edge_from(wrap_to_unwrap_id, mapper_r_wrapped.get_id())
-                # add unwrap as the command is commutative
-                unwrap = r_unwrap.make_unwrap_node([wrap_to_unwrap_id], out_id)
-                graph.set_edge_to(wrap_to_unwrap_id, unwrap.get_id())
-                graph.set_edge_from(out_id, unwrap.get_id())
-                # END
-                all_mappers.append(mapper_r_wrapped)
-                all_mappers.append(unwrap)
-            for new_node in all_mappers:
-                graph.add_node(new_node)
-
-            in_aggregator_ids = out_mapper_ids
-            out_aggregator_id = streaming_output
-            if aggregator_kind == AggregatorKindEnum.CUSTOM_2_ARY:
-                # TODO: we simplify and assume that every mapper produces a single output for now:
-                map_in_aggregator_ids = [[id] for id in in_aggregator_ids]
-                graph.create_generic_aggregator_tree(curr, parallelizer_rr, map_in_aggregator_ids, out_aggregator_id,
-                                                     fileIdGen)
-        else:
-            raise Exception("aggregator kind not yet implemented")
     elif option_parallelizer_rr is not None: # do consecutive chunks
         # TODO: we do consecutive chunks here but from a rr splitter
         parallelizer_rr = option_parallelizer_rr

From a1c87a5b03c84df47d8e1058a78cdda89ed34e20 Mon Sep 17 00:00:00 2001
From: Felix Stutz <fstutz@mpi-sws.org>
Date: Tue, 28 Jun 2022 17:58:21 -0400
Subject: [PATCH 06/64] Add support for dgsh_tee nodes (#594)

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>
---
 TODO.md                                       |  1 +
 .../annotations_utils/util_cmd_invocations.py |  4 +-
 compiler/definitions/ir/dfg_node.py           |  6 +-
 compiler/definitions/ir/nodes/dgsh_tee.py     | 66 ++++++++++++++-----
 compiler/pash_runtime.py                      |  1 -
 5 files changed, 56 insertions(+), 22 deletions(-)

diff --git a/TODO.md b/TODO.md
index 135a4cf03..b28ed019d 100644
--- a/TODO.md
+++ b/TODO.md
@@ -6,4 +6,5 @@
 - working on all tests
 - Adding annotation library installation and removing ad-hoc import of the latter
 - clean up utils for annotations
+- graphviz
 - Changing PaSh flags (making the default be priority r-split and then consecutive chunks), so remove the r_split flag and make defaults be the ones from the OSDI paper (I can do that TODO too) 
\ No newline at end of file
diff --git a/compiler/annotations_utils/util_cmd_invocations.py b/compiler/annotations_utils/util_cmd_invocations.py
index 19d2d6c0f..d624affe6 100644
--- a/compiler/annotations_utils/util_cmd_invocations.py
+++ b/compiler/annotations_utils/util_cmd_invocations.py
@@ -53,12 +53,12 @@ def to_node_cmd_inv_with_io_vars(cmd_inv, edges, redirs, assignments):
     node = make_command(cmd_asts, redirections=new_redirs, assignments=assignments)
     return node
 
-def to_ast_flagoption(flagoption, _edges):
+def to_ast_flagoption(flagoption, edges):
     if isinstance(flagoption, Flag):
         return [string_to_argument(flagoption.get_name())]
     elif isinstance(flagoption, OptionWithIO): # retype to IOVar
         opt_name_ast = string_to_argument(flagoption.get_name())
-        opt_arg_ast = translate_io_var_if_applicable(flagoption.get_arg())
+        opt_arg_ast = translate_io_var_if_applicable(flagoption.get_arg(), edges)
         return [opt_name_ast, opt_arg_ast]
 
 def to_ast_operand(operand, edges):
diff --git a/compiler/definitions/ir/dfg_node.py b/compiler/definitions/ir/dfg_node.py
index 3564e5928..45ea066d0 100644
--- a/compiler/definitions/ir/dfg_node.py
+++ b/compiler/definitions/ir/dfg_node.py
@@ -211,8 +211,10 @@ def _to_ast_aux_get_redirs(self):
     ## TODO: Improve this function to be separately implemented for different special nodes,
     ##       such as cat, eager, split, etc...
     ## I do not think this separation is reasonable anymore since we remodelled nodes in a way that the back-translation is trivial
-    ## One exception: r_wrap; currently, the wrapped command is translated at creation of the r_wrap already and
-    ## hence assumes that non-streaming inputs/outputs will not change
+    ## Two exceptions:
+    ##  - r_wrap; currently, the wrapped command is translated at creation of the r_wrap already and
+    ##    hence assumes that non-streaming inputs/outputs will not change
+    ##  - dgsh_tee: it requires the operands to appear before the flags/options (not XBD standard compliant)
     def to_ast(self, edges, drain_streams):
         ## TODO: We might not want to implement this at all actually
         if (drain_streams):
diff --git a/compiler/definitions/ir/nodes/dgsh_tee.py b/compiler/definitions/ir/nodes/dgsh_tee.py
index 772d79e73..f449e4aa1 100644
--- a/compiler/definitions/ir/nodes/dgsh_tee.py
+++ b/compiler/definitions/ir/nodes/dgsh_tee.py
@@ -1,27 +1,59 @@
+from datatypes_new.AccessKind import AccessKind
+from datatypes_new.BasicDatatypes import Flag, ArgStringType
+from datatypes_new.BasicDatatypesWithIO import OptionWithIO
+from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars
+
+from annotations_utils.util_cmd_invocations import to_ast_flagoption, to_ast_operand
 from definitions.ir.dfg_node import *
 
 class DGSHTee(DFGNode):
-    def __init__(self, inputs, outputs, com_name, com_category, com_options = [], 
-                 com_redirs = [], com_assignments=[]):
-        super().__init__(inputs, outputs, com_name, com_category, 
-                         com_options=com_options, 
-                         com_redirs=com_redirs, 
+    def __init__(self,
+                 cmd_invocation_with_io_vars,
+                 com_redirs=[], com_assignments=[]
+                 ):
+        # TODO []: default
+        super().__init__(cmd_invocation_with_io_vars,
+                         com_redirs=com_redirs,
                          com_assignments=com_assignments)
 
+    # TODO: this is only needed since dgsh.sh does not comply with the XBD standard
+    def to_ast(self, edges, drain_streams):
+        if (drain_streams):
+            raise NotImplementedError()
+        else:
+            redirs = self._to_ast_aux_get_redirs()
+            assignments = self.com_assignments
+            node = to_node_cmd_inv_with_io_vars_for_dgsh_tee(self.cmd_invocation_with_io_vars, edges, redirs, assignments)
+        return node
+
 def make_dgsh_tee_node(input_id, output_id):
     dgsh_tee_bin = os.path.join(config.PASH_TOP, config.config['runtime']['dgsh_tee_binary'])
-    com_name = Arg(string_to_argument(dgsh_tee_bin))
 
-    com_category = "pure"
+    operand_list = [input_id, output_id]
+    access_map = {output_id: AccessKind.make_stream_output(),
+                  input_id: AccessKind.make_stream_input()}
+
+    flag_option_list = [Flag("-I"),
+                        Flag("-f"),
+                        OptionWithIO("-b", ArgStringType(Arg(string_to_argument(str(config.config['runtime']['dgsh_buffer_size'])))))]
 
-    ## TODO: add as command line arguments
-    com_options = [(2, Arg(string_to_argument("-I")))] # Eager functionality
-    com_options.append((3, Arg(string_to_argument("-f")))) # use file on disk when buffer reaches maximum
-    com_options.append((4, Arg(string_to_argument(f"-b {config.config['runtime']['dgsh_buffer_size']}")))) # set buffer size
-    # com_options.append(4, Arg(string_to_argument("−m batch_size"))) # set the 
+    cmd_inv_with_io_vars = CommandInvocationWithIOVars(
+        cmd_name=dgsh_tee_bin,
+        flag_option_list=flag_option_list,
+        operand_list=operand_list,
+        implicit_use_of_streaming_input=None,
+        implicit_use_of_streaming_output=None,
+        access_map=access_map)
+    return DGSHTee(cmd_inv_with_io_vars)
 
-    return DGSHTee([input_id],
-                 [output_id],
-                 com_name, 
-                 com_category,
-                 com_options=com_options)
+def to_node_cmd_inv_with_io_vars_for_dgsh_tee(cmd_inv, edges, redirs, assignments):
+    ast_cmd_name = string_to_argument(cmd_inv.cmd_name)
+    ast_flagoptions = []
+    for flagoption in cmd_inv.flag_option_list:
+        ast_flagoptions += to_ast_flagoption(flagoption, edges)
+    ast_operands = [to_ast_operand(operand, edges) for operand in cmd_inv.operand_list]
+    # This is where it differs ... in the order
+    cmd_asts = [ast_cmd_name] + ast_operands + ast_flagoptions
+    # we omit stuff for stdin and stdout as we know it does not exist
+    node = make_command(cmd_asts, redirections=redirs, assignments=assignments)
+    return node
diff --git a/compiler/pash_runtime.py b/compiler/pash_runtime.py
index f3daa3378..bf730919b 100644
--- a/compiler/pash_runtime.py
+++ b/compiler/pash_runtime.py
@@ -688,7 +688,6 @@ def add_eager(eager_input_id, graph, fileIdGen, intermediateFileIdGen, use_dgsh_
     new_id = new_fid.get_ident()
 
     if use_dgsh_tee:
-        assert(False)
         ## TODO: seperate to better use dgsh-tee params and maybe deprecate eager
         eager_node = dgsh_tee.make_dgsh_tee_node(eager_input_id, new_id)
     else:

From 034f41dc341d1c647854eeeff74abdce0fe1a4e1 Mon Sep 17 00:00:00 2001
From: Felix Stutz <fstutz@mpi-sws.org>
Date: Wed, 29 Jun 2022 09:50:12 -0400
Subject: [PATCH 07/64] Update TODOs before merging to future

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>
---
 TODO.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/TODO.md b/TODO.md
index b28ed019d..9cbc2104a 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,10 +1,10 @@
 ## TODOs before merging to `future`
 
-- dgsh_tee
+- separate checking and application of parallelization
 - cat-split fusion
 - r-unwrap-commutative fusion
 - working on all tests
 - Adding annotation library installation and removing ad-hoc import of the latter
 - clean up utils for annotations
 - graphviz
-- Changing PaSh flags (making the default be priority r-split and then consecutive chunks), so remove the r_split flag and make defaults be the ones from the OSDI paper (I can do that TODO too) 
\ No newline at end of file
+- Changing PaSh flags (making the default be priority r-split and then consecutive chunks), so remove the r_split flag and make defaults be the ones from the OSDI paper
\ No newline at end of file

From f6a3df14c6e87c678b819bb115df62bfc926ba13 Mon Sep 17 00:00:00 2001
From: Felix Stutz <fstutz@mpi-sws.org>
Date: Wed, 29 Jun 2022 10:39:17 -0400
Subject: [PATCH 08/64] Modify dgsh-wrapper to not require operands before
 options but pass options for input and output directly (#595)

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>
---
 compiler/definitions/ir/dfg_node.py       |  5 ++--
 compiler/definitions/ir/nodes/dgsh_tee.py | 29 ++++-------------------
 runtime/dgsh_tee.sh                       | 12 ++++++----
 3 files changed, 14 insertions(+), 32 deletions(-)

diff --git a/compiler/definitions/ir/dfg_node.py b/compiler/definitions/ir/dfg_node.py
index 45ea066d0..f09c17303 100644
--- a/compiler/definitions/ir/dfg_node.py
+++ b/compiler/definitions/ir/dfg_node.py
@@ -211,10 +211,9 @@ def _to_ast_aux_get_redirs(self):
     ## TODO: Improve this function to be separately implemented for different special nodes,
     ##       such as cat, eager, split, etc...
     ## I do not think this separation is reasonable anymore since we remodelled nodes in a way that the back-translation is trivial
-    ## Two exceptions:
+    ## One exception:
     ##  - r_wrap; currently, the wrapped command is translated at creation of the r_wrap already and
-    ##    hence assumes that non-streaming inputs/outputs will not change
-    ##  - dgsh_tee: it requires the operands to appear before the flags/options (not XBD standard compliant)
+    ##    hence assumes that non-streaming inputs/outputs will not change; with a special to_ast, we could circumvent this
     def to_ast(self, edges, drain_streams):
         ## TODO: We might not want to implement this at all actually
         if (drain_streams):
diff --git a/compiler/definitions/ir/nodes/dgsh_tee.py b/compiler/definitions/ir/nodes/dgsh_tee.py
index f449e4aa1..c417b8f58 100644
--- a/compiler/definitions/ir/nodes/dgsh_tee.py
+++ b/compiler/definitions/ir/nodes/dgsh_tee.py
@@ -16,44 +16,23 @@ def __init__(self,
                          com_redirs=com_redirs,
                          com_assignments=com_assignments)
 
-    # TODO: this is only needed since dgsh.sh does not comply with the XBD standard
-    def to_ast(self, edges, drain_streams):
-        if (drain_streams):
-            raise NotImplementedError()
-        else:
-            redirs = self._to_ast_aux_get_redirs()
-            assignments = self.com_assignments
-            node = to_node_cmd_inv_with_io_vars_for_dgsh_tee(self.cmd_invocation_with_io_vars, edges, redirs, assignments)
-        return node
-
 def make_dgsh_tee_node(input_id, output_id):
     dgsh_tee_bin = os.path.join(config.PASH_TOP, config.config['runtime']['dgsh_tee_binary'])
 
-    operand_list = [input_id, output_id]
     access_map = {output_id: AccessKind.make_stream_output(),
                   input_id: AccessKind.make_stream_input()}
 
-    flag_option_list = [Flag("-I"),
+    flag_option_list = [OptionWithIO("-i", input_id),
+                        OptionWithIO("-o", output_id),
+                        Flag("-I"),
                         Flag("-f"),
                         OptionWithIO("-b", ArgStringType(Arg(string_to_argument(str(config.config['runtime']['dgsh_buffer_size'])))))]
 
     cmd_inv_with_io_vars = CommandInvocationWithIOVars(
         cmd_name=dgsh_tee_bin,
         flag_option_list=flag_option_list,
-        operand_list=operand_list,
+        operand_list=[],
         implicit_use_of_streaming_input=None,
         implicit_use_of_streaming_output=None,
         access_map=access_map)
     return DGSHTee(cmd_inv_with_io_vars)
-
-def to_node_cmd_inv_with_io_vars_for_dgsh_tee(cmd_inv, edges, redirs, assignments):
-    ast_cmd_name = string_to_argument(cmd_inv.cmd_name)
-    ast_flagoptions = []
-    for flagoption in cmd_inv.flag_option_list:
-        ast_flagoptions += to_ast_flagoption(flagoption, edges)
-    ast_operands = [to_ast_operand(operand, edges) for operand in cmd_inv.operand_list]
-    # This is where it differs ... in the order
-    cmd_asts = [ast_cmd_name] + ast_operands + ast_flagoptions
-    # we omit stuff for stdin and stdout as we know it does not exist
-    node = make_command(cmd_asts, redirections=redirs, assignments=assignments)
-    return node
diff --git a/runtime/dgsh_tee.sh b/runtime/dgsh_tee.sh
index 7fc992a7b..ce4ab4081 100755
--- a/runtime/dgsh_tee.sh
+++ b/runtime/dgsh_tee.sh
@@ -1,8 +1,9 @@
 #!/usr/bin/env bash
 
-input=${1?"ERROR: dgsh-tee: No input file given"}
-output=${2?"ERROR: dgsh-tee: No output file given"}
-args=("${@:3}")
+# input and output properly provided in original args already now
+# input=${1?"ERROR: dgsh-tee: No input file given"}
+# output=${2?"ERROR: dgsh-tee: No output file given"}
+args=("${@:1}")
 
 # Set a default DISH_TOP in this directory if it doesn't exist
 PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)}
@@ -18,4 +19,7 @@ PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)}
 # $PASH_TOP/runtime/dgsh-tee -i "$input" -o "$output" $args &
 # dgsh_tee_pid=$!
 # wait $dgsh_tee_pid
-"$PASH_TOP"/runtime/dgsh-tee -i "$input" -o "$output" "${args[@]}"
+#"$PASH_TOP"/runtime/dgsh-tee -i "$input" -o "$output" "${args[@]}"
+
+# input and output properly provided in original args already now
+"$PASH_TOP"/runtime/dgsh-tee "${args[@]}"

From 558dc45cac61709c7adfea7df49af3cdb8be3fdb Mon Sep 17 00:00:00 2001
From: Felix Stutz <fstutz@mpi-sws.org>
Date: Wed, 29 Jun 2022 11:04:03 -0400
Subject: [PATCH 09/64] Remove dgsh-tee wrapper and call dgsh-tee directly
 (#596)

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>
---
 compiler/config.json |  2 +-
 runtime/dgsh_tee.sh  | 25 -------------------------
 2 files changed, 1 insertion(+), 26 deletions(-)
 delete mode 100755 runtime/dgsh_tee.sh

diff --git a/compiler/config.json b/compiler/config.json
index 2968b663d..4bed7462b 100644
--- a/compiler/config.json
+++ b/compiler/config.json
@@ -6,7 +6,7 @@
         "r_merge_binary": "runtime/r_merge",
         "r_wrap_binary": "runtime/r_wrap",
         "r_unwrap_binary": "runtime/r_unwrap",
-        "dgsh_tee_binary": "runtime/dgsh_tee.sh",
+        "dgsh_tee_binary": "runtime/dgsh-tee",
         "remote_read_binary": "runtime/dspash/remote_read.sh",
         "remote_write_binary": "runtime/dspash/remote_write.sh",
         "dfs_split_reader_binary": "runtime/dspash/dfs_split_reader.sh",
diff --git a/runtime/dgsh_tee.sh b/runtime/dgsh_tee.sh
deleted file mode 100755
index ce4ab4081..000000000
--- a/runtime/dgsh_tee.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/usr/bin/env bash
-
-# input and output properly provided in original args already now
-# input=${1?"ERROR: dgsh-tee: No input file given"}
-# output=${2?"ERROR: dgsh-tee: No output file given"}
-args=("${@:1}")
-
-# Set a default DISH_TOP in this directory if it doesn't exist
-PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)}
-
-# TODO: Doable check if this is still needed. Turned off for distributed exection. 
-#   PR https://github.com/binpash/pash/pull/495 might've resolved it.
-# cleanup()
-# {
-#     kill -SIGTERM $dgsh_tee_pid > /dev/null 2>&1
-# }
-# trap cleanup EXIT
-
-# $PASH_TOP/runtime/dgsh-tee -i "$input" -o "$output" $args &
-# dgsh_tee_pid=$!
-# wait $dgsh_tee_pid
-#"$PASH_TOP"/runtime/dgsh-tee -i "$input" -o "$output" "${args[@]}"
-
-# input and output properly provided in original args already now
-"$PASH_TOP"/runtime/dgsh-tee "${args[@]}"

From a77d4eef2739a59010fbd58a90157ae122f7b84f Mon Sep 17 00:00:00 2001
From: Felix Stutz <fstutz@mpi-sws.org>
Date: Thu, 30 Jun 2022 08:56:54 -0400
Subject: [PATCH 10/64] Separate parallelization into choose and apply phases
 (#597)

* Refactor parallelizing transformations in separate choose and apply phases

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>
---
 TODO.md                             |   3 +-
 compiler/definitions/ir/dfg_node.py |  33 ++++---
 compiler/ir.py                      | 145 +++++++++++++++++++++++++++-
 compiler/pash_runtime.py            |  54 ++++++++++-
 4 files changed, 217 insertions(+), 18 deletions(-)

diff --git a/TODO.md b/TODO.md
index 9cbc2104a..35ffdfc11 100644
--- a/TODO.md
+++ b/TODO.md
@@ -7,4 +7,5 @@
 - Adding annotation library installation and removing ad-hoc import of the latter
 - clean up utils for annotations
 - graphviz
-- Changing PaSh flags (making the default be priority r-split and then consecutive chunks), so remove the r_split flag and make defaults be the ones from the OSDI paper
\ No newline at end of file
+- Changing PaSh flags (making the default be priority r-split and then consecutive chunks), so remove the r_split flag and make defaults be the ones from the OSDI paper
+- Remove code which got obsolete due to the changes
\ No newline at end of file
diff --git a/compiler/definitions/ir/dfg_node.py b/compiler/definitions/ir/dfg_node.py
index f09c17303..e3e56631c 100644
--- a/compiler/definitions/ir/dfg_node.py
+++ b/compiler/definitions/ir/dfg_node.py
@@ -305,24 +305,31 @@ def replace_edge_in_list(self, edge_ids, from_id, to_id):
             new_edge_ids.append(new_edge_id)
         return new_edge_ids
 
-    def set_used_parallelizer(self, parallelizer):
-        assert(False)
-        # TODO: instantiate in __init__ already in some way
-        self.used_parallelizer = parallelizer
-
-    def get_used_parallelizer(self):
-        assert(False)
-        return self.used_parallelizer
-
     def get_option_implemented_round_robin_parallelizer(self):
         for parallelizer in self.parallelizer_list:
             splitter = parallelizer.get_splitter()
-            mapper_spec = parallelizer.get_mapper_spec()
-            aggregator_spec = parallelizer.get_aggregator_spec()
-            if splitter.is_splitter_round_robin() and mapper_spec.is_implemented and aggregator_spec.is_implemented:
+            if splitter.is_splitter_round_robin() and parallelizer.are_all_parts_implemented():
+                return parallelizer
+        return None
+
+    def get_option_implemented_consecutive_chunks_parallelizer(self):
+        for parallelizer in self.parallelizer_list:
+            splitter = parallelizer.get_splitter()
+            if splitter.is_splitter_consec_chunks() and parallelizer.are_all_parts_implemented():
                 return parallelizer
         return None
 
     @staticmethod
     def make_simple_dfg_node_from_cmd_inv_with_io_vars(cmd_inv_with_io_vars):
-        return DFGNode(cmd_inv_with_io_vars)
\ No newline at end of file
+        return DFGNode(cmd_inv_with_io_vars)
+
+    def get_single_streaming_input_single_output_and_configuration_inputs_of_node_for_parallelization(self):
+        streaming_inputs = self.get_streaming_inputs()
+        assert (len(streaming_inputs) == 1)
+        streaming_input = streaming_inputs[0]
+        configuration_inputs = self.get_configuration_inputs()
+        assert (len(configuration_inputs) == 0)
+        streaming_outputs = self.get_output_list()
+        assert (len(streaming_outputs) == 1)
+        streaming_output = streaming_outputs[0]
+        return streaming_input, streaming_output, configuration_inputs
diff --git a/compiler/ir.py b/compiler/ir.py
index 0e387ed47..11e305bf5 100644
--- a/compiler/ir.py
+++ b/compiler/ir.py
@@ -774,6 +774,147 @@ def add_edge(self, edge_fid):
     def empty(self):
         return (len(self.nodes) == 0)
 
+    def apply_parallelization_to_node(self, node_id, parallelizer, fileIdGen, fan_out,
+                                            batch_size, no_cat_split_vanish, r_split_batch_size):
+        splitter = parallelizer.get_splitter()
+        if splitter.is_splitter_round_robin():
+            # TODO: for both functions, check which parameters are needed
+            self.apply_round_robin_parallelization_to_node(node_id, parallelizer, fileIdGen, fan_out,
+                                            batch_size, no_cat_split_vanish, r_split_batch_size)
+        elif splitter.is_splitter_consec_chunks():
+            self.apply_consecutive_chunks_parallelization_to_node(node_id, parallelizer, fileIdGen, fan_out,
+                                                                 batch_size, no_cat_split_vanish, r_split_batch_size)
+        else:
+            raise Exception("Splitter not yet implemented")
+
+    def apply_round_robin_parallelization_to_node(self, node_id, parallelizer, fileIdGen, fan_out,
+                                                        batch_size, no_cat_split_vanish, r_split_batch_size):
+        # TODO: this control flow should move done to aggregators once we implement them;
+        #  currently, this cannot be done since splitter etc. would be added...
+        aggregator_spec = parallelizer.get_aggregator_spec()
+        if aggregator_spec.is_aggregator_spec_adj_lines_merge():
+            raise Exception("adj_lines_merge not yet implemented in PaSh")
+        elif aggregator_spec.is_aggregator_spec_adj_lines_seq():
+            raise Exception("adj_lines_seq not yet implemented in PaSh")
+        elif aggregator_spec.is_aggregator_spec_adj_lines_func():
+            raise Exception("adj_lines_func not yet implemented in PaSh")
+        # END of what to move
+
+        node = self.get_node(node_id)
+        # get info from node, and delete it from graph
+        streaming_input, streaming_output, configuration_inputs = \
+            node.get_single_streaming_input_single_output_and_configuration_inputs_of_node_for_parallelization()
+        original_cmd_invocation_with_io_vars = node.cmd_invocation_with_io_vars
+        self.remove_node(node_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph
+
+        # splitter
+        round_robin_splitter_generator = lambda input_id, output_ids: r_split.make_r_split(input_id, output_ids, r_split_batch_size)
+        out_split_ids = self.introduce_splitter(round_robin_splitter_generator, fan_out, fileIdGen, streaming_input)
+
+        # mappers
+        in_mapper_ids = out_split_ids
+        out_mapper_ids = self.introduce_mappers(fan_out, fileIdGen, in_mapper_ids, original_cmd_invocation_with_io_vars,
+                                                parallelizer)
+
+        # aggregator(s)
+        self.introduce_aggregator_for_round_robin(out_mapper_ids, parallelizer, streaming_output)
+
+    def apply_consecutive_chunks_parallelization_to_node(self, node_id, parallelizer, fileIdGen, fan_out,
+                                                               batch_size, no_cat_split_vanish, r_split_batch_size):
+        node = self.get_node(node_id)
+        streaming_input, streaming_output, configuration_inputs = \
+            node.get_single_streaming_input_single_output_and_configuration_inputs_of_node_for_parallelization()
+        original_cmd_invocation_with_io_vars = node.cmd_invocation_with_io_vars
+        self.remove_node(node_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph
+
+        # splitter
+        consec_chunks_splitter_generator = lambda input_id, output_ids: pash_split.make_split_file(input_id, output_ids)
+        out_split_ids = self.introduce_splitter(consec_chunks_splitter_generator, fan_out, fileIdGen, streaming_input)
+
+        # mappers
+        in_mapper_ids = out_split_ids
+        out_mapper_ids = self.introduce_mappers(fan_out, fileIdGen, in_mapper_ids, original_cmd_invocation_with_io_vars,
+                                                parallelizer)
+
+        # aggregators
+        in_aggregator_ids = out_mapper_ids
+        out_aggregator_id = streaming_output
+        self.introduce_aggregators_for_consec_chunks(fileIdGen, in_aggregator_ids,
+                                                     original_cmd_invocation_with_io_vars, out_aggregator_id, parallelizer,
+                                                     streaming_output)
+
+    def introduce_splitter(self, splitter_generator, fan_out, fileIdGen, streaming_input):
+        out_split_ids = self.generate_ephemeral_edges(fileIdGen, fan_out)
+        splitter = splitter_generator(streaming_input, out_split_ids)
+        self.set_edge_to(streaming_input, splitter.get_id())
+        for out_split_id in out_split_ids:
+            self.set_edge_from(out_split_id, splitter.get_id())
+        self.add_node(splitter)
+        return out_split_ids
+
+    def introduce_mappers(self, fan_out, fileIdGen, in_mapper_ids, original_cmd_invocation_with_io_vars, parallelizer):
+        out_mapper_ids = self.generate_ephemeral_edges(fileIdGen, fan_out)
+        zip_mapper_in_out_ids = zip(in_mapper_ids, out_mapper_ids)
+        all_mappers = []
+        for (in_id, out_id) in zip_mapper_in_out_ids:
+            # BEGIN: these 4 lines could be refactored to be a function in graph such that
+            # creating end point of edges and the creation of edges is not decoupled
+            mapper_cmd_inv = parallelizer.get_actual_mapper(original_cmd_invocation_with_io_vars, in_id, out_id)
+            mapper = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(mapper_cmd_inv)
+            self.set_edge_to(in_id, mapper.get_id())
+            self.set_edge_from(out_id, mapper.get_id())
+            # END
+            splitter = parallelizer.get_splitter()
+            if splitter.is_splitter_round_robin():
+                mapper_r_wrapped = r_wrap.wrap_node(mapper, self.edges)
+                self.set_edge_to(in_id, mapper_r_wrapped.get_id())
+                self.set_edge_from(out_id, mapper_r_wrapped.get_id())
+                mapper = mapper_r_wrapped
+            all_mappers.append(mapper)
+        for new_node in all_mappers:
+            self.add_node(new_node)
+        return out_mapper_ids
+
+    def introduce_aggregators_for_consec_chunks(self, fileIdGen, in_aggregator_ids,
+                                                original_cmd_invocation_with_io_vars, out_aggregator_id, parallelizer,
+                                                streaming_output):
+        aggregator_spec = parallelizer.get_aggregator_spec()
+        if aggregator_spec.is_aggregator_spec_concatenate() or aggregator_spec.is_aggregator_spec_custom_n_ary():
+            aggregator_cmd_inv = parallelizer.get_actual_aggregator(original_cmd_invocation_with_io_vars,
+                                                                    in_aggregator_ids, out_aggregator_id)
+            aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(aggregator_cmd_inv)
+            for in_aggregator_id in in_aggregator_ids:
+                self.set_edge_to(in_aggregator_id, aggregator.get_id())
+            self.set_edge_from(streaming_output, aggregator.get_id())
+            all_aggregators = [aggregator]
+            ## Add the merge commands in the graph
+            for new_node in all_aggregators:
+                self.add_node(new_node)
+        elif aggregator_spec.is_aggregator_spec_custom_2_ary():
+            # TODO: we simplify and assume that every mapper produces a single output for now
+            map_in_aggregator_ids = [[id] for id in in_aggregator_ids]
+            # TODO: turn node into cmd_invocation_with_io_vars since this is the only thing required in this function
+            self.create_generic_aggregator_tree(original_cmd_invocation_with_io_vars, parallelizer, map_in_aggregator_ids, out_aggregator_id, fileIdGen)
+        else:
+            raise Exception("aggregator kind not yet implemented")
+
+    def introduce_aggregator_for_round_robin(self, out_mapper_ids, parallelizer, streaming_output):
+        aggregator_spec = parallelizer.get_aggregator_spec()
+        if aggregator_spec.is_aggregator_spec_concatenate():
+            in_aggregator_ids = out_mapper_ids
+            out_aggregator_id = streaming_output
+            aggregator = r_merge.make_r_merge_node(in_aggregator_ids, out_aggregator_id)
+            for in_aggregator_id in in_aggregator_ids:
+                self.set_edge_to(in_aggregator_id, aggregator.get_id())
+            self.set_edge_from(streaming_output, aggregator.get_id())
+            all_aggregators = [aggregator]
+            ## Add the aggregator node(s) in the graph
+            for new_node in all_aggregators:
+                self.add_node(new_node)
+        else:
+            # TODO: this is where the other cases for aggregators need to be added
+            pass
+
 
     ## This function parallelizes a merger followed by a parallelizable node
     ##
@@ -1107,10 +1248,10 @@ def valid(self):
                 #      and not self.get_stdout() is None)))
 
     ## This is a function that creates a reduce tree for a given node
-    def create_generic_aggregator_tree(self, curr_node, parallelizer, input_ids_for_aggregators, out_aggregator_id, fileIdGen):
+    def create_generic_aggregator_tree(self, cmd_invocation_with_io_vars, parallelizer, input_ids_for_aggregators, out_aggregator_id, fileIdGen):
         def function_to_get_binary_aggregator(in_ids, out_ids):
             assert(len(out_ids) == 1)
-            aggregator_cmd_inv = parallelizer.get_actual_aggregator(curr_node.cmd_invocation_with_io_vars, in_ids, out_ids[0])
+            aggregator_cmd_inv = parallelizer.get_actual_aggregator(cmd_invocation_with_io_vars, in_ids, out_ids[0])
             aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(aggregator_cmd_inv)
             return aggregator
         ## The Aggregator node takes a sequence of input ids and an output id
diff --git a/compiler/pash_runtime.py b/compiler/pash_runtime.py
index bf730919b..698cff210 100644
--- a/compiler/pash_runtime.py
+++ b/compiler/pash_runtime.py
@@ -215,7 +215,7 @@ def optimize_irs(asts_and_irs, args, compiler_config):
 
             # log(ir_node)
             # with cProfile.Profile() as pr:
-            distributed_graph = naive_parallelize_stateless_nodes_bfs(ast_or_ir, compiler_config.width,
+            distributed_graph = choose_and_apply_parallelizing_transformations(ast_or_ir, compiler_config.width,
                                                                       runtime_config['batch_size'],
                                                                       args.no_cat_split_vanish,
                                                                       args.r_split, args.r_split_batch_size)
@@ -252,6 +252,54 @@ def print_graph_statistics(graph):
     log("Cat nodes:", len(cat_nodes))
     log("Eager nodes:", len(eager_nodes))
 
+
+def choose_and_apply_parallelizing_transformations(graph, fan_out, batch_size, no_cat_split_vanish,
+                                                   r_split_flag, r_split_batch_size):
+    parallelizer_map = choose_parallelizing_transformations(graph, r_split_flag)
+    apply_parallelizing_transformations(graph, parallelizer_map, fan_out, batch_size, no_cat_split_vanish,
+                                                   r_split_flag, r_split_batch_size)
+    return graph
+
+
+def choose_parallelizing_transformations(graph, r_split_flag): # shall return map
+    source_node_ids = graph.source_nodes()
+    parallelizer_map = {}
+    workset = source_node_ids
+    visited = set()
+    # We apply a modified BFS such that we ensure that we know which parallelizer was chosen for all previous nodes
+    # and assume that the decision for any subsequent node will exploit any potential synergy effects
+    while (len(workset) > 0):
+        curr_id = workset.pop(0)
+        assert(isinstance(curr_id, int))
+        all_previous_nodes_visited = all(prev in visited for prev in graph.get_previous_nodes(curr_id))
+        if not all_previous_nodes_visited:
+            workset.append(curr_id)
+        elif not curr_id in visited:
+            next_node_ids = graph.get_next_nodes(curr_id)
+            workset += next_node_ids
+            parallelizer_map[curr_id] = choose_parallelizing_transformation(curr_id, graph, r_split_flag)
+            visited.add(curr_id)
+    return parallelizer_map
+
+
+def choose_parallelizing_transformation(curr_id, graph, r_split_flag): # shall return map entry
+    # TODO: here we can implement more sophisticated techniques to decide how to parallelize
+    curr = graph.get_node(curr_id)
+    if r_split_flag:
+        option_parallelizer = curr.get_option_implemented_round_robin_parallelizer()
+    else:
+        option_parallelizer = curr.get_option_implemented_consecutive_chunks_parallelizer()
+    return option_parallelizer
+
+
+def apply_parallelizing_transformations(graph, parallelizer_map, fan_out, batch_size, no_cat_split_vanish,
+                                        r_split_flag, r_split_batch_size):
+    fileIdGen = graph.get_file_id_gen()
+    node_id_non_none_parallelizer_list = [(node_id, parallelizer) for (node_id, parallelizer) in parallelizer_map.items()
+                                                                  if parallelizer is not None]
+    for (node_id, parallelizer) in node_id_non_none_parallelizer_list:
+        graph.apply_parallelization_to_node(node_id, parallelizer, fileIdGen, fan_out,
+                                            batch_size, no_cat_split_vanish, r_split_batch_size)
 ## This is a simplistic planner, that pushes the available
 ## parallelization from the inputs in file stateless commands. The
 ## planner starts from the sources of the graph, and pushes
@@ -261,6 +309,7 @@ def print_graph_statistics(graph):
 ## be scheduled depending on the available computational resources.
 def naive_parallelize_stateless_nodes_bfs(graph, fan_out, batch_size, no_cat_split_vanish,
                                           r_split_flag, r_split_batch_size):
+    assert(False)
     source_node_ids = graph.source_nodes()
 
     ## Generate a fileIdGen from a graph, that doesn't clash with the
@@ -418,6 +467,7 @@ def split_hdfs_cat_input(hdfs_cat, next_node, graph, fileIdGen):
 ## This function takes a node (id) and parallelizes it
 def parallelize_node(curr_id, graph, fileIdGen, fan_out,
                      batch_size, no_cat_split_vanish, r_split_flag, r_split_batch_size):
+    assert(False)
     curr = graph.get_node(curr_id)
     new_nodes_for_workset = []
 
@@ -533,7 +583,7 @@ def parallelize_node(curr_id, graph, fileIdGen, fan_out,
         elif aggregator_kind == AggregatorKindEnum.CUSTOM_2_ARY:
             # TODO: we simplify and assume that every mapper produces a single output for now:
             map_in_aggregator_ids = [[id] for id in in_aggregator_ids]
-            graph.create_generic_aggregator_tree(curr, parallelizer_rr, map_in_aggregator_ids, out_aggregator_id, fileIdGen)
+            graph.create_generic_aggregator_tree(original_cmd_invocation_with_io_vars, parallelizer_rr, map_in_aggregator_ids, out_aggregator_id, fileIdGen)
         else:
             raise Exception("aggregator kind not yet implemented")
 

From f1221ffc21f887a679d2955b965322906f0b8e03 Mon Sep 17 00:00:00 2001
From: Felix Stutz <fstutz@mpi-sws.org>
Date: Thu, 30 Jun 2022 10:45:19 -0400
Subject: [PATCH 11/64] Fuse cat and subsequent split (#599)

* Fuse cat and subsequent split

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>
---
 TODO.md        |  1 -
 compiler/ir.py | 40 +++++++++++++++++++++++++---------------
 2 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/TODO.md b/TODO.md
index 35ffdfc11..5725529a9 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,6 +1,5 @@
 ## TODOs before merging to `future`
 
-- separate checking and application of parallelization
 - cat-split fusion
 - r-unwrap-commutative fusion
 - working on all tests
diff --git a/compiler/ir.py b/compiler/ir.py
index 11e305bf5..795893e0a 100644
--- a/compiler/ir.py
+++ b/compiler/ir.py
@@ -1,28 +1,18 @@
-# BEGIN ANNO
 import sys
 
 from config import get_path_annotation_repo
 sys.path.insert(1, get_path_annotation_repo())
-# for typing
 from datatypes_new.CommandInvocationInitial import CommandInvocationInitial
 from datatypes_new.BasicDatatypesWithIO import FileNameWithIOInfo, StdDescriptorWithIOInfo
 from annotation_generation_new.datatypes.InputOutputInfo import InputOutputInfo
 from annotation_generation_new.datatypes.ParallelizabilityInfo import ParallelizabilityInfo
 from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars
 
-# for use
-# --
-
 from annotations_utils.util_parsing import parse_arg_list_to_command_invocation
 from annotations_utils.util_cmd_invocations import get_input_output_info_from_cmd_invocation_util, get_parallelizability_info_from_cmd_invocation_util
 from annotations_utils.util_mapper import get_mapper_as_dfg_node_from_node, get_map_output_files
 from annotations_utils.util_aggregator import get_aggregator_as_dfg_node_from_node
 from annotations_utils.util_file_descriptors import resource_from_file_descriptor
-# END ANNO
-
-# BEGIN REMODEL
-
-# END REMODEL
 
 from definitions.ir.file_id import *
 from definitions.ir.nodes.cat import *
@@ -816,23 +806,43 @@ def apply_round_robin_parallelization_to_node(self, node_id, parallelizer, fileI
         out_mapper_ids = self.introduce_mappers(fan_out, fileIdGen, in_mapper_ids, original_cmd_invocation_with_io_vars,
                                                 parallelizer)
 
-        # aggregator(s)
+        # aggregator
         self.introduce_aggregator_for_round_robin(out_mapper_ids, parallelizer, streaming_output)
 
     def apply_consecutive_chunks_parallelization_to_node(self, node_id, parallelizer, fileIdGen, fan_out,
                                                                batch_size, no_cat_split_vanish, r_split_batch_size):
+        # check whether we can fuse with previous node's parallelization:
+        # we can do so if the previous node's parallelization is the same, and the aggregator is concatenation
+        # Assumption: it suffices to check that the previous node is an aggregator node of type concatenate
+        #  as this is unique for consecutive chunk parallelization (for now, this is true)
         node = self.get_node(node_id)
         streaming_input, streaming_output, configuration_inputs = \
             node.get_single_streaming_input_single_output_and_configuration_inputs_of_node_for_parallelization()
         original_cmd_invocation_with_io_vars = node.cmd_invocation_with_io_vars
+
+        prev_nodes = self.get_previous_nodes(node_id)
+        assert(len(prev_nodes) > 0)
+        # get info about first one but also ensure that it is the only one if we fuse
+        first_pred_id = prev_nodes[0]
+        first_pred_node = self.get_node(first_pred_id)
+        first_pred_cmd_inv = first_pred_node.cmd_invocation_with_io_vars
+
+        # remove node to be parallelized
         self.remove_node(node_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph
 
-        # splitter
-        consec_chunks_splitter_generator = lambda input_id, output_ids: pash_split.make_split_file(input_id, output_ids)
-        out_split_ids = self.introduce_splitter(consec_chunks_splitter_generator, fan_out, fileIdGen, streaming_input)
+        if len(prev_nodes) == 1 and first_pred_cmd_inv.is_aggregator_concatenate():
+            # can be fused
+            self.remove_node(first_pred_id) # also sets respective edge to's and from's to None
+            in_mapper_ids = first_pred_cmd_inv.operand_list
+        else: # cannot be fused so introduce splitter
+            # splitter
+            consec_chunks_splitter_generator = lambda input_id, output_ids: pash_split.make_split_file(input_id,
+                                                                                                       output_ids)
+            out_split_ids = self.introduce_splitter(consec_chunks_splitter_generator, fan_out, fileIdGen,
+                                                    streaming_input)
+            in_mapper_ids = out_split_ids
 
         # mappers
-        in_mapper_ids = out_split_ids
         out_mapper_ids = self.introduce_mappers(fan_out, fileIdGen, in_mapper_ids, original_cmd_invocation_with_io_vars,
                                                 parallelizer)
 

From 334091f6c8b18a841c0d01ee7d5c66b9a35dc93f Mon Sep 17 00:00:00 2001
From: Felix Stutz <fstutz@mpi-sws.org>
Date: Thu, 30 Jun 2022 11:12:38 -0400
Subject: [PATCH 12/64] Fuse r_merge and subsequent r_split (#600)

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>
---
 compiler/ir.py | 35 +++++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/compiler/ir.py b/compiler/ir.py
index 795893e0a..2bd6fccb1 100644
--- a/compiler/ir.py
+++ b/compiler/ir.py
@@ -795,14 +795,24 @@ def apply_round_robin_parallelization_to_node(self, node_id, parallelizer, fileI
         streaming_input, streaming_output, configuration_inputs = \
             node.get_single_streaming_input_single_output_and_configuration_inputs_of_node_for_parallelization()
         original_cmd_invocation_with_io_vars = node.cmd_invocation_with_io_vars
+
+        prev_nodes = self.get_previous_nodes(node_id)
+        first_pred_node, first_pred_cmd_inv = self.get_first_previous_node_and_first_previous_cmd_invocation(prev_nodes)
+
+        # remove node to be parallelized
         self.remove_node(node_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph
 
-        # splitter
-        round_robin_splitter_generator = lambda input_id, output_ids: r_split.make_r_split(input_id, output_ids, r_split_batch_size)
-        out_split_ids = self.introduce_splitter(round_robin_splitter_generator, fan_out, fileIdGen, streaming_input)
+        if len(prev_nodes) == 1 and isinstance(first_pred_node, r_merge.RMerge):
+            # can be fused
+            self.remove_node(prev_nodes[0]) # also sets respective edge to's and from's to None
+            in_mapper_ids = first_pred_cmd_inv.operand_list
+        else: # cannot be fused so introduce splitter
+            # splitter
+            round_robin_splitter_generator = lambda input_id, output_ids: r_split.make_r_split(input_id, output_ids, r_split_batch_size)
+            out_split_ids = self.introduce_splitter(round_robin_splitter_generator, fan_out, fileIdGen, streaming_input)
+            in_mapper_ids = out_split_ids
 
         # mappers
-        in_mapper_ids = out_split_ids
         out_mapper_ids = self.introduce_mappers(fan_out, fileIdGen, in_mapper_ids, original_cmd_invocation_with_io_vars,
                                                 parallelizer)
 
@@ -821,18 +831,15 @@ def apply_consecutive_chunks_parallelization_to_node(self, node_id, parallelizer
         original_cmd_invocation_with_io_vars = node.cmd_invocation_with_io_vars
 
         prev_nodes = self.get_previous_nodes(node_id)
-        assert(len(prev_nodes) > 0)
-        # get info about first one but also ensure that it is the only one if we fuse
-        first_pred_id = prev_nodes[0]
-        first_pred_node = self.get_node(first_pred_id)
-        first_pred_cmd_inv = first_pred_node.cmd_invocation_with_io_vars
+        first_pred_node, first_pred_cmd_inv = self.get_first_previous_node_and_first_previous_cmd_invocation(prev_nodes)
 
         # remove node to be parallelized
         self.remove_node(node_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph
 
+        # TODO: change to check on Node (first_pred_node) and not cmd_inv
         if len(prev_nodes) == 1 and first_pred_cmd_inv.is_aggregator_concatenate():
             # can be fused
-            self.remove_node(first_pred_id) # also sets respective edge to's and from's to None
+            self.remove_node(prev_nodes[0]) # also sets respective edge to's and from's to None
             in_mapper_ids = first_pred_cmd_inv.operand_list
         else: # cannot be fused so introduce splitter
             # splitter
@@ -853,6 +860,14 @@ def apply_consecutive_chunks_parallelization_to_node(self, node_id, parallelizer
                                                      original_cmd_invocation_with_io_vars, out_aggregator_id, parallelizer,
                                                      streaming_output)
 
+    def get_first_previous_node_and_first_previous_cmd_invocation(self, prev_nodes):
+        assert (len(prev_nodes) > 0)
+        # get info about first one but also ensure that it is the only one if we fuse
+        first_pred_id = prev_nodes[0]
+        first_pred_node = self.get_node(first_pred_id)
+        first_pred_cmd_inv = first_pred_node.cmd_invocation_with_io_vars
+        return first_pred_node, first_pred_cmd_inv
+
     def introduce_splitter(self, splitter_generator, fan_out, fileIdGen, streaming_input):
         out_split_ids = self.generate_ephemeral_edges(fileIdGen, fan_out)
         splitter = splitter_generator(streaming_input, out_split_ids)

From 58fbd29a4a31cdba2f66319e0ffdc6e63d62b60a Mon Sep 17 00:00:00 2001
From: Felix Stutz <fstutz@mpi-sws.org>
Date: Thu, 30 Jun 2022 11:55:37 -0400
Subject: [PATCH 13/64] Fuse r_merge and subsequent commutative command (#601)

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>
---
 compiler/ir.py           | 19 ++++++++++++++++++-
 compiler/pash_runtime.py | 18 ++++++++++++------
 2 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/compiler/ir.py b/compiler/ir.py
index 2bd6fccb1..c2f36543e 100644
--- a/compiler/ir.py
+++ b/compiler/ir.py
@@ -836,11 +836,17 @@ def apply_consecutive_chunks_parallelization_to_node(self, node_id, parallelizer
         # remove node to be parallelized
         self.remove_node(node_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph
 
-        # TODO: change to check on Node (first_pred_node) and not cmd_inv
+        # TODO: change first check to first_pred_node and not cmd_inv
         if len(prev_nodes) == 1 and first_pred_cmd_inv.is_aggregator_concatenate():
             # can be fused
             self.remove_node(prev_nodes[0]) # also sets respective edge to's and from's to None
             in_mapper_ids = first_pred_cmd_inv.operand_list
+        elif len(prev_nodes) == 1 and isinstance(first_pred_node, r_merge.RMerge) and node.is_commutative():
+            self.remove_node(prev_nodes[0]) # also sets respective edge to's and from's to None
+
+            in_unwrap_ids = first_pred_cmd_inv.operand_list
+            out_unwrap_ids = self.introduce_unwraps(fileIdGen, in_unwrap_ids)
+            in_mapper_ids = out_unwrap_ids
         else: # cannot be fused so introduce splitter
             # splitter
             consec_chunks_splitter_generator = lambda input_id, output_ids: pash_split.make_split_file(input_id,
@@ -900,6 +906,17 @@ def introduce_mappers(self, fan_out, fileIdGen, in_mapper_ids, original_cmd_invo
             self.add_node(new_node)
         return out_mapper_ids
 
+    def introduce_unwraps(self, fileIdGen, in_unwrap_ids):
+        unwrap_to_commutative_mappers_ids = self.generate_ephemeral_edges(fileIdGen, len(in_unwrap_ids))
+        in_out_unwrap_ids = zip(in_unwrap_ids, unwrap_to_commutative_mappers_ids)
+        for in_unwrap, out_unwrap in in_out_unwrap_ids:
+            unwrap = r_unwrap.make_unwrap_node([in_unwrap], out_unwrap)
+            self.add_node(unwrap)
+            self.set_edge_to(in_unwrap, unwrap.get_id())  # from are still (wrapped) mappers
+            self.set_edge_from(out_unwrap, unwrap.get_id())  # to will be set to mappers of current node
+        in_mapper_ids = unwrap_to_commutative_mappers_ids
+        return in_mapper_ids
+
     def introduce_aggregators_for_consec_chunks(self, fileIdGen, in_aggregator_ids,
                                                 original_cmd_invocation_with_io_vars, out_aggregator_id, parallelizer,
                                                 streaming_output):
diff --git a/compiler/pash_runtime.py b/compiler/pash_runtime.py
index 698cff210..d5ec78bca 100644
--- a/compiler/pash_runtime.py
+++ b/compiler/pash_runtime.py
@@ -283,13 +283,19 @@ def choose_parallelizing_transformations(graph, r_split_flag): # shall return ma
 
 
 def choose_parallelizing_transformation(curr_id, graph, r_split_flag): # shall return map entry
-    # TODO: here we can implement more sophisticated techniques to decide how to parallelize
+    # here we can implement more sophisticated techniques to decide how to parallelize
     curr = graph.get_node(curr_id)
-    if r_split_flag:
-        option_parallelizer = curr.get_option_implemented_round_robin_parallelizer()
-    else:
-        option_parallelizer = curr.get_option_implemented_consecutive_chunks_parallelizer()
-    return option_parallelizer
+    # we ignore `r_split_flag` here as we want to exploit r_merge followed by commutative command
+    # which only works if the a parallelizer for the latter is chosen (sort does not have RR-parallelizer)
+    # we prioritize round robin over consecutive chunks:
+    return return_default_if_none_else_itself(curr.get_option_implemented_round_robin_parallelizer(),
+                                       curr.get_option_implemented_consecutive_chunks_parallelizer())
+    # When `r_split_flag` should be used:
+    # if r_split_flag:
+    #     option_parallelizer = curr.get_option_implemented_round_robin_parallelizer()
+    # else:
+    #     option_parallelizer = curr.get_option_implemented_consecutive_chunks_parallelizer()
+    # return option_parallelizer
 
 
 def apply_parallelizing_transformations(graph, parallelizer_map, fan_out, batch_size, no_cat_split_vanish,

From 4a0e5a4287d95af0d642a493c51c3587f0e5c48f Mon Sep 17 00:00:00 2001
From: Felix Stutz <fstutz@mpi-sws.org>
Date: Thu, 30 Jun 2022 11:56:08 -0400
Subject: [PATCH 14/64] Add TODO

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>
---
 TODO.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/TODO.md b/TODO.md
index 5725529a9..e604dbdd9 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,7 +1,6 @@
 ## TODOs before merging to `future`
 
-- cat-split fusion
-- r-unwrap-commutative fusion
+- support for RR with unwrap for commutative commands
 - working on all tests
 - Adding annotation library installation and removing ad-hoc import of the latter
 - clean up utils for annotations

From cd00918c106ab5c6cb309624e6ee8ca9f079a018 Mon Sep 17 00:00:00 2001
From: Felix Stutz <fstutz@mpi-sws.org>
Date: Thu, 30 Jun 2022 14:35:37 -0400
Subject: [PATCH 15/64] Support round-robin parallelization for commutative
 commands (#602)

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>
---
 compiler/definitions/ir/dfg_node.py      |  8 ++++
 compiler/definitions/ir/nodes/r_split.py | 22 ++++-------
 compiler/ir.py                           | 48 +++++++++++++++++++++---
 compiler/pash_runtime.py                 |  8 ++--
 4 files changed, 62 insertions(+), 24 deletions(-)

diff --git a/compiler/definitions/ir/dfg_node.py b/compiler/definitions/ir/dfg_node.py
index e3e56631c..587622680 100644
--- a/compiler/definitions/ir/dfg_node.py
+++ b/compiler/definitions/ir/dfg_node.py
@@ -312,6 +312,14 @@ def get_option_implemented_round_robin_parallelizer(self):
                 return parallelizer
         return None
 
+    def get_option_implemented_round_robin_with_unwrap_parallelizer(self):
+        for parallelizer in self.parallelizer_list:
+            splitter = parallelizer.get_splitter()
+            if splitter.is_splitter_round_robin_with_unwrap_flag() and parallelizer.are_all_parts_implemented():
+                return parallelizer
+        return None
+
+
     def get_option_implemented_consecutive_chunks_parallelizer(self):
         for parallelizer in self.parallelizer_list:
             splitter = parallelizer.get_splitter()
diff --git a/compiler/definitions/ir/nodes/r_split.py b/compiler/definitions/ir/nodes/r_split.py
index 68a889f2f..011df0559 100644
--- a/compiler/definitions/ir/nodes/r_split.py
+++ b/compiler/definitions/ir/nodes/r_split.py
@@ -1,7 +1,7 @@
 import os
 
 from datatypes_new.AccessKind import AccessKind
-from datatypes_new.BasicDatatypes import Operand
+from datatypes_new.BasicDatatypes import Operand, Flag
 from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars
 
 import config
@@ -24,21 +24,8 @@ def __init__(self,
                          parallelizer_list=parallelizer_list,
                          cmd_related_properties=cmd_related_properties)
 
-    ## TODO: Generalize this code (for this and SortGReduce) to be able to add an option to any command.
     def add_r_flag(self):
-        assert(False)
-        assert(len(self.com_options) <= 1)
-            
-        ## Add -r in r_split
-        new_opt = (0, Arg(string_to_argument("-r")))
-        shifted_options = [(i+1, opt) for i, opt in self.com_options]
-        self.com_options = [new_opt] + shifted_options
-
-    ## This is not a proper option check. It just works if the r_flag is added as a separate option.
-    def has_r_flag(self):
-        assert(False)
-        option_strings = [str(opt) for i, opt in self.com_options]
-        return ("-r" in option_strings)
+        self.cmd_invocation_with_io_vars.flag_option_list.append(Flag("-r"))
 
 
 def make_r_split(input_id, out_ids, r_split_batch_size):
@@ -56,3 +43,8 @@ def make_r_split(input_id, out_ids, r_split_batch_size):
                     implicit_use_of_streaming_output=None,
                     access_map=access_map)
     return RSplit(cmd_inv_with_io_vars)
+
+def make_r_split_with_unwrap_flag(input_id, out_ids, r_split_batch_size):
+    standard_r_split = make_r_split(input_id, out_ids, r_split_batch_size)
+    standard_r_split.add_r_flag()
+    return standard_r_split
diff --git a/compiler/ir.py b/compiler/ir.py
index c2f36543e..e5cd5c423 100644
--- a/compiler/ir.py
+++ b/compiler/ir.py
@@ -771,6 +771,9 @@ def apply_parallelization_to_node(self, node_id, parallelizer, fileIdGen, fan_ou
             # TODO: for both functions, check which parameters are needed
             self.apply_round_robin_parallelization_to_node(node_id, parallelizer, fileIdGen, fan_out,
                                             batch_size, no_cat_split_vanish, r_split_batch_size)
+        elif splitter.is_splitter_round_robin_with_unwrap_flag():
+            self.apply_round_robin_with_unwrap_flag_parallelization_to_node(node_id, parallelizer, fileIdGen, fan_out,
+                                                           batch_size, no_cat_split_vanish, r_split_batch_size)
         elif splitter.is_splitter_consec_chunks():
             self.apply_consecutive_chunks_parallelization_to_node(node_id, parallelizer, fileIdGen, fan_out,
                                                                  batch_size, no_cat_split_vanish, r_split_batch_size)
@@ -819,6 +822,45 @@ def apply_round_robin_parallelization_to_node(self, node_id, parallelizer, fileI
         # aggregator
         self.introduce_aggregator_for_round_robin(out_mapper_ids, parallelizer, streaming_output)
 
+    def apply_round_robin_with_unwrap_flag_parallelization_to_node(self, node_id, parallelizer, fileIdGen, fan_out,
+                                                  batch_size, no_cat_split_vanish, r_split_batch_size):
+        # round robin with unwrap flag is an inferred parallelizer which ensures that
+        # the command is commutative and has an aggregator for consecutive chunks;
+        # thus we can check whether we can re-open a previous "RR"-parallelization ending with `r_merge`
+        node = self.get_node(node_id)
+        streaming_input, streaming_output, configuration_inputs = \
+            node.get_single_streaming_input_single_output_and_configuration_inputs_of_node_for_parallelization()
+        original_cmd_invocation_with_io_vars = node.cmd_invocation_with_io_vars
+
+        prev_nodes = self.get_previous_nodes(node_id)
+        first_pred_node, first_pred_cmd_inv = self.get_first_previous_node_and_first_previous_cmd_invocation(prev_nodes)
+
+        # remove node to be parallelized
+        self.remove_node(node_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph
+
+        if len(prev_nodes) == 1 and isinstance(first_pred_node, r_merge.RMerge):
+                              # and node.is_commutative(): implied by how this kind of splitter is inferred
+            self.remove_node(prev_nodes[0]) # also sets respective edge to's and from's to None
+
+            in_unwrap_ids = first_pred_cmd_inv.operand_list
+            out_unwrap_ids = self.introduce_unwraps(fileIdGen, in_unwrap_ids)
+            in_mapper_ids = out_unwrap_ids
+        else:
+            # splitter
+            round_robin_with_unwrap_flag_splitter_generator = lambda input_id, output_ids: r_split.make_r_split_with_unwrap_flag(input_id, output_ids, r_split_batch_size)
+            out_split_ids = self.introduce_splitter(round_robin_with_unwrap_flag_splitter_generator, fan_out, fileIdGen, streaming_input)
+            in_mapper_ids = out_split_ids
+
+        # mappers
+        out_mapper_ids = self.introduce_mappers(fan_out, fileIdGen, in_mapper_ids, original_cmd_invocation_with_io_vars,
+                                                parallelizer)
+
+        in_aggregator_ids = out_mapper_ids
+        out_aggregator_id = streaming_output
+        self.introduce_aggregators_for_consec_chunks(fileIdGen, in_aggregator_ids,
+                                                     original_cmd_invocation_with_io_vars, out_aggregator_id, parallelizer,
+                                                     streaming_output)
+
     def apply_consecutive_chunks_parallelization_to_node(self, node_id, parallelizer, fileIdGen, fan_out,
                                                                batch_size, no_cat_split_vanish, r_split_batch_size):
         # check whether we can fuse with previous node's parallelization:
@@ -841,12 +883,6 @@ def apply_consecutive_chunks_parallelization_to_node(self, node_id, parallelizer
             # can be fused
             self.remove_node(prev_nodes[0]) # also sets respective edge to's and from's to None
             in_mapper_ids = first_pred_cmd_inv.operand_list
-        elif len(prev_nodes) == 1 and isinstance(first_pred_node, r_merge.RMerge) and node.is_commutative():
-            self.remove_node(prev_nodes[0]) # also sets respective edge to's and from's to None
-
-            in_unwrap_ids = first_pred_cmd_inv.operand_list
-            out_unwrap_ids = self.introduce_unwraps(fileIdGen, in_unwrap_ids)
-            in_mapper_ids = out_unwrap_ids
         else: # cannot be fused so introduce splitter
             # splitter
             consec_chunks_splitter_generator = lambda input_id, output_ids: pash_split.make_split_file(input_id,
diff --git a/compiler/pash_runtime.py b/compiler/pash_runtime.py
index d5ec78bca..27217a971 100644
--- a/compiler/pash_runtime.py
+++ b/compiler/pash_runtime.py
@@ -287,9 +287,11 @@ def choose_parallelizing_transformation(curr_id, graph, r_split_flag): # shall r
     curr = graph.get_node(curr_id)
     # we ignore `r_split_flag` here as we want to exploit r_merge followed by commutative command
     # which only works if the a parallelizer for the latter is chosen (sort does not have RR-parallelizer)
-    # we prioritize round robin over consecutive chunks:
-    return return_default_if_none_else_itself(curr.get_option_implemented_round_robin_parallelizer(),
-                                       curr.get_option_implemented_consecutive_chunks_parallelizer())
+    # we prioritize round robin over round robin with unwrap over consecutive chunks:
+    list_all_parallelizers_in_priority = [curr.get_option_implemented_round_robin_parallelizer(),
+                                          curr.get_option_implemented_round_robin_with_unwrap_parallelizer(),
+                                          curr.get_option_implemented_consecutive_chunks_parallelizer()]
+    return next((item for item in list_all_parallelizers_in_priority if item is not None), None)
     # When `r_split_flag` should be used:
     # if r_split_flag:
     #     option_parallelizer = curr.get_option_implemented_round_robin_parallelizer()

From dfa03a760f9e0d16081b6c45d3519fd888b35ce7 Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Fri, 1 Jul 2022 15:59:43 -0400
Subject: [PATCH 16/64] Install annotations lib using `pip` (#603)

* Add a proper installation of the annotation lib

Signed-off-by: Konstantinos Kallas <konstantinos.kallas@hotmail.com>

* Remove unnecessary sys.path

Signed-off-by: Konstantinos Kallas <konstantinos.kallas@hotmail.com>

* Fix bug in setup

Signed-off-by: Konstantinos Kallas <konstantinos.kallas@hotmail.com>

* fix setup script

Signed-off-by: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
---
 TODO.md                                             | 2 +-
 compiler/annotations_utils/util_aggregator.py       | 4 ----
 compiler/annotations_utils/util_cmd_invocations.py  | 5 -----
 compiler/annotations_utils/util_file_descriptors.py | 3 ---
 compiler/annotations_utils/util_mapper.py           | 3 ---
 compiler/annotations_utils/util_parsing.py          | 3 ---
 compiler/config.py                                  | 9 ---------
 compiler/definitions/ir/dfg_node.py                 | 2 --
 compiler/ir.py                                      | 2 --
 scripts/setup-pash.sh                               | 4 ++++
 10 files changed, 5 insertions(+), 32 deletions(-)

diff --git a/TODO.md b/TODO.md
index e604dbdd9..a6aa0a7d4 100644
--- a/TODO.md
+++ b/TODO.md
@@ -2,8 +2,8 @@
 
 - support for RR with unwrap for commutative commands
 - working on all tests
-- Adding annotation library installation and removing ad-hoc import of the latter
 - clean up utils for annotations
 - graphviz
 - Changing PaSh flags (making the default be priority r-split and then consecutive chunks), so remove the r_split flag and make defaults be the ones from the OSDI paper
+- Fixing annotation library installation to a specific commit
 - Remove code which got obsolete due to the changes
\ No newline at end of file
diff --git a/compiler/annotations_utils/util_aggregator.py b/compiler/annotations_utils/util_aggregator.py
index 3382730c6..f51a5ab42 100644
--- a/compiler/annotations_utils/util_aggregator.py
+++ b/compiler/annotations_utils/util_aggregator.py
@@ -1,9 +1,5 @@
 # TODO: this file can properly be deleted
 
-import sys
-from config import get_path_annotation_repo
-sys.path.insert(1, get_path_annotation_repo())
-
 from definitions.ir.dfg_node import DFGNode
 from definitions.ir.nodes.cat import Cat
 from annotations_utils.util_cmd_invocations import get_command_invocation_prefix_from_dfg_node
diff --git a/compiler/annotations_utils/util_cmd_invocations.py b/compiler/annotations_utils/util_cmd_invocations.py
index d624affe6..5d6e206ee 100644
--- a/compiler/annotations_utils/util_cmd_invocations.py
+++ b/compiler/annotations_utils/util_cmd_invocations.py
@@ -1,5 +1,3 @@
-import sys
-
 from datatypes_new.BasicDatatypes import Flag, ArgStringType, Operand
 from datatypes_new.BasicDatatypesWithIO import OptionWithIO
 from datatypes_new.CommandInvocationInitial import CommandInvocationInitial
@@ -10,11 +8,8 @@
     get_parallelizability_info_from_cmd_invocation
 from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars
 
-from config import get_path_annotation_repo
 from definitions.ir.arg import Arg
 
-sys.path.insert(1, get_path_annotation_repo())
-
 # for typing
 from datatypes_new.CommandInvocationPrefix import CommandInvocationPrefix
 
diff --git a/compiler/annotations_utils/util_file_descriptors.py b/compiler/annotations_utils/util_file_descriptors.py
index 910efa632..fe68ed9fb 100644
--- a/compiler/annotations_utils/util_file_descriptors.py
+++ b/compiler/annotations_utils/util_file_descriptors.py
@@ -1,8 +1,5 @@
 from util import log
 from definitions.ir.resource import FileResource, Resource, FileDescriptorResource
-import sys
-from config import get_path_annotation_repo
-sys.path.insert(1, get_path_annotation_repo())
 from datatypes_new.BasicDatatypesWithIO import FileNameWithIOInfo, StdDescriptorWithIOInfo
 
 
diff --git a/compiler/annotations_utils/util_mapper.py b/compiler/annotations_utils/util_mapper.py
index 64657cf03..14bd965d1 100644
--- a/compiler/annotations_utils/util_mapper.py
+++ b/compiler/annotations_utils/util_mapper.py
@@ -1,9 +1,6 @@
 # TODO: this file can properly be deleted
 
 # imports from annotation framework
-import sys
-from config import get_path_annotation_repo
-sys.path.insert(1, get_path_annotation_repo())
 # for typing
 # for use
 from annotation_generation_new.datatypes.parallelizability.Mapper import Mapper
diff --git a/compiler/annotations_utils/util_parsing.py b/compiler/annotations_utils/util_parsing.py
index 19a098403..516c43da7 100644
--- a/compiler/annotations_utils/util_parsing.py
+++ b/compiler/annotations_utils/util_parsing.py
@@ -1,10 +1,7 @@
-import sys
 from typing import Set, List, Any
 
 from definitions.ir.arg import Arg
 
-from config import get_path_annotation_repo
-sys.path.insert(1, get_path_annotation_repo())
 from datatypes_new.CommandInvocationInitial import CommandInvocationInitial
 from datatypes_new.BasicDatatypes import Option, ArgStringType, Flag, Operand
 from parser_new.parser import parse, get_set_of_all_flags, get_dict_flag_to_primary_repr, get_set_of_all_options, \
diff --git a/compiler/config.py b/compiler/config.py
index 71a9959fc..f5e7648b7 100644
--- a/compiler/config.py
+++ b/compiler/config.py
@@ -29,15 +29,6 @@
 
 HDFS_PREFIX = "$HDFS_DATANODE_DIR/"
 
-# move this to `config.json` if possible
-PATH_ANNOTATION_REPO="/home/felix/git-repos/MIT/annotations"
-
-def get_path_annotation_repo():
-    if PATH_ANNOTATION_REPO is None:
-        log("No path for annotation repository given! Specify it in compiler/config.py")
-        raise Exception("No path for annotation repository given! Specify it in compiler/config.py")
-    return PATH_ANNOTATION_REPO
-
 config = {}
 annotations = []
 pash_args = None
diff --git a/compiler/definitions/ir/dfg_node.py b/compiler/definitions/ir/dfg_node.py
index 587622680..fe1559194 100644
--- a/compiler/definitions/ir/dfg_node.py
+++ b/compiler/definitions/ir/dfg_node.py
@@ -6,8 +6,6 @@
 from annotations_utils.util_cmd_invocations import to_node_cmd_inv_with_io_vars, construct_property_container_from_list_of_properties
 
 import sys
-from config import get_path_annotation_repo
-sys.path.insert(1, get_path_annotation_repo())
 
 from util import return_empty_list_if_none_else_itself, return_default_if_none_else_itself
 
diff --git a/compiler/ir.py b/compiler/ir.py
index e5cd5c423..a3dc8a1ed 100644
--- a/compiler/ir.py
+++ b/compiler/ir.py
@@ -1,7 +1,5 @@
 import sys
 
-from config import get_path_annotation_repo
-sys.path.insert(1, get_path_annotation_repo())
 from datatypes_new.CommandInvocationInitial import CommandInvocationInitial
 from datatypes_new.BasicDatatypesWithIO import FileNameWithIOInfo, StdDescriptorWithIOInfo
 from annotation_generation_new.datatypes.InputOutputInfo import InputOutputInfo
diff --git a/scripts/setup-pash.sh b/scripts/setup-pash.sh
index 719450796..78721c656 100755
--- a/scripts/setup-pash.sh
+++ b/scripts/setup-pash.sh
@@ -93,6 +93,10 @@ python3 -m pip install graphviz --root $PYTHON_PKG_DIR --ignore-installed #&> $L
 python3 -m pip install numpy --root $PYTHON_PKG_DIR --ignore-installed #&> $LOG_DIR/pip_install_numpy.log
 python3 -m pip install matplotlib --root $PYTHON_PKG_DIR --ignore-installed #&> $LOG_DIR/pip_install_matplotlib.log
 
+## TODO: Fix a specific version somehow, maybe commit?
+git clone https://github.com/binpash/annotations.git ./annotations_repo
+python3 -m pip install ./annotations_repo --root $PYTHON_PKG_DIR --ignore-installed #&> $LOG_DIR/pip_install_annotations.log
+
 # clean the python packages
 cd $PYTHON_PKG_DIR
 # can we find a better alternative to that                                      

From 624d8171a1a67002a3bb6a86b379ccff47ac379f Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Tue, 5 Jul 2022 06:17:00 -0700
Subject: [PATCH 17/64] Add a whitespace to trigger CI

Signed-off-by: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
---
 compiler/config.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/compiler/config.py b/compiler/config.py
index f5e7648b7..053320f98 100644
--- a/compiler/config.py
+++ b/compiler/config.py
@@ -29,6 +29,7 @@
 
 HDFS_PREFIX = "$HDFS_DATANODE_DIR/"
 
+
 config = {}
 annotations = []
 pash_args = None

From 3bd0cf63442246fed043458b59a2d4c9cfd1016d Mon Sep 17 00:00:00 2001
From: Felix Stutz <fstutz@mpi-sws.org>
Date: Tue, 12 Jul 2022 14:48:04 -0400
Subject: [PATCH 18/64] Refactored to remove __future__ from annotations
 library (#609)

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>
---
 compiler/definitions/ir/nodes/dgsh_tee.py | 6 +++---
 compiler/definitions/ir/nodes/eager.py    | 8 ++++----
 compiler/definitions/ir/nodes/r_merge.py  | 6 +++---
 compiler/definitions/ir/nodes/r_split.py  | 6 +++---
 compiler/definitions/ir/nodes/r_unwrap.py | 4 ++--
 compiler/definitions/ir/nodes/r_wrap.py   | 4 ++--
 6 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/compiler/definitions/ir/nodes/dgsh_tee.py b/compiler/definitions/ir/nodes/dgsh_tee.py
index c417b8f58..cacdd94c9 100644
--- a/compiler/definitions/ir/nodes/dgsh_tee.py
+++ b/compiler/definitions/ir/nodes/dgsh_tee.py
@@ -1,4 +1,4 @@
-from datatypes_new.AccessKind import AccessKind
+from datatypes_new.AccessKind import make_stream_output, make_stream_input
 from datatypes_new.BasicDatatypes import Flag, ArgStringType
 from datatypes_new.BasicDatatypesWithIO import OptionWithIO
 from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars
@@ -19,8 +19,8 @@ def __init__(self,
 def make_dgsh_tee_node(input_id, output_id):
     dgsh_tee_bin = os.path.join(config.PASH_TOP, config.config['runtime']['dgsh_tee_binary'])
 
-    access_map = {output_id: AccessKind.make_stream_output(),
-                  input_id: AccessKind.make_stream_input()}
+    access_map = {output_id: make_stream_output(),
+                  input_id: make_stream_input()}
 
     flag_option_list = [OptionWithIO("-i", input_id),
                         OptionWithIO("-o", output_id),
diff --git a/compiler/definitions/ir/nodes/eager.py b/compiler/definitions/ir/nodes/eager.py
index ac49a576e..ae931b486 100644
--- a/compiler/definitions/ir/nodes/eager.py
+++ b/compiler/definitions/ir/nodes/eager.py
@@ -1,4 +1,4 @@
-from datatypes_new.AccessKind import AccessKind
+from datatypes_new.AccessKind import AccessKind, make_stream_output, make_stream_input, make_other_output
 from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars
 
 from definitions.ir.dfg_node import *
@@ -18,9 +18,9 @@ def make_eager_node(input_id, output_id, intermediate_file_id, eager_exec_path):
     eager_name = eager_exec_path
     intermediate_file_id_id = intermediate_file_id.get_ident()
     operand_list = [input_id, output_id, intermediate_file_id_id]
-    access_map = {output_id: AccessKind.make_stream_output(),
-                  input_id: AccessKind.make_stream_input(),
-                  intermediate_file_id_id: AccessKind.make_other_output()}
+    access_map = {output_id: make_stream_output(),
+                  input_id: make_stream_input(),
+                  intermediate_file_id_id: make_other_output()}
     cmd_inv_with_io_vars = CommandInvocationWithIOVars(
         cmd_name=eager_name,
         flag_option_list=[],
diff --git a/compiler/definitions/ir/nodes/r_merge.py b/compiler/definitions/ir/nodes/r_merge.py
index f587a94fc..453f0c01f 100644
--- a/compiler/definitions/ir/nodes/r_merge.py
+++ b/compiler/definitions/ir/nodes/r_merge.py
@@ -1,4 +1,4 @@
-from datatypes_new.AccessKind import AccessKind
+from datatypes_new.AccessKind import make_stream_input, make_stream_output
 from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars
 
 from definitions.ir.dfg_node import *
@@ -20,8 +20,8 @@ def __init__(self,
 def make_r_merge_node(inputs, output):
     r_merge_bin = os.path.join(config.PASH_TOP, config.config['runtime']['r_merge_binary'])
     # TODO: assume that the inputs and output is provided as operands
-    access_map = {input_id: AccessKind.make_stream_input() for input_id in inputs}
-    access_map[output] = AccessKind.make_stream_output()
+    access_map = {input_id: make_stream_input() for input_id in inputs}
+    access_map[output] = make_stream_output()
     cmd_inv_with_io_vars = CommandInvocationWithIOVars(
         cmd_name=r_merge_bin,
         flag_option_list=[],
diff --git a/compiler/definitions/ir/nodes/r_split.py b/compiler/definitions/ir/nodes/r_split.py
index 011df0559..05900a1d9 100644
--- a/compiler/definitions/ir/nodes/r_split.py
+++ b/compiler/definitions/ir/nodes/r_split.py
@@ -1,6 +1,6 @@
 import os
 
-from datatypes_new.AccessKind import AccessKind
+from datatypes_new.AccessKind import AccessKind, make_stream_input, make_stream_output
 from datatypes_new.BasicDatatypes import Operand, Flag
 from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars
 
@@ -33,8 +33,8 @@ def make_r_split(input_id, out_ids, r_split_batch_size):
     operand_list = [input_id,
                     Operand(Arg(string_to_argument(str(r_split_batch_size))))]
     operand_list.extend(out_ids)
-    access_map = {output_id: AccessKind.make_stream_output() for output_id in out_ids}
-    access_map[input_id] = AccessKind.make_stream_input()
+    access_map = {output_id: make_stream_output() for output_id in out_ids}
+    access_map[input_id] = make_stream_input()
     cmd_inv_with_io_vars = CommandInvocationWithIOVars(
                     cmd_name=r_split_bin,
                     flag_option_list=[],
diff --git a/compiler/definitions/ir/nodes/r_unwrap.py b/compiler/definitions/ir/nodes/r_unwrap.py
index 38cb03dcc..0a2aec195 100644
--- a/compiler/definitions/ir/nodes/r_unwrap.py
+++ b/compiler/definitions/ir/nodes/r_unwrap.py
@@ -1,4 +1,4 @@
-from datatypes_new.AccessKind import AccessKind
+from datatypes_new.AccessKind import make_stream_input, make_stream_output
 from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars
 
 from definitions.ir.dfg_node import *
@@ -21,7 +21,7 @@ def __init__(self,
 def make_unwrap_node(inputs, output):
     assert(len(inputs) == 1)
     input_id = inputs[0]
-    access_map = {input_id: AccessKind.make_stream_input(), output: AccessKind.make_stream_output()}
+    access_map = {input_id: make_stream_input(), output: make_stream_output()}
     r_unwrap_bin = os.path.join(config.PASH_TOP, config.config['runtime']['r_unwrap_binary'])
     cmd_inv_with_io_vars = CommandInvocationWithIOVars(
         cmd_name=r_unwrap_bin,
diff --git a/compiler/definitions/ir/nodes/r_wrap.py b/compiler/definitions/ir/nodes/r_wrap.py
index 8fd44f6ca..316e81f33 100644
--- a/compiler/definitions/ir/nodes/r_wrap.py
+++ b/compiler/definitions/ir/nodes/r_wrap.py
@@ -1,4 +1,4 @@
-from datatypes_new.AccessKind import AccessKind
+from datatypes_new.AccessKind import make_stream_output, make_stream_input
 from datatypes_new.BasicDatatypes import ArgStringType
 from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars
 
@@ -45,7 +45,7 @@ def wrap_node(node: DFGNode, edges):
     ## TODO: changed this from <= to == 1 to simplify reasoning later for now
     assert(len(outputs) == 1)
     output_id = outputs[0]
-    access_map = {input_id: AccessKind.make_stream_input(), output_id: AccessKind.make_stream_output()}
+    access_map = {input_id: make_stream_input(), output_id: make_stream_output()}
 
     #create bash -c argument
     cmd_inv_with_io_vars: CommandInvocationWithIOVars = node.cmd_invocation_with_io_vars

From 44017dc6af2c1cc8ea14b5dd010eaf3f4e66d61b Mon Sep 17 00:00:00 2001
From: Felix Stutz <fstutz@mpi-sws.org>
Date: Wed, 20 Jul 2022 14:24:42 -0400
Subject: [PATCH 19/64] Cover all but one test case from
 compiler/test_evaluation_scripts.sh (#612)

* Fix bug in parser to switch from flag to operand mode when reading hyphen

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>

* Adapt shortest_scripts.sh to work with parser

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>

* Cover more test cases from script_microbenchmarks

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>

* Parallelize spell-grep as done in `future`, i.e., not RR but CC for `set_diff`

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>

* Clean up and clarifying comment in parser

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>

* Simplify control flow in parallelization

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>
---
 TODO.md                                       | 13 +++--
 .../annotations_utils/util_cmd_invocations.py |  4 +-
 compiler/annotations_utils/util_parsing.py    |  2 +
 compiler/definitions/ir/nodes/pash_split.py   |  6 +--
 compiler/ir.py                                | 48 ++++++++++++-------
 evaluation/tests/shortest_scripts.sh          |  4 +-
 6 files changed, 50 insertions(+), 27 deletions(-)

diff --git a/TODO.md b/TODO.md
index a6aa0a7d4..0ffd62f55 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,9 +1,16 @@
 ## TODOs before merging to `future`
 
-- support for RR with unwrap for commutative commands
-- working on all tests
+- fix tests from compiler/test_evaluation_scripts.sh:
+  + bigrams
 - clean up utils for annotations
 - graphviz
 - Changing PaSh flags (making the default be priority r-split and then consecutive chunks), so remove the r_split flag and make defaults be the ones from the OSDI paper
 - Fixing annotation library installation to a specific commit
-- Remove code which got obsolete due to the changes
\ No newline at end of file
+- Remove code which got obsolete due to the changes
+- Room for optimization: basically disable parallelization after a tr which squeezes all new lines since there are no sequences of data to parallelize anyway for the moment. 
+    Long-term, we could allow parallelization but with a adj_line_merge aggregator.
+- Changes to scripts:
+  + `shortest_scripts.sh`: here I only needed to modify the script slightly: 
+    (1) option arguments for `cut` with whitespace as the parser cannot deal with them otherwise currently but we might want to change this in the future, 
+    (2) `head -n 15` instead of `head -15` which might be a bit harder to support. I did not really see how the man-page supports this actually when skimming but I might have missed that. 
+- tr_test.sh: Outside the testing script, the outputs are the same but somehow it still shows different outputs. Checked this with Konstantinos and he will check the testing script later.
diff --git a/compiler/annotations_utils/util_cmd_invocations.py b/compiler/annotations_utils/util_cmd_invocations.py
index 5d6e206ee..1be87e28b 100644
--- a/compiler/annotations_utils/util_cmd_invocations.py
+++ b/compiler/annotations_utils/util_cmd_invocations.py
@@ -110,12 +110,12 @@ def to_arg_from_cmd_inv_with_io_vars_without_streaming_inputs_or_outputs_for_wra
     whole_cmd.concatenate(Arg(string_to_argument("\'")))
     return whole_cmd
 
-def to_arg_flagoption(flagoption, _edges):
+def to_arg_flagoption(flagoption, edges):
     if isinstance(flagoption, Flag):
         return [Arg(string_to_argument(flagoption.get_name()))]
     elif isinstance(flagoption, OptionWithIO):
         opt_name_arg = Arg(string_to_argument(flagoption.get_name()))
-        opt_arg_arg = translate_io_var_to_arg_if_applicable(flagoption.get_arg())
+        opt_arg_arg = translate_io_var_to_arg_if_applicable(flagoption.get_arg(), edges)
         return [opt_name_arg, opt_arg_arg]
 
 def to_arg_operand(operand, edges):
diff --git a/compiler/annotations_utils/util_parsing.py b/compiler/annotations_utils/util_parsing.py
index 516c43da7..cef464f79 100644
--- a/compiler/annotations_utils/util_parsing.py
+++ b/compiler/annotations_utils/util_parsing.py
@@ -70,6 +70,8 @@ def parse_arg_list_to_command_invocation(command, flags_options_operands) -> Com
             option = Option(option_name_as_string, option_arg_as_arg)
             flag_option_list.append(option)
             i += 1  # since we consumed another term for the argument
+        elif potential_flag_or_option_name == "-": # switch to operand mode (interpreted as hyphen-stdin)
+            break
         elif are_all_individually_flags(potential_flag_or_option_name, set_of_all_flags):
             for split_el in list(potential_flag_or_option_name[1:]):
                 flag: Flag = Flag(f'-{split_el}')
diff --git a/compiler/definitions/ir/nodes/pash_split.py b/compiler/definitions/ir/nodes/pash_split.py
index 9c28267d5..e21de4b1d 100644
--- a/compiler/definitions/ir/nodes/pash_split.py
+++ b/compiler/definitions/ir/nodes/pash_split.py
@@ -1,4 +1,4 @@
-from datatypes_new.AccessKind import AccessKind
+from datatypes_new.AccessKind import make_stream_input, make_stream_output
 from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars
 
 from definitions.ir.file_id import *
@@ -26,8 +26,8 @@ def make_split_file(input_id, out_ids):
     auto_split_bin = os.path.join(config.PASH_TOP, config.config['runtime']['auto_split_binary'])
     operand_list = [input_id]
     operand_list.extend(out_ids)
-    access_map = {output_id: AccessKind.make_stream_output() for output_id in out_ids}
-    access_map[input_id] = AccessKind.make_stream_input()
+    access_map = {output_id: make_stream_output() for output_id in out_ids}
+    access_map[input_id] = make_stream_input()
     cmd_inv_with_io_vars = CommandInvocationWithIOVars(
         cmd_name=auto_split_bin,
         flag_option_list=[],
diff --git a/compiler/ir.py b/compiler/ir.py
index a3dc8a1ed..0816a663f 100644
--- a/compiler/ir.py
+++ b/compiler/ir.py
@@ -177,6 +177,10 @@ def compile_command_to_DFG(fileIdGen, command, options,
                            redirections=[]):
     command_invocation: CommandInvocationInitial = parse_arg_list_to_command_invocation(command, options)
     io_info: InputOutputInfo = get_input_output_info_from_cmd_invocation_util(command_invocation)
+    if io_info is None:
+        raise Exception(f"InputOutputInformation for {format_arg_chars(command)} not provided so considered side-effectful.")
+    if io_info.has_other_outputs():
+        raise Exception(f"Command {format_arg_chars(command)} has outputs other than streaming.")
     para_info: ParallelizabilityInfo = get_parallelizability_info_from_cmd_invocation_util(command_invocation)
     command_invocation_with_io = io_info.apply_input_output_info_to_command_invocation(command_invocation)
     parallelizer_list, round_robin_compatible_with_cat, is_commutative = para_info.unpack_info()
@@ -766,7 +770,6 @@ def apply_parallelization_to_node(self, node_id, parallelizer, fileIdGen, fan_ou
                                             batch_size, no_cat_split_vanish, r_split_batch_size):
         splitter = parallelizer.get_splitter()
         if splitter.is_splitter_round_robin():
-            # TODO: for both functions, check which parameters are needed
             self.apply_round_robin_parallelization_to_node(node_id, parallelizer, fileIdGen, fan_out,
                                             batch_size, no_cat_split_vanish, r_split_batch_size)
         elif splitter.is_splitter_round_robin_with_unwrap_flag():
@@ -797,14 +800,19 @@ def apply_round_robin_parallelization_to_node(self, node_id, parallelizer, fileI
             node.get_single_streaming_input_single_output_and_configuration_inputs_of_node_for_parallelization()
         original_cmd_invocation_with_io_vars = node.cmd_invocation_with_io_vars
 
+
+        can_be_fused_with_prev = False
         prev_nodes = self.get_previous_nodes(node_id)
-        first_pred_node, first_pred_cmd_inv = self.get_first_previous_node_and_first_previous_cmd_invocation(prev_nodes)
+        if len(prev_nodes) == 1:
+            first_pred_node, first_pred_cmd_inv = \
+                self.get_only_previous_node_and_only_previous_cmd_invocation(prev_nodes)
+            if isinstance(first_pred_node, r_merge.RMerge):
+                can_be_fused_with_prev = True
 
         # remove node to be parallelized
         self.remove_node(node_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph
 
-        if len(prev_nodes) == 1 and isinstance(first_pred_node, r_merge.RMerge):
-            # can be fused
+        if can_be_fused_with_prev:
             self.remove_node(prev_nodes[0]) # also sets respective edge to's and from's to None
             in_mapper_ids = first_pred_cmd_inv.operand_list
         else: # cannot be fused so introduce splitter
@@ -830,16 +838,19 @@ def apply_round_robin_with_unwrap_flag_parallelization_to_node(self, node_id, pa
             node.get_single_streaming_input_single_output_and_configuration_inputs_of_node_for_parallelization()
         original_cmd_invocation_with_io_vars = node.cmd_invocation_with_io_vars
 
+        can_be_fused_with_prev = False
         prev_nodes = self.get_previous_nodes(node_id)
-        first_pred_node, first_pred_cmd_inv = self.get_first_previous_node_and_first_previous_cmd_invocation(prev_nodes)
+        if len(prev_nodes) == 1:
+            first_pred_node, first_pred_cmd_inv = \
+                self.get_only_previous_node_and_only_previous_cmd_invocation(prev_nodes)
+            if isinstance(first_pred_node, r_merge.RMerge):
+                can_be_fused_with_prev = True
 
         # remove node to be parallelized
         self.remove_node(node_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph
 
-        if len(prev_nodes) == 1 and isinstance(first_pred_node, r_merge.RMerge):
-                              # and node.is_commutative(): implied by how this kind of splitter is inferred
+        if can_be_fused_with_prev: # and node.is_commutative(): implied by how this kind of splitter is inferred
             self.remove_node(prev_nodes[0]) # also sets respective edge to's and from's to None
-
             in_unwrap_ids = first_pred_cmd_inv.operand_list
             out_unwrap_ids = self.introduce_unwraps(fileIdGen, in_unwrap_ids)
             in_mapper_ids = out_unwrap_ids
@@ -870,23 +881,24 @@ def apply_consecutive_chunks_parallelization_to_node(self, node_id, parallelizer
             node.get_single_streaming_input_single_output_and_configuration_inputs_of_node_for_parallelization()
         original_cmd_invocation_with_io_vars = node.cmd_invocation_with_io_vars
 
+        can_be_fused_with_prev = False
         prev_nodes = self.get_previous_nodes(node_id)
-        first_pred_node, first_pred_cmd_inv = self.get_first_previous_node_and_first_previous_cmd_invocation(prev_nodes)
+        if len(prev_nodes) == 1:
+            first_pred_node, first_pred_cmd_inv = \
+                self.get_only_previous_node_and_only_previous_cmd_invocation(prev_nodes)
+            if first_pred_cmd_inv.is_aggregator_concatenate():
+                can_be_fused_with_prev = True
 
         # remove node to be parallelized
         self.remove_node(node_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph
 
-        # TODO: change first check to first_pred_node and not cmd_inv
-        if len(prev_nodes) == 1 and first_pred_cmd_inv.is_aggregator_concatenate():
-            # can be fused
+        if can_be_fused_with_prev:
             self.remove_node(prev_nodes[0]) # also sets respective edge to's and from's to None
             in_mapper_ids = first_pred_cmd_inv.operand_list
         else: # cannot be fused so introduce splitter
             # splitter
-            consec_chunks_splitter_generator = lambda input_id, output_ids: pash_split.make_split_file(input_id,
-                                                                                                       output_ids)
-            out_split_ids = self.introduce_splitter(consec_chunks_splitter_generator, fan_out, fileIdGen,
-                                                    streaming_input)
+            consec_chunks_splitter_generator = lambda input_id, output_ids: pash_split.make_split_file(input_id, output_ids)
+            out_split_ids = self.introduce_splitter(consec_chunks_splitter_generator, fan_out, fileIdGen, streaming_input)
             in_mapper_ids = out_split_ids
 
         # mappers
@@ -900,9 +912,9 @@ def apply_consecutive_chunks_parallelization_to_node(self, node_id, parallelizer
                                                      original_cmd_invocation_with_io_vars, out_aggregator_id, parallelizer,
                                                      streaming_output)
 
-    def get_first_previous_node_and_first_previous_cmd_invocation(self, prev_nodes):
-        assert (len(prev_nodes) > 0)
+    def get_only_previous_node_and_only_previous_cmd_invocation(self, prev_nodes):
         # get info about first one but also ensure that it is the only one if we fuse
+        assert len(prev_nodes) == 1
         first_pred_id = prev_nodes[0]
         first_pred_node = self.get_node(first_pred_id)
         first_pred_cmd_inv = first_pred_node.cmd_invocation_with_io_vars
diff --git a/evaluation/tests/shortest_scripts.sh b/evaluation/tests/shortest_scripts.sh
index 0d3913119..7321d775e 100644
--- a/evaluation/tests/shortest_scripts.sh
+++ b/evaluation/tests/shortest_scripts.sh
@@ -4,4 +4,6 @@
 # +p.95 multiple sed
 # +p.XX crawler
 
-cat $IN | xargs file | grep "shell script" | cut -d: -f1 | xargs -L 1 wc -l | grep -v '^0$' | sort -n | head -15
+# cut -d: -f1 -> cut -d : -f 1; as parser recognizes option arguments only if given with whitespace
+# head -15 -> head -n 15; not documented in man page 
+cat $IN | xargs file | grep "shell script" | cut -d : -f 1 | xargs -L 1 wc -l | grep -v '^0$' | sort -n | head -n 15

From a75378cd9b4f2039d2b5155fe79f98d2ef3c3d06 Mon Sep 17 00:00:00 2001
From: Felix Stutz <fstutz@mpi-sws.org>
Date: Mon, 25 Jul 2022 13:44:53 -0400
Subject: [PATCH 20/64] Add support for bigrams (#614)

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>
---
 compiler/definitions/ir/nodes/cat.py | 11 ++--
 compiler/ir.py                       | 95 +++++++++++++++++++---------
 2 files changed, 70 insertions(+), 36 deletions(-)

diff --git a/compiler/definitions/ir/nodes/cat.py b/compiler/definitions/ir/nodes/cat.py
index a27b89f4f..28df3920e 100644
--- a/compiler/definitions/ir/nodes/cat.py
+++ b/compiler/definitions/ir/nodes/cat.py
@@ -1,4 +1,5 @@
-from definitions.ir.dfg_node import *
+from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars
+from definitions.ir.dfg_node import DFGNode
 
 class Cat(DFGNode):
     def __init__(self, inputs, outputs, com_name, com_category,
@@ -32,9 +33,5 @@ def __init__(self, inputs, outputs, com_name, com_category,
                          )
 
 def make_cat_node(inputs, output):
-    com_name = Arg(string_to_argument("cat"))
-    com_category = "stateless"
-    return Cat(inputs,
-               [output],
-               com_name, 
-               com_category)
+    cmd_inv_cat = CommandInvocationWithIOVars.make_cat_command_invocation_with_io_vars(inputs, output)
+    return DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(cmd_inv_cat)
diff --git a/compiler/ir.py b/compiler/ir.py
index 0816a663f..49bd57a91 100644
--- a/compiler/ir.py
+++ b/compiler/ir.py
@@ -4,6 +4,7 @@
 from datatypes_new.BasicDatatypesWithIO import FileNameWithIOInfo, StdDescriptorWithIOInfo
 from annotation_generation_new.datatypes.InputOutputInfo import InputOutputInfo
 from annotation_generation_new.datatypes.ParallelizabilityInfo import ParallelizabilityInfo
+from annotation_generation_new.datatypes.CommandProperties import CommandProperties
 from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars
 
 from annotations_utils.util_parsing import parse_arg_list_to_command_invocation
@@ -184,9 +185,9 @@ def compile_command_to_DFG(fileIdGen, command, options,
     para_info: ParallelizabilityInfo = get_parallelizability_info_from_cmd_invocation_util(command_invocation)
     command_invocation_with_io = io_info.apply_input_output_info_to_command_invocation(command_invocation)
     parallelizer_list, round_robin_compatible_with_cat, is_commutative = para_info.unpack_info()
-    property_list = [('round_robin_compatible_with_cat', round_robin_compatible_with_cat),
-                     ('is_commutative', is_commutative)]
-    cmd_related_properties = construct_property_container_from_list_of_properties(property_list)
+    property_dict = [{'round_robin_compatible_with_cat': round_robin_compatible_with_cat,
+                     'is_commutative': is_commutative}]
+    cmd_related_properties = CommandProperties(property_dict)
 
     ## TODO: Make an empty IR and add edges and nodes incrementally (using the methods defined in IR).
 
@@ -824,6 +825,7 @@ def apply_round_robin_parallelization_to_node(self, node_id, parallelizer, fileI
         # mappers
         out_mapper_ids = self.introduce_mappers(fan_out, fileIdGen, in_mapper_ids, original_cmd_invocation_with_io_vars,
                                                 parallelizer)
+        out_mapper_ids = [out_ids[0] for out_ids in out_mapper_ids] # since we get list of list back for potential aux info
 
         # aggregator
         self.introduce_aggregator_for_round_robin(out_mapper_ids, parallelizer, streaming_output)
@@ -913,6 +915,7 @@ def apply_consecutive_chunks_parallelization_to_node(self, node_id, parallelizer
                                                      streaming_output)
 
     def get_only_previous_node_and_only_previous_cmd_invocation(self, prev_nodes):
+        assert (len(prev_nodes) > 0)
         # get info about first one but also ensure that it is the only one if we fuse
         assert len(prev_nodes) == 1
         first_pred_id = prev_nodes[0]
@@ -930,16 +933,26 @@ def introduce_splitter(self, splitter_generator, fan_out, fileIdGen, streaming_i
         return out_split_ids
 
     def introduce_mappers(self, fan_out, fileIdGen, in_mapper_ids, original_cmd_invocation_with_io_vars, parallelizer):
-        out_mapper_ids = self.generate_ephemeral_edges(fileIdGen, fan_out)
+        # -> [[input, aux1, aux2], [...], [...], ...]
+        num_aux_mapper_to_aggregator = parallelizer.info_mapper_aggregator
+        out_mapper_ids = []
+        for _ in range(0,fan_out):
+            out_mapper_ids.append(self.generate_ephemeral_edges(fileIdGen, num_aux_mapper_to_aggregator+1))
+        # TODO: Fix that we use different ones here!
+        # list of output, aux_output_1, aux_output_2, ...
         zip_mapper_in_out_ids = zip(in_mapper_ids, out_mapper_ids)
         all_mappers = []
-        for (in_id, out_id) in zip_mapper_in_out_ids:
+        for (in_id, out_ids) in zip_mapper_in_out_ids:
             # BEGIN: these 4 lines could be refactored to be a function in graph such that
             # creating end point of edges and the creation of edges is not decoupled
-            mapper_cmd_inv = parallelizer.get_actual_mapper(original_cmd_invocation_with_io_vars, in_id, out_id)
+            out_id = out_ids[0]
+            aux_out_ids = out_ids[1:]
+            mapper_cmd_inv = parallelizer.get_actual_mapper(original_cmd_invocation_with_io_vars, in_id, out_id, aux_out_ids)
             mapper = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(mapper_cmd_inv)
             self.set_edge_to(in_id, mapper.get_id())
             self.set_edge_from(out_id, mapper.get_id())
+            for aux_out_id in aux_out_ids:
+                self.set_edge_from(aux_out_id, mapper.get_id())
             # END
             splitter = parallelizer.get_splitter()
             if splitter.is_splitter_round_robin():
@@ -966,25 +979,34 @@ def introduce_unwraps(self, fileIdGen, in_unwrap_ids):
     def introduce_aggregators_for_consec_chunks(self, fileIdGen, in_aggregator_ids,
                                                 original_cmd_invocation_with_io_vars, out_aggregator_id, parallelizer,
                                                 streaming_output):
-        aggregator_spec = parallelizer.get_aggregator_spec()
-        if aggregator_spec.is_aggregator_spec_concatenate() or aggregator_spec.is_aggregator_spec_custom_n_ary():
-            aggregator_cmd_inv = parallelizer.get_actual_aggregator(original_cmd_invocation_with_io_vars,
-                                                                    in_aggregator_ids, out_aggregator_id)
-            aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(aggregator_cmd_inv)
-            for in_aggregator_id in in_aggregator_ids:
-                self.set_edge_to(in_aggregator_id, aggregator.get_id())
-            self.set_edge_from(streaming_output, aggregator.get_id())
-            all_aggregators = [aggregator]
-            ## Add the merge commands in the graph
-            for new_node in all_aggregators:
-                self.add_node(new_node)
-        elif aggregator_spec.is_aggregator_spec_custom_2_ary():
-            # TODO: we simplify and assume that every mapper produces a single output for now
-            map_in_aggregator_ids = [[id] for id in in_aggregator_ids]
-            # TODO: turn node into cmd_invocation_with_io_vars since this is the only thing required in this function
-            self.create_generic_aggregator_tree(original_cmd_invocation_with_io_vars, parallelizer, map_in_aggregator_ids, out_aggregator_id, fileIdGen)
-        else:
-            raise Exception("aggregator kind not yet implemented")
+        # in_aggregator_ids: [[input, aux1, aux2, ...], [...], [...], ...]
+        if parallelizer.info_mapper_aggregator == 0:
+            in_aggregator_ids = [in_ids[0] for in_ids in in_aggregator_ids]  # since we get list of list back for potential aux info
+            aggregator_spec = parallelizer.get_aggregator_spec()
+            if aggregator_spec.is_aggregator_spec_concatenate() or aggregator_spec.is_aggregator_spec_custom_n_ary():
+                aggregator_cmd_inv = parallelizer.get_actual_aggregator(original_cmd_invocation_with_io_vars,
+                                                                        in_aggregator_ids, out_aggregator_id)
+                aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(aggregator_cmd_inv)
+                for in_aggregator_id in in_aggregator_ids:
+                    self.set_edge_to(in_aggregator_id, aggregator.get_id())
+                self.set_edge_from(streaming_output, aggregator.get_id())
+                all_aggregators = [aggregator]
+                ## Add the merge commands in the graph
+                for new_node in all_aggregators:
+                    self.add_node(new_node)
+            elif aggregator_spec.is_aggregator_spec_custom_2_ary():
+                # TODO: we simplify and assume that every mapper produces a single output for now
+                map_in_aggregator_ids = [[id] for id in in_aggregator_ids]
+                # TODO: turn node into cmd_invocation_with_io_vars since this is the only thing required in this function
+                self.create_generic_aggregator_tree(original_cmd_invocation_with_io_vars, parallelizer, map_in_aggregator_ids, out_aggregator_id, fileIdGen)
+            else:
+                raise Exception("aggregator kind not yet implemented")
+        else: # we got auxiliary information
+            assert(parallelizer.core_aggregator_spec.is_aggregator_spec_custom_2_ary())
+            map_in_aggregator_ids = in_aggregator_ids
+            self.create_generic_aggregator_tree(original_cmd_invocation_with_io_vars, parallelizer,
+                                                map_in_aggregator_ids, out_aggregator_id, fileIdGen)
+
 
     def introduce_aggregator_for_round_robin(self, out_mapper_ids, parallelizer, streaming_output):
         aggregator_spec = parallelizer.get_aggregator_spec()
@@ -1338,10 +1360,25 @@ def valid(self):
     ## This is a function that creates a reduce tree for a given node
     def create_generic_aggregator_tree(self, cmd_invocation_with_io_vars, parallelizer, input_ids_for_aggregators, out_aggregator_id, fileIdGen):
         def function_to_get_binary_aggregator(in_ids, out_ids):
-            assert(len(out_ids) == 1)
-            aggregator_cmd_inv = parallelizer.get_actual_aggregator(cmd_invocation_with_io_vars, in_ids, out_ids[0])
-            aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(aggregator_cmd_inv)
-            return aggregator
+            if len(out_ids) == 1:
+                aggregator_cmd_inv = parallelizer.get_actual_aggregator(cmd_invocation_with_io_vars, in_ids, out_ids[0])
+                aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(aggregator_cmd_inv)
+                return aggregator
+            else:
+                # list has been flattened ...
+                num_input_ids = len(in_ids)
+                assert(num_input_ids % 2 == 0)
+                fst_normal_input = in_ids[0]
+                fst_aux_inputs_from = in_ids[1:int(num_input_ids/2)]
+                snd_normal_input = in_ids[int(num_input_ids/2)]
+                snd_aux_inputs_from = in_ids[int(num_input_ids/2)+1:]
+                output_to = out_ids[0]
+                aux_outputs_to = out_ids[1:]
+                aggregator_cmd_inv = parallelizer.get_actual_2_ary_aggregator_with_aux(
+                    fst_normal_input, fst_aux_inputs_from, snd_normal_input, snd_aux_inputs_from,
+                    output_to, aux_outputs_to)
+                aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(aggregator_cmd_inv)
+                return aggregator
         ## The Aggregator node takes a sequence of input ids and an output id
         all_aggregators, new_edges, final_output_id = self.create_reduce_tree(lambda in_ids, out_ids: function_to_get_binary_aggregator(in_ids, out_ids),
                                     input_ids_for_aggregators, fileIdGen)

From 3f79626bbe45f3831fa40b8fab45b771f226d265 Mon Sep 17 00:00:00 2001
From: Felix Stutz <fstutz@mpi-sws.org>
Date: Mon, 25 Jul 2022 16:46:38 -0400
Subject: [PATCH 21/64] Refactor to have defaults for AnnotationInfo (#615)

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>
---
 compiler/ir.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/compiler/ir.py b/compiler/ir.py
index 49bd57a91..c7c1afea4 100644
--- a/compiler/ir.py
+++ b/compiler/ir.py
@@ -183,6 +183,8 @@ def compile_command_to_DFG(fileIdGen, command, options,
     if io_info.has_other_outputs():
         raise Exception(f"Command {format_arg_chars(command)} has outputs other than streaming.")
     para_info: ParallelizabilityInfo = get_parallelizability_info_from_cmd_invocation_util(command_invocation)
+    if para_info is None:
+        para_info = ParallelizabilityInfo() # defaults to no parallelizer's and all properties False
     command_invocation_with_io = io_info.apply_input_output_info_to_command_invocation(command_invocation)
     parallelizer_list, round_robin_compatible_with_cat, is_commutative = para_info.unpack_info()
     property_dict = [{'round_robin_compatible_with_cat': round_robin_compatible_with_cat,

From b5bd70bcf2194964ebc3d35d8785c00d71a0e038 Mon Sep 17 00:00:00 2001
From: Felix Stutz <fstutz@mpi-sws.org>
Date: Tue, 30 Aug 2022 05:33:33 -0400
Subject: [PATCH 22/64] Minor changes due to typing in annotations repository
 (#622)

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>
---
 compiler/ir.py | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/compiler/ir.py b/compiler/ir.py
index c7c1afea4..0ca5d8453 100644
--- a/compiler/ir.py
+++ b/compiler/ir.py
@@ -1,7 +1,8 @@
 import sys
 
 from datatypes_new.CommandInvocationInitial import CommandInvocationInitial
-from datatypes_new.BasicDatatypesWithIO import FileNameWithIOInfo, StdDescriptorWithIOInfo
+from datatypes_new.BasicDatatypes import ArgStringType
+from datatypes_new.BasicDatatypesWithIO import FileNameWithIOInfo, StdDescriptorWithIOInfo, OptionWithIO
 from annotation_generation_new.datatypes.InputOutputInfo import InputOutputInfo
 from annotation_generation_new.datatypes.ParallelizabilityInfo import ParallelizabilityInfo
 from annotation_generation_new.datatypes.CommandProperties import CommandProperties
@@ -139,6 +140,7 @@ def add_file_id_vars(command_invocation_with_io, fileIdGen):
     # make pass over everything and create file_id for everything
     # only for operands for now:
     dfg_edges = {}
+    new_flagoption_list = []
     new_operand_list = []
     access_map = dict()
 
@@ -150,6 +152,15 @@ def add_var_for_descriptor(operand):
         access_map[fid_id] = operand.get_access()
         return fid_id
 
+    for i in range(len(command_invocation_with_io.flag_option_list)):
+        flagoption = command_invocation_with_io.flag_option_list[i]
+        if isinstance(flagoption, OptionWithIO) and not isinstance(flagoption.option_arg, ArgStringType):
+            fid_id = add_var_for_descriptor(flagoption.option_arg)
+            new_option = OptionWithIOVar(flagoption.name, fid_id)
+            new_flagoption_list.append(new_option)
+        else: # Flag
+            new_flagoption_list.append(flagoption)
+
     for i in range(len(command_invocation_with_io.operand_list)):
         operand = command_invocation_with_io.operand_list[i]
         if isinstance(operand, FileNameWithIOInfo) or isinstance(operand, StdDescriptorWithIOInfo):
@@ -166,11 +177,12 @@ def add_var_for_descriptor(operand):
     else:
         new_implicit_use_of_streaming_output = None
 
-    # this shall become copy-based
-    command_invocation_with_io_vars = CommandInvocationWithIOVars.get_from_without_vars(command_invocation_with_io, access_map)
-    command_invocation_with_io_vars.operand_list = new_operand_list
-    command_invocation_with_io_vars.implicit_use_of_streaming_input = new_implicit_use_of_streaming_input
-    command_invocation_with_io_vars.implicit_use_of_streaming_output = new_implicit_use_of_streaming_output
+    command_invocation_with_io_vars = CommandInvocationWithIOVars(cmd_name=command_invocation_with_io.cmd_name,
+                                       flag_option_list=new_flagoption_list,
+                                       operand_list=new_operand_list,
+                                       implicit_use_of_streaming_input=new_implicit_use_of_streaming_input,
+                                       implicit_use_of_streaming_output=new_implicit_use_of_streaming_output,
+                                       access_map=access_map)
     return command_invocation_with_io_vars, dfg_edges
 
 
@@ -186,6 +198,8 @@ def compile_command_to_DFG(fileIdGen, command, options,
     if para_info is None:
         para_info = ParallelizabilityInfo() # defaults to no parallelizer's and all properties False
     command_invocation_with_io = io_info.apply_input_output_info_to_command_invocation(command_invocation)
+    if para_info is None:
+        para_info = ParallelizabilityInfo()  # defaults to no parallelizer's and all properties False
     parallelizer_list, round_robin_compatible_with_cat, is_commutative = para_info.unpack_info()
     property_dict = [{'round_robin_compatible_with_cat': round_robin_compatible_with_cat,
                      'is_commutative': is_commutative}]

From 1ee23bc83e554eac7a9be58e1fad1f0b3a4849a9 Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Mon, 5 Sep 2022 12:24:23 -0400
Subject: [PATCH 23/64] revert setup merge issue

Signed-off-by: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
---
 scripts/setup-pash.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/scripts/setup-pash.sh b/scripts/setup-pash.sh
index 2668b5188..8e04219ec 100755
--- a/scripts/setup-pash.sh
+++ b/scripts/setup-pash.sh
@@ -28,6 +28,10 @@ python3 -m pip install matplotlib --root $PYTHON_PKG_DIR --ignore-installed #&>
 # TODO 2022-08-01 if libdash wheel isn't available, we need autmake etc.
 python3 -m pip install libdash --root $PYTHON_PKG_DIR --ignore-installed #&> $LOG_DIR/pip_install_libdash.log
 
+## TODO: Fix a specific version somehow, maybe commit?
+git clone https://github.com/binpash/annotations.git ./annotations_repo
+python3 -m pip install ./annotations_repo --root $PYTHON_PKG_DIR --ignore-installed #&> $LOG_DIR/pip_install_annotations.log
+
 # clean the python packages
 cd $PYTHON_PKG_DIR
 # can we find a better alternative to that                                      

From 92364f6a2ba4adf57c216a9e2dfbbf338d704472 Mon Sep 17 00:00:00 2001
From: Felix Stutz <fstutz@mpi-sws.org>
Date: Thu, 8 Sep 2022 13:56:01 +0200
Subject: [PATCH 24/64] Remove old annotations and move them to annotations
 repository

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>
---
 annotations/README.md                      | 222 -------
 annotations/alt_bigram_aux_reduce.json     |  12 -
 annotations/alt_bigrams_aux.json           |  17 -
 annotations/auto-split.json                |  12 -
 annotations/awk.json                       |  12 -
 annotations/bc.json                        |  12 -
 annotations/bigram_aux_map.json            |  12 -
 annotations/bigram_aux_reduce.json         |  12 -
 annotations/bigrams_aux.json               |  23 -
 annotations/c_stats/README.md              |  15 -
 annotations/c_stats/builtins.txt           | 108 ----
 annotations/c_stats/coreutils.txt          | 106 ----
 annotations/c_stats/linux_command_list.txt | 667 ---------------------
 annotations/c_stats/output.txt             | 667 ---------------------
 annotations/c_stats/plan9.txt              | 180 ------
 annotations/c_stats/pman.txt               | 667 ---------------------
 annotations/c_stats/posix.txt              | 160 -----
 annotations/cat.json                       |  37 --
 annotations/chmod.json                     |  12 -
 annotations/col.json                       |  12 -
 annotations/comm.json                      |  13 -
 annotations/convert.json                   |  13 -
 annotations/custom_aggregators/cat.py.json |  12 -
 annotations/custom_aggregators/concat.json |  12 -
 annotations/custom_sort.json               |  29 -
 annotations/custom_tr.json                 |  12 -
 annotations/cut.json                       |  33 -
 annotations/date.json                      |  12 -
 annotations/dd.json                        |  12 -
 annotations/dfs_split_reader.json          |  12 -
 annotations/dgsh_tee.json                  |  12 -
 annotations/diff.json                      |  12 -
 annotations/eager-no-task-par.json         |  12 -
 annotations/eager.json                     |  12 -
 annotations/echo.json                      |  12 -
 annotations/export.json                    |  12 -
 annotations/extract_text.json              |  13 -
 annotations/ffmpeg.json                    |  13 -
 annotations/find.json                      |  12 -
 annotations/fmt.json                       |  22 -
 annotations/grep.json                      |  47 --
 annotations/groff.json                     |  12 -
 annotations/gunzip.json                    |  12 -
 annotations/gzip.json                      |  14 -
 annotations/hdfs.json                      |  23 -
 annotations/head.json                      |  13 -
 annotations/history.json                   |  12 -
 annotations/iconv.json                     |  12 -
 annotations/jobs.json                      |  12 -
 annotations/ls.json                        |  12 -
 annotations/mkfifo.json                    |  12 -
 annotations/mktemp.json                    |  12 -
 annotations/multiply.json                  |  12 -
 annotations/nc.json                        |  12 -
 annotations/nl.json                        |  13 -
 annotations/notes.md                       |  30 -
 annotations/p_stats/README.md              |  39 --
 annotations/p_stats/coreutils-summary.txt  | 104 ----
 annotations/p_stats/coreutils.txt          | 112 ----
 annotations/p_stats/get-summary.sh         |  14 -
 annotations/p_stats/mr.md                  |  19 -
 annotations/p_stats/output.txt             | 667 ---------------------
 annotations/p_stats/posix-summary.txt      |  94 ---
 annotations/p_stats/posix_mandatory1.txt   |  57 --
 annotations/p_stats/posix_mandatory2.txt   |  37 --
 annotations/p_stats/statistics.sh          |  27 -
 annotations/package_build_aux.json         |  12 -
 annotations/pandoc.json                    |  12 -
 annotations/paste.json                     |  13 -
 annotations/pr.json                        |  36 --
 annotations/process_bio_s_line.json        |  13 -
 annotations/ps.json                        |  12 -
 annotations/pwd.json                       |  12 -
 annotations/r_merge.json                   |  13 -
 annotations/r_split.json                   |  12 -
 annotations/r_unwrap.json                  |  12 -
 annotations/r_wrap.json                    |  12 -
 annotations/read.json                      |  12 -
 annotations/readelf.json                   |  12 -
 annotations/remote_read.json               |  12 -
 annotations/remote_write.json              |  12 -
 annotations/resize.json                    |  13 -
 annotations/rev.json                       |  12 -
 annotations/rm.json                        |  12 -
 annotations/run_tests.json                 |  12 -
 annotations/sed.json                       |  13 -
 annotations/seq.json                       |  12 -
 annotations/set.json                       |  12 -
 annotations/set_diff.json                  |  16 -
 annotations/sha256sum.json                 |  12 -
 annotations/shuf.json                      |  12 -
 annotations/sort.json                      |  30 -
 annotations/split.json                     |  12 -
 annotations/stem-words.json                |  13 -
 annotations/tac.json                       |  14 -
 annotations/tail.json                      |  13 -
 annotations/tee.json                       |  12 -
 annotations/test_one.json                  |  16 -
 annotations/test_two.json                  |  17 -
 annotations/test_uniq_1.json               |  17 -
 annotations/test_uniq_2.json               |  18 -
 annotations/tr.json                        |  37 --
 annotations/trigrams_aux.json              |  13 -
 annotations/uniq.json                      |  14 -
 annotations/wc.json                        |  13 -
 annotations/xargs.json                     |  26 -
 annotations/xxd.json                       |  12 -
 107 files changed, 5297 deletions(-)
 delete mode 100644 annotations/README.md
 delete mode 100644 annotations/alt_bigram_aux_reduce.json
 delete mode 100644 annotations/alt_bigrams_aux.json
 delete mode 100644 annotations/auto-split.json
 delete mode 100644 annotations/awk.json
 delete mode 100644 annotations/bc.json
 delete mode 100644 annotations/bigram_aux_map.json
 delete mode 100644 annotations/bigram_aux_reduce.json
 delete mode 100644 annotations/bigrams_aux.json
 delete mode 100644 annotations/c_stats/README.md
 delete mode 100644 annotations/c_stats/builtins.txt
 delete mode 100644 annotations/c_stats/coreutils.txt
 delete mode 100644 annotations/c_stats/linux_command_list.txt
 delete mode 100644 annotations/c_stats/output.txt
 delete mode 100644 annotations/c_stats/plan9.txt
 delete mode 100644 annotations/c_stats/pman.txt
 delete mode 100644 annotations/c_stats/posix.txt
 delete mode 100644 annotations/cat.json
 delete mode 100644 annotations/chmod.json
 delete mode 100644 annotations/col.json
 delete mode 100644 annotations/comm.json
 delete mode 100644 annotations/convert.json
 delete mode 100644 annotations/custom_aggregators/cat.py.json
 delete mode 100644 annotations/custom_aggregators/concat.json
 delete mode 100644 annotations/custom_sort.json
 delete mode 100644 annotations/custom_tr.json
 delete mode 100644 annotations/cut.json
 delete mode 100644 annotations/date.json
 delete mode 100644 annotations/dd.json
 delete mode 100644 annotations/dfs_split_reader.json
 delete mode 100644 annotations/dgsh_tee.json
 delete mode 100644 annotations/diff.json
 delete mode 100644 annotations/eager-no-task-par.json
 delete mode 100644 annotations/eager.json
 delete mode 100644 annotations/echo.json
 delete mode 100644 annotations/export.json
 delete mode 100644 annotations/extract_text.json
 delete mode 100644 annotations/ffmpeg.json
 delete mode 100644 annotations/find.json
 delete mode 100644 annotations/fmt.json
 delete mode 100644 annotations/grep.json
 delete mode 100644 annotations/groff.json
 delete mode 100644 annotations/gunzip.json
 delete mode 100644 annotations/gzip.json
 delete mode 100644 annotations/hdfs.json
 delete mode 100644 annotations/head.json
 delete mode 100644 annotations/history.json
 delete mode 100644 annotations/iconv.json
 delete mode 100644 annotations/jobs.json
 delete mode 100644 annotations/ls.json
 delete mode 100644 annotations/mkfifo.json
 delete mode 100644 annotations/mktemp.json
 delete mode 100644 annotations/multiply.json
 delete mode 100644 annotations/nc.json
 delete mode 100644 annotations/nl.json
 delete mode 100644 annotations/notes.md
 delete mode 100644 annotations/p_stats/README.md
 delete mode 100644 annotations/p_stats/coreutils-summary.txt
 delete mode 100644 annotations/p_stats/coreutils.txt
 delete mode 100755 annotations/p_stats/get-summary.sh
 delete mode 100644 annotations/p_stats/mr.md
 delete mode 100644 annotations/p_stats/output.txt
 delete mode 100644 annotations/p_stats/posix-summary.txt
 delete mode 100644 annotations/p_stats/posix_mandatory1.txt
 delete mode 100644 annotations/p_stats/posix_mandatory2.txt
 delete mode 100755 annotations/p_stats/statistics.sh
 delete mode 100644 annotations/package_build_aux.json
 delete mode 100644 annotations/pandoc.json
 delete mode 100644 annotations/paste.json
 delete mode 100644 annotations/pr.json
 delete mode 100644 annotations/process_bio_s_line.json
 delete mode 100644 annotations/ps.json
 delete mode 100644 annotations/pwd.json
 delete mode 100644 annotations/r_merge.json
 delete mode 100644 annotations/r_split.json
 delete mode 100644 annotations/r_unwrap.json
 delete mode 100644 annotations/r_wrap.json
 delete mode 100644 annotations/read.json
 delete mode 100644 annotations/readelf.json
 delete mode 100644 annotations/remote_read.json
 delete mode 100644 annotations/remote_write.json
 delete mode 100644 annotations/resize.json
 delete mode 100644 annotations/rev.json
 delete mode 100644 annotations/rm.json
 delete mode 100644 annotations/run_tests.json
 delete mode 100644 annotations/sed.json
 delete mode 100644 annotations/seq.json
 delete mode 100644 annotations/set.json
 delete mode 100644 annotations/set_diff.json
 delete mode 100644 annotations/sha256sum.json
 delete mode 100644 annotations/shuf.json
 delete mode 100644 annotations/sort.json
 delete mode 100644 annotations/split.json
 delete mode 100644 annotations/stem-words.json
 delete mode 100644 annotations/tac.json
 delete mode 100644 annotations/tail.json
 delete mode 100644 annotations/tee.json
 delete mode 100644 annotations/test_one.json
 delete mode 100644 annotations/test_two.json
 delete mode 100644 annotations/test_uniq_1.json
 delete mode 100644 annotations/test_uniq_2.json
 delete mode 100644 annotations/tr.json
 delete mode 100644 annotations/trigrams_aux.json
 delete mode 100644 annotations/uniq.json
 delete mode 100644 annotations/wc.json
 delete mode 100644 annotations/xargs.json
 delete mode 100644 annotations/xxd.json

diff --git a/annotations/README.md b/annotations/README.md
deleted file mode 100644
index be3578524..000000000
--- a/annotations/README.md
+++ /dev/null
@@ -1,222 +0,0 @@
-# Parallelizability Study & Annotation Language
-Quick Jump: [Parallelizability](#main-parallelizability-classes) | [study](#parallelizability-study-of-commands-in-gnu--posix) | [example 1](#a-simple-example-chmod) | [example 2](#another-example-cut) | [howto](#how-to-annotate-a-command) | [issues](#Issues)
-
-PaSh includes 
-  (i) a parallelizability study of commands in POSIX and GNU Coreutils, and 
-  (ii) an annotation language for describing the parallelizability properties of individual commands.
-The parallelizability study informed the design of the annotation language, which was in turn used to capture the key parallelizability characteristics in many of these commands.
-
-> _N.b.: We welcome contributions to the study and annotatations for common commands._
-
-## Main Parallelizability Classes
-
-PaSh introduces four major parallelizability classes:
-
-* _Stateless Commands:_
-The first class, `stateless`, contains commands that operate on individual line elements of their input, without maintaining state across invocations.
-These are commands that can be expressed as a purely functional `map` or `filter` -- _e.g.,_ `grep` filters out individual lines and `basename` removes a path prefix from a string.
-They may produce multiple elements -- _e.g.,_ `tr` may insert `NL` tokens -- but always return empty output for empty input.
-Workloads that use only stateless commands are trivial to parallelize:
-  they do not require any synchronization to maintain correctness, nor caution about where to split inputs.
-
-* _Parallelizable Pure Commands:_
-The second class, `parallelizable_pure`, contains commands that respect functional purity -- _i.e.,_ same outputs for same inputs -- but maintain internal state across their entire pass.
-The details of this state and its propagation during element processing affect their parallelizability characteristics.
-Some commands are easy to parallelize, because they maintain trivial state and are commutative -- _e.g.,_ `wc` simply maintains a counter.
-Other commands, such as `sort`, maintain more complex invariants that have to be taken into account when merging partial results.
-
-* _Non-parallelizable Pure Commands:_
-The third class, `pure`, contains commands that, while purely functional, cannot be parallelized within a single data stream.
-This is because their internal state depends on prior state in non-trivial ways over the same pass. % should we say something about state machines?
-For example, hashing commands such as `sha1sum` maintain complex state that has to be updated sequentially.
-If parallelized on a single input, each stage would need to wait on the results of all previous stages, foregoing any parallelism benefits.
-
-* _Side-effectful Commands:_
-The last class, `side-effectful`, contains commands that have side-effects across the system -- for example, updating environment variables, interacting with the filesystem, and accessing the network.
-Such commands are not parallelizable without finer-grained concurrency control mechanisms that can detect side-effects across the system.
-
-## Parallelizability Study of Commands in GNU & POSIX
-
-The parallelizability study of commands in GNU and POSIX is comprised of two parts: a coarse-grained parallelizability study, and a set of annotations for commands.
-
-The main results of the parallelizability study are summarized in the [PaSh EuroSys'21 paper (Sec. 3.1 and Tab. 1)](https://arxiv.org/pdf/2007.09436.pdf).
-To see the results of the  parallelizability study, run [./p_stats](./p_stats).
-
-Annotations for about 60 popular commands are stored in this directory encoded as JSON files (about 14 lines per annotation on average, for a total of 846 lines of annotations).
-Annotations can be thought of as defining a bidirectional correspondence between a command and a node in the dataflow graph---the abstraction used by the PaSh compiler.
-Since command behaviors (and correspondence) can change based on their arguments, annotations contain a sequence of predicates.
-Each predicate is accompanied by information that instantiates the correspondence between a command and a dataflow node.
-
-## A Simple Example: `chmod`
-
-As a first example, below we present the annotations for `chmod`.
-
-```json
-{
-  "command": "chmod",
-  "cases": [
-    {
-      "predicate": "default",
-      "class": "side-effectful"
-    }
-  ]
-}
-```
-
-The annotation for `chmod` is very simple, since it only needs to establish that `chmod` is side-effectful and therefore cannot be translated to a dataflow node.
-
-## Another Example: `cut`
-
-As another example, below we present the annotations for `cut`.
-
-```json
-{
-  "command": "cut",
-  "cases": [
-    {
-      "predicate": {
-        "operator": "or",
-        "operands": [
-          {
-            "operator": "val_opt_eq",
-            "operands": [
-              "-d",
-              "\n"
-            ]
-          },
-          {
-            "operator": "exists",
-            "operands": [
-              "-z"
-            ]
-          }
-        ]
-      },
-      "class": "pure",
-      "inputs": [
-        "args[:]"
-      ],
-      "outputs": [
-        "stdout"
-      ]
-    },
-    {
-      "predicate": "default",
-      "class": "stateless",
-      "inputs": [
-        "args[:]"
-      ],
-      "outputs": [
-        "stdout"
-      ]
-    }
-  ],
-  "options": [
-    "stdin-hyphen",
-    "empty-args-stdin"
-  ],
-  "short-long": [
-    {
-      "short": "-d",
-      "long": "--delimiter"
-    },
-    {
-      "short": "-z",
-      "long": "--zero-terminated"
-    }
-  ]
-}
-```
-
-The annotation for `cut` has two cases, each of which consists of a predicate on its arguments, and then an assignment of its parallelizability class, inputs, and outputs.
-The first predicate indicates that `cut` is "pure" -- _i.e._, not parallelizable but representable as a dataflow node -- if the value accompanying the `-d` option is `\n` or if it was used with the `-z` flag.
-In both of these cases, newlines do not represent data item boundaries, but are rather used internally by the command, making it unsafe to parallelize by splitting on line boundaries.
-In all other cases (see the "default" case) the command is stateless.
-Inputs are always assigned to the non-option arguments and the output is always stdout.
-The option "stdin-hyphen" indicates that a non-option argument that is just a dash `-` represents the stdin, and the option “empty-args-stdin” indicates that if non-option arguments are empty, then the command reads from its stdin.
-The list identified by "short-long" contains a correspondence of short and long argument names for this command.
-
-## How to Annotate a Command
-
-The first step to annotating a command is to identify its default class: `stateless`, `parallelizable_pure`, `pure`, and `side-effectful`. How does the command behave without any inputs?
-The next step is to identify the set of inputs and their order.
-
-This process then has to be repeated for every set of arguments, which have to be expressed as first-order-logic predicates (see examples above).
-This can be (and is currently) achieved in an incremental fashion:
-  a few flags at a time.
-
-For more details, here is an early version of the annotation language:
-
-```
-  <option> ::= `-' <string>
-  <category> ::= `stateless' | `pure' | ...
-  <maybe-int> ::= ε | <int>
-  <arg> ::= `args[' <int> `]'
-  <args> ::= <arg>
-           | `args[' <maybe-int> `:' <maybe-int> `]'
-  <input> ::= `stdin' 
-            |  <args>
-  <inputs> ::= <input>
-             | <input> `,' <inputs>
-  <output> ::= `stdout' 
-             |  <arg>
-  <outputs> ::= <output>
-              | <output> `,' <outputs>
-  <option-pred> ::= <option>
-                  | `value' <option> = <string>
-                  | `not' <option-pred>
-                  | <option-pred> `or' <option-pred>
-                  | <option-pred> `and' <option-pred>
-  <assignment> ::= `(' <category>, `[' <inputs> `]' `,' `[' <output> `]' `)'
-  <predicate> ::= <option-pred> `=>' <assignment>
-  <pred-list> ::= `|' <predicate> <pred-list>
-                | `|' `otherwise' `=>' <assignment>
-  <command> ::= <name> `\{' <pred-list> `\}'
-  <command-list> ::= <command>
-                   | <command> <command-list>
-```
-
-[//]: # (TODO: 1. update language spec; 2. put all annotations in a directory)
-
-## Mini-tutorial: Adding Custom Aggregators
-
-For this tutorial, let's assume you want to parallelize [a simple `ann-agg.sh` script](https://github.com/binpash/pash/blob/main/evaluation/tests/ann-agg.sh).
-
-Let's also assume there are no annotations or aggregators for the commands `test_one` and `test_two`.
-Note that normally these two commands would be annotated as `stateless`, as their aggregator is simply the con`cat`enation function;
-  however, we will now annotate them as `parallelizable_pure` and provide "custom" aggregation commands that simply concatenate their input streams.
-
-*Step 1: Implement aggregators and their annotations*:
-
-An aggregator is usually either binary or _n_-ary:
-  it takes as input two or _n_ file names (or paths) and outputs results to the standard out.
-An aggregator  may also take additional flags---for example, flags that configure its operation or flags that were provided to the original command.
-
-We will implement `test_one`'s aggregator as [a shell script](https://github.com/binpash/pash/blob/main/runtime/agg/opt/concat.sh) that internally uses the Unix `cat` command to concatenate any number of input streams.
-
-We will implement `test_two`'s aggregator as [a Python script](https://github.com/binpash/pash/blob/main/runtime/agg/py/cat.py) that concatenates any number of inputs streams.
-
-For PaSh to be able to hook these aggregators correctly, _i.e._, so that it can instantiate them as command invocations, we also need to add their annotations in [annotations/custom_aggregators](https://github.com/binpash/pash/tree/main/annotations/custom_aggregators).
-Below are the two annotation files named [`annotations/custom_aggregators/cat.py.json`](./custom_aggregators/cat.py.json) and [`annotations/custom_aggregators/concat.json`](./custom_aggregators/concat.json). (FIXME: relative path? **Until this is fixed, prefix aggregator names with `pagg-` to avoid name clashes!**)
-The most important information in these files is (i) the aggregation command's `name`, and (ii) its treatment of inputs (both taking `["args[:]"]`), and outputs (both outputing to `["stdout"]`).
-
-*Step 2: Point commands to their custom aggregators*:
-Add two new annotation files in `$PASH_TOP/annotations` with names `test_one.json` and  `test_two.json`, so that they point to the right aggregator commands.
-Apart from providing the correct command `name`, the two key properties are the `class` (which should be `parallelizable_pure`) and the `rel_path` (which should point to the aggregator programs we just implemented---ideally, relative to `$PASH_TOP`).
-
-Here is the annotation for [`test_one.json`](./test_one.json), where the aggregator points to `runtime/agg/opt/concat.sh`. 
-Note that path is relative with respect to `$PASH_TOP` and therefore refers to `$PASH_TOP/runtime/agg/opt/concat.sh`:
-
-Here is the annotation for [`test_two.json`](./test_two.json), pointing to `runtime/agg/py/cat.py` (i.e., implying `$PASH_TOP/runtime/agg/py/cat.py`).
-The annotations also specifies that the aggregator should be called with the `-a` flag, in addition to any other flags provided to the original command.
-
-**More complex aggregators**:
-Suppose we want to parallelize a new script called [ann-agg-2.sh](https://github.com/binpash/pash/blob/main/evaluation/tests/ann-agg.sh).
-This script contains two new commands `test_uniq_1` and `test_uniq_2`. 
-Their annotations are in files [annotations/test_uniq_1](./test_uniq_1.json) and [annotations/test_uniq_2.json](./test_uniq_2.json).
-
-## Issues
-
-* [pr.json line 18] (https://github.com/binpash/pash/tree/main/annotations/pr.json)
-* [cat.json line 3] (https://github.com/binpash/pash/tree/main/annotations/cat.json)
-
diff --git a/annotations/alt_bigram_aux_reduce.json b/annotations/alt_bigram_aux_reduce.json
deleted file mode 100644
index 8fc1e83a5..000000000
--- a/annotations/alt_bigram_aux_reduce.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "alt_bigram_aux_reduce",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["args[:]"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/alt_bigrams_aux.json b/annotations/alt_bigrams_aux.json
deleted file mode 100644
index 247c9f30b..000000000
--- a/annotations/alt_bigrams_aux.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-    "command": "alt_bigrams_aux",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "parallelizable_pure",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"],
-            "aggregator":
-            {
-                "name": "alt_bigram_aux_reduce"
-            }
-        }
-    ],
-    "comment": "This is a custom command that is used in our evaluation."
-}
diff --git a/annotations/auto-split.json b/annotations/auto-split.json
deleted file mode 100644
index 5c501a426..000000000
--- a/annotations/auto-split.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "auto-split.sh",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["args[0]"],
-            "outputs": ["args[1:]"]
-        }
-    ]
-}
diff --git a/annotations/awk.json b/annotations/awk.json
deleted file mode 100644
index 75cf9b869..000000000
--- a/annotations/awk.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "awk",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/bc.json b/annotations/bc.json
deleted file mode 100644
index c13228676..000000000
--- a/annotations/bc.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "bc",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "stateless",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/bigram_aux_map.json b/annotations/bigram_aux_map.json
deleted file mode 100644
index 10aa84e58..000000000
--- a/annotations/bigram_aux_map.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "bigram_aux_map",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["args[0]"],
-            "outputs": ["args[1:4]"]
-        }
-    ]
-}
diff --git a/annotations/bigram_aux_reduce.json b/annotations/bigram_aux_reduce.json
deleted file mode 100644
index 6c0cf7281..000000000
--- a/annotations/bigram_aux_reduce.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "bigram_aux_reduce",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["args[:6]"],
-            "outputs": ["args[6:]"]
-        }
-    ]
-}
diff --git a/annotations/bigrams_aux.json b/annotations/bigrams_aux.json
deleted file mode 100644
index 5b0d4ee52..000000000
--- a/annotations/bigrams_aux.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-    "command": "bigrams_aux",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "parallelizable_pure",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"],
-            "aggregator":
-            {
-                "name": "bigram_aux_reduce"
-            },
-            "mapper":
-            {
-                "name": "bigram_aux_map",
-                "num_outputs": 3,
-                "comment": "This is only needed if the map phase of a command is not the same as the command itself. In addition, this map here needs to keep extra info. TODO: At some point, the num_outputs should be extracted from the annotation of the mapper itself."
-            }
-        }
-    ],
-    "comment": "This is a custom command that is used in our evaluation."
-}
diff --git a/annotations/c_stats/README.md b/annotations/c_stats/README.md
deleted file mode 100644
index 858e197ce..000000000
--- a/annotations/c_stats/README.md
+++ /dev/null
@@ -1,15 +0,0 @@
-This directory contains different groups of commands; the diagram below captures most of the overlap between them, but in practice there's no real subset relationship.
-
-```
-  {ubuntu { extended ( POSIX {gnu )coreutils} } } }
-```
-Each one of these sets are well-known The focus is on POSIX and GNU Coreutils. The study and annotations of other commands is welcome!
-
-To view set difference between classes of commands, run the following script from `pash`'s benchmarks (and count them by piping to `wc -l`):
-
-```
-../../scripts/set-diff.sh coreutils posix
-```
-
-To find all commands in an environment (e.g., to generate `ubuntu`), run `compgen -c`.
-
diff --git a/annotations/c_stats/builtins.txt b/annotations/c_stats/builtins.txt
deleted file mode 100644
index d12700e28..000000000
--- a/annotations/c_stats/builtins.txt
+++ /dev/null
@@ -1,108 +0,0 @@
-Command       External    csh(1)    sh(1)
-!             No          No        Yes
-%             No          Yes       No
-.             No          No        Yes
-:             No          Yes       Yes
-@             No          Yes       Yes
-{             No          No        Yes
-}             No          No        Yes
-alias         No**        Yes       Yes
-alloc         No          Yes       No
-bg            No**        Yes       Yes
-bind          No          No        Yes
-bindkey       No          Yes       No
-break         No          Yes       Yes
-breaksw       No          Yes       No
-builtin       No          No        Yes
-builtins      No          Yes       No
-case          No          Yes       Yes
-cd            No**        Yes       Yes
-chdir         No          Yes       Yes
-command       No**        No        Yes
-complete      No          Yes       No
-continue      No          Yes       Yes
-default       No          Yes       No
-dirs          No          Yes       No
-do            No          No        Yes
-done          No          No        Yes
-echo          Yes         Yes       Yes
-echotc        No          Yes       No
-elif          No          No        Yes
-else          No          Yes       Yes
-end           No          Yes       No
-endif         No          Yes       No
-endsw         No          Yes       No
-esac          No          No        Yes
-eval          No          Yes       Yes
-exec          No          Yes       Yes
-exit          No          Yes       Yes
-export        No          No        Yes
-false         Yes         No        Yes
-fc            No**        No        Yes
-fg            No**        Yes       Yes
-filetest      No          Yes       No
-fi            No          No        Yes
-for           No          No        Yes
-foreach       No          Yes       No
-getopts       No**        No        Yes
-glob          No          Yes       No
-goto          No          Yes       No
-hash          No          No        Yes
-hashstat      No          Yes       No
-history       No          Yes       No
-hup           No          Yes       No
-if            No          Yes       Yes
-jobid         No          No        Yes
-jobs          No**        Yes       Yes
-kill          Yes         Yes       No
-limit         No          Yes       No
-local         No          No        Yes
-log           No          Yes       No
-login         Yes         Yes       No
-logout        No          Yes       No
-ls-F          No          Yes       No
-nice          Yes         Yes       No
-nohup         Yes         Yes       No
-notify        No          Yes       No
-onintr        No          Yes       No
-popd          No          Yes       No
-printenv      Yes         Yes       No
-pushd         No          Yes       No
-pwd           Yes         No        Yes
-read          No**        No        Yes
-readonly      No          No        Yes
-rehash        No          Yes       No
-repeat        No          Yes       No
-return        No          No        Yes
-sched         No          Yes       No
-set           No          Yes       Yes
-setenv        No          Yes       No
-settc         No          Yes       No
-setty         No          Yes       No
-setvar        No          No        Yes
-shift         No          Yes       Yes
-source        No          Yes       No
-stop          No          Yes       No
-suspend       No          Yes       No
-switch        No          Yes       No
-telltc        No          Yes       No
-test          Yes         No        Yes
-then          No          No        Yes
-time          Yes         Yes       No
-times         No          No        Yes
-trap          No          No        Yes
-true          Yes         No        Yes
-type          No          No        Yes
-ulimit        No          No        Yes
-umask         No**        Yes       Yes
-unalias       No**        Yes       Yes
-uncomplete    No          Yes       No
-unhash        No          Yes       No
-unlimit       No          Yes       No
-unset         No          Yes       Yes
-unsetenv      No          Yes       No
-until         No          No        Yes
-wait          No**        Yes       Yes
-where         No          Yes       No
-which         Yes         Yes       No
-while         No          Yes       Yes
diff --git a/annotations/c_stats/coreutils.txt b/annotations/c_stats/coreutils.txt
deleted file mode 100644
index a9221b5ec..000000000
--- a/annotations/c_stats/coreutils.txt
+++ /dev/null
@@ -1,106 +0,0 @@
-Name           Category           Description
---------------------------------------------------------------------------------
-chcon        File utilities    Changes file security context (SELinux)
-chgrp        File utilities    Changes file group ownership
-chown        File utilities    Changes file ownership
-chmod        File utilities    Changes the permissions of a file or directory
-cp           File utilities    Copies a file or directory
-dd           File utilities    Copies and converts a file
-df           File utilities    Shows disk free space on file systems
-dir          File utilities    Is exactly like "ls -C -b". (Files are by default listed in columns and sorted vertically.)
-dircolors    File utilities    Set up color for ls
-install      File utilities    Copies files and set attributes
-ln           File utilities    Creates a link to a file
-ls           File utilities    Lists the files in a directory
-mkdir        File utilities    Creates a directory
-mkfifo       File utilities    Makes named pipes (FIFOs)
-mknod        File utilities    Makes block or character special files
-mktemp       File utilities    Creates a temporary file or directory
-mv           File utilities    Moves files or rename files
-realpath     File utilities    Returns the resolved absolute or relative path for a file
-rm           File utilities    Removes (deletes) files, directories, device nodes and symbolic links
-rmdir        File utilities    Removes empty directories
-shred        File utilities    Overwrites a file to hide its contents, and optionally deletes it
-sync         File utilities    Flushes file system buffers
-touch        File utilities    Changes file timestamps
-truncate     File utilities    Shrink or extend the size of a file to the specified size
-vdir         File utilities    Is exactly like "ls -l -b". (Files are by default listed in long format.)
-b2sum        Text utilities    Computes and checks BLAKE2b message digest
-base32       Text utilities    Encodes or decodes Base32, and prints result to standard output
-base64       Text utilities    Encodes or decodes Base64, and prints result to standard output
-cat          Text utilities    Concatenates and prints files on the standard output
-cksum        Text utilities    Checksums and count the bytes in a file
-comm         Text utilities    Compares two sorted files line by line
-csplit       Text utilities    Splits a file into sections determined by context lines
-cut          Text utilities    Removes sections from each line of files
-expand       Text utilities    Converts tabs to spaces
-fmt          Text utilities    Simple optimal text formatter
-fold         Text utilities    Wraps each input line to fit in specified width
-head         Text utilities    Outputs the first part of files
-join         Text utilities    Joins lines of two files on a common field
-md5sum       Text utilities    Computes and checks MD5 message digest
-nl           Text utilities    Numbers lines of files
-numfmt       Text utilities    Reformat numbers
-od           Text utilities    Dumps files in octal and other formats
-paste        Text utilities    Merges lines of files
-ptx          Text utilities    Produces a permuted index of file contents
-pr           Text utilities    Converts text files for printing
-sha1sum,     Text utilities    Computes and checks SHA-1/SHA-2 message digests
-sha224sum,   Text utilities    Computes and checks SHA-1/SHA-2 message digests
-sha256sum,   Text utilities    Computes and checks SHA-1/SHA-2 message digests
-sha384sum,   Text utilities    Computes and checks SHA-1/SHA-2 message digests
-sha512sum    Text utilities    Computes and checks SHA-1/SHA-2 message digests
-shuf         Text utilities    generate random permutations
-sort         Text utilities    sort lines of text files
-split        Text utilities    Splits a file into pieces
-sum          Text utilities    Checksums and counts the blocks in a file
-tac          Text utilities    Concatenates and prints files in reverse order line by line
-tail         Text utilities    Outputs the last part of files
-tr           Text utilities    Translates or deletes characters
-tsort        Text utilities    Performs a topological sort
-unexpand     Text utilities    Converts spaces to tabs
-uniq         Text utilities    Removes duplicate lines from a sorted file
-wc           Text utilities    Prints the number of bytes, words, and lines in files
-arch         Shell utilities    Prints machine hardware name (same as uname -m)
-basename     Shell utilities    Removes the path prefix from a given pathname
-chroot       Shell utilities    Changes the root directory
-date         Shell utilities    Prints or sets the system date and time
-dirname      Shell utilities    Strips non-directory suffix from file name
-du           Shell utilities    Shows disk usage on file systems
-echo         Shell utilities    Displays a specified line of text
-env          Shell utilities    Displays and modifies environment variables
-expr         Shell utilities    Evaluates expressions
-factor       Shell utilities    Factors numbers
-false        Shell utilities    Does nothing, but exits unsuccessfully
-groups       Shell utilities    Prints the groups of which the user is a member
-hostid       Shell utilities    Prints the numeric identifier for the current host
-id           Shell utilities    Prints real or effective UID and GID
-link         Shell utilities    Creates a link to a file
-logname      Shell utilities    Print the user's login name
-nice         Shell utilities    Modifies scheduling priority
-nohup        Shell utilities    Allows a command to continue running after logging out
-nproc        Shell utilities    Queries the number of (active) processors
-pathchk      Shell utilities    Checks whether file names are valid or portable
-pinky        Shell utilities    A lightweight version of finger
-printenv     Shell utilities    Prints environment variables
-printf       Shell utilities    Formats and prints data
-pwd          Shell utilities    Prints the current working directory
-readlink     Shell utilities    Displays value of a symbolic link
-runcon       Shell utilities    Run command with specified security context
-seq          Shell utilities    Prints a sequence of numbers
-sleep        Shell utilities    Delays for a specified amount of time
-stat         Shell utilities    Returns data about an inode
-stdbuf       Shell utilities    Controls buffering for commands that use stdio
-stty         Shell utilities    Changes and prints terminal line settings
-tee          Shell utilities    Sends output to multiple files
-test         Shell utilities    Evaluates an expression
-timeout      Shell utilities    Run a command with a time limit
-true         Shell utilities    Does nothing, but exits successfully
-tty          Shell utilities    Prints terminal name
-uname        Shell utilities    Prints system information
-unlink       Shell utilities    Removes the specified file using the unlink function
-uptime       Shell utilities    Tells how long the system has been running
-users        Shell utilities    Prints the user names of users currently logged into the current host
-who          Shell utilities    Prints a list of all users currently logged in
-whoami       Shell utilities    Prints the effective userid
-yes          Shell utilities    Prints a string repeatedly
diff --git a/annotations/c_stats/linux_command_list.txt b/annotations/c_stats/linux_command_list.txt
deleted file mode 100644
index fec6a40da..000000000
--- a/annotations/c_stats/linux_command_list.txt
+++ /dev/null
@@ -1,667 +0,0 @@
-accept
-access
-aclocal
-aconnect
-acpi
-acpi_available
-acpid
-addr2line
-addresses
-agetty
-alias
-alsactl
-amidi
-amixer
-anacron
-aplay
-aplaymidi
-apm
-apmd
-apropos
-apt
-apt-get
-aptitude
-ar
-arch
-arecord
-arecordmidi
-arp
-as
-aspell
-at
-atd
-atq
-atrm
-audiosend
-aumix
-autoconf
-autoheader
-automake
-autoreconf
-autoscan
-autoupdate
-awk
-badblocks
-banner
-basename
-bash
-batch
-bc
-bg
-biff
-bind
-bison
-break
-builtin
-bzcmp
-bzdiff
-bzgrep
-bzip2
-bzless
-bzmore
-cal
-cardctl
-cardmgr
-case
-cat
-cc
-cd
-cdda2wav
-cdparanoia
-cdrdao
-cdrecord
-cfdisk
-chage
-chattr
-chdir
-chfn
-chgrp
-chkconfig
-chmod
-chown
-chpasswd
-chroot
-chrt
-chsh
-chvt
-cksum
-clear
-cmp
-col
-colcrt
-colrm
-column
-comm
-command
-compress
-continue
-cp
-cpio
-cpp
-cron
-crond
-crontab
-csplit
-ctags
-cupsd
-curl
-cut
-cvs
-date
-dc
-dd
-ddrescue
-deallocvt
-debugfs
-declare
-depmod
-devdump
-df
-diff
-diff3
-dig
-dir
-dircolors
-dirname
-dirs
-disable
-dlpsh
-dmesg
-dnsdomainname
-dnssec-keygen
-dnssec-makekeyset
-dnssec-signkey
-dnssec-signzone
-doexec
-domainname
-dosfsck
-du
-dump
-dumpe2fs
-dumpkeys
-e2fsck
-e2image
-e2label
-echo
-edGNUEd–
-edquota
-egrep
-eject
-elvtune
-emacs
-enable
-env
-envsubst
-esd
-esd-config
-esdcat
-esdctl
-esddsp
-esdmon
-esdplay
-esdrec
-esdsample
-etags
-ethtool
-eval
-ex
-exec
-exit
-expand
-expect
-export
-expr
-factor
-false
-fc-cache
-fc-list
-fdformat
-fdisk
-fetchmail
-fg
-fgconsole
-fgrep
-file
-find
-finger
-fingerd
-flex
-fmt
-fold
-for
-formail
-format
-free
-fsck
-ftp
-ftpd
-function
-fuser
-g++
-gawk
-gcc
-gdb
-getent
-getkeycodes
-getopts
-gpasswd
-gpg
-gpgsplit
-gpgv
-gpm
-gprof
-grep
-groff
-groffer
-groupadd
-groupdel
-groupmod
-groups
-grpck
-grpconv
-gs
-gunzip
-gzexe
-gzip
-halt
-hash
-hdparm
-head
-help
-hexdump
-history
-host
-hostid
-hostname
-htdigest
-htop
-hwclock
-iconv
-id
-if
-ifconfig
-ifdown
-ifup
-imapd
-import
-inetd
-info
-init
-insmod
-install
-iostat
-ip
-ipcrm
-ipcs
-iptables
-iptables-restore
-iptables-save
-isodump
-isoinfo
-isosize
-isovfy
-ispell
-jobs
-join
-kbd_mode
-kbdrate
-kill
-killall
-killall5
-klogd
-kudzu
-last
-lastb
-lastlog
-ld
-ldconfig
-ldd
-less
-lesskey
-let
-lftp
-lftpget
-link
-ln
-loadkeys
-local
-locale
-locate
-lockfile
-logger
-login
-logname
-logout
-logrotate
-look
-losetup
-lpadmin
-lpc
-lpinfo
-lpmove
-lpq
-lpr
-lprint
-lprintd
-lprintq
-lprm
-lpstat
-ls
-lsattr
-lsblk
-lsmod
-lsof
-lspci
-lsusb
-m4
-mail
-mailq
-mailstats
-mailto
-make
-makedbm
-makemap
-man
-manpath
-mattrib
-mbadblocks
-mcat
-mcd
-mcopy
-md5sum
-mdel,mdeltree
-mdir
-mdu
-merge
-mesg
-metamail
-metasend
-mformat
-mimencode
-minfo
-mkdir
-mkdosfs
-mke2fs
-mkfifo
-mkfs
-mkfs.ext3
-mkisofs
-mklost+found
-mkmanifest
-mknod
-mkraid
-mkswap
-mktemp
-mlabel
-mmd
-mmount
-mmove
-mmv
-modinfo
-modprobe
-more
-most
-mount
-mountd
-mpartition
-mpg123
-mpg321
-mrd
-mren
-mshowfat
-mt
-mtools
-mtoolstest
-mtr
-mtype
-mv
-mzip
-named
-namei
-nameif
-nc
-netstat
-newaliases
-newgrp
-newusers
-nfsd
-nfsstat
-nice
-nl
-nm
-nohup
-notify-send
-nslookup
-nsupdate
-objcopy
-objdump
-od
-op
-open
-openvt
-passwd
-paste
-patch
-pathchk
-perl
-pgrep
-pidof
-ping
-pinky
-pkill
-pmap
-popd
-portmap
-poweroff
-pppd
-pr
-praliases
-printcap
-printenv
-printf
-ps
-ptx
-pushd
-pv
-pwck
-pwconv
-pwd
-python
-quota
-quotacheck
-quotactl
-quotaoff
-quotaon
-quotastats
-raidstart
-ram
-ramsize
-ranlib
-rar
-rarpd
-rcp
-rdate
-rdev
-rdist
-rdistd
-read
-readarray
-readcd
-readelf
-readlink
-readonly
-reboot
-reject
-remsync
-rename
-renice
-repquota
-reset
-resize2fs
-restore
-return
-rev
-rexec
-rexecd
-richtext
-rlogin
-rlogind
-rm
-rmail
-rmdir
-rmmod
-rndc
-rootflags
-route
-routed
-rpcgen
-rpcinfo
-rpm
-rsh
-rshd
-rsync
-runlevel
-rup
-ruptime
-rusers
-rusersd
-rwall
-rwho
-rwhod
-sane-find-scanner
-scanadf
-scanimage
-scp
-screen
-script
-sdiff
-sed
-select
-sendmail
-sensors
-seq
-set
-setfdprm
-setkeycodes
-setleds
-setmetamode
-setquota
-setsid
-setterm
-sftp
-sh
-sha1sum
-shift
-shopt
-showkey
-showmount
-shred
-shutdown
-size
-skill
-slabtop
-slattach
-sleep
-slocate
-snice
-sort
-source
-split
-ss
-ssh
-ssh-add
-ssh-agent
-ssh-keygen
-ssh-keyscan
-sshd
-stat
-statd
-strace
-strfile
-strings
-strip
-stty
-su
-sudo
-sum
-suspend
-swapoff
-swapon
-symlink
-sync
-sysctl
-sysklogd
-syslogd
-tac
-tail
-tailf
-talk
-talkd
-tar
-taskset
-tcpd
-tcpdump
-tcpslice
-tee
-telinit
-telnet
-telnetd
-test
-tftp
-tftpd
-time
-timeout
-times
-tload
-tmpwatch
-top
-touch
-tput
-tr
-tracepath
-traceroute
-trap
-troff
-TRUE
-tset
-tsort
-tty
-tune2fs
-tunelp
-type
-ul
-ulimit
-umask
-umount
-unalias
-uname
-uncompress
-unexpand
-unicode_start
-unicode_stop
-uniq
-units
-unrar
-unset
-unshar
-until
-uptime
-useradd
-userdel
-usermod
-users
-usleep
-uudecode
-uuencode
-uuidgen
-vdir
-vi
-vidmode
-vim
-vmstat
-volname
-w
-wait
-wall
-warnquota
-watch
-wc
-wget
-whatis
-whereis
-which
-while
-who
-whoami
-whois
-write
-xargs
-xdg-open
-xinetd
-xz
-yacc
-yes
-ypbind
-ypcat
-ypinit
-ypmatch
-yppasswd
-yppasswdd
-yppoll
-yppush
-ypserv
-ypset
-yptest
-ypwhich
-ypxfr
-zcat
-zcmp
-zdiff
-zdump
-zforce
-zgrep
-zic
-zip
-zless
-zmore
-znew
\ No newline at end of file
diff --git a/annotations/c_stats/output.txt b/annotations/c_stats/output.txt
deleted file mode 100644
index baedf2f66..000000000
--- a/annotations/c_stats/output.txt
+++ /dev/null
@@ -1,667 +0,0 @@
-if              - 0.15749059356523074
-alias           - 0.11932734392997005
-echo            - 0.08362128541810643
-local           - 0.08277662596943869
-return          - 0.04576518467327037
-for             - 0.042232972433387084
-function        - 0.028795208477309375
-printf          - 0.027873761806035475
-command         - 0.023880826230515243
-file            - 0.017200337863779468
-sed             - 0.01704676341856715
-case            - 0.016278891192505567
-grep            - 0.01528065729862551
-exit            - 0.015050295630807033
-shift           - 0.014973508408200876
-install         - 0.014435997849957766
-as              - 0.011825232281348384
-awk             - 0.011825232281348384
-source          - 0.010750211164862166
-set             - 0.010212700606619059
-builtin         - 0.008830530599708208
-eval            - 0.008600168931889734
-true            - 0.008216232818858941
-read            - 0.008062658373646624
-unset           - 0.0070644244797665665
-rm              - 0.006526913921523459
-while           - 0.0064501266989173
-export          - 0.006296552253704984
-mkdir           - 0.005298318359824925
-cd              - 0.00506795669200645
-cat             - 0.004453658911157184
-which           - 0.004453658911157184
-break           - 0.00414651002073255
-cut             - 0.00414651002073255
-test            - 0.0039929355755202334
-sudo            - 0.003916148352914075
-sort            - 0.0036857866850956
-tr              - 0.0035322122398832832
-at              - 0.0034554250172771253
-head            - 0.0033018505720648086
-curl            - 0.00322506334945865
-help            - 0.002994701681640175
-named           - 0.002994701681640175
-declare         - 0.0029179144590340167
-find            - 0.0029179144590340167
-xargs           - 0.0028411272364278583
-false           - 0.0027643400138217
-script          - 0.0027643400138217
-make            - 0.0023804039007909086
-ls              - 0.00230361667818475
-hash            - 0.0021500422329724335
-more            - 0.0021500422329724335
-info            - 0.002073255010366275
-type            - 0.002073255010366275
-history         - 0.0019964677877601167
-mv              - 0.0019964677877601167
-tee             - 0.0019196805651539584
-cp              - 0.0017661061199416416
-exec            - 0.0017661061199416416
-chmod           - 0.0016893188973354835
-readonly        - 0.0016893188973354835
-open            - 0.001612531674729325
-continue        - 0.0015357444521231667
-env             - 0.0015357444521231667
-format          - 0.0015357444521231667
-last            - 0.0015357444521231667
-wc              - 0.0014589572295170084
-dir             - 0.00138217000691085
-man             - 0.00138217000691085
-tail            - 0.00138217000691085
-reset           - 0.0012285955616985335
-sleep           - 0.0012285955616985335
-until           - 0.0012285955616985335
-time            - 0.001151808339092375
-write           - 0.001151808339092375
-enable          - 0.0010750211164862167
-select          - 0.0010750211164862167
-tar             - 0.0010750211164862167
-yes             - 0.0010750211164862167
-wget            - 9.982338938800584E-4
-clear           - 9.214466712739E-4
-link            - 9.214466712739E-4
-bash            - 8.446594486677417E-4
-disable         - 8.446594486677417E-4
-emacs           - 8.446594486677417E-4
-free            - 8.446594486677417E-4
-id              - 8.446594486677417E-4
-iptables        - 8.446594486677417E-4
-less            - 8.446594486677417E-4
-screen          - 8.446594486677417E-4
-touch           - 8.446594486677417E-4
-cc              - 7.678722260615834E-4
-diff            - 7.678722260615834E-4
-init            - 7.678722260615834E-4
-let             - 7.678722260615834E-4
-uniq            - 7.678722260615834E-4
-paste           - 6.91085003455425E-4
-perl            - 6.91085003455425E-4
-wait            - 6.91085003455425E-4
-addresses       - 6.142977808492667E-4
-chown           - 6.142977808492667E-4
-gpg             - 6.142977808492667E-4
-host            - 6.142977808492667E-4
-most            - 6.142977808492667E-4
-zgrep           - 6.142977808492667E-4
-access          - 5.375105582431084E-4
-apt-get         - 5.375105582431084E-4
-cron            - 5.375105582431084E-4
-import          - 5.375105582431084E-4
-kill            - 5.375105582431084E-4
-ln              - 5.375105582431084E-4
-look            - 5.375105582431084E-4
-netstat         - 5.375105582431084E-4
-route           - 5.375105582431084E-4
-sh              - 5.375105582431084E-4
-top             - 5.375105582431084E-4
-trap            - 5.375105582431084E-4
-users           - 5.375105582431084E-4
-ip              - 4.6072333563695E-4
-join            - 4.6072333563695E-4
-login           - 4.6072333563695E-4
-merge           - 4.6072333563695E-4
-ps              - 4.6072333563695E-4
-size            - 4.6072333563695E-4
-split           - 4.6072333563695E-4
-strings         - 4.6072333563695E-4
-tty             - 4.6072333563695E-4
-xz              - 4.6072333563695E-4
-basename        - 3.839361130307917E-4
-comm            - 3.839361130307917E-4
-dump            - 3.839361130307917E-4
-logrotate       - 3.839361130307917E-4
-ping            - 3.839361130307917E-4
-popd            - 3.839361130307917E-4
-restore         - 3.839361130307917E-4
-rpm             - 3.839361130307917E-4
-unalias         - 3.839361130307917E-4
-vim             - 3.839361130307917E-4
-aptitude        - 3.0714889042463337E-4
-chsh            - 3.0714889042463337E-4
-date            - 3.0714889042463337E-4
-dig             - 3.0714889042463337E-4
-dirs            - 3.0714889042463337E-4
-expand          - 3.0714889042463337E-4
-getopts         - 3.0714889042463337E-4
-lsof            - 3.0714889042463337E-4
-mail            - 3.0714889042463337E-4
-nohup           - 3.0714889042463337E-4
-pushd           - 3.0714889042463337E-4
-pwd             - 3.0714889042463337E-4
-sha1sum         - 3.0714889042463337E-4
-timeout         - 3.0714889042463337E-4
-times           - 3.0714889042463337E-4
-uname           - 3.0714889042463337E-4
-w               - 3.0714889042463337E-4
-du              - 2.30361667818475E-4
-egrep           - 2.30361667818475E-4
-getent          - 2.30361667818475E-4
-iconv           - 2.30361667818475E-4
-locate          - 2.30361667818475E-4
-notify-send     - 2.30361667818475E-4
-reboot          - 2.30361667818475E-4
-ssh             - 2.30361667818475E-4
-stat            - 2.30361667818475E-4
-strfile         - 2.30361667818475E-4
-sync            - 2.30361667818475E-4
-ar              - 1.5357444521231668E-4
-banner          - 1.5357444521231668E-4
-column          - 1.5357444521231668E-4
-df              - 1.5357444521231668E-4
-fgrep           - 1.5357444521231668E-4
-fmt             - 1.5357444521231668E-4
-fuser           - 1.5357444521231668E-4
-groups          - 1.5357444521231668E-4
-hostname        - 1.5357444521231668E-4
-killall         - 1.5357444521231668E-4
-locale          - 1.5357444521231668E-4
-mktemp          - 1.5357444521231668E-4
-nc              - 1.5357444521231668E-4
-nl              - 1.5357444521231668E-4
-pidof           - 1.5357444521231668E-4
-python          - 1.5357444521231668E-4
-rename          - 1.5357444521231668E-4
-rev             - 1.5357444521231668E-4
-rmdir           - 1.5357444521231668E-4
-rsync           - 1.5357444521231668E-4
-ssh-add         - 1.5357444521231668E-4
-ssh-agent       - 1.5357444521231668E-4
-strip           - 1.5357444521231668E-4
-stty            - 1.5357444521231668E-4
-suspend         - 1.5357444521231668E-4
-symlink         - 1.5357444521231668E-4
-tac             - 1.5357444521231668E-4
-uptime          - 1.5357444521231668E-4
-useradd         - 1.5357444521231668E-4
-accept          - 7.678722260615834E-5
-apt             - 7.678722260615834E-5
-bg              - 7.678722260615834E-5
-chdir           - 7.678722260615834E-5
-dircolors       - 7.678722260615834E-5
-expect          - 7.678722260615834E-5
-fg              - 7.678722260615834E-5
-ftp             - 7.678722260615834E-5
-gunzip          - 7.678722260615834E-5
-gzip            - 7.678722260615834E-5
-halt            - 7.678722260615834E-5
-ifconfig        - 7.678722260615834E-5
-jobs            - 7.678722260615834E-5
-lesskey         - 7.678722260615834E-5
-logout          - 7.678722260615834E-5
-modinfo         - 7.678722260615834E-5
-nice            - 7.678722260615834E-5
-nm              - 7.678722260615834E-5
-patch           - 7.678722260615834E-5
-pv              - 7.678722260615834E-5
-ram             - 7.678722260615834E-5
-rar             - 7.678722260615834E-5
-readlink        - 7.678722260615834E-5
-shopt           - 7.678722260615834E-5
-shutdown        - 7.678722260615834E-5
-ss              - 7.678722260615834E-5
-sshd            - 7.678722260615834E-5
-statd           - 7.678722260615834E-5
-su              - 7.678722260615834E-5
-sum             - 7.678722260615834E-5
-tcpdump         - 7.678722260615834E-5
-tftp            - 7.678722260615834E-5
-tput            - 7.678722260615834E-5
-ulimit          - 7.678722260615834E-5
-umask           - 7.678722260615834E-5
-uncompress      - 7.678722260615834E-5
-unrar           - 7.678722260615834E-5
-userdel         - 7.678722260615834E-5
-usermod         - 7.678722260615834E-5
-vmstat          - 7.678722260615834E-5
-xdg-open        - 7.678722260615834E-5
-aclocal         - 0.0
-aconnect        - 0.0
-acpi            - 0.0
-acpi_available  - 0.0
-acpid           - 0.0
-addr2line       - 0.0
-agetty          - 0.0
-alsactl         - 0.0
-amidi           - 0.0
-amixer          - 0.0
-anacron         - 0.0
-aplay           - 0.0
-aplaymidi       - 0.0
-apm             - 0.0
-apmd            - 0.0
-apropos         - 0.0
-arch            - 0.0
-arecord         - 0.0
-arecordmidi     - 0.0
-arp             - 0.0
-aspell          - 0.0
-atd             - 0.0
-atq             - 0.0
-atrm            - 0.0
-audiosend       - 0.0
-aumix           - 0.0
-autoconf        - 0.0
-autoheader      - 0.0
-automake        - 0.0
-autoreconf      - 0.0
-autoscan        - 0.0
-autoupdate      - 0.0
-badblocks       - 0.0
-batch           - 0.0
-bc              - 0.0
-biff            - 0.0
-bind            - 0.0
-bison           - 0.0
-bzcmp           - 0.0
-bzdiff          - 0.0
-bzgrep          - 0.0
-bzip2           - 0.0
-bzless          - 0.0
-bzmore          - 0.0
-cal             - 0.0
-cardctl         - 0.0
-cardmgr         - 0.0
-cdda2wav        - 0.0
-cdparanoia      - 0.0
-cdrdao          - 0.0
-cdrecord        - 0.0
-cfdisk          - 0.0
-chage           - 0.0
-chattr          - 0.0
-chfn            - 0.0
-chgrp           - 0.0
-chkconfig       - 0.0
-chpasswd        - 0.0
-chroot          - 0.0
-chrt            - 0.0
-chvt            - 0.0
-cksum           - 0.0
-cmp             - 0.0
-col             - 0.0
-colcrt          - 0.0
-colrm           - 0.0
-compress        - 0.0
-cpio            - 0.0
-cpp             - 0.0
-crond           - 0.0
-crontab         - 0.0
-csplit          - 0.0
-ctags           - 0.0
-cupsd           - 0.0
-cvs             - 0.0
-dc              - 0.0
-dd              - 0.0
-ddrescue        - 0.0
-deallocvt       - 0.0
-debugfs         - 0.0
-depmod          - 0.0
-devdump         - 0.0
-diff3           - 0.0
-dirname         - 0.0
-dlpsh           - 0.0
-dmesg           - 0.0
-dnsdomainname   - 0.0
-dnssec-keygen   - 0.0
-dnssec-makekeyset - 0.0
-dnssec-signkey  - 0.0
-dnssec-signzone - 0.0
-doexec          - 0.0
-domainname      - 0.0
-dosfsck         - 0.0
-dumpe2fs        - 0.0
-dumpkeys        - 0.0
-e2fsck          - 0.0
-e2image         - 0.0
-e2label         - 0.0
-edgnued–        - 0.0
-edquota         - 0.0
-eject           - 0.0
-elvtune         - 0.0
-envsubst        - 0.0
-esd             - 0.0
-esd-config      - 0.0
-esdcat          - 0.0
-esdctl          - 0.0
-esddsp          - 0.0
-esdmon          - 0.0
-esdplay         - 0.0
-esdrec          - 0.0
-esdsample       - 0.0
-etags           - 0.0
-ethtool         - 0.0
-ex              - 0.0
-expr            - 0.0
-factor          - 0.0
-fc-cache        - 0.0
-fc-list         - 0.0
-fdformat        - 0.0
-fdisk           - 0.0
-fetchmail       - 0.0
-fgconsole       - 0.0
-finger          - 0.0
-fingerd         - 0.0
-flex            - 0.0
-fold            - 0.0
-formail         - 0.0
-fsck            - 0.0
-ftpd            - 0.0
-g++             - 0.0
-gawk            - 0.0
-gcc             - 0.0
-gdb             - 0.0
-getkeycodes     - 0.0
-gpasswd         - 0.0
-gpgsplit        - 0.0
-gpgv            - 0.0
-gpm             - 0.0
-gprof           - 0.0
-groff           - 0.0
-groffer         - 0.0
-groupadd        - 0.0
-groupdel        - 0.0
-groupmod        - 0.0
-grpck           - 0.0
-grpconv         - 0.0
-gs              - 0.0
-gzexe           - 0.0
-hdparm          - 0.0
-hexdump         - 0.0
-hostid          - 0.0
-htdigest        - 0.0
-htop            - 0.0
-hwclock         - 0.0
-ifdown          - 0.0
-ifup            - 0.0
-imapd           - 0.0
-inetd           - 0.0
-insmod          - 0.0
-iostat          - 0.0
-ipcrm           - 0.0
-ipcs            - 0.0
-iptables-restore - 0.0
-iptables-save   - 0.0
-isodump         - 0.0
-isoinfo         - 0.0
-isosize         - 0.0
-isovfy          - 0.0
-ispell          - 0.0
-kbd_mode        - 0.0
-kbdrate         - 0.0
-killall5        - 0.0
-klogd           - 0.0
-kudzu           - 0.0
-lastb           - 0.0
-lastlog         - 0.0
-ld              - 0.0
-ldconfig        - 0.0
-ldd             - 0.0
-lftp            - 0.0
-lftpget         - 0.0
-loadkeys        - 0.0
-lockfile        - 0.0
-logger          - 0.0
-logname         - 0.0
-losetup         - 0.0
-lpadmin         - 0.0
-lpc             - 0.0
-lpinfo          - 0.0
-lpmove          - 0.0
-lpq             - 0.0
-lpr             - 0.0
-lprint          - 0.0
-lprintd         - 0.0
-lprintq         - 0.0
-lprm            - 0.0
-lpstat          - 0.0
-lsattr          - 0.0
-lsblk           - 0.0
-lsmod           - 0.0
-lspci           - 0.0
-lsusb           - 0.0
-m4              - 0.0
-mailq           - 0.0
-mailstats       - 0.0
-mailto          - 0.0
-makedbm         - 0.0
-makemap         - 0.0
-manpath         - 0.0
-mattrib         - 0.0
-mbadblocks      - 0.0
-mcat            - 0.0
-mcd             - 0.0
-mcopy           - 0.0
-md5sum          - 0.0
-mdel,mdeltree   - 0.0
-mdir            - 0.0
-mdu             - 0.0
-mesg            - 0.0
-metamail        - 0.0
-metasend        - 0.0
-mformat         - 0.0
-mimencode       - 0.0
-minfo           - 0.0
-mkdosfs         - 0.0
-mke2fs          - 0.0
-mkfifo          - 0.0
-mkfs            - 0.0
-mkfs.ext3       - 0.0
-mkisofs         - 0.0
-mklost+found    - 0.0
-mkmanifest      - 0.0
-mknod           - 0.0
-mkraid          - 0.0
-mkswap          - 0.0
-mlabel          - 0.0
-mmd             - 0.0
-mmount          - 0.0
-mmove           - 0.0
-mmv             - 0.0
-modprobe        - 0.0
-mount           - 0.0
-mountd          - 0.0
-mpartition      - 0.0
-mpg123          - 0.0
-mpg321          - 0.0
-mrd             - 0.0
-mren            - 0.0
-mshowfat        - 0.0
-mt              - 0.0
-mtools          - 0.0
-mtoolstest      - 0.0
-mtr             - 0.0
-mtype           - 0.0
-mzip            - 0.0
-namei           - 0.0
-nameif          - 0.0
-newaliases      - 0.0
-newgrp          - 0.0
-newusers        - 0.0
-nfsd            - 0.0
-nfsstat         - 0.0
-nslookup        - 0.0
-nsupdate        - 0.0
-objcopy         - 0.0
-objdump         - 0.0
-od              - 0.0
-op              - 0.0
-openvt          - 0.0
-passwd          - 0.0
-pathchk         - 0.0
-pgrep           - 0.0
-pinky           - 0.0
-pkill           - 0.0
-pmap            - 0.0
-portmap         - 0.0
-poweroff        - 0.0
-pppd            - 0.0
-pr              - 0.0
-praliases       - 0.0
-printcap        - 0.0
-printenv        - 0.0
-ptx             - 0.0
-pwck            - 0.0
-pwconv          - 0.0
-quota           - 0.0
-quotacheck      - 0.0
-quotactl        - 0.0
-quotaoff        - 0.0
-quotaon         - 0.0
-quotastats      - 0.0
-raidstart       - 0.0
-ramsize         - 0.0
-ranlib          - 0.0
-rarpd           - 0.0
-rcp             - 0.0
-rdate           - 0.0
-rdev            - 0.0
-rdist           - 0.0
-rdistd          - 0.0
-readarray       - 0.0
-readcd          - 0.0
-readelf         - 0.0
-reject          - 0.0
-remsync         - 0.0
-renice          - 0.0
-repquota        - 0.0
-resize2fs       - 0.0
-rexec           - 0.0
-rexecd          - 0.0
-richtext        - 0.0
-rlogin          - 0.0
-rlogind         - 0.0
-rmail           - 0.0
-rmmod           - 0.0
-rndc            - 0.0
-rootflags       - 0.0
-routed          - 0.0
-rpcgen          - 0.0
-rpcinfo         - 0.0
-rsh             - 0.0
-rshd            - 0.0
-runlevel        - 0.0
-rup             - 0.0
-ruptime         - 0.0
-rusers          - 0.0
-rusersd         - 0.0
-rwall           - 0.0
-rwho            - 0.0
-rwhod           - 0.0
-sane-find-scanner - 0.0
-scanadf         - 0.0
-scanimage       - 0.0
-scp             - 0.0
-sdiff           - 0.0
-sendmail        - 0.0
-sensors         - 0.0
-seq             - 0.0
-setfdprm        - 0.0
-setkeycodes     - 0.0
-setleds         - 0.0
-setmetamode     - 0.0
-setquota        - 0.0
-setsid          - 0.0
-setterm         - 0.0
-sftp            - 0.0
-showkey         - 0.0
-showmount       - 0.0
-shred           - 0.0
-skill           - 0.0
-slabtop         - 0.0
-slattach        - 0.0
-slocate         - 0.0
-snice           - 0.0
-ssh-keygen      - 0.0
-ssh-keyscan     - 0.0
-strace          - 0.0
-swapoff         - 0.0
-swapon          - 0.0
-sysctl          - 0.0
-sysklogd        - 0.0
-syslogd         - 0.0
-tailf           - 0.0
-talk            - 0.0
-talkd           - 0.0
-taskset         - 0.0
-tcpd            - 0.0
-tcpslice        - 0.0
-telinit         - 0.0
-telnet          - 0.0
-telnetd         - 0.0
-tftpd           - 0.0
-tload           - 0.0
-tmpwatch        - 0.0
-tracepath       - 0.0
-traceroute      - 0.0
-troff           - 0.0
-tset            - 0.0
-tsort           - 0.0
-tune2fs         - 0.0
-tunelp          - 0.0
-ul              - 0.0
-umount          - 0.0
-unexpand        - 0.0
-unicode_start   - 0.0
-unicode_stop    - 0.0
-units           - 0.0
-unshar          - 0.0
-usleep          - 0.0
-uudecode        - 0.0
-uuencode        - 0.0
-uuidgen         - 0.0
-vdir            - 0.0
-vi              - 0.0
-vidmode         - 0.0
-volname         - 0.0
-wall            - 0.0
-warnquota       - 0.0
-watch           - 0.0
-whatis          - 0.0
-whereis         - 0.0
-who             - 0.0
-whoami          - 0.0
-whois           - 0.0
-xinetd          - 0.0
-yacc            - 0.0
-ypbind          - 0.0
-ypcat           - 0.0
-ypinit          - 0.0
-ypmatch         - 0.0
-yppasswd        - 0.0
-yppasswdd       - 0.0
-yppoll          - 0.0
-yppush          - 0.0
-ypserv          - 0.0
-ypset           - 0.0
-yptest          - 0.0
-ypwhich         - 0.0
-ypxfr           - 0.0
-zcat            - 0.0
-zcmp            - 0.0
-zdiff           - 0.0
-zdump           - 0.0
-zforce          - 0.0
-zic             - 0.0
-zip             - 0.0
-zless           - 0.0
-zmore           - 0.0
-znew            - 0.0
diff --git a/annotations/c_stats/plan9.txt b/annotations/c_stats/plan9.txt
deleted file mode 100644
index aef1b90c0..000000000
--- a/annotations/c_stats/plan9.txt
+++ /dev/null
@@ -1,180 +0,0 @@
-https://9p.io/wiki/plan9/plan_9_wiki/
-https://en.wikipedia.org/wiki/List_of_Plan_9_applications
-
-# General user
-dd[1] – convert and copy a file
-date[2] – date and time
-echo[3] – print arguments
-file[4] – determine file type
-ns[5] – display namespace
-plumb[6] – send message to plumber
-plumber[7] – interprocess messaging
-rc[8] – rc is the Plan 9 shell
-rio[9] – the new Plan 9 windowing system
-8½ – the old Plan 9 windowing system [1]
-uptime[10] – show how long the system has been running
-
-# Process and task management
-time[11] – time a command
-kill,[12] slay,[12] broke[12] – print commands to kill processes
-sleep[13] – suspend execution for an interval
-ps[14] – process status
-psu[14] – process status information about processes started by a specific user
-User management and support
-passwd,[15] netkey,[15] iam – change user password
-who[16] – who is using the machine
-man,[17] lookman[17] – print or find pages of this manual
-
-# File system and server
-/boot/boot[18] – connect to the root file server
-fossil/fossil, fossil/flchk, fossil/flfmt, fossil/conf, fossil/last – archival file server
-history[19] – print file names from the dump
-users[20] – file server user list format
-vac[21] – create a vac archive on Venti
-venti/buildindex, venti/checkarenas, venti/checkindex, venti/conf, venti/copy,[22] venti/fmtarenas, venti/fmtindex, venti/fmtisect, venti/rdarena, venti/rdarenablocks, venti/read,[22] venti/wrarenablocks, venti/write[22] – Venti maintenance and debugging commands
-venti/venti,[23] venti/sync – an archival block storage server
-yesterday,[24] diffy[24] – print file names from the dump
-
-# Hardware devices
-setrtc – set real time clock(RTC) on PC hardware
-
-# Files and text
-Filesystem utilities
-chgrp[25] – change file group
-chmod[26] – change mode
-cp,[27] fcp,[27] mv[27] – copy, move files
-du[28] – disk usage
-ls,[29] lc[29] – list contents of directory
-mkdir[30] – make a directory
-bind,[31] mount,[31] umount[31] – change name space
-pwd,[32] pbd[32] – working directory
-rm[33] – remove files
-touch[34] – set modification date of a file
-
-# Archivers and compression
-ar[35] – archive and library maintainer
-gzip,[36] gunzip,[36] bzip2,[36] bunzip2,[36] compress,[36] uncompress,[36] zip,[36] unzip[36] – compress and expand data
-tar[37] – archiver
-Text processing
-awk – pattern-directed scanning and processing language
-cat, read – catenate files
-cmp – compare two files
-diff – differential file comparator
-doc2txt, xls2txt – extract printable strings from Microsoft Office documents
-doctype – intuit command line for formatting a document
-fmt, htmlfmt – simple text formatters
-freq – print histogram of character frequencies
-grep – search a file for a pattern
-idiff – interactive diff
-mc – multicolumn print
-p[38] – paginate
-pr[39] – print file
-sed – stream editor
-spell, sprog – find spelling errors
-split – split a file into pieces
-tail – deliver the last part of a file
-tcs – translate character sets
-tr – translate characters
-wc – word count
-xd – hex, octal, decimal or ASCII dump of file
-
-# Editors
-acme[40] – interactive text editor and shell
-ed[41] – text editor
-sam[42] – screen editor with structural regular expressions
-Communication, networking and remote access
-con, telnet, rx, xms, xmr – remote login, execution, and XMODEM file transfer
-cpu – connection to CPU server
-dial/at, dial/drain, dial/expect, dial/pass – dialer scripting tools
-netstat – summarize network connections
-replica/changes, replica/pull, replica/push, replica/scan – client–server replica management
-ssh, sshnet, scp, aux/sshserve – secure login and file copy from/to Unix or Plan 9
-tel, iwhois – look in phone book
-vncs, vncv – remote frame buffer server and viewer for Virtual Network Computing (VNC)
-
-# Email and news programs
-faces, seemail, vwhois – mailbox interface
-mail – mail and mailboxes
-news – print news items
-upas/filter, upas/list, upas/deliver, upas/token, upas/vf – filtering mail
-upas/fs – mail file server
-upas/marshal – formatting and sending mail
-upas/ml, upas/mlmgr, upas/mlowner – unmoderated mailing lists
-upas/nedmail – reading mail
-upas/scanmail, upas/testscan – spam filters
-upas/send – mail routing and delivery
-upas/smtp, upas/smtpd – mail transport
-
-# Network system services
-ip/dhcpd, ip/dhcpleases, ip/rarpd, ip/tftpd – Internet booting
-aux/listen – listen for calls on a network device
-ndb/query, ndb/mkhash, ndb/mkdb, ndb/cs, ndb/csquery, ndb/dns, ndb/dnsquery, ndb/ipquery, ndb/dnsdebug, ndb/mkhosts – network database
-upas/pop3, ip/imap4d – mail servers
-aux/timesync – NTP client
-Network utilities
-aan – always available network
-ip/ipconfig, ip/rip – Internet configuration and routing
-ip/telnetd, ip/rlogind, ip/rexexec, ip/ftpd – Internet remote access daemons
-ip/ping, ip/gping, ip/traceroute, ip/hogports – probe the Internet
-snoopy – spy on network packets
-ip/udpecho – echo UDP packets
-Security
-auth/aescbc, ipso, auth/secstore – secstore commands
-auth/changeuser, auth/wrkey, auth/convkeys, auth/convkeys2, auth/printnetkey, auth/status, auth/authsrv, auth/guard.srv, auth/login, auth/disable, auth/enable – maintain authentication databases
-auth/factotum, auth/fgui – authentication agent
-auth/secstored, auth/secuser – secstore commands
-delkey – delete keys from factotum
-
-# Compilers and programming tools
-0a, 1a, 2a, 5a, 7a, 8a, ka, qa, va – assemblers
-0c, 1c, 2c, 5c, 7c, 8c, kc, qc, vc – C compilers
-0l, 1l, 2l, 5l, 7l, 8l, kl, ql, vl – loaders
-acid, truss, trump – debugger
-bc – arbitrary–precision arithmetic language
-cb – C program beautifier
-cpp – C language preprocessor
-ktrace – interpret kernel stack dumps
-leak, kmem – help find memory leaks
-mk, membername – maintain (make) related files
-patch – simple patch creation and tracking system
-pcc – APE C compiler driver
-strip – remove symbols from binary files
-syscall – test a system call
-yacc – yet another compiler-compiler
-
-# Web browsers
-abaco[43] – a "lame" text-based and graphical web browser
-i – rough web browser, based on charon
-links – a text-based and graphical web browser
-mothra – a very basic web browser
-Desktop publishing
-deroff, delatex – remove formatting requests
-eqn – typeset mathematics
-gs – Aladdin Ghostscript (PostScript and PDF language interpreter)
-htmlroff – HTML formatting and typesetting
-lp – printer output
-ms2html, html2ms – convert between troff's ms macros and html
-page – view FAX, image, graphic PostScript PDF, and typesetter output files
-pic, tpic – troff and tex preprocessors for drawing pictures
-pr – print file
-ps2pdf, pdf2ps – convert between PostScript and PDF
-tbl – format tables for nroff or troff
-troff, nroff – text formatting and typesetting
-troff2html – convert troff output into HTML
-
-# Graphics and multimedia
-getmap,[44] colors[44] – display color map
-jpg,[45] gif,[45] png,[45] ppm,[45] bmp,[45] v210,[45] yuv,[45] ico,[45] togif,[45] toppm,[45] topng,[45] toico[45] – view and convert pictures
-Various utilities and games
-astro[46] – print astronomical information
-cal[47] – print calendar
-calendar[48] – print upcoming events
-clock[49] – draws a simple analog clock
-dict[50] – dictionary browser
-fortune[51] – sample lines from a file
-juke,[52] games/jukebox,[52] games/jukefs[52] – music jukebox
-lens[53] – interactive screen magnifier
-map,[54] mapdemo[54] – draw maps on various projections
-games/playlistfs[55] – playlist file system
-thesaurus[56] – search online thesaurus
-scat[57] – sky catalogue
diff --git a/annotations/c_stats/pman.txt b/annotations/c_stats/pman.txt
deleted file mode 100644
index 4c7e4fc76..000000000
--- a/annotations/c_stats/pman.txt
+++ /dev/null
@@ -1,667 +0,0 @@
-accept     Accept or Reject jobs to a destination, such as a printer.
-access     Check a user’s RWX permission for a file.
-aclocal    GNU autoconf too
-aconnect      ALSA sequencer connection manager.
-acpi     Show information about the Advanced Configuration and Power Interface.
-acpi_available     Check if ACPI functionality exists on the system.
-acpid     Informs user-space programs about ACPI events.
-addr2line     Used to convert addresses into file names and line numbers.
-addresses     Formats for internet mail addresses.
-agetty     An alternative Linux Getty
-alias     Create an alias for Linux commands
-alsactl     Access advanced controls for ALSA soundcard driver.
-amidi     Perform read/write operation for ALSA RawMIDI ports.
-amixer     Access CLI-based mixer for ALSA soundcard driver.
-anacron     Used to run commands periodically.
-aplay     Sound recorder and player for CLI.
-aplaymidi     CLI utility used to play MIDI files.
-apm     Show Advanced Power Management (APM) hardware info on older systems.
-apmd     Used to handle events reported by APM BIOS drivers.
-apropos     Shows the list of all man pages containing a specific keyword
-apt     Advanced Package Tool, a package management system for Debian and derivatives.
-apt-get     Command-line utility to install/remove/update packages based on APT system.
-aptitude     Another utility to add/remove/upgrade packages based on the APT system.
-ar     A utility to create/modify/extract from archives.
-arch     Display print machine hardware name.
-arecord     Just like aplay, it’s a sound recorder and player for ALSA soundcard driver.
-arecordmidi     Record standard MIDI files.
-arp     Used to make changes to the system’s ARP cache
-as     A portable GNU assembler.
-aspell     An interactive spell checker utility.
-at     Used to schedule command execution at specified date & time, reading commands from an input file.
-atd     Used to execute jobs queued by the at command.
-atq     List a user’s pending jobs for the at command.
-atrm     Delete jobs queued by the at command.
-audiosend     Used to send an audio recording as an email.
-aumix     An audio mixer utility.
-autoconf     Generate configuration scripts from a TEMPLATE-FILE and send the output to standard output.
-autoheader     Create a template header for configure.
-automake     Creates GNU standards-compliant Makefiles from template files
-autoreconf     Update generated configuration files.
-autoscan     Generate a preliminary configure.in
-autoupdate     Update a configure.in file to newer autoconf.
-awk     Used to find and replace text in a file(s).
-badblocks     Search a disk partition for bad sectors.
-banner     Used to print characters as a poster.
-basename     Used to display filenames with directoy or suffix.
-bash     GNU Bourne-Again Shell.
-batch     Used to run commands entered on a standard input.
-bc     Access the GNU bc calculator utility.
-bg     Send processes to the background.
-biff     Notify about incoming mail and sender’s name on a system running comsat server.
-bind     Used to attach a name to a socket.
-bison     A GNU parser generator, compatible with yacc.
-break     Used to exit from a loop (eg: for, while, select).
-builtin     Used to run shell builtin commands, make custom functions for commands extending their functionality.
-bzcmp     Used to call the cmp program for bzip2 compressed files.
-bzdiff     Used to call the diff program for bzip2 compressed files.
-bzgrep     Used to call grep for bzip2 compressed files.
-bzip2     A block-sorting file compressor used to shrink given files.
-bzless     Used to apply ‘less’ (show info one page at a time) to bzip2 compressed files.
-bzmore     Used to apply ‘more’ (an inferior version of less) to bzip2 compressed files.
-cal     Show calendar.
-cardctl     Used to control PCMCIA sockets and select configuration schemes.
-cardmgr     Keeps an eye on the added/removes sockets for PCMCIA devices.
-case     Execute a command conditionally by matching a pattern.
-cat     Used to concatenate files and print them on the screen.
-cc     GNU C and C++ compiler.
-cd     Used to change directory.
-cdda2wav     Used to rip a CD-ROM and make WAV file.
-cdparanoia     Record audio from CD more reliably using data-verification algorithms.
-cdrdao     Used to write all the content specified to a file to a CD all at once.
-cdrecord     Used to record data or audio compact discs.
-cfdisk     Show or change the disk partition table.
-chage     Used to change user password information.
-chattr     Used to change file attributes.
-chdir     Used to change active working directory.
-chfn     Used to change real user name and information.
-chgrp     Used to change group ownership for file.
-chkconfig     Manage execution of runlevel services.
-chmod     Change access permission for a file(s).
-chown     Change the owner or group for a file.
-chpasswd     Update password in a batch.
-chroot     Run a command with root privileges.
-chrt     Alter process attributed in real-time.
-chsh     Switch login shell.
-chvt     Change foreground virtual terminal.
-cksum     Perform a CRC checksum for files.
-clear     Used to clear the terminal window.
-cmp     Compare two files (byte by byte).
-col     Filter reverse (and half-reverse) line feeds from the input.
-colcrt     Filter nroff output for CRT previewing.
-colrm     Remove columns from the lines of a file.
-column     A utility that formats its input into columns.
-comm     Used to compare two sorted files line by line.
-command     Used to execute a command with arguments ignoring shell function named command.
-compress     Used to compress one or more file(s) and replacing the originals ones.
-continue     Resume the next iteration of a loop.
-cp     Copy contents of one file to another.
-cpio     Copy files from and to archives.
-cpp     GNU C language processor.
-cron     A daemon to execute scheduled commands.
-crond     Same work as cron.
-crontab     Manage crontab files (containing schedules commands) for users.
-csplit     Split a file into sections on the basis of context lines.
-ctags     Make a list of functions and macro names defined in a programming source file.
-cupsd     A scheduler for CUPS.
-curl     Used to transfer data from or to a server using supported protocols.
-cut     Used to remove sections from each line of a file(s).
-cvs     Concurrent Versions System. Used to track file versions, allow storage/retrieval of previous versions, and enables multiple users to work on the same file.
-date     Show system date and time.
-dc     Desk calculator utility.
-dd     Used to convert and copy a file, create disk clone, write disk headers, etc.
-ddrescue     Used to recover data from a crashed partition.
-deallocvt     Deallocates kernel memory for unused virtual consoles.
-debugfs     File system debugger for ext2/ext3/ext4
-declare     Used to declare variables and assign attributes.
-depmod     Generate modules.dep and map files.
-devdump     Interactively displays the contents of device or file system ISO.
-df     Show disk usage.
-diff     Used to compare files line by line.
-diff3     Compare three files line by line.
-dig     Domain Information Groper, a DNS lookup utility.
-dir     List the contents of a directory.
-dircolors     Set colors for ‘ls’ by altering the LS_COLORS environment variable.
-dirname     Display pathname after removing the last slash and characters thereafter.
-dirs     Show the list of remembered directories.
-disable     Restrict access to a printer.
-dlpsh     Interactive Desktop Link Protocol (DLP) shell for PalmOS.
-dmesg     Examine and control the kernel ring buffer.
-dnsdomainname     Show the DNS domain name of the system.
-dnssec-keygen     Generate encrypted Secure DNS keys for a given domain name.
-dnssec-makekeyset     Produce domain key set from one or more DNS security keys generated by dnssec-keygen.
-dnssec-signkey     Sign a secure DNS keyset with key signatures specified in the list of key-identifiers.
-dnssec-signzone     Sign a secure DNS zonefile with the signatures in the specified list of key-identifiers.
-doexec     Used to run an executable with an arbitrary argv list provided.
-domainname     Show or set the name of current NIS (Network Information Services) domain.
-dosfsck     Check and repair MS-DOS file systems.
-du     Show disk usage summary for a file(s).
-dump     Backup utility for ext2/ext3 file systems.
-dumpe2fs     Dump ext2/ext3/ext4 file systems.
-dumpkeys     Show information about the keyboard driver’s current translation tables.
-e2fsck     Used to check ext2/ext3/ext4 file systems.
-e2image     Store important ext2/ext3/ext4 filesystem metadata to a file.
-e2label     Show or change the label on an ext2/ext3/ext4 filesystem.
-echo     Send input string(s) to standard output i.e. display text on the screen.
-ed     GNU Ed – a line-oriented text editor.
-edquota     Used to edit filesystem quotas using a text editor, such as vi.
-egrep     Search and display text matching a pattern.
-eject     Eject removable media.
-elvtune     Used to set latency in the elevator algorithm used to schedule I/O activities for specified block devices.
-emacs    Emacs text editor command line utility.
-enable     Used to enable/disable shell builtin commands.
-env     Run a command in a modified environment. Show/set/delete environment variables.
-envsubst     Substitute environment variable values in shell format strings.
-esd     Start the Enlightenment Sound Daemon (EsounD or esd). Enables multiple applications to access the same audio device simultaneously.
-esd-config    Manage EsounD configuration.
-esdcat     Use EsounD to send audio data from a specified file.
-esdctl     EsounD control program.
-esddsp     Used to reroute non-esd audio data to esd and control all the audio using esd.
-esdmon     Used to copy the sound being sent to a device. Also, send it to a secondary device.
-esdplay     Use EsounD system to play a file.
-esdrec     Use EsounD to record audio to a specified file.
-esdsample     Sample audio using esd.
-etags     Used to create a list of functions and macros from a programming source file. These etags are used by emacs. For vi, use ctags.
-ethtool     Used to query and control network driver and hardware settings.
-eval     Used to evaluate multiple commands or arguments are once.
-ex     Interactive command
-exec     An interactive line-based text editor.
-exit     Exit from the terminal.
-expand     Convert tabs into spaces in a given file and show the output.
-expect     An extension to the Tcl script, it’s used to automate interaction with other applications based on their expected output.
-export     Used to set an environment variable.
-expr     Evaluate expressions and display them on standard output.
-factor     Display prime factors of specified integer numbers.
-false     Do nothing, unsuccessfully. Exit with a status code indicating failure.
-fc-cache     Make font information cache after scanning the directories.
-fc-list     Show the list of available fonts.
-fdformat     Do a low-level format on a floppy disk.
-fdisk     Make changes to the disk partition table.
-fetchmail     Fetch mail from mail servers and forward it to the local mail delivery system.
-fg     Used to send a job to the foreground.
-fgconsole     Display the number of the current virtual console.
-fgrep     Display lines from a file(s) that match a specified string. A variant of grep.
-file     Determine file type for a file.
-find     Do a file search in a directory hierarchy.
-finger     Display user data including the information listed in .plan and .project in each user’s home directory.
-fingerd     Provides a network interface for the finger program.
-flex     Generate programs that perform pattern-matching on text.
-fmt     Used to convert text to a specified width by filling lines and removing new lines, displaying the output.
-fold     Wrap input line to fit in a specified width.
-for     Expand words and run commands for each one in the resultant list.
-formail     Used to filter standard input into mailbox format.
-format     Used to format disks.
-free     Show free and used system memory.
-fsck     Check and repair a Linux file system
-ftp     File transfer protocol user interface.
-ftpd     FTP server process.
-function     Used to define function macros.
-fuser     Find and kill a process accessing a file.
-g++     Run the g++ compiler.
-gawk     Used for pattern scanning and language processing. A GNU implementation of AWK language.
-gcc     A C and C++ compiler by GNU.
-gdb     A utility to debug programs and know about where it crashes.
-getent     Shows entries from Name Service Switch Libraries for specified keys.
-getkeycodes     Displays the kernel scancode-to-keycode mapping table.
-getopts     A utility to parse positional parameters.
-gpasswd     Allows an administrator to change group passwords.
-gpg     Enables encryption and signing services as per the OpenPGP standard.
-gpgsplit     Used to split an OpenPGP message into packets.
-gpgv     Used to verify OpenPGP signatures.
-gpm     It enables cut and paste functionality and a mouse server for the Linux console.
-gprof     Shows call graph profile data.
-grep     Searches input files for a given pattern and displays the relevant lines.
-groff     Serves as the front-end of the groff document formatting system.
-groffer     Displays groff files and man pages.
-groupadd     Used to add a new user group.
-groupdel     Used to remove a user group.
-groupmod     Used to modify a group definition.
-groups     Show the group(s) to which a user belongs.
-grpck     Verifies the integrity of group files.
-grpconv     Creates a gshadow file from a group or an already existing gshadow.
-gs     Invokes Ghostscript, and interpreter and previewer for Adobe’s PostScript and PDF languages.
-gunzip     A utility to compress/expand files.
-gzexe     Used compress executable files in place and have them automatically uncompress and run at a later stage.
-gzip     Same as gzip.
-halt     Command used to half the machine.
-hash     Shows the path for the commands executed in the shell.
-hdparm     Show/configure parameters for SATA/IDE devices.
-head     Shows first 10 lines from each specified file.
-help     Display’s help for a built-in command.
-hexdump     Shows specified file output in hexadecimal, octal, decimal, or ASCII format.
-history     Shows the command history.
-host     A utility to perform DNS lookups.
-hostid     Shows host’s numeric ID in hexadecimal format.
-hostname     Display/set the hostname of the system.
-htdigest     Manage the user authentication file used by the Apache web server.
-htop     An interactive process viewer for the command line.
-hwclock     Show or configure the system’s hardware clock.
-iconv     Convert text file from one encoding to another.
-id     Show user and group information for a specified user.
-if     Execute a command conditionally.
-ifconfig     Used to configure network interfaces.
-ifdown     Stops a network interface.
-ifup     Starts a network interface.
-imapd     An IMAP (Interactive Mail Access Protocol) server daemon.
-import     Capture an X server screen and saves it as an image.
-inetd     Extended internet services daemon, it starts the programs that provide internet services.
-info     Used to read the documentation in Info format.
-init     Systemd system and service manager.
-insmod     A program that inserts a module into the Linux kernel.
-install     Used to copy files to specified locations and set attributions during the install process.
-iostat     Shows statistics for CPU, I/O devices, partitions, network filesystems.
-ip     Display/manipulate routing, devices, policy, routing and tunnels.
-ipcrm     Used to remove System V interprocess communication (IPC) objects and associated data structures.
-ipcs     Show information on IPC facilities for which calling process has read access.
-iptables     Administration tool for IPv4 packet filtering and NAT.
-iptables-restore     Used to restore IP tables from data specified in the input or a file.
-iptables-save     Used to dump IP table contents to standard output.
-isodump     A utility that shows the content iso9660 images to verify the integrity of directory contents.
-isoinfo     A utility to perform directory like listings of iso9660 images.
-isosize     Show the length of an iso9660 filesystem contained in a specified file.
-isovfy     Verifies the integrity of an iso9660 image.
-ispell     A CLI-based spell-check utility.
-jobs     Show the list of active jobs and their status.
-join     For each pair of input lines, join them using a command field and display on standard output.
-kbd_mode     Set a keyboard mode. Without arguments, shows the current keyboard mode.
-kbdrate     Reset keyboard repeat rate and delay time.
-kill     Send a kill (termination) signal to one more processes.
-killall     Kills a process(es) running a specified command.
-killall5     A SystemV killall command. Kills all the processes excluding the ones which it depends on.
-klogd     Control and prioritize the kernel messages to be displayed on the console, and log them through syslogd.
-kudzu     Used to detect new and enhanced hardware by comparing it with existing database. Only for RHEL and derivates.
-last     Shows a list of recent logins on the system by fetching data from /var/log/wtmp file.
-lastb     Shows the list of bad login attempts by fetching data from /var/log/btmp file.
-lastlog     Displays information about the most recent login of all users or a specified user.
-ld     The Unix linker, it combines archives and object files. It then puts them into one output file, resolving external references.
-ldconfig     Configure dynamic linker run-time bindings.
-ldd     Shows shared object dependencies.
-less     Displays contents of a file one page at a time. It’s advanced than more command.
-lesskey     Used to specify key bindings for less command.
-let     Used to perform integer artithmetic on shell variables.
-lftp     An FTP utility with extra features.
-lftpget     Uses lftop to retrieve HTTP, FTP, and other protocol URLs supported by lftp.
-link     Create links between two files. Similar to ln command.
-ln     Create links between files. Links can be hard (two names for the same file) or soft (a shortcut of the first file).
-loadkeys     Load keyboard translation tables.
-local     Used to create function variables.
-locale     Shows information about current or all locales.
-locate     Used to find files by their name.
-lockfile     Create semaphore file(s) which can be used to limit access to a file.
-logger     Make entries in the system log.
-login     Create a new session on the system.
-logname     Shows the login name of the current user.
-logout     Performs the logout operation by making changes to the utmp and wtmp files.
-logrotate     Used for automatic rotation, compression, removal, and mailing of system log files.
-look     Shows any lines in a file containing a given string in the beginning.
-losetup     Set up and control loop devices.
-lpadmin     Used to configure printer and class queues provided by CUPS (Common UNIX Printing System).
-lpc     Line printer control program, it provides limited control over CUPS printer and class queues.
-lpinfo     Shows the list of avaiable devices and drivers known to the CUPS server.
-lpmove     Move on or more printing jobs to a new destination.
-lpq     Shows current print queue status for a specified printer.
-lpr     Used to submit files for printing.
-lprint     Used to print a file.
-lprintd     Used to abort a print job.
-lprintq     List the print queue.
-lprm     Cancel print jobs.
-lpstat     Displays status information about current classes, jobs, and printers.
-ls     Shows the list of files in the current directory.
-lsattr     Shows file attributes on a Linux ext2 file system.
-lsblk     Lists information about all available or the specified block devices.
-lsmod     Show the status of modules in the Linux kernel.
-lsof     List open files.
-lspci     List all PCI devices.
-lsusb     List USB devices.
-m4     Macro processor.
-mail     Utility to compose, receive, send, forward, and reply to emails.
-mailq     Shows to list all emails queued for delivery (sendmail queue).
-mailstats     Shows current mail statistics.
-mailto     Used to send mail with multimedia content in MIME format.
-make     Utility to maintain groups of programs, recompile them if needed.
-makedbm     Creates an NIS (Network Information Services) database map.
-makemap     Creates database maps used by the keyed map lookups in sendmail.
-man     Shows manual pages for Linux commands.
-manpath     Determine search path for manual pages.
-mattrib     Used to change MS-DOS file attribute flags.
-mbadblocks     Checks MD-DOS filesystems for bad blocks.
-mcat     Dump raw disk image.
-mcd     Used to change MS-DOS directory.
-mcopy     Used to copy MS-DOS files from or to Unix.
-md5sum     Used to check MD5 checksum for a file.
-mdel, mdeltree     Used to delete MS-DOS file. mdeltree recursively deletes MS-DOS directory and its contents.
-mdir     Used to display an MS-DOS directory.
-mdu     Used to display the amount of space occupied by an MS-DOS directory.
-merge     Three-way file merge. Includes all changes from file2 and file3 to file1.
-mesg     Allow/disallow osends to sedn write messages to your terminal.
-metamail    For sending and showing rich text or multimedia email using MIME typing metadata.
-metasend      An interface for sending non-text mail.
-mformat     Used to add an MS-DOS filesystem to a low-level formatted floppy disk.
-mimencode     Translate to/from MIME multimedia mail encoding formats.
-minfo     Display parameters of an MS-DOS filesystem.
-mkdir     Used to create directories.
-mkdosfs     Used to create an MS-DOS filesystem under Linux.
-mke2fs     Used create an ext2/ext3/ext4 filesystem.
-mkfifo     Used to create named pipes (FIFOs) with the given names.
-mkfs     Used to build a Linux filesystem on a hard disk partition.
-mkfs.ext3     Same as mke2fs, create an ext3 Linux filesystem.
-mkisofs     Used to create an ISO9660/JOLIET/HFS hybrid filesystem.
-mklost+found     Create a lost+found directory on a mounted ext2 filesystem.
-mkmanifest     Makes a list of file names and their DOS 8.3 equivalent.
-mknod     Create a FIFO, block (buffered) special file, character (unbuffered) special file with the specified name.
-mkraid     Used to setup RAID device arrays.
-mkswap     Set up a Linux swap area.
-mktemp     Create a temporary file or directory.
-mlabel     Make an MD-DOS volume label.
-mmd     Make an MS-DOS subdirectory.
-mmount     Mount an MS-DOS disk.
-mmove     Move or rename an MS-DOS file or subdirectory.
-mmv     Mass move and rename files.
-modinfo     Show information about a Linux kernel module.
-modprobe     Add or remove modules from the Linux kernel.
-more     Display content of a file page-by-page.
-most     Browse or page through a text file.
-mount     Mount a filesystem.
-mountd     NFS mount daemon.
-mpartition     Partition an MS-DOS disk.
-mpg123     Command-line mp3 player.
-mpg321     Similar to mpg123.
-mrd     Remove an MS-DOS subdirectory.
-mren     Rename an existing MS-DOS file.
-mshowfat     Show FTA clusters allocated to a file.
-mt     Control magnetic tape drive operation.
-mtools     Utilities to access MS-DOS disks.
-mtoolstest     Tests and displays the mtools configuration files.
-mtr     A network diagnostic tool.
-mtype     Display contents of an MS-DOS file.
-mv     Move/rename files or directories.
-mzip     Change protection mode and eject disk on Zip/Jaz drive.
-named     Internet domain name server.
-namei     Follow a pathname until a terminal point is found.
-nameif     Name network interfaces based on MAC addresses.
-nc     Netcat utility. Arbitrary TCP and UDP connections and listens.
-netstat     Show network information.
-newaliases     Rebuilds mail alias database.
-newgrp     Log-in to a new group.
-newusers     Update/create new users in batch.
-nfsd     Special filesystem for controlling Linux NFS server.
-nfsstat     List NFS statistics.
-nice     Run a program with modified scheduling priority.
-nl     Show numbered line while displaying the contents of a file.
-nm     List symbols from object files.
-nohup     Run a command immune to hangups.
-notify-send     A program to send desktop notifications.
-nslookup     Used performs DNS queries. Read this article for more info.
-nsupdate     Dynamic DNS update utility.
-objcopy     Copy and translate object files.
-objdump     Display information from object files.
-od     Dump files in octal and other formats.
-op     Operator access, allows system administrators to grant users access to certain root operations that require superuser privileges.
-open     Open a file using its default application.
-openvt     Start a program on a new virtual terminal (VT).
-passwd     Change user password.
-paste     Merge lines of files. Write to standard output, TAB-separated lines consisting of sqentially correspnding lines from each file.
-patch     Apply a patchfile (containing differences listing by diff program) to an original file.
-pathchk     Check if file names are valid or portable.
-perl     Perl 5 language interpreter.
-pgrep     List process IDs matching the specified criteria among all the running processes.
-pidof     Find process ID of a running program.
-ping     Send ICMP ECHO_REQUEST to network hosts.
-pinky     Lightweight finger.
-pkill     Send kill signal to processes based on name and other attributes.
-pmap     Report memory map of a process.
-popd     Removes directory on the head of the directory stack and takes you to the new directory on the head.
-portmap     Converts RPC program numbers to IP port numbers.
-poweroff     Shuts down the machine.
-pppd     Point-to-point protocol daemon.
-pr     Convert (column or paginate) text files for printing.
-praliases     Prints the current system mail aliases.
-printcap     Printer capability database.
-printenv     Show values of all or specified environment variables.
-printf     Show arguments formatted according to a specified format.
-ps     Report a snapshot of the current processes.
-ptx     Produce a permuted index of file contents.
-pushd     Appends a given directory name to the head of the stack and then cd to the given directory.
-pv     Monitor progress of data through a pipe.
-pwck     Verify integrity of password files.
-pwconv     Creates shadow from passwd and an optionally existing shadow.
-pwd     Show current directory.
-python    
-quota     Shows disk usage, and space limits for a user or group. Without arguments, only shows user quotas.
-quotacheck     Used to scan a file system for disk usage.
-quotactl     Make changes to disk quotas.
-quotaoff     Enable enforcement of filesystem quotas.
-quotaon     Disable enforcement of filesystem quotas.
-quotastats     Shows the report of quota system statistics gathered from the kernel.
-raidstart     Start/stop RAID devices.
-ram     RAM disk device used to access the RAM disk in raw mode.
-ramsize     Show usage information for the RAM disk.
-ranlib     Generate index to the contents of an archive and store it in the archive.
-rar     Create and manage RAR file in Linux.
-rarpd     Respond to Reverse Address Resoultion Protocol (RARP) requests.
-rcp     Remote copy command to copy files between remote computers.
-rdate     Set system date and time by fetching information from a remote machine.
-rdev     Set or query RAM disk size, image root device, or video mode.
-rdist     Remote file distribution client, maintains identical file copies over multiple hosts.
-rdistd     Start the rdist server.
-read     Read from a file descriptor.
-readarray     Read lines from a file into an array variable.
-readcd     Read/write compact disks.
-readelf     Shows information about ELF (Executable and Linkable fomrat) files.
-readlink     Display value of a symbolic link or canonical file name.
-readonly     Mark functions and variables as read-only.
-reboot     Restart the machine.
-reject     Accept/reject print jobs sent to a specified destination.
-remsync     Synchronize remote files over email.
-rename     Rename one or more files.
-renice     Change priority of active processes.
-repquota     Report disk usage and quotas for a specified filesystem.
-reset     Reinitialize the terminal.
-resize2fs     Used to resize ext2/ext3/ext4 file systems.
-restore     Restore files from a backup created using dump.
-return     Exit a shell function.
-rev     Show contents of a file, reversing the order of characters in every line.
-rexec     Remote execution client for exec server.
-rexecd     Remote execution server.
-richtext     View “richtext” on an ACSII terminal.
-rlogin     Used to connect a local host system with a remote host.
-rlogind     Acts as the server for rlogin. It facilitates remote login, and authentication based on privileged port numbers from trusted hosts.
-rm     Removes specified files and directories (not by default).
-rmail     Handle remote mail received via uucp.
-rmdir     Used to remove empty directories.
-rmmod     A program to remove modules from Linux kernel.
-rndc     Name server control utility. Send command to a BIND DNS server over a TCP connection.
-rootflags     Show/set flags for the kernel image.
-route     Show/change IP routing table.
-routed     A daemon, invoked at boot time, to manage internet routing tables.
-rpcgen     An RPC protocol compiler. Parse a file written in the RPC language.
-rpcinfo     Shows RPC information. Makes an RPC call to an RPC server and reports the findings.
-rpm     A package manager for linux distributions. Originally developed for RedHat Linux.
-rsh     Remote shell. Connects to a specified host and executes commands.
-rshd     A daemon that acts as a server for rsh and rcp commands.
-rsync     A versitile to for copying files remotely and locally.
-runlevel     Shows previous and current SysV runlevel.
-rup     Remote status display. Shows current system status for all or specified hosts on the local network.
-ruptime     Shows uptime and login details of the machines on the local network.
-rusers     Shows the list of the users logged-in to the host or on all machines on the local network.
-rusersd     The rsuerd daemon acts as a server that responds to the queries from rsuers command.
-rwall     Sends messages to all users on the local network.
-rwho     Reports who is logged-in to the hosts on the local network.
-rwhod     Acts as a server for rwho and ruptime commands.
-sane-find-scanner     Find SCSI and USB scanner and determine their device files.
-scanadf     Retrieve multiple images from a scanner equipped with an automatic document feeder (ADF).
-scanimage     Read images from image aquistion devices (scanner or camera) and display on standard output in PNM (Portable aNyMap) format.
-scp     Copy files between hosts on a network securely using SSH.
-screen     A window manager that enables multiple pseudo-terminals with the help of ANSI/VT100 terminal emulation.
-script     Used to make a typescript of everything displayed on the screen during a terminal session.
-sdiff     Shows two files side-by-side and highlights the differences.
-sed     Stream editor for filtering and transforming text (from a file or a pipe input).
-select     Synchronous I/O multiplexing.
-sendmail     It’s a mail router or an MTA (Mail Transfer Agent). sendmail support can send a mail to one or more recepients using necessary protocols.
-sensors     Shows the current readings of all sensor chips.
-seq     Displays an incremental sequence of numbers from first to last.
-set     Used to manipulate shell variables and functions.
-setfdprm     Sets floppy disk parameters as provided by the user.
-setkeycodes     Load kernel scancode-to-keycode mapping table entries.
-setleds     Show/change LED light settings of the keyboard.
-setmetamode     Define keyboard meta key handling. Without arguments, shows current meta key mode.
-setquota     Set disk quotas for users and groups.
-setsid     Run a program in a new session.
-setterm     Set terminal attributes.
-sftp     Secure File Transfer program.
-sh     Command interpreter (shell) utility.
-sha1sum     Compute and check 160-bit SHA1 checksum to verify file integrity.
-shift     Shift positional parameters.
-shopt     Shell options.
-showkey     Examines codes sent by the keyboard displays them in printable form.
-showmount     Shows information about NFS server mount on the host.
-shred     Overwrite a file to hide its content (optionally delete it), making it harder to recover it.
-shutdown     Power-off the machine.
-size     Lists section size and the total size of a specified file.
-skill     Send a signal to processes.
-slabtop     Show kernel slab cache information in real-time.
-slattach     Attack a network interface to a serial line.
-sleep     Suspend execution for a specified amount of time (in seconds).
-slocate     Display matches by searching filename databases. Takes ownership and file permission into consideration.
-snice     Reset priority for processes.
-sort     Sort lines of text files.
-source     Run commands from a specified file.
-split     Split a file into pieces of fixed size.
-ss     Display socket statistics, similar to netstat.
-ssh     An SSH client for logging in to a remote machine. It provides encrypted communication between the hosts.
-ssh-add     Adds private key identities to the authentication agent.
-ssh-agent     It holds private keys used for public key authentication.
-ssh-keygen     It generates, manages, converts authentication keys for ssh.
-ssh-keyscan     Gather ssh public keys.
-sshd     Server for the ssh program.
-stat     Display file or filesystem status.
-statd     A daemon that listens for reboot notifications from other hosts, and manages the list of hosts to be notified when the local system reboots.
-strace     Trace system calls and signals.
-strfile     Create a random access file for storing strings.
-strings     Search a specified file and prints any printable strings with at least four characters and followed by an unprintable character.
-strip     Discard symbols from object files.
-stty     Change and print terminal line settings.
-su     Change user ID or become superuser.
-sudo     Execute a command as superuser.
-sum     Checksum and count the block in a file.
-suspend     Suspend the execution of the current shell.
-swapoff     Disable devices for paging and swapping.
-swapon     Enable devices for paging and swapping.
-symlink     Create a symbolic link to a file.
-sync     Synchronize cached writes to persistent storage.
-sysctl     Configure kernel parameters at runtime.
-sysklogd     Linux system logging utilities. Provides syslogd and klogd functionalities.
-syslogd     Read and log system messages to the system console and log files.
-tac     Concatenate and print files in reverse order. Opposite of cat command.
-tail     Show the last 10 lines of each specified file(s).
-tailf     Follow the growth of a log file. (Deprecated command)
-talk     A two-way screen-oriented communication utility that allows two user to exchange messages simulateneously.
-talkd     A remote user communication server for talk.
-tar     GNU version of the tar archiving utility. Used to store and extract multiple files from a single archive.
-taskset     Set/retrieve a process’s CPU affinity.
-tcpd     Access control utility for internet services.
-tcpdump     Dump traffic on network. Displays a description of the contents of packets on a network interface that match the boolean expression.
-tcpslice     Extract pieces of tcpdump files or merge them.
-tee     Read from standard input and write to standard output and files.
-telinit     Change SysV runlevel.
-telnet     Telnet protocol user interface. Used to interact with another host using telnet.
-telnetd     A server for the telnet protocol.
-test     Check file type and compare values.
-tftp     User interface to the internet TFTP (Trivial File Transfer Protocol).
-tftpd     TFTP server.
-time     Run programs and summarize system resource usage.
-timeout     Execute a command with a time limit.
-times     Shows accumulated user and system times for the shell and it’s child processes.
-tload     Shows a graph of the current system load average to the specified tty.
-tmpwatch     Recursively remove files and directories which haven’t been accessed for the specified period of time.
-top     Displays real-time view of processes running on the system.
-touch     Change file access and modification times.
-tput     Modify terminal-dependent capabilities, color, etc.
-tr     Translate, squeeze, or delete characters from standard input and display on standard output.
-tracepath     Traces path to a network host discovering MTU (Maximum Transmission Unit) along this path.
-traceroute     Traces the route taken by the packets to reach the network host.
-trap     Trap function responds to hardware signals. It defines and creates handlers to run when the shell receives signals.
-troff     The troff processor of the groff text formatting system.
-TRUE     Exit with a status code indicating success.
-tset     Initialize terminal.
-tsort     Perform topological sort.
-tty     Display the filename of the terminal connected to standard input.
-tune2fs     Adjust tunable filesystem parameters on ext2/ext3/ext4 filesystems.
-tunelp     Set various parameters for the line printer devices.
-type     Write a description for a command type.
-ul     Underline text.
-ulimit     Get and set user limits for the calling process.
-umask     Set file mode creation mask.
-umount     Unmount specified file systems.
-unalias     Remove alias definitions for specified alias names.
-uname     Show system information.
-uncompress    Uncompress the files compressed with the compress command.
-unexpand     Convert spaces to tabs for a specified file.
-unicode_start     Put keyboard and console in Unicode mode.
-unicode_stop     Revert keyboard and console from Unicode mode.
-uniq     Report or omit repeating lines.
-units     Convert units from one scalar to another.
-unrar     Extract files from a RAR archive.
-unset     Remove variable or function names.
-unshar     Unpack shell archive scripts.
-until     Execute command until a given condition is true.
-uptime     Tell how long the system has been running.
-useradd     Create a new user or update default user information.
-userdel     Delete a user account and related files.
-usermod     Modify a user account.
-users     Show the list of active users on the machine.
-usleep     Suspend execution for microsecond intervals.
-uudecode     Decode a binary file.
-uuencode     Encode a binary file.
-uuidgen     Created a new UUID (Universally Unique Identifier) table.
-vdir     Same as ls -l -b. Verbosely list directory contents.
-vi     A text editor utility.
-vidmode     Set the video mode for a kernel image. Displays current mode value without arguments. Alternative: rdev -v
-vim     Vi Improved, a text-based editor which is a successor to vi.
-vmstat    Shows information about processes, memory, paging, block IO, traps, disks, and CPU activity.
-volname     Returns volume name for a device formatted with an ISO-9660 filesystem. For example, CD-ROM.
-w     Show who is logged-on and what they’re doing.
-wait     Waits for a specified process ID(s) to terminate and returns the termination status.
-wall     Display a message on the terminals all the users who are currently logged-in.
-warnquota     Send mail to the users who’ve exceeded their disk quota soft limit.
-watch     Runs commands repeatedly until interrupted and shows their output and errors.
-wc     Print newline, word, and byte count for each of the specified files.
-wget     A non-interactive file download utility.
-whatis     Display one line manual page descriptions.
-whereis     Locate the binary, source, and man page files for a command.
-which     For a given command, lists the pathnames for the files which would be executed when the command runs.
-while     Conditionally execute commands (while loop).
-who     Shows who is logged on.
-whoami     Displays the username tied to the current effective user ID.
-whois     Looks for an object in a WHOIS database
-write     Display a message on other user’s terminal.
-xargs     Runs a command using initial arguments and then reads remaining arguments from standard input.
-xdg-open     Used to open a file or URL in an application preferred by the user.
-xinetd     Extended internet services daemon. Works similar to inetd.
-xz     Compress/ Decompress .xz and .lzma files.
-yacc     Yet Another Compiler Compiler, a GNU Project parser generator.
-yes     Repeatedly output a line with a specified string(s) until killed.
-ypbind     A daemon that helps client processes to connect to an NIS server.
-ypcat     Shows the NIS map (or database) for the specified MapName parameter.
-ypinit     Sets up NIS maps on an NIS server.
-ypmatch     Shows values for specified keys from an NIS map.
-yppasswd     Change NIS login password.
-yppasswdd     Acts as a server for the yppasswd command. Receives and executes requests.
-yppoll     Shows the ID number or version of NIS map currently used on the NIS server.
-yppush     Forces slave NIS servers to copy updated NIS maps.
-ypserv     A daemon activated at system startup. It looks for information in local NIS maps.
-ypset     Point a client (running ypbind) to a specifc server (running ypserv).
-yptest     Calls various functions to check the configuration of NIS services.
-ypwhich     Shows the hostname for NIS server or master server for a given map.
-ypxfr     Transfers NIS server map from server to a local host.
-zcat     Used to compress/uncompress files. Similar to gzip
-zcmp     Compare compressed files.
-zdiff     Compare compressed files line by line.
-zdump     Displays time for the timezone mentioned.
-zforce     Adds .gz extension to all gzipped files.
-zgrep     Performs grep on compressed files.
-zic     Creates time conversion information files using the specified input files.
-zip     A file compression and packaging utility.
-zless     Displays information of a compressed file (using less command) on the terminal one screen at a time.
-zmore     Displays output of a compressed file (using more command) on the terminal one page at a time.
-znew     Recompress .z files to .gz. files.
diff --git a/annotations/c_stats/posix.txt b/annotations/c_stats/posix.txt
deleted file mode 100644
index 54cc07ccd..000000000
--- a/annotations/c_stats/posix.txt
+++ /dev/null
@@ -1,160 +0,0 @@
-qalter      Batch utilities          Obsolescent (BE)    Alter batch job    
-qdel        Batch utilities          Obsolescent (BE)    Delete batch jobs    
-qhold       Batch utilities          Obsolescent (BE)    Hold batch jobs    
-qmove       Batch utilities          Obsolescent (BE)    Move batch jobs    
-qmsg        Batch utilities          Obsolescent (BE)    Send message to batch jobs    
-qrerun      Batch utilities          Obsolescent (BE)    Rerun batch jobs    
-qrls        Batch utilities          Obsolescent (BE)    Release batch jobs    
-qselect     Batch utilities          Obsolescent (BE)    Select batch jobs    
-qsig        Batch utilities          Obsolescent (BE)    Signal batch jobs    
-qstat       Batch utilities          Obsolescent (BE)    Show status of batch jobs    
-qsub        Batch utilities          Obsolescent (BE)    Submit a script    
-cc/c99      C programming            Optional (CD)       Compile standard C programs                                                IEEE Std 1003.1-2001
-cflow       C programming            Optional (XSI)      Generate a C-language call graph                                           System V
-ctags       C programming            Optional (SD)       Create a tags file                                                         3BSD
-cxref       C programming            Optional (XSI)      Generate a C-language program cross-reference table                        System V
-lex         C programming            Optional (CD)       Generate programs for lexical tasks                                        Version 7 AT&T UNIX
-nm          C programming            Optional (SD, XSI)  Write the name list of an object file                                      Version 1 AT&T UNIX
-strings     C programming            Mandatory           Find printable strings in files                                            2BSD
-strip       C programming            Optional (SD)       Remove unnecessary information from executable files                       Version 1 AT&T UNIX
-yacc        C programming            Optional (CD)       Yet another compiler compiler                                              PWB UNIX
-basename    Filesystem               Mandatory           Return non-directory part of pathname; cf.dirname                          Version 7 AT&T UNIX
-cat         Filesystem               Mandatory           Concatenate and print files                                                Version 1 AT&T UNIX
-cd          Filesystem               Mandatory           Change the working directory                                               Version 6 AT&T UNIX
-chgrp       Filesystem               Mandatory           Change the file group ownership                                            PWB UNIX
-chmod       Filesystem               Mandatory           Change the file modes/attributes/permissions                               Version 1 AT&T UNIX
-chown       Filesystem               Mandatory           Change the file ownership                                                  Version 1 AT&T UNIX
-cksum       Filesystem               Mandatory           Write file checksums and sizes                                             4.4BSD
-cmp         Filesystem               Mandatory           Compare two files; see also diff                                           Version 1 AT&T UNIX
-compress    Filesystem               Optional (XSI)      Compress data                                                              4.3BSD
-cp          Filesystem               Mandatory           Copy files                                                                 Version 1 AT&T UNIX
-dd          Filesystem               Mandatory           Convert and copy a file                                                    Version 5 AT&T UNIX
-df          Filesystem               Mandatory           Report free disk space                                                     Version 1 AT&T UNIX
-dirname     Filesystem               Mandatory           Return the directory part of pathname; cf. basename                        System III
-du          Filesystem               Mandatory           Estimate file space usage                                                  Version 1 AT&T UNIX
-file        Filesystem               Mandatory           Determine file type                                                        Version 4 AT&T UNIX
-find        Filesystem               Mandatory           Find files                                                                 Version 1 AT&T UNIX
-link        Filesystem               Optional (XSI)      Create a hard link to a file                                               Version 1 AT&T UNIX
-ln          Filesystem               Mandatory           Link files                                                                 Version 1 AT&T UNIX
-ls          Filesystem               Mandatory           List directory contents                                                    Version 1 AT&T UNIX
-mkdir       Filesystem               Mandatory           Make directories                                                           Version 1 AT&T UNIX
-mkfifo      Filesystem               Mandatory           Make FIFO special files                                                    4.4BSD[dubious – discuss]
-mv          Filesystem               Mandatory           Move or rename files                                                       Version 1 AT&T UNIX
-pathchk     Filesystem               Mandatory           Check pathnames    
-pwd         Filesystem               Mandatory           present working directory - Return  name                                   Version 5 AT&T UNIX
-rm          Filesystem               Mandatory           Remove directory entries                                                   Version 1 AT&T UNIX
-rmdir       Filesystem               Mandatory           Remove directories, if they are empty.                                     Version 1 AT&T UNIX
-touch       Filesystem               Mandatory           Change file access and modification times                                  Version 7 AT&T UNIX
-unlink      Filesystem               Optional (XSI)      Call the unlink function                                                   Version 1 AT&T UNIX
-fort77      FORTRAN77 prog.          Obsolescent (FD)    FORTRAN compiler                                                           XPG4
-alias       Misc                     Mandatory           Define or display aliases    
-ar          Misc                     Mandatory           Create and maintain library archives                                       Version 1 AT&T UNIX
-bc          Misc                     Mandatory           Arbitrary-precision arithmetic language                                    Version 6 AT&T UNIX
-cal         Misc                     Optional (XSI)      Print a calendar                                                           Version 5 AT&T UNIX
-crontab     Misc                     Mandatory           Schedule periodic background work                                          System V
-date        Misc                     Mandatory           Display the date and time                                                  Version 1 AT&T UNIX
-env         Misc                     Mandatory           Set the environment for command invocation                                 System III
-fc          Misc                     Optional (UP)       Process the command history list    
-gencat      Misc                     Mandatory           Generate a formatted message catalog    
-getconf     Misc                     Mandatory           Get configuration values    
-grep        Misc                     Mandatory           Search text for a pattern                                                  Version 4 AT&T UNIX
-hash        Misc                     Mandatory           hash database access method    
-id          Misc                     Mandatory           Return user identity                                                       System V
-ipcrm       Misc                     Optional (XSI)      Remove a message queue, semaphore set, or shared memory segment identifier System V
-ipcs        Misc                     Optional (XSI)      Report interprocess communication facilities status                        System V
-locale      Misc                     Mandatory           Get locale-specific information    
-localedef   Misc                     Mandatory           Define locale environment    
-logname     Misc                     Mandatory           Return the user's login name                                               4.4BSD
-m4          Misc                     Mandatory           Macro processor                                                            PWB UNIX
-mailx       Misc                     Mandatory           Process messages                                                           Version 1 AT&T UNIX
-man         Misc                     Mandatory           Display system documentation                                               Version 2 AT&T UNIX
-mesg        Misc                     Mandatory           Permit or deny messages                                                    Version 1 AT&T UNIX
-newgrp      Misc                     Mandatory           Change to a new group (functionality similar to sg[1])                     Version 6 AT&T UNIX
-od          Misc                     Mandatory           Dump files in various formats                                              Version 1 AT&T UNIX
-pax         Misc                     Mandatory           Portable archive interchange                                               4.4BSD[citation needed]
-split       Misc                     Mandatory           Split files into pieces                                                    Version 3 AT&T UNIX
-stty        Misc                     Mandatory           Set the options for a terminal                                             Version 2 AT&T UNIX
-tabs        Misc                     Mandatory           Set terminal tabs                                                          PWB UNIX
-talk        Misc                     Optional (UP)       Talk to another user                                                       4.2BSD
-tput        Misc                     Mandatory           Change terminal characteristics                                            System V
-tty         Misc                     Mandatory           Return user's terminal name                                                Version 1 AT&T UNIX
-type        Misc                     Optional (XSI)      Displays how a name would be interpreted if used as a command              
-ulimit      Misc                     Optional (XSI)      Set or report file size limit    
-umask       Misc                     Mandatory           Get or set the file mode creation mask                                     System III
-unalias     Misc                     Mandatory           Remove alias definitions    
-uname       Misc                     Mandatory           Return system name                                                         PWB UNIX
-uncompress  Misc                     Optional (XSI)      Expand compressed data                                                     4.3BSD
-write       Misc                     Mandatory           Write to another user's terminal                                           Version 1 AT&T UNIX
-uucp        Network                  Optional (UU)       System-to-system copy                                                      Version 7 AT&T UNIX
-uudecode    Network                  Mandatory           Decode a binary file                                                       4BSD
-uuencode    Network                  Mandatory           Encode a binary file                                                       4BSD
-uustat      Network                  Optional (UU)       uucp status inquiry and job control                                        System III
-at          Process management       Mandatory           Execute commands at a later time                                           Version 7 AT&T UNIX
-batch       Process management       Mandatory           Schedule commands to be executed in a batch queue    
-bg          Process management       Optional (UP)       Run jobs in the background    
-fg          Process management       Optional (UP)       Run jobs in the foreground    
-fuser       Process management       Optional (XSI)      List process IDs of all processes that have one or more files open         System V
-jobs        Process management       Optional (UP)       Display status of jobs in the current session    
-kill        Process management       Mandatory           Terminate or signal processes                          Version 4 AT&T UNIX
-nice        Process management       Mandatory           Invoke a utility with an altered nice value            Version 4 AT&T UNIX
-nohup       Process management       Mandatory           Invoke a utility immune to hangups                     Version 4 AT&T UNIX
-ps          Process management       Mandatory           Report process status                                  Version 4 AT&T UNIX
-renice      Process management       Mandatory           Set nice values of running processes                   4BSD
-time        Process management       Mandatory           Time a simple command                                  Version 3 AT&T UNIX
-uux         Process management       Optional (UU)       Remote command execution                               Version 7 AT&T UNIX
-wait        Process management       Mandatory           Await process completion                               Version 4 AT&T UNIX
-make        Programming              Optional (SD)       Maintain, update, and regenerate groups of programs    PWB UNIX
-admin       SCCS                     Optional (XSI)      Create and administer SCCS files         PWB UNIX
-delta       SCCS                     Optional (XSI)      Make a delta (change) to an SCCS file    PWB UNIX
-get         SCCS                     Optional (XSI)      Get a version of an SCCS file            PWB UNIX
-prs         SCCS                     Optional (XSI)      Print an SCCS file    PWB UNIX
-rmdel       SCCS                     Optional (XSI)      Remove a delta from an SCCS file    PWB UNIX
-sact        SCCS                     Optional (XSI)      Print current SCCS file-editing activity    System III
-sccs        SCCS                     Optional (XSI)      Front end for the SCCS subsystem    4.3BSD
-unget       SCCS                     Optional (XSI)      Undo a previous get of an SCCS file    System III
-val         SCCS                     Optional (XSI)      Validate SCCS files    System III
-what        SCCS                     Optional (XSI)      Identify SCCS files    PWB UNIX
-command     Shell programming        Mandatory           Execute a simple command    
-echo        Shell programming        Mandatory           Write arguments to standard output    Version 2 AT&T UNIX
-expr        Shell programming        Mandatory           Evaluate arguments as an expression    Version 7 AT&T UNIX
-false       Shell programming        Mandatory           Return false value    Version 7 AT&T UNIX
-getopts     Shell programming        Mandatory           Parse utility options    
-logger      Shell programming        Mandatory           Log messages    4.3BSD
-printf      Shell programming        Mandatory           Write formatted output    4.3BSD-Reno
-read        Shell programming        Mandatory           Read a line from standard input    
-sh          Shell programming        Mandatory           Shell, the standard command language interpreter    Version 7 AT&T UNIX (in earlier versions, sh was either the Thompson shell or the PWB shell)
-sleep       Shell programming        Mandatory           Suspend execution for an interval    Version 4 AT&T UNIX
-tee         Shell programming        Mandatory           Duplicate the standard output    Version 5 AT&T UNIX
-test        Shell programming        Mandatory           Evaluate expression    Version 7 AT&T UNIX
-true        Shell programming        Mandatory           Return true value    Version 7 AT&T UNIX
-xargs       Shell programming        Mandatory           Construct argument lists and invoke utility    PWB UNIX
-who         System administration    Mandatory           Display who is on the system    Version 1 AT&T UNIX
-asa         Text processing          Optional (FR)       Interpret carriage-control characters    System V
-awk         Text processing          Mandatory           Pattern scanning and processing language    Version 7 AT&T UNIX
-comm        Text processing          Mandatory           Select or reject lines common to two files    Version 4 AT&T UNIX
-csplit      Text processing          Mandatory           Split files based on context    PWB UNIX
-cut         Text processing          Mandatory           Cut out selected fields of each line of a file    System III
-diff        Text processing          Mandatory           Compare two files; see also cmp    Version 5 AT&T UNIX
-ed          Text processing          Mandatory           The standard text editor    Version 1 AT&T UNIX
-ex          Text processing          Optional (XSI)      Text editor    1BSD
-expand      Text processing          Mandatory           Convert tabs to spaces    3BSD
-fold        Text processing          Mandatory           Filter for folding lines    1BSD
-head        Text processing          Mandatory           Copy the first part of files    PWB UNIX[citation needed]
-iconv       Text processing          Mandatory           Codeset conversion    HP-UX
-join        Text processing          Mandatory           Merges two sorted text files based on the presence of a common field    Version 7 AT&T UNIX
-lp          Text processing          Mandatory           Send files to a printer    System V
-more        Text processing          Optional (UP)       Display files on a page-by-page basis    3BSD
-nl          Text processing          Optional (XSI)      Line numbering filter    System III
-paste       Text processing          Mandatory           Merge corresponding or subsequent lines of files    Version 32V AT&T UNIX
-patch       Text processing          Mandatory           Apply changes to files    4.3BSD
-pr          Text processing          Mandatory           Print files    Version 1 AT&T UNIX
-sed         Text processing          Mandatory           Stream editor    Version 7 AT&T UNIX
-sort        Text processing          Mandatory           Sort, merge, or sequence check text files    Version 1 AT&T UNIX
-tail        Text processing          Mandatory           Copy the last part of a file    PWB UNIX[citation needed]
-tr          Text processing          Mandatory           Translate characters    Version 4 AT&T UNIX
-tsort       Text processing          Mandatory           Topological sort    Version 7 AT&T UNIX
-unexpand    Text processing          Mandatory           Convert spaces to tabs    3BSD
-uniq        Text processing          Mandatory           Report or filter out repeated lines in a file    Version 3 AT&T UNIX
-vi          Text processing          Optional (UP)       Screen-oriented (visual) display editor    1BSD
-wc          Text processing          Mandatory           Line, word and byte or character count    Version 1 AT&T UNIX
-zcat        Text processing          Optional (XSI)      Expand and concatenate data    4.3BSD
diff --git a/annotations/cat.json b/annotations/cat.json
deleted file mode 100644
index 9453846dc..000000000
--- a/annotations/cat.json
+++ /dev/null
@@ -1,37 +0,0 @@
-{
-    "command": "cat",
-    "aggregate": "$PASH_TOP/runtime/agg/py/cat.py", "comment": "TODO(@DanielSongShen) Can't capture all cases of cat i.e. cat -n needs a different aggregator. Perhaps have an aggregate record per predicate.",
-    "cases":
-    [
-		{
-            "predicate":
-            {
-                "operator": "exists",
-                "operands": ["-n"]
-            },
-            "class": "pure",
-            "inputs": ["args[:]"],
-            "outputs": ["stdout"],
-			"comment": "Pure because -n prints line numbering, and splitting affects it"
-		},
-        {
-            "predicate":
-            {
-                "operator": "len_args_eq",
-                "operands": [0]
-            },
-            "comment": ":')",
-            "class": "stateless",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        },
-        {
-            "predicate": "default",
-            "class": "stateless",
-            "inputs": ["args[:]"],
-            "outputs": ["stdout"],
-            "options": ["stdin-hyphen"]
-        }
-    ],
-    "options": ["stdin-hyphen"]
-}
diff --git a/annotations/chmod.json b/annotations/chmod.json
deleted file mode 100644
index 5e6cb1603..000000000
--- a/annotations/chmod.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "chmod",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "side-effectful",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/col.json b/annotations/col.json
deleted file mode 100644
index f41ca990e..000000000
--- a/annotations/col.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "col",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "stateless",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/comm.json b/annotations/comm.json
deleted file mode 100644
index 4f2f94757..000000000
--- a/annotations/comm.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-    "command": "comm",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["args[:]"],
-            "outputs": ["stdout"],
-            "options": ["stdin-hyphen"]
-        }
-    ]
-}
diff --git a/annotations/convert.json b/annotations/convert.json
deleted file mode 100644
index 057855296..000000000
--- a/annotations/convert.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-    "command": "convert",
-    "cases":
-    [
-        
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["args[1]"],
-            "outputs": ["args[2]"]
-        }
-    ]
-}
diff --git a/annotations/custom_aggregators/cat.py.json b/annotations/custom_aggregators/cat.py.json
deleted file mode 100644
index 322fa8edf..000000000
--- a/annotations/custom_aggregators/cat.py.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "cat.py",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["args[:]"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/custom_aggregators/concat.json b/annotations/custom_aggregators/concat.json
deleted file mode 100644
index bcc11b487..000000000
--- a/annotations/custom_aggregators/concat.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "concat.sh",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["args[:]"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/custom_sort.json b/annotations/custom_sort.json
deleted file mode 100644
index a25dfff0b..000000000
--- a/annotations/custom_sort.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
-    "command": "custom_sort",
-    "cases":
-    [
-        {
-            "predicate":
-            {
-                "operator": "exists",
-                "operands": ["-m"]
-            },
-            "class": "pure",
-            "inputs": ["args[:]"],
-            "outputs": ["stdout"]
-        },
-        {
-            "predicate": "default",
-            "class": "parallelizable_pure",
-            "properties": ["commutative"],
-            "inputs": ["stdin"],
-            "outputs": ["stdout"],
-            "aggregator":
-            {
-                "name": "sort",
-                "options": ["-m"]
-            }
-        }
-    ],
-    "comment": "TODO: This is not correct. To assign arguments correctly, we need to check if there is any file argument. If not, then we can read from stdin, otherwise we have to read from input files."
-}
diff --git a/annotations/custom_tr.json b/annotations/custom_tr.json
deleted file mode 100644
index 1fb5bbc4d..000000000
--- a/annotations/custom_tr.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "custom_tr",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "stateless",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/cut.json b/annotations/cut.json
deleted file mode 100644
index 364ae91f3..000000000
--- a/annotations/cut.json
+++ /dev/null
@@ -1,33 +0,0 @@
-{
-    "command": "cut",
-    "cases":
-    [
-        {
-            "predicate":
-            {
-                "operator": "and",
-                "operands":
-                [
-                    {
-                        "operator": "val_opt_eq",
-                        "operands": ["-d", "--delimiter", "\n"]
-                    },
-                    {
-                        "operator": "exists",
-                        "operands": ["-f", "--fields"]
-                    }
-                ]
-            },
-            "class": "pure",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"],
-			"comments": "Stateless in all cases with exception in case where newline is a delimiter."
-        },
-        {
-            "predicate": "default",
-            "class": "stateless",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/date.json b/annotations/date.json
deleted file mode 100644
index 65155f5c4..000000000
--- a/annotations/date.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "date",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/dd.json b/annotations/dd.json
deleted file mode 100644
index f641923fe..000000000
--- a/annotations/dd.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "dd",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "side-effectful",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/dfs_split_reader.json b/annotations/dfs_split_reader.json
deleted file mode 100644
index 56d8c41ac..000000000
--- a/annotations/dfs_split_reader.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "dfs_split_reader.sh",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["args[0]"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/dgsh_tee.json b/annotations/dgsh_tee.json
deleted file mode 100644
index 196b7fb3b..000000000
--- a/annotations/dgsh_tee.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "dgsh_tee.sh",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["args[0]"],
-            "outputs": ["args[1]"]
-        }
-    ]
-}
diff --git a/annotations/diff.json b/annotations/diff.json
deleted file mode 100644
index 1042ce19e..000000000
--- a/annotations/diff.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "diff",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["args[:]"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/eager-no-task-par.json b/annotations/eager-no-task-par.json
deleted file mode 100644
index 326f59c3d..000000000
--- a/annotations/eager-no-task-par.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "eager-no-task-par.sh",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["args[0]"],
-            "outputs": ["args[1]"]
-        }
-    ]
-}
diff --git a/annotations/eager.json b/annotations/eager.json
deleted file mode 100644
index ea700bcf2..000000000
--- a/annotations/eager.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "eager.sh",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["args[0]"],
-            "outputs": ["args[1]"]
-        }
-    ]
-}
diff --git a/annotations/echo.json b/annotations/echo.json
deleted file mode 100644
index a22b760bb..000000000
--- a/annotations/echo.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-	"command": "echo",
-	"cases":
-	[
-	    {
-		"predicate": "default",
-		"class": "pure", "comment": "No input so f(a++b) != f(a) ++ f(b)",
-		"inputs": [ ],
-		"outputs": ["stdout"]
-	    }
-	]
-}
diff --git a/annotations/export.json b/annotations/export.json
deleted file mode 100644
index d9b8c0983..000000000
--- a/annotations/export.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "export",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "side-effectful",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/extract_text.json b/annotations/extract_text.json
deleted file mode 100644
index bfee3b301..000000000
--- a/annotations/extract_text.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-    "command": "extract_text",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "stateless",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
-
diff --git a/annotations/ffmpeg.json b/annotations/ffmpeg.json
deleted file mode 100644
index ed6c7fb37..000000000
--- a/annotations/ffmpeg.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-    "command": "ffmpeg",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["args[:]"],
-            "outputs": ["stdout"],
-            "options": ["stdin-hyphen"]
-        }
-    ]
-}
diff --git a/annotations/find.json b/annotations/find.json
deleted file mode 100644
index 6726fb46f..000000000
--- a/annotations/find.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "find",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "side-effectful",
-            "inputs": ["args[:]"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/fmt.json b/annotations/fmt.json
deleted file mode 100644
index 1506d01ac..000000000
--- a/annotations/fmt.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
-    "command": "fmt",
-    "cases":
-    [
-        {
-            "predicate":
-            {
-                "operator": "exists",
-                "operands": ["-w1"]
-            },
-            "class": "stateless",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        },
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/grep.json b/annotations/grep.json
deleted file mode 100644
index 70426e0fc..000000000
--- a/annotations/grep.json
+++ /dev/null
@@ -1,47 +0,0 @@
-{
-    "command": "grep",
-    "cases":
-    [
-        {
-            "predicate":
-            {
-                "operator": "exists",
-                "operands": ["-c"]
-            },
-            "class": "pure",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        },
-        {
-            "predicate":
-            {
-                "operator": "len_args_eq",
-                "operands": [1]
-            },
-            "comment": "This doesn't work if the pattern is given with the -e, -f flags",
-            "class": "stateless",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        },
-        {
-            "predicate":
-            {
-                "operator": "exists",
-                "operands": ["-r",
-                             "--recursive",
-                             "-R",
-                             "--dereference-recursive"]
-            },
-            "class": "side-effects",
-            "comment": "In this case this reads directories and that is why we conservatively assume it is side-effectful. It is possible that this could be made more precise to be stateless."
-        },
-        {
-            "predicate": "default",
-            "class": "stateless",
-            "inputs": ["args[:]"],
-            "outputs": ["stdout"],
-            "comment": "This isn't _quite_ right---we need to identify a pattern somewhere and exclude it from `input`."
-        }
-    ],
-    "comment": "For the stateless one we need to add -h|--no-filename as a requirement if it takes multiple files as input."
-}
diff --git a/annotations/groff.json b/annotations/groff.json
deleted file mode 100644
index 7026d0e88..000000000
--- a/annotations/groff.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "groff",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "stateless",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/gunzip.json b/annotations/gunzip.json
deleted file mode 100644
index c9e169e64..000000000
--- a/annotations/gunzip.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "gunzip",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "stateless",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/gzip.json b/annotations/gzip.json
deleted file mode 100644
index 40b201861..000000000
--- a/annotations/gzip.json
+++ /dev/null
@@ -1,14 +0,0 @@
-{
-    "command": "gzip",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "stateless",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"],
-			"options": "no_split_stream",
-			"comment": "stateless if gzip a.zip b.zip; single file stream has to be contiguous"
-        }
-    ]
-}
diff --git a/annotations/hdfs.json b/annotations/hdfs.json
deleted file mode 100644
index 60c18da71..000000000
--- a/annotations/hdfs.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-    "command": "hdfs",
-    "cases":
-    [
-        {
-            "predicate":
-            {
-                "operator": "exists",
-                "operands": ["-cat"]
-            },
-            "class": "stateless",
-            "inputs": ["args[1]"],
-            "outputs": ["stdout"],
-			"comments": "This represents hdfs dfs -cat <path>. Slightly hacky since we only check for -cat"
-        },
-        {
-            "predicate": "default",
-            "class": "side-effectful",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
\ No newline at end of file
diff --git a/annotations/head.json b/annotations/head.json
deleted file mode 100644
index 187041f7a..000000000
--- a/annotations/head.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-    "command": "head",
-    "aggregate": "$PASH_TOP/runtime/agg/py/head.py",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/history.json b/annotations/history.json
deleted file mode 100644
index 274c1ee2c..000000000
--- a/annotations/history.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "history",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "side-effectful",
-            "inputs": [],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/iconv.json b/annotations/iconv.json
deleted file mode 100644
index 09b8dfe3d..000000000
--- a/annotations/iconv.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "iconv",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/jobs.json b/annotations/jobs.json
deleted file mode 100644
index 634d4e8e0..000000000
--- a/annotations/jobs.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "jobs",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "stateless",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/ls.json b/annotations/ls.json
deleted file mode 100644
index 95ce59f92..000000000
--- a/annotations/ls.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "ls",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "side-effectful",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/mkfifo.json b/annotations/mkfifo.json
deleted file mode 100644
index abcca761d..000000000
--- a/annotations/mkfifo.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "mkfifo",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "side-effectful",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/mktemp.json b/annotations/mktemp.json
deleted file mode 100644
index 4ff2340e6..000000000
--- a/annotations/mktemp.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "mktemp",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "side-effectful",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/multiply.json b/annotations/multiply.json
deleted file mode 100644
index b3aff3f6d..000000000
--- a/annotations/multiply.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "multiply.sh",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/nc.json b/annotations/nc.json
deleted file mode 100644
index e2c35b4fc..000000000
--- a/annotations/nc.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "nc",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "side-effectful",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/nl.json b/annotations/nl.json
deleted file mode 100644
index c742274c5..000000000
--- a/annotations/nl.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-    "command": "nl",
-    "aggregate": "$PASH_TOP/runtime/agg/py/nl.py",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/notes.md b/annotations/notes.md
deleted file mode 100644
index 2d047b640..000000000
--- a/annotations/notes.md
+++ /dev/null
@@ -1,30 +0,0 @@
-## Notes
-
-If we solve some of these questions below we can include them as
-pieces of wisdom in the language description.
-
-### Notes/Questions regarding the language design
-
-- Properly handle `-` as a file argument in commands. - means
-  stdin for many commands. Again this can be an option in the
-  annotation.
-  + I added an option "stdin-hyphen" to indicate this.
-
-- What is the proper and correct way to separate non flag arguments
-  from flag arguments. We should make sure that our separation is
-  adequate.
-
-- Can multi-input commands (such as general comm, diff) be classified
-  as pure?
-
-- How exactly should we handle combined flags (e.g. -13 is the same as
-  -1 and -3). Should the annotation include both? Or just separate
-  flags and then our annotation component takes care of that? Is this
-  convention necessary? If it is a known convention maybe we can
-  include it in the annotation of the command (meaning that when a
-  command annotation contains an X field, then we can combine flags
-  etc. This would make the annotation language simpler.
-
-- How should we handle `grep`'s inputs-outputs? At the moment I am
-  doing it with a predicate that checks the args size but this might
-  not be the best way.
diff --git a/annotations/p_stats/README.md b/annotations/p_stats/README.md
deleted file mode 100644
index 0bae177e1..000000000
--- a/annotations/p_stats/README.md
+++ /dev/null
@@ -1,39 +0,0 @@
-The study has evolved over time
-
-Originally, there are five main groups:
-
-* _stateless_: these are the simplest to parallelize. They fall into four classes depending on the input chunk---some are parallelizable at the level of individual characters (e.g., `tr`), some at the level of lines (e.g., `grep`), some at the level of paragraphs (e.g., fmt), and some at the level of files
-
-* _pure_: these are somewhat more difficult to parallelize, as they might require to see the end of the input (or include some line metadata); they are still pure (a la reducers).
-
-* _DFS_: these interact with the (distributed version of the) file-system. As the file-system is a central part of the Unix design and philosophy, many of these commands have Unix-specific semantics that are not helpful in pipelines or distributed workloads---e.g., operations related to file ownership. However, they are not difficult to emulate atop a conventional distributed storage.
-
-* _EVs_: these commands affect environment variables; interestingly, the vast majority of these commands only _read_ environment variables---the common case in scripts.
-So even if we had something like distributed transactions, we would still be able to get away with mostly read-accesses that do not need to block (note that there are multiple transactional protocols, because these are reversible).
-
-* _Side-effectful_: These have non-reversible side-effects that affect the outside system. They are non-distributable, and would most certainly require a pesimistic transaction management control.
-
-There are still a few commands whose semantics are not entirely understood, marked as `?` (but for which `pash` assumes the most conservative class possible).
-
-
-Currently, the four major classes:
-
-* _Stateless Commands_: This class contains commands that operate on individual line elements of their input, without maintaining state across invocations.
-These are commands that can be expressed as a purely functional `map` or `filter`—e.g., `grep` filters out individual lines and `basename` removes a path prefix from a string.
-They may produce multiple elements—e.g., `tr` may insert NL tokens—but always return empty output for empty input.
-Workloads that use only stateless commands are trivial to parallelize: they do not require any synchronization to maintain correctness, nor caution about where to split inputs.
-
-* _Parallelizable Pure Commands_: This class contains commands that respect functional purity—i.e., same outputs for same inputs—but maintain internal state across their entire pass.
-The details of this state and its propagation during element processing affect their parallelizability characteristics.
-Some commands are easy to parallelize, because they maintain trivial state and are commutative—e.g., `wc` simply maintains a counter.
-Other commands, such as `sort`, maintain more complex invariants that have to be taken into account when merging partial results.
-
-* _Non-parallelizable Pure Commands_: This class contains commands that, while purely functional, cannot be parallelized.
-This is because their internal state depends on prior state in the same pass in non-trivial ways.
-For example, hashing commands such as `sha1sum` maintain complex state that has to be updated sequentially.
-If parallelized on a single input, each stage would need to wait on the results of all previous stages, foregoing any parallelism benefits.
-
-* _Side-effectful Commands_: This class contains commands that have side-effects across the system—for example, updating environment variables, interacting with the
-filesystem, and accessing the network.
-Such commands are not parallelizable without finer-grained concurrency control mechanisms that can detect side-effects across the system.
-This is the largest class.
diff --git a/annotations/p_stats/coreutils-summary.txt b/annotations/p_stats/coreutils-summary.txt
deleted file mode 100644
index 1936c764a..000000000
--- a/annotations/p_stats/coreutils-summary.txt
+++ /dev/null
@@ -1,104 +0,0 @@
-basename   : S :   Removes the path prefix from a given pathname
-dirname    : S :   Strips non-directory suffix from file name
-echo       : S :   Displays a specified line of text
-factor     : P :   Factors numbers
-od         : P :   Dumps files in octal and other formats [nv: but changes \n location]
-pathchk    : S :   Checks whether file names are valid or portable
-printf     : P :   Formats and prints data
-shuf       : S :   generate random permutations
-test       : S :   Evaluates an expression
-tr         : S :   Translates or deletes characters
-unexpand   : S :   Converts spaces to tabs
-yes        : S :   Prints a string repeatedly
-comm       : P :   Compares two sorted files line by line
-cut        : S :   Removes sections from each line of files
-expand     : S :   Converts tabs to spaces
-fold       : P :   Wraps each input line to fit in specified width [nv: like map, from one to two lines]
-join       : P :   Joins lines of two files on a common field
-nl         : P :   Numbers lines of files
-numfmt     : S :   Reformat numbers
-paste      : P :   Merges lines of files
-fmt        : P :   Simple optimal text formatter [nv: might need to fill from next line]
-cat        : S :   Concatenates and prints files on the standard output
-uniq       : P :   Removes duplicate lines from a sorted file
-sort       : P :   sort lines of text files
-wc         : P :   Prints the number of bytes, words, and lines in files
-pr         : P :   Converts text files for printing
-tac        : P :   Concatenates and prints files in reverse order line by line
-head       : P :   Outputs the first part of files [nv: if we have chunks and watermarks, then this is trivially parallelizable]
-tail       : P :   Outputs the last part of files
-seq        : P :   Prints a sequence of numbers
-b2sum      : N :   Computes and checks BLAKE2b message digest
-base32     : N :   Encodes or decodes Base32, and prints result to standard output
-base64     : N :   Encodes or decodes Base64, and prints result to standard output
-cksum      : N :   Checksums and count the bytes in a file
-md5sum     : N :   Computes and checks MD5 message digest
-sha1sum    : N :   Computes and checks SHA-1/SHA-2 message digests
-sha224sum  : N :   Computes and checks SHA-1/SHA-2 message digests
-sha256sum  : N :   Computes and checks SHA-1/SHA-2 message digests
-sha384sum  : N :   Computes and checks SHA-1/SHA-2 message digests
-sha512sum  : N :   Computes and checks SHA-1/SHA-2 message digests
-sum        : N :   Checksums and counts the blocks in a file
-tsort      : N :   Performs a topological sort
-expr       : N :   Evaluates expressions [nv: not shell, but math etc]
-chcon      : E :   Changes file security context (SELinux)
-chgrp      : E :   Changes file group ownership
-chmod      : E :   Changes the permissions of a file or directory
-chown      : E :   Changes file ownership
-cp         : E :   Copies a file or directory
-csplit     : E :   Splits a file into sections determined by context lines
-dd         : E :   Copies and converts a file
-install    : E :   Copies files and set attributes
-link       : E :   Creates a link to a file
-ln         : E :   Creates a link to a file
-mkdir      : E :   Creates a directory
-mktemp     : E :   Creates a temporary file or directory
-mv         : E :   Moves files or rename files
-readlink   : E :   Displays value of a symbolic link
-realpath   : E :   Returns the resolved absolute or relative path for a file
-rm         : E :   Removes (deletes) files, directories, device nodes and symbolic links
-rmdir      : E :   Removes empty directories
-shred      : E :   Overwrites a file to hide its contents, and optionally deletes it
-split      : E :   Splits a file into pieces [nv: splitting is pure]
-sync       : E :   Flushes file system buffers
-touch      : E :   Changes file timestamps
-truncate   : E :   Shrink or extend the size of a file to the specified size
-unlink     : E :   Removes the specified file using the unlink function
-vdir       : E :   Is exactly like "ls -l -b". (Files are by default listed in long format.)
-arch       : E :   Prints machine hardware name (same as uname -m)
-date       : E :   Prints or sets the system date and time
-df         : E :   Shows disk free space on file systems
-dir        : E :   Is exactly like "ls -C -b". (Files are by default listed in columns and sorted vertically.)
-du         : E :   Shows disk usage on file systems
-groups     : E :   Prints the groups of which the user is a member
-hostid     : E :   Prints the numeric identifier for the current host
-id         : E :   Prints real or effective UID and GID
-logname    : E :   Print the user's login name
-ls         : E :   Lists the files in a directory
-pinky      : E :   A lightweight version of finger
-printenv   : E :   Prints environment variables
-pwd        : E :   Prints the current working directory
-stat       : E :   Returns data about an inode
-timeout    : E :   Run a command with a time limit
-tty        : E :   Prints terminal name
-uname      : E :   Prints system information
-uptime     : E :   Tells how long the system has been running
-users      : E :   Prints the user names of users currently logged into the current host
-who        : E :   Prints a list of all users currently logged in
-whoami     : E :   Prints the effective userid
-chroot     : E :   Changes the root directory
-dircolors  : E :   Set up color for ls
-env        : E :   Displays and modifies environment variables
-stty       : E :   Changes and prints terminal line settings
-false      : E :   Does nothing, but exits unsuccessfully
-nice       : E :   Modifies scheduling priority
-nohup      : E :   Allows a command to continue running after logging out
-nproc      : E :   Queries the number of (active) processors
-sleep      : E :   Delays for a specified amount of time
-true       : E :   Does nothing, but exits successfully
-mkfifo     : E :   Makes named pipes (FIFOs) [nv: might affect DAG]
-mknod      : E :   Makes block or character special files
-ptx        : ? :   Produces a permuted index of file contents [nv: not sure about its invariants]
-runcon     : E :   Run command with specified security context
-stdbuf     : ? :   Controls buffering for commands that use stdio
-tee        : E :   Sends output to multiple files [nv: affects DAG]
diff --git a/annotations/p_stats/coreutils.txt b/annotations/p_stats/coreutils.txt
deleted file mode 100644
index 0ea72a845..000000000
--- a/annotations/p_stats/coreutils.txt
+++ /dev/null
@@ -1,112 +0,0 @@
-basename     stateless      Removes the path prefix from a given pathname
-dirname      stateless      Strips non-directory suffix from file name
-echo         stateless      Displays a specified line of text
-factor       stateless      Factors numbers
-od           stateless*     Dumps files in octal and other formats [nv: but changes \n location]
-pathchk      stateless      Checks whether file names are valid or portable
-printf       stateless      Formats and prints data
-shuf         stateless*     generate random permutations
-test         stateless      Evaluates an expression
-tr           stateless      Translates or deletes characters
-unexpand     stateless      Converts spaces to tabs
-yes          stateless      Prints a string repeatedly
-comm        l-stateless*    Compares two sorted files line by line
-cut         l-stateless     Removes sections from each line of files
-expand      l-stateless     Converts tabs to spaces
-fold        l-stateless     Wraps each input line to fit in specified width [nv: like map, from one to two lines]
-join        l-stateless     Joins lines of two files on a common field
-nl          l-stateless     Numbers lines of files
-numfmt      l-stateless     Reformat numbers
-paste       l-stateless     Merges lines of files
-fmt         p-stateless     Simple optimal text formatter [nv: might need to fill from next line]
-
-uniq            pure        Removes duplicate lines from a sorted file
-sort            pure        sort lines of text files
-wc              pure        Prints the number of bytes, words, and lines in files
-pr              pure        Converts text files for printing
-
-b2sum           pure        Computes and checks BLAKE2b message digest
-base32          pure        Encodes or decodes Base32, and prints result to standard output
-base64          pure        Encodes or decodes Base64, and prints result to standard output
-cksum           pure        Checksums and count the bytes in a file
-md5sum          pure        Computes and checks MD5 message digest
-sha1sum         pure        Computes and checks SHA-1/SHA-2 message digests
-sha224sum       pure        Computes and checks SHA-1/SHA-2 message digests
-sha256sum       pure        Computes and checks SHA-1/SHA-2 message digests
-sha384sum       pure        Computes and checks SHA-1/SHA-2 message digests
-sha512sum       pure        Computes and checks SHA-1/SHA-2 message digests
-sum             pure        Checksums and counts the blocks in a file
-tsort           pure        Performs a topological sort
-expr            pure        Evaluates expressions [nv: not shell, but math etc]
-
-tac             pure        Concatenates and prints files in reverse order line by line
-cat             pure        Concatenates and prints files on the standard output
-head            pure        Outputs the first part of files [nv: if we have chunks and watermarks, then this is trivially parallelizable]
-tail            pure        Outputs the last part of files
-seq             pure        Prints a sequence of numbers
-
-chcon            DFS        Changes file security context (SELinux)
-chgrp            DFS        Changes file group ownership
-chmod            DFS        Changes the permissions of a file or directory
-chown            DFS        Changes file ownership
-cp               DFS        Copies a file or directory
-csplit           DFS        Splits a file into sections determined by context lines
-dd               DFS        Copies and converts a file
-install          DFS        Copies files and set attributes
-link             DFS        Creates a link to a file
-ln               DFS        Creates a link to a file
-mkdir            DFS        Creates a directory
-mktemp           DFS        Creates a temporary file or directory
-mv               DFS        Moves files or rename files
-readlink         DFS        Displays value of a symbolic link
-realpath         DFS        Returns the resolved absolute or relative path for a file
-rm               DFS        Removes (deletes) files, directories, device nodes and symbolic links
-rmdir            DFS        Removes empty directories
-shred            DFS        Overwrites a file to hide its contents, and optionally deletes it
-split            DFS*       Splits a file into pieces [nv: splitting is pure]
-sync             DFS        Flushes file system buffers
-touch            DFS        Changes file timestamps
-truncate         DFS        Shrink or extend the size of a file to the specified size
-unlink           DFS        Removes the specified file using the unlink function
-vdir             DFS        Is exactly like "ls -l -b". (Files are by default listed in long format.)
-
-arch            info.       Prints machine hardware name (same as uname -m)
-date            info.       Prints or sets the system date and time
-df              info.       Shows disk free space on file systems
-dir             info.       Is exactly like "ls -C -b". (Files are by default listed in columns and sorted vertically.)
-du              info.       Shows disk usage on file systems
-groups          info.       Prints the groups of which the user is a member
-hostid          info.       Prints the numeric identifier for the current host
-id              info.       Prints real or effective UID and GID
-logname         info.       Print the user's login name
-ls              info.       Lists the files in a directory
-pinky           info.       A lightweight version of finger
-printenv        info.       Prints environment variables
-pwd             info.       Prints the current working directory
-stat            info.       Returns data about an inode
-timeout         info.       Run a command with a time limit
-tty             info.       Prints terminal name
-uname           info.       Prints system information
-uptime          info.       Tells how long the system has been running
-users           info.       Prints the user names of users currently logged into the current host
-who             info.       Prints a list of all users currently logged in
-whoami          info.       Prints the effective userid
-
-chroot          envr?       Changes the root directory
-dircolors       envr.       Set up color for ls
-env             envr.       Displays and modifies environment variables
-stty            env         Changes and prints terminal line settings
-
-false     Side-effectful    Does nothing, but exits unsuccessfully
-nice      Side-effectful    Modifies scheduling priority
-nohup     Side-effectful    Allows a command to continue running after logging out
-nproc     Side-effectful    Queries the number of (active) processors
-sleep     Side-effectful    Delays for a specified amount of time
-true      Side-effectful    Does nothing, but exits successfully
-
-mkfifo           ?*         Makes named pipes (FIFOs) [nv: might affect DAG]
-mknod            ?          Makes block or character special files
-ptx              ?          Produces a permuted index of file contents [nv: not sure about its invariants]
-runcon           ?          Run command with specified security context
-stdbuf           ?          Controls buffering for commands that use stdio
-tee              ?*         Sends output to multiple files [nv: affects DAG]
diff --git a/annotations/p_stats/get-summary.sh b/annotations/p_stats/get-summary.sh
deleted file mode 100755
index 2d248566e..000000000
--- a/annotations/p_stats/get-summary.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-echo 'GNU Coreutils ('$(cat coreutils-summary.txt | wc -l | awk '{$1=$1};1') 'commands):'
-echo '  S:' $(cat coreutils-summary.txt | grep ' S ' | wc -l)
-echo '  P:' $(cat coreutils-summary.txt | grep ' P ' | wc -l)
-echo '  N:' $(cat coreutils-summary.txt | grep ' N ' | wc -l)
-echo '  E:' $(cat coreutils-summary.txt | grep ' E ' | wc -l)
-
-echo 'POSIX ('$(( $(cat posix-summary.txt | wc -l | awk '{$1=$1};1') + $(cat ../c_stats/posix.txt | grep -v Mandatory | wc -l) )) 'commands):'
-echo '  S:' $(cat posix-summary.txt | grep ' S ' | wc -l)
-echo '  P:' $(cat posix-summary.txt | grep ' P ' | wc -l)
-echo '  N:' $(cat posix-summary.txt | grep ' N ' | wc -l)
-echo '  E:' $(( $(cat posix-summary.txt | grep ' E ' | wc -l) + $(cat ../c_stats/posix.txt | grep -v Mandatory | wc -l) ))
-
diff --git a/annotations/p_stats/mr.md b/annotations/p_stats/mr.md
deleted file mode 100644
index 3db30bea3..000000000
--- a/annotations/p_stats/mr.md
+++ /dev/null
@@ -1,19 +0,0 @@
-```
-Command:    Reduction _g_:
-tr            a ++ b
-tr -s         (init a) ++ (tr -s (last a) ++ (head b)) ++ (tail b)
-nl            a ++ map (\e -> ( (last unzip a) + (fst e)), snd e) ) b // it's simpler than it looks
-join          a ++ b
-cat           a ++ b
-rev           b ++ a
-uniq -c       a + b
-uniq          (init a) ++ (uniq (last a) ++ (head b)) ++ (tail b)
-uniq -d       (init a) ++ (uniq -d (last a) ++ (head b)) ++ (tail b)
-
-join -e       
-wc            ( ((fst a) + (fst b)), ((med a) + (med b)), ((lst a) + (lst b)) )
-wc -l         a + b
-
-sort          sort -m a b
-sort -h       a
-```
diff --git a/annotations/p_stats/output.txt b/annotations/p_stats/output.txt
deleted file mode 100644
index a318a9fe1..000000000
--- a/annotations/p_stats/output.txt
+++ /dev/null
@@ -1,667 +0,0 @@
-if              - control-flow
-alias           - built-in
-echo            - stateless
-local           - BSD
-return          - BSD
-for             - control-flow
-function        - built-in?
-printf          - stateless
-command         - ?
-file            - stateless
-sed             - mostly-stateless (can work on windows)
-case            - control-flow
-grep            - stateless
-exit            - side-effectful (but pure)
-shift           - built-in (affects argument array)
-install         - (I'm pretty sure this is part of `apt-get install`, not the gnu install)
-as              - ?
-awk             - Turing-complete
-source          - Turing complete
-set             - shell PL
-builtin         - higher order
-eval            - Turing complete
-true            - stateless
-read            - stateless
-unset           - shell PL
-rm              - DFS
-while           - shell PL
-export          - shell PL
-mkdir           - DFS
-cd              - DFS
-cat             - DFS+
-which           - Replaceable with distributed equivalent
-break           - control-flow
-cut             - stateless
-test            - control-flow
-sudo            - higher-order
-sort            - mostly-serial?
-tr              - stateless
-at              - ?
-head            - mostly-serial?
-curl            - stateless (but internally parallelizable)
-help            - noop? (!)
-named           - stateless
-declare         - (shell PL)
-find            - DFS
-xargs           - higher-order
-false           - stateless
-script          - ?
-make            - ?
-ls              - DFS 
-hash            - Replaceable with distributed equivalent
-more            - Why used in script
-info            - Why used in script
-type            - ?
-history         - ?
-mv              - DFS
-tee             - ?
-cp              - DFS
-exec            - ?
-chmod           - DFS
-readonly        - N/A
-open            - Why used in script
-continue        - control-flow
-env             - Replaceable with distributed equivalent
-format          - Replaceable with distributed equivalent
-last            - Replaceable with distributed equivalent
-wc              - stateless
-dir             - DFS
-man             - Why used in script
-tail            - mostly-serial?
-reset           - Replaceable with distributed equivalent
-sleep           - stateless? (just block all streams?)
-until           - control-flow
-time            - higher-order 
-write           - Why used in script (!)
-enable          - ?
-select          - ?
-tar             - stateless
-yes             - stateless
-wget            - stateless + (internally parallelizable)
-clear           - no-op
-link            - DFS
-bash            - Turing-complete
-disable         - Why used in script (!)
-emacs           - Turing-complete
-free            - Replaceable with distributed equivalent
-id              - Replaceable with distributed equivalent (!)
-iptables        - non-distributable
-less            - Why used in script
-screen          - Why used in script
-touch           - DFS
-cc              - ?
-diff            - stateless
-init            - Replaceable with distributed equivalent
-let             - stateless
-uniq            - mostly-stateless (looks at pairs)
-paste           - 6.91085003455425E-4
-perl            - Turing-complete
-wait            - Synchronization primitive
-addresses       - This has no GNU nor BSD entry
-chown           - 6.142977808492667E-4
-gpg             - 6.142977808492667E-4
-host            - 6.142977808492667E-4
-most            - 6.142977808492667E-4
-zgrep           - 6.142977808492667E-4
-access          - 5.375105582431084E-4
-apt-get         - 5.375105582431084E-4
-cron            - 5.375105582431084E-4
-import          - 5.375105582431084E-4
-kill            - 5.375105582431084E-4
-ln              - 5.375105582431084E-4
-look            - 5.375105582431084E-4
-netstat         - 5.375105582431084E-4
-route           - 5.375105582431084E-4
-sh              - 5.375105582431084E-4
-top             - 5.375105582431084E-4
-trap            - 5.375105582431084E-4
-users           - 5.375105582431084E-4
-ip              - 4.6072333563695E-4
-join            - 4.6072333563695E-4
-login           - 4.6072333563695E-4
-merge           - 4.6072333563695E-4
-ps              - 4.6072333563695E-4
-size            - 4.6072333563695E-4
-split           - 4.6072333563695E-4
-strings         - 4.6072333563695E-4
-tty             - 4.6072333563695E-4
-xz              - 4.6072333563695E-4
-basename        - 3.839361130307917E-4
-comm            - 3.839361130307917E-4
-dump            - 3.839361130307917E-4
-logrotate       - 3.839361130307917E-4
-ping            - 3.839361130307917E-4
-popd            - 3.839361130307917E-4
-restore         - 3.839361130307917E-4
-rpm             - 3.839361130307917E-4
-unalias         - 3.839361130307917E-4
-vim             - 3.839361130307917E-4
-aptitude        - 3.0714889042463337E-4
-chsh            - 3.0714889042463337E-4
-date            - 3.0714889042463337E-4
-dig             - 3.0714889042463337E-4
-dirs            - 3.0714889042463337E-4
-expand          - 3.0714889042463337E-4
-getopts         - 3.0714889042463337E-4
-lsof            - 3.0714889042463337E-4
-mail            - 3.0714889042463337E-4
-nohup           - 3.0714889042463337E-4
-pushd           - 3.0714889042463337E-4
-pwd             - 3.0714889042463337E-4
-sha1sum         - 3.0714889042463337E-4
-timeout         - 3.0714889042463337E-4
-times           - 3.0714889042463337E-4
-uname           - 3.0714889042463337E-4
-w               - 3.0714889042463337E-4
-du              - 2.30361667818475E-4
-egrep           - 2.30361667818475E-4
-getent          - 2.30361667818475E-4
-iconv           - 2.30361667818475E-4
-locate          - 2.30361667818475E-4
-notify-send     - 2.30361667818475E-4
-reboot          - 2.30361667818475E-4
-ssh             - 2.30361667818475E-4
-stat            - 2.30361667818475E-4
-strfile         - 2.30361667818475E-4
-sync            - 2.30361667818475E-4
-ar              - 1.5357444521231668E-4
-banner          - 1.5357444521231668E-4
-column          - 1.5357444521231668E-4
-df              - 1.5357444521231668E-4
-fgrep           - 1.5357444521231668E-4
-fmt             - 1.5357444521231668E-4
-fuser           - 1.5357444521231668E-4
-groups          - 1.5357444521231668E-4
-hostname        - 1.5357444521231668E-4
-killall         - 1.5357444521231668E-4
-locale          - 1.5357444521231668E-4
-mktemp          - 1.5357444521231668E-4
-nc              - 1.5357444521231668E-4
-nl              - 1.5357444521231668E-4
-pidof           - 1.5357444521231668E-4
-python          - 1.5357444521231668E-4
-rename          - 1.5357444521231668E-4
-rev             - 1.5357444521231668E-4
-rmdir           - 1.5357444521231668E-4
-rsync           - 1.5357444521231668E-4
-ssh-add         - 1.5357444521231668E-4
-ssh-agent       - 1.5357444521231668E-4
-strip           - 1.5357444521231668E-4
-stty            - 1.5357444521231668E-4
-suspend         - 1.5357444521231668E-4
-symlink         - 1.5357444521231668E-4
-tac             - 1.5357444521231668E-4
-uptime          - 1.5357444521231668E-4
-useradd         - 1.5357444521231668E-4
-accept          - 7.678722260615834E-5
-apt             - 7.678722260615834E-5
-bg              - 7.678722260615834E-5
-chdir           - 7.678722260615834E-5
-dircolors       - 7.678722260615834E-5
-expect          - 7.678722260615834E-5
-fg              - 7.678722260615834E-5
-ftp             - 7.678722260615834E-5
-gunzip          - 7.678722260615834E-5
-gzip            - 7.678722260615834E-5
-halt            - 7.678722260615834E-5
-ifconfig        - 7.678722260615834E-5
-jobs            - 7.678722260615834E-5
-lesskey         - 7.678722260615834E-5
-logout          - 7.678722260615834E-5
-modinfo         - 7.678722260615834E-5
-nice            - 7.678722260615834E-5
-nm              - 7.678722260615834E-5
-patch           - 7.678722260615834E-5
-pv              - 7.678722260615834E-5
-ram             - 7.678722260615834E-5
-rar             - 7.678722260615834E-5
-readlink        - 7.678722260615834E-5
-shopt           - 7.678722260615834E-5
-shutdown        - 7.678722260615834E-5
-ss              - 7.678722260615834E-5
-sshd            - 7.678722260615834E-5
-statd           - 7.678722260615834E-5
-su              - 7.678722260615834E-5
-sum             - 7.678722260615834E-5
-tcpdump         - 7.678722260615834E-5
-tftp            - 7.678722260615834E-5
-tput            - 7.678722260615834E-5
-ulimit          - 7.678722260615834E-5
-umask           - 7.678722260615834E-5
-uncompress      - 7.678722260615834E-5
-unrar           - 7.678722260615834E-5
-userdel         - 7.678722260615834E-5
-usermod         - 7.678722260615834E-5
-vmstat          - 7.678722260615834E-5
-xdg-open        - 7.678722260615834E-5
-aclocal         - 0.0
-aconnect        - 0.0
-acpi            - 0.0
-acpi_available  - 0.0
-acpid           - 0.0
-addr2line       - 0.0
-agetty          - 0.0
-alsactl         - 0.0
-amidi           - 0.0
-amixer          - 0.0
-anacron         - 0.0
-aplay           - 0.0
-aplaymidi       - 0.0
-apm             - 0.0
-apmd            - 0.0
-apropos         - 0.0
-arch            - 0.0
-arecord         - 0.0
-arecordmidi     - 0.0
-arp             - 0.0
-aspell          - 0.0
-atd             - 0.0
-atq             - 0.0
-atrm            - 0.0
-audiosend       - 0.0
-aumix           - 0.0
-autoconf        - 0.0
-autoheader      - 0.0
-automake        - 0.0
-autoreconf      - 0.0
-autoscan        - 0.0
-autoupdate      - 0.0
-badblocks       - 0.0
-batch           - 0.0
-bc              - 0.0
-biff            - 0.0
-bind            - 0.0
-bison           - 0.0
-bzcmp           - 0.0
-bzdiff          - 0.0
-bzgrep          - 0.0
-bzip2           - 0.0
-bzless          - 0.0
-bzmore          - 0.0
-cal             - 0.0
-cardctl         - 0.0
-cardmgr         - 0.0
-cdda2wav        - 0.0
-cdparanoia      - 0.0
-cdrdao          - 0.0
-cdrecord        - 0.0
-cfdisk          - 0.0
-chage           - 0.0
-chattr          - 0.0
-chfn            - 0.0
-chgrp           - 0.0
-chkconfig       - 0.0
-chpasswd        - 0.0
-chroot          - 0.0
-chrt            - 0.0
-chvt            - 0.0
-cksum           - 0.0
-cmp             - 0.0
-col             - 0.0
-colcrt          - 0.0
-colrm           - 0.0
-compress        - 0.0
-cpio            - 0.0
-cpp             - 0.0
-crond           - 0.0
-crontab         - 0.0
-csplit          - 0.0
-ctags           - 0.0
-cupsd           - 0.0
-cvs             - 0.0
-dc              - 0.0
-dd              - 0.0
-ddrescue        - 0.0
-deallocvt       - 0.0
-debugfs         - 0.0
-depmod          - 0.0
-devdump         - 0.0
-diff3           - 0.0
-dirname         - 0.0
-dlpsh           - 0.0
-dmesg           - 0.0
-dnsdomainname   - 0.0
-dnssec-keygen   - 0.0
-dnssec-makekeyset - 0.0
-dnssec-signkey  - 0.0
-dnssec-signzone - 0.0
-doexec          - 0.0
-domainname      - 0.0
-dosfsck         - 0.0
-dumpe2fs        - 0.0
-dumpkeys        - 0.0
-e2fsck          - 0.0
-e2image         - 0.0
-e2label         - 0.0
-edgnued–        - 0.0
-edquota         - 0.0
-eject           - 0.0
-elvtune         - 0.0
-envsubst        - 0.0
-esd             - 0.0
-esd-config      - 0.0
-esdcat          - 0.0
-esdctl          - 0.0
-esddsp          - 0.0
-esdmon          - 0.0
-esdplay         - 0.0
-esdrec          - 0.0
-esdsample       - 0.0
-etags           - 0.0
-ethtool         - 0.0
-ex              - 0.0
-expr            - 0.0
-factor          - 0.0
-fc-cache        - 0.0
-fc-list         - 0.0
-fdformat        - 0.0
-fdisk           - 0.0
-fetchmail       - 0.0
-fgconsole       - 0.0
-finger          - 0.0
-fingerd         - 0.0
-flex            - 0.0
-fold            - 0.0
-formail         - 0.0
-fsck            - 0.0
-ftpd            - 0.0
-g++             - 0.0
-gawk            - 0.0
-gcc             - 0.0
-gdb             - 0.0
-getkeycodes     - 0.0
-gpasswd         - 0.0
-gpgsplit        - 0.0
-gpgv            - 0.0
-gpm             - 0.0
-gprof           - 0.0
-groff           - 0.0
-groffer         - 0.0
-groupadd        - 0.0
-groupdel        - 0.0
-groupmod        - 0.0
-grpck           - 0.0
-grpconv         - 0.0
-gs              - 0.0
-gzexe           - 0.0
-hdparm          - 0.0
-hexdump         - 0.0
-hostid          - 0.0
-htdigest        - 0.0
-htop            - 0.0
-hwclock         - 0.0
-ifdown          - 0.0
-ifup            - 0.0
-imapd           - 0.0
-inetd           - 0.0
-insmod          - 0.0
-iostat          - 0.0
-ipcrm           - 0.0
-ipcs            - 0.0
-iptables-restore - 0.0
-iptables-save   - 0.0
-isodump         - 0.0
-isoinfo         - 0.0
-isosize         - 0.0
-isovfy          - 0.0
-ispell          - 0.0
-kbd_mode        - 0.0
-kbdrate         - 0.0
-killall5        - 0.0
-klogd           - 0.0
-kudzu           - 0.0
-lastb           - 0.0
-lastlog         - 0.0
-ld              - 0.0
-ldconfig        - 0.0
-ldd             - 0.0
-lftp            - 0.0
-lftpget         - 0.0
-loadkeys        - 0.0
-lockfile        - 0.0
-logger          - 0.0
-logname         - 0.0
-losetup         - 0.0
-lpadmin         - 0.0
-lpc             - 0.0
-lpinfo          - 0.0
-lpmove          - 0.0
-lpq             - 0.0
-lpr             - 0.0
-lprint          - 0.0
-lprintd         - 0.0
-lprintq         - 0.0
-lprm            - 0.0
-lpstat          - 0.0
-lsattr          - 0.0
-lsblk           - 0.0
-lsmod           - 0.0
-lspci           - 0.0
-lsusb           - 0.0
-m4              - 0.0
-mailq           - 0.0
-mailstats       - 0.0
-mailto          - 0.0
-makedbm         - 0.0
-makemap         - 0.0
-manpath         - 0.0
-mattrib         - 0.0
-mbadblocks      - 0.0
-mcat            - 0.0
-mcd             - 0.0
-mcopy           - 0.0
-md5sum          - 0.0
-mdel,mdeltree   - 0.0
-mdir            - 0.0
-mdu             - 0.0
-mesg            - 0.0
-metamail        - 0.0
-metasend        - 0.0
-mformat         - 0.0
-mimencode       - 0.0
-minfo           - 0.0
-mkdosfs         - 0.0
-mke2fs          - 0.0
-mkfifo          - 0.0
-mkfs            - 0.0
-mkfs.ext3       - 0.0
-mkisofs         - 0.0
-mklost+found    - 0.0
-mkmanifest      - 0.0
-mknod           - 0.0
-mkraid          - 0.0
-mkswap          - 0.0
-mlabel          - 0.0
-mmd             - 0.0
-mmount          - 0.0
-mmove           - 0.0
-mmv             - 0.0
-modprobe        - 0.0
-mount           - 0.0
-mountd          - 0.0
-mpartition      - 0.0
-mpg123          - 0.0
-mpg321          - 0.0
-mrd             - 0.0
-mren            - 0.0
-mshowfat        - 0.0
-mt              - 0.0
-mtools          - 0.0
-mtoolstest      - 0.0
-mtr             - 0.0
-mtype           - 0.0
-mzip            - 0.0
-namei           - 0.0
-nameif          - 0.0
-newaliases      - 0.0
-newgrp          - 0.0
-newusers        - 0.0
-nfsd            - 0.0
-nfsstat         - 0.0
-nslookup        - 0.0
-nsupdate        - 0.0
-objcopy         - 0.0
-objdump         - 0.0
-od              - 0.0
-op              - 0.0
-openvt          - 0.0
-passwd          - 0.0
-pathchk         - 0.0
-pgrep           - 0.0
-pinky           - 0.0
-pkill           - 0.0
-pmap            - 0.0
-portmap         - 0.0
-poweroff        - 0.0
-pppd            - 0.0
-pr              - 0.0
-praliases       - 0.0
-printcap        - 0.0
-printenv        - 0.0
-ptx             - 0.0
-pwck            - 0.0
-pwconv          - 0.0
-quota           - 0.0
-quotacheck      - 0.0
-quotactl        - 0.0
-quotaoff        - 0.0
-quotaon         - 0.0
-quotastats      - 0.0
-raidstart       - 0.0
-ramsize         - 0.0
-ranlib          - 0.0
-rarpd           - 0.0
-rcp             - 0.0
-rdate           - 0.0
-rdev            - 0.0
-rdist           - 0.0
-rdistd          - 0.0
-readarray       - 0.0
-readcd          - 0.0
-readelf         - 0.0
-reject          - 0.0
-remsync         - 0.0
-renice          - 0.0
-repquota        - 0.0
-resize2fs       - 0.0
-rexec           - 0.0
-rexecd          - 0.0
-richtext        - 0.0
-rlogin          - 0.0
-rlogind         - 0.0
-rmail           - 0.0
-rmmod           - 0.0
-rndc            - 0.0
-rootflags       - 0.0
-routed          - 0.0
-rpcgen          - 0.0
-rpcinfo         - 0.0
-rsh             - 0.0
-rshd            - 0.0
-runlevel        - 0.0
-rup             - 0.0
-ruptime         - 0.0
-rusers          - 0.0
-rusersd         - 0.0
-rwall           - 0.0
-rwho            - 0.0
-rwhod           - 0.0
-sane-find-scanner - 0.0
-scanadf         - 0.0
-scanimage       - 0.0
-scp             - 0.0
-sdiff           - 0.0
-sendmail        - 0.0
-sensors         - 0.0
-seq             - 0.0
-setfdprm        - 0.0
-setkeycodes     - 0.0
-setleds         - 0.0
-setmetamode     - 0.0
-setquota        - 0.0
-setsid          - 0.0
-setterm         - 0.0
-sftp            - 0.0
-showkey         - 0.0
-showmount       - 0.0
-shred           - 0.0
-skill           - 0.0
-slabtop         - 0.0
-slattach        - 0.0
-slocate         - 0.0
-snice           - 0.0
-ssh-keygen      - 0.0
-ssh-keyscan     - 0.0
-strace          - 0.0
-swapoff         - 0.0
-swapon          - 0.0
-sysctl          - 0.0
-sysklogd        - 0.0
-syslogd         - 0.0
-tailf           - 0.0
-talk            - 0.0
-talkd           - 0.0
-taskset         - 0.0
-tcpd            - 0.0
-tcpslice        - 0.0
-telinit         - 0.0
-telnet          - 0.0
-telnetd         - 0.0
-tftpd           - 0.0
-tload           - 0.0
-tmpwatch        - 0.0
-tracepath       - 0.0
-traceroute      - 0.0
-troff           - 0.0
-tset            - 0.0
-tsort           - 0.0
-tune2fs         - 0.0
-tunelp          - 0.0
-ul              - 0.0
-umount          - 0.0
-unexpand        - 0.0
-unicode_start   - 0.0
-unicode_stop    - 0.0
-units           - 0.0
-unshar          - 0.0
-usleep          - 0.0
-uudecode        - 0.0
-uuencode        - 0.0
-uuidgen         - 0.0
-vdir            - 0.0
-vi              - 0.0
-vidmode         - 0.0
-volname         - 0.0
-wall            - 0.0
-warnquota       - 0.0
-watch           - 0.0
-whatis          - 0.0
-whereis         - 0.0
-who             - 0.0
-whoami          - 0.0
-whois           - 0.0
-xinetd          - 0.0
-yacc            - 0.0
-ypbind          - 0.0
-ypcat           - 0.0
-ypinit          - 0.0
-ypmatch         - 0.0
-yppasswd        - 0.0
-yppasswdd       - 0.0
-yppoll          - 0.0
-yppush          - 0.0
-ypserv          - 0.0
-ypset           - 0.0
-yptest          - 0.0
-ypwhich         - 0.0
-ypxfr           - 0.0
-zcat            - 0.0
-zcmp            - 0.0
-zdiff           - 0.0
-zdump           - 0.0
-zforce          - 0.0
-zic             - 0.0
-zip             - 0.0
-zless           - 0.0
-zmore           - 0.0
-znew            - 0.0
diff --git a/annotations/p_stats/posix-summary.txt b/annotations/p_stats/posix-summary.txt
deleted file mode 100644
index 74d59a323..000000000
--- a/annotations/p_stats/posix-summary.txt
+++ /dev/null
@@ -1,94 +0,0 @@
-xargs     : P : Construct argument lists and invoke utility (higher order, depends on command)
-mkfifo    : E : Makes named pipes (FIFOs) [nv: might affect DAG]
-tee       : ? : Sends output to multiple files [nv: affects DAG]
-sh        : E : Shell, the standard command language interpreter
-nohup     : E : Allows a command to continue running after logging out
-tput      : E : Change terminal characteristics
-newgrp    : E : Change to a new group [nv: creates a new shell exec. env. with new real/effective GIDs]
-stty      : E : Changes and prints terminal line settings
-chgrp     : E : Changes file group ownership
-chown     : E : Changes file ownership
-touch     : E : Changes file timestamps
-chmod     : E : Changes the permissions of a file or directory
-cp        : E : Copies a file or directory
-dd        : E : Copies and converts a file
-mkdir     : E : Creates a directory
-ln        : E : Creates a link to a file
-localedef : E : Define locale environment
-sleep     : E : Delays for a specified amount of time
-file      : E : Determine file type [nv: performs 3 tests---"DFS" for the 1st and "pure" for 2nd and 3rd).
-env       : E : Displays and modifies environment variables
-true      : S : Does nothing, but exits successfully
-false     : S : Does nothing, but exits unsuccessfully
-at        : E : Execute commands at a later time
-find      : E : Find files
-gencat    : E : Generate a formatted message catalog [nv: pure but depends on fs]
-getconf   : E : Get configuration values [nv: does not use stdin]
-locale    : E : Get locale-specific information
-ls        : E : Lists the files in a directory
-logger    : E : Log messages    4.3BSD
-m4        : E : Macro processor[nv: formal verification of m4 would be interesting]
-nice      : E : Modifies scheduling priority
-mv        : E : Moves files or rename files
-mesg      : E : Permit or deny messages
-logname   : E : Print the user's login name
-who       : E : Prints a list of all users currently logged in
-date      : E : Prints or sets the system date and time
-id        : E : Prints real or effective UID and GID
-uname     : E : Prints system information
-tty       : E : Prints terminal name
-pwd       : E : Prints the current working directory
-mailx     : E : Process messages
-rm        : E : Removes (deletes) files, directories, device nodes and symbolic links
-rmdir     : E : Removes empty directories
-ps        : E : Report process status
-batch     : E : Schedule commands to be executed in a batch queue
-crontab   : E : Schedule periodic background work
-lp        : E : Send files to a printer
-renice    : E : Set nice values of running processes
-tabs      : E : Set terminal tabs [nv: affects where tabs stop on terminal output]
-df        : E : Shows disk free space on file systems
-du        : E : Shows disk usage on file systems
-split     : E : Splits a file into pieces [nv: splitting is pure]
-csplit    : E : Splits a file into sections determined by context lines
-write     : E : Write to another user's terminal
-awk       : N : Pattern scanning and processing language
-pax       : N : Portable archive interchange
-patch     : N : Apply changes to files
-bc        : N : Arbitrary-precision arithmetic language [nv: could be used in data-intensive examples]]
-cksum     : N : Checksums and count the bytes in a file
-diff      : P : Compare two files; see also cmp
-cat       : S : Concatenates and prints files on the standard output
-pr        : P : Converts text files for printing
-ar        : N : Create and maintain library archives [nv: but interface is FS]
-strings   : S : Find printable strings in files [nv: simple pure, a la DFS]
-head      : P : Outputs the first part of files [nv: if we have chunks and watermarks, then this is trivially parallelizable]
-tail      : P : Outputs the last part of files
-tsort     : N : Performs a topological sort
-wc        : P : Prints the number of bytes, words, and lines in files
-uniq      : P : Removes duplicate lines from a sorted file
-sort      : P : sort lines of text files
-pathchk   : S : Checks whether file names are valid or portable
-iconv     : S : Codeset conversion[nv: stateless modulo definition of '\n']
-cmp       : S : Compare two files; see also diff [nv: like diff]
-comm      : P : Compares two sorted files line by line
-unexpand  : S : Converts spaces to tabs
-expand    : S : Converts tabs to spaces
-uudecode  : N : Decode a binary file
-man       : P : Display system documentation
-echo      : S : Displays a specified line of text
-od        : N : Dumps files in octal and other formats [nv: but changes \n location]
-uuencode  : N : Encode a binary file
-test      : S : Evaluates an expression
-expr      : N : Evaluates expressions [nv: not shell, but math etc]
-printf    : S : Formats and prints data
-join      : P : Joins lines of two files on a common field
-paste     : P : Merges lines of files
-fold      : P : Wraps each input line to fit in specified width [nv: like map, from one to two lines]
-cut       : S : Removes sections from each line of files
-basename  : S : Removes the path prefix from a given pathname
-grep      : S : Search text for a pattern
-sed       : S : Stream editor
-dirname   : S : Strips non-directory suffix from file name
-ed        : S : The standard text editor [nv: line-oriented text-editing, precursor to sed]
-tr        : S : Translates or deletes characters
diff --git a/annotations/p_stats/posix_mandatory1.txt b/annotations/p_stats/posix_mandatory1.txt
deleted file mode 100644
index 4d8528cdd..000000000
--- a/annotations/p_stats/posix_mandatory1.txt
+++ /dev/null
@@ -1,57 +0,0 @@
-mkfifo           ?*         Makes named pipes (FIFOs) [nv: might affect DAG]
-tee              ?*         Sends output to multiple files [nv: affects DAG]
-chgrp            DFS        Changes file group ownership
-chmod            DFS        Changes the permissions of a file or directory
-chown            DFS        Changes file ownership
-cp               DFS        Copies a file or directory
-csplit           DFS        Splits a file into sections determined by context lines
-dd               DFS        Copies and converts a file
-ln               DFS        Creates a link to a file
-mkdir            DFS        Creates a directory
-mv               DFS        Moves files or rename files
-rm               DFS        Removes (deletes) files, directories, device nodes and symbolic links
-rmdir            DFS        Removes empty directories
-touch            DFS        Changes file timestamps
-split            DFS*       Splits a file into pieces [nv: splitting is pure]
-false     Side-effectful    Does nothing, but exits unsuccessfully
-nice      Side-effectful    Modifies scheduling priority
-nohup     Side-effectful    Allows a command to continue running after logging out
-sleep     Side-effectful    Delays for a specified amount of time
-true      Side-effectful    Does nothing, but exits successfully
-stty            env         Changes and prints terminal line settings
-env             envr.       Displays and modifies environment variables
-date            info.       Prints or sets the system date and time
-df              info.       Shows disk free space on file systems
-du              info.       Shows disk usage on file systems
-id              info.       Prints real or effective UID and GID
-logname         info.       Print the user's login name
-ls              info.       Lists the files in a directory
-pwd             info.       Prints the current working directory
-tty             info.       Prints terminal name
-uname           info.       Prints system information
-who             info.       Prints a list of all users currently logged in
-cut         l-stateless     Removes sections from each line of files
-expand      l-stateless     Converts tabs to spaces
-fold        l-stateless     Wraps each input line to fit in specified width [nv: like map, from one to two lines]
-join        l-stateless     Joins lines of two files on a common field
-paste       l-stateless     Merges lines of files
-comm        l-stateless*    Compares two sorted files line by line
-cat             pure        Concatenates and prints files on the standard output
-cksum           pure        Checksums and count the bytes in a file
-head            pure        Outputs the first part of files [nv: if we have chunks and watermarks, then this is trivially parallelizable]
-pr              pure        Converts text files for printing
-sort            pure        sort lines of text files
-tail            pure        Outputs the last part of files
-tsort           pure        Performs a topological sort
-uniq            pure        Removes duplicate lines from a sorted file
-wc              pure        Prints the number of bytes, words, and lines in files
-basename     stateless      Removes the path prefix from a given pathname
-dirname      stateless      Strips non-directory suffix from file name
-echo         stateless      Displays a specified line of text
-expr         stateless      Evaluates expressions [nv: not shell, but math etc]
-pathchk      stateless      Checks whether file names are valid or portable
-printf       stateless      Formats and prints data
-test         stateless      Evaluates an expression
-tr           stateless      Translates or deletes characters
-unexpand     stateless      Converts spaces to tabs
-od           stateless*     Dumps files in octal and other formats [nv: but changes \n location]
diff --git a/annotations/p_stats/posix_mandatory2.txt b/annotations/p_stats/posix_mandatory2.txt
deleted file mode 100644
index dddcb2074..000000000
--- a/annotations/p_stats/posix_mandatory2.txt
+++ /dev/null
@@ -1,37 +0,0 @@
-strings     pure                                Find printable strings in files [nv: simple pure, a la DFS]
-cmp         stateless                           Compare two files; see also diff [nv: like diff]
-file        DFS                                 Determine file type [nv: performs 3 tests---"DFS" for the 1st and "pure" for 2nd and 3rd).
-find        DFS                                 Find files
-ar          pure                                Create and maintain library archives [nv: but interface is FS]
-bc          pure                                Arbitrary-precision arithmetic language [nv: could be used in data-intensive examples]]
-crontab     side-effect                         Schedule periodic background work
-gencat      DFS                                 Generate a formatted message catalog [nv: pure but depends on fs]
-getconf     env-ro                              Get configuration values [nv: does not use stdin]
-grep        stateless                           Search text for a pattern
-locale      env-ro                              Get locale-specific information
-localedef   env-w                               Define locale environment
-m4          pure                                Macro processor[nv: formal verification of m4 would be interesting]
-mailx       side-effect                         Process messages
-man         stateless                           Display system documentation
-mesg        env-w                               Permit or deny messages
-newgrp      side-effect                         Change to a new group [nv: creates a new shell exec. env. with new real/effective GIDs]
-pax         pure?                               Portable archive interchange
-tabs        side-effect                         Set terminal tabs [nv: affects where tabs stop on terminal output]
-tput        env                                 Change terminal characteristics
-write       side-effect                         Write to another user's terminal
-uudecode    stateless                           Decode a binary file
-uuencode    stateless                           Encode a binary file
-at          side-effect                         Execute commands at a later time
-batch       side-effect                         Schedule commands to be executed in a batch queue
-ps          env-ro                              Report process status
-renice      side-effect                         Set nice values of running processes
-logger      DFS                                 Log messages    4.3BSD
-sh          ?                                   Shell, the standard command language interpreter
-xargs       higher-order                        Construct argument lists and invoke utility (higher order, depends on command)
-awk         pure                                Pattern scanning and processing language
-diff        pure                                Compare two files; see also cmp
-ed          stateless                           The standard text editor [nv: line-oriented text-editing, precursor to sed]
-iconv       stateless                           Codeset conversion[nv: stateless modulo definition of '\n']
-lp          side-effect                         Send files to a printer
-patch       pure                                Apply changes to files
-sed         stateless                           Stream editor
diff --git a/annotations/p_stats/statistics.sh b/annotations/p_stats/statistics.sh
deleted file mode 100755
index 4bf6faf64..000000000
--- a/annotations/p_stats/statistics.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-
-# This classification  is wrt to their operation, not  its input---i.e., whether
-# input contains the use of fs identifiers (identifiers can be fs or not fs)
-
-# We need to think about how to translate DFS commands
-# What is a distributed fs? Directories are simply keys?
-
-# everything else (i.e., side-effectful) just needs to be converted to location independent commands
-
-p="../c_stats/"
-A=${1:-${p}posix.txt}
-B=${2:-${p}coreutils.txt}
-
-# Take commands that are shared and use existing distributability descriptions
-comm -12 <(cat $A | grep 'Mandatory' | cut -d ' ' -f 1 | sort ) <( cut -d ' ' -f 1 $B | sort) |
-  sed s/^/\^/ |
-  xargs -n 1 -I {} grep -w {} ./coreutils.txt |
-  sort -b -k2,2 -k1,1 # > posix_mandatory1.txt # commenting out this redirection will overwrite!
-
-# Analyze mandatory commands not in the second, and not built-ins
-comm -23 <(cat $A | grep 'Mandatory' | cut -d ' ' -f 1 | sort ) <( cut -d ' ' -f 1 $B | sort) | 
-  comm -23 - <(cat ../c_stats/builtins.txt | sed 's/  */ /g' | cut -d ' ' -f 1 | sort) |
-  sed s/^/\^/ |
-  xargs -n 1 -I {} grep -w {} $A |
-  sed s/Mandatory// |
-  sort -b -k2,2 -k1,1 # > posix_mandatory2.txt # commenting out this redirection will overwrite!
diff --git a/annotations/package_build_aux.json b/annotations/package_build_aux.json
deleted file mode 100644
index 01f922363..000000000
--- a/annotations/package_build_aux.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "package_build_aux",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["args[0]"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/pandoc.json b/annotations/pandoc.json
deleted file mode 100644
index 32f2dd0ff..000000000
--- a/annotations/pandoc.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "pandoc",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "non-pure",
-            "inputs": ["args[:]"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/paste.json b/annotations/paste.json
deleted file mode 100644
index 65008e4fe..000000000
--- a/annotations/paste.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-    "command": "paste",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["args[:]"],
-            "outputs": ["stdout"],
-            "options": ["stdin-hyphen"]
-        }
-    ]
-}
diff --git a/annotations/pr.json b/annotations/pr.json
deleted file mode 100644
index abb0b8623..000000000
--- a/annotations/pr.json
+++ /dev/null
@@ -1,36 +0,0 @@
-{
-    "command": "pr",
-    "cases":
-    [
-    	{
-            "predicate":
-            {
-            	"operator": "and",
-            	"operands":
-            	[
-            	    {
-            		"operator": "exists",
-            		"operands": ["-T, --omit-pagination"]
-            	    },
-            	    {
-            		"operator": "!exists",
-            		"operands": ["integer"],
-            		"comment": "TODO(@DanielSongShen) Unsure how to express this kind of thing. Want to say: operand does not contain an integer."
-            	    }
-            	
-            	]
-            	
-            },
-            "class": "stateless",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-            
-        },
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/process_bio_s_line.json b/annotations/process_bio_s_line.json
deleted file mode 100644
index 83597fa72..000000000
--- a/annotations/process_bio_s_line.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-    "command": "process_bio_s_line",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["args[0]"],
-            "outputs": ["stdout"]
-        }
-    ],
-    "comment": "Used in bio script."
-}
diff --git a/annotations/ps.json b/annotations/ps.json
deleted file mode 100644
index e1d106a1e..000000000
--- a/annotations/ps.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "ps",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "stateless",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/pwd.json b/annotations/pwd.json
deleted file mode 100644
index ba8f2e328..000000000
--- a/annotations/pwd.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "pwd",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "side-effectful",
-            "inputs": [],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/r_merge.json b/annotations/r_merge.json
deleted file mode 100644
index ae3a3a656..000000000
--- a/annotations/r_merge.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-    "command": "r_merge",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "stateless",
-            "inputs": ["args[:]"],
-            "outputs": ["stdout"]
-        }
-    ],
-    "options": ["stdin-hyphen"]
-}
diff --git a/annotations/r_split.json b/annotations/r_split.json
deleted file mode 100644
index 437030602..000000000
--- a/annotations/r_split.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "r_split",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["args[0]"],
-            "outputs": ["args[2:]"]
-        }
-    ]
-}
diff --git a/annotations/r_unwrap.json b/annotations/r_unwrap.json
deleted file mode 100644
index e63bb1155..000000000
--- a/annotations/r_unwrap.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "r_unwrap",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/r_wrap.json b/annotations/r_wrap.json
deleted file mode 100644
index 78afc3586..000000000
--- a/annotations/r_wrap.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "r_wrap",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/read.json b/annotations/read.json
deleted file mode 100644
index 3604a85be..000000000
--- a/annotations/read.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "read",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "side-effectful",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/readelf.json b/annotations/readelf.json
deleted file mode 100644
index 73cc10f8a..000000000
--- a/annotations/readelf.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "readelf",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["args[:]"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/remote_read.json b/annotations/remote_read.json
deleted file mode 100644
index e79c3d31c..000000000
--- a/annotations/remote_read.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "remote_read.sh",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": [],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/remote_write.json b/annotations/remote_write.json
deleted file mode 100644
index bff6da003..000000000
--- a/annotations/remote_write.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "remote_write.sh",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/resize.json b/annotations/resize.json
deleted file mode 100644
index 4f2f94757..000000000
--- a/annotations/resize.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-    "command": "comm",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["args[:]"],
-            "outputs": ["stdout"],
-            "options": ["stdin-hyphen"]
-        }
-    ]
-}
diff --git a/annotations/rev.json b/annotations/rev.json
deleted file mode 100644
index a521d97aa..000000000
--- a/annotations/rev.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "rev",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "stateless",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/rm.json b/annotations/rm.json
deleted file mode 100644
index 924c30f8e..000000000
--- a/annotations/rm.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "rm",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "side-effectful",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/run_tests.json b/annotations/run_tests.json
deleted file mode 100644
index 0bcbf5e52..000000000
--- a/annotations/run_tests.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "run_tests",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["args[0]"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/sed.json b/annotations/sed.json
deleted file mode 100644
index b3bee84f5..000000000
--- a/annotations/sed.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-    "command": "sed",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"],
-			"comment": "Always stateless unless NUMBERd or NUMBERq is present. Maybe introduce REGEX? Need to introduce operator starts_with (if starts with s then is stateless). For now it is OK since it is overriden in command-categories.py"
-        }
-    ]
-}
diff --git a/annotations/seq.json b/annotations/seq.json
deleted file mode 100644
index 9e0fbc900..000000000
--- a/annotations/seq.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "seq",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": [],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/set.json b/annotations/set.json
deleted file mode 100644
index 1d752bfa8..000000000
--- a/annotations/set.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "set",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "side-effectful",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/set_diff.json b/annotations/set_diff.json
deleted file mode 100644
index b3fc7be03..000000000
--- a/annotations/set_diff.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-    "command": "set_diff",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "stateless",
-            "inputs":
-            {
-                "configuration": ["args[0]"],
-                "standard": ["stdin"]
-            },
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/sha256sum.json b/annotations/sha256sum.json
deleted file mode 100644
index 37ce98b5d..000000000
--- a/annotations/sha256sum.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "sha256sum",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "non-pure",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/shuf.json b/annotations/shuf.json
deleted file mode 100644
index fca499e9a..000000000
--- a/annotations/shuf.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "shuf",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "stateless",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/sort.json b/annotations/sort.json
deleted file mode 100644
index 19c4c4f98..000000000
--- a/annotations/sort.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-    "command": "sort",
-    "cases":
-    [
-        {
-            "predicate":
-            {
-                "operator": "exists",
-                "operands": ["-m"]
-            },
-            "class": "pure",
-            "inputs": ["args[:]"],
-            "outputs": ["stdout"]
-        },
-        {
-            "predicate": "default",
-            "class": "parallelizable_pure",
-            "properties": ["commutative"],
-            "agg": "sort",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"],
-            "aggregator":
-            {
-                "name": "sort",
-                "options": ["-m"]
-            }
-        }
-    ],
-    "comment": "TODO: This is not correct. To assign arguments correctly, we need to check if there is any file argument. If not, then we can read from stdin, otherwise we have to read from input files."
-}
diff --git a/annotations/split.json b/annotations/split.json
deleted file mode 100644
index eb350f280..000000000
--- a/annotations/split.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "split",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "side-effectful",
-            "inputs": ["args[:]"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/stem-words.json b/annotations/stem-words.json
deleted file mode 100644
index 8dda58476..000000000
--- a/annotations/stem-words.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-    "command": "stem-words.js",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "stateless",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
-
diff --git a/annotations/tac.json b/annotations/tac.json
deleted file mode 100644
index bb340c5c9..000000000
--- a/annotations/tac.json
+++ /dev/null
@@ -1,14 +0,0 @@
-{
-    "command": "tac",
-    "aggregate": "$PASH_TOP/runtime/agg/py/tac.py",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "stateless",
-            "inputs": ["args[:]"],
-            "outputs": ["stdout"]
-        }
-    ],
-    "options": ["stdin-hyphen"]
-}
diff --git a/annotations/tail.json b/annotations/tail.json
deleted file mode 100644
index 6116d92eb..000000000
--- a/annotations/tail.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-    "command": "tail",
-    "aggregate": "$PASH_TOP/runtime/agg/py/tail.py",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/tee.json b/annotations/tee.json
deleted file mode 100644
index c2f8480a4..000000000
--- a/annotations/tee.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "tee",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["stdin"],
-            "outputs": ["stdout", "args[:]"]
-        }
-    ]
-}
diff --git a/annotations/test_one.json b/annotations/test_one.json
deleted file mode 100644
index fa9a9d7ba..000000000
--- a/annotations/test_one.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-    "command": "test_one",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "parallelizable_pure",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"],
-            "aggregator":
-            {
-                "path": "runtime/agg/opt/concat.sh"
-            }
-        }
-    ]
-}
diff --git a/annotations/test_two.json b/annotations/test_two.json
deleted file mode 100644
index 55a5f738a..000000000
--- a/annotations/test_two.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-    "command": "test_two",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "parallelizable_pure",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"],
-            "aggregator":
-            {
-                "path": "runtime/agg/py/cat.py"
-            }
-        }
-    ]
-}
-
diff --git a/annotations/test_uniq_1.json b/annotations/test_uniq_1.json
deleted file mode 100644
index 6961e4cb3..000000000
--- a/annotations/test_uniq_1.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-    "command": "test_uniq_1",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "parallelizable_pure",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"],
-            "aggregator":
-            {
-                "path": "runtime/agg/bin/uniq"
-            }
-        }
-    ]
-}
-
diff --git a/annotations/test_uniq_2.json b/annotations/test_uniq_2.json
deleted file mode 100644
index aecad3752..000000000
--- a/annotations/test_uniq_2.json
+++ /dev/null
@@ -1,18 +0,0 @@
-{
-    "command": "test_uniq_2",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "parallelizable_pure",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"],
-            "aggregator":
-            {
-                "path": "runtime/agg/bin/uniq",
-                "options": ["-c"]
-            }
-        }
-    ]
-}
-
diff --git a/annotations/tr.json b/annotations/tr.json
deleted file mode 100644
index 57998cc74..000000000
--- a/annotations/tr.json
+++ /dev/null
@@ -1,37 +0,0 @@
-{
-    "command": "tr",
-    "cases":
-    [
-        {
-            "predicate":
-            {
-                "operator": "exists",
-                "operands": ["-d",
-                             "--delete"]
-            },
-            "class": "stateless",
-            "comment": "KK (pure): Not sure if this should be pure or stateless",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        },
-        {
-            "predicate":
-            {
-                "operator": "exists",
-                "operands": ["-s",
-                             "--squeeze-repeats"]
-            },
-            "class": "pure",
-            "comment": "This is too conservative since it only holds if the second argument to -s is \n.",
-            "TODO": "In order for this to work, we need to fix annotation interpreter to check for concatenated short options. For now it works in a custom function.",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        },
-        {
-            "predicate": "default",
-            "class": "stateless",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/trigrams_aux.json b/annotations/trigrams_aux.json
deleted file mode 100644
index 21a745db7..000000000
--- a/annotations/trigrams_aux.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-    "command": "trigrams_aux",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
-
diff --git a/annotations/uniq.json b/annotations/uniq.json
deleted file mode 100644
index 609224b9a..000000000
--- a/annotations/uniq.json
+++ /dev/null
@@ -1,14 +0,0 @@
-{
-    "command": "uniq",
-    "aggregate": "$PASH_TOP/runtime/agg/py/uniq.py",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
-
diff --git a/annotations/wc.json b/annotations/wc.json
deleted file mode 100644
index f1c3c0760..000000000
--- a/annotations/wc.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-    "command": "wc",
-    "aggregate": "$PASH_TOP/runtime/agg/py/wc.py",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"]
-        }
-    ]
-}
diff --git a/annotations/xargs.json b/annotations/xargs.json
deleted file mode 100644
index 5e423bd0c..000000000
--- a/annotations/xargs.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
-    "command": "xargs",
-    "cases":
-    [
-        {
-            "predicate": 
-            {
-                "operator": "value",
-                "operands": ["-L", "1"]
-            },
-            "class": "stateless",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"],
-            "options": ["resolve_at_runtime"],
-			"comment": "TODO: Implement the 'resolve_at_runtime' option by finding the category based on  the arguments at runtime."
-        },
-        {
-            "predicate": "default",
-            "class": "stateless",
-            "inputs": ["stdin"],
-            "outputs": ["stdout"],
-            "options": ["resolve_at_runtime"],
-			"comment": "TODO: Implement the 'resolve_at_runtime' option by finding the category based on  the arguments at runtime."
-        }
-    ]
-}
diff --git a/annotations/xxd.json b/annotations/xxd.json
deleted file mode 100644
index 73cc10f8a..000000000
--- a/annotations/xxd.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "command": "readelf",
-    "cases":
-    [
-        {
-            "predicate": "default",
-            "class": "pure",
-            "inputs": ["args[:]"],
-            "outputs": ["stdout"]
-        }
-    ]
-}

From cf102a18b77dbc7a6832003123a650206e8e69f1 Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Thu, 8 Sep 2022 08:36:08 -0400
Subject: [PATCH 25/64] whitespace to trigger CI

Signed-off-by: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
---
 compiler/ir.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/compiler/ir.py b/compiler/ir.py
index 0ca5d8453..2366b03e7 100644
--- a/compiler/ir.py
+++ b/compiler/ir.py
@@ -1468,3 +1468,4 @@ def create_reduce_tree_level(init_func, input_ids, fileIdGen):
     def create_reduce_node(init_func, input_ids, output_ids):
         return init_func(flatten_list(input_ids), output_ids)
     # TODO: this is where we need to use our aggregator spec/node
+

From 2ca8d1cb31f42b99e12548ae6d0421d7f735226b Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Thu, 8 Sep 2022 10:06:10 -0400
Subject: [PATCH 26/64] whitespace to trigger CI

Signed-off-by: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
---
 compiler/ir.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/compiler/ir.py b/compiler/ir.py
index 2366b03e7..cb54affe9 100644
--- a/compiler/ir.py
+++ b/compiler/ir.py
@@ -1469,3 +1469,4 @@ def create_reduce_node(init_func, input_ids, output_ids):
         return init_func(flatten_list(input_ids), output_ids)
     # TODO: this is where we need to use our aggregator spec/node
 
+

From 3a7e7f42bdcf34d54730c8c7a897665e2a4f24ae Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Thu, 8 Sep 2022 21:28:54 -0400
Subject: [PATCH 27/64] whitespace for CI

Signed-off-by: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
---
 compiler/ir.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/compiler/ir.py b/compiler/ir.py
index cb54affe9..2366b03e7 100644
--- a/compiler/ir.py
+++ b/compiler/ir.py
@@ -1469,4 +1469,3 @@ def create_reduce_node(init_func, input_ids, output_ids):
         return init_func(flatten_list(input_ids), output_ids)
     # TODO: this is where we need to use our aggregator spec/node
 
-

From 5f544b4e63f0ca7669ffc771b0ffc1abe301aec3 Mon Sep 17 00:00:00 2001
From: Felix Stutz <fstutz@mpi-sws.org>
Date: Wed, 28 Sep 2022 12:18:11 +0200
Subject: [PATCH 28/64] Refactoring to use annotations as package
 pash_annotations

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>
---
 .../annotations_utils/util_cmd_invocations.py  | 18 +++++++++---------
 .../annotations_utils/util_file_descriptors.py |  2 +-
 compiler/annotations_utils/util_mapper.py      |  2 +-
 compiler/annotations_utils/util_parsing.py     |  8 ++++----
 compiler/definitions/ir/nodes/cat.py           |  2 +-
 compiler/definitions/ir/nodes/dgsh_tee.py      |  8 ++++----
 compiler/definitions/ir/nodes/eager.py         |  4 ++--
 compiler/definitions/ir/nodes/pash_split.py    |  4 ++--
 compiler/definitions/ir/nodes/r_merge.py       |  4 ++--
 compiler/definitions/ir/nodes/r_split.py       |  6 +++---
 compiler/definitions/ir/nodes/r_unwrap.py      |  4 ++--
 compiler/definitions/ir/nodes/r_wrap.py        |  6 +++---
 compiler/ir.py                                 | 17 +++++++++--------
 compiler/pash_runtime.py                       |  2 +-
 14 files changed, 44 insertions(+), 43 deletions(-)

diff --git a/compiler/annotations_utils/util_cmd_invocations.py b/compiler/annotations_utils/util_cmd_invocations.py
index 1be87e28b..8ee2907df 100644
--- a/compiler/annotations_utils/util_cmd_invocations.py
+++ b/compiler/annotations_utils/util_cmd_invocations.py
@@ -1,17 +1,17 @@
-from datatypes_new.BasicDatatypes import Flag, ArgStringType, Operand
-from datatypes_new.BasicDatatypesWithIO import OptionWithIO
-from datatypes_new.CommandInvocationInitial import CommandInvocationInitial
-from annotation_generation_new.datatypes.InputOutputInfo import InputOutputInfo
-from annotation_generation_new.datatypes.ParallelizabilityInfo import ParallelizabilityInfo
-from annotation_generation_new.datatypes.CommandProperties import CommandProperties
-from annotation_generation_new.AnnotationGeneration import get_input_output_info_from_cmd_invocation, \
+from pash_annotations.datatypes.BasicDatatypes import Flag, ArgStringType, Operand
+from pash_annotations.datatypes.BasicDatatypesWithIO import OptionWithIO
+from pash_annotations.datatypes.CommandInvocationInitial import CommandInvocationInitial
+from pash_annotations.annotation_generation.datatypes.InputOutputInfo import InputOutputInfo
+from pash_annotations.annotation_generation.datatypes.ParallelizabilityInfo import ParallelizabilityInfo
+from pash_annotations.annotation_generation.datatypes.CommandProperties import CommandProperties
+from pash_annotations.annotation_generation.AnnotationGeneration import get_input_output_info_from_cmd_invocation, \
     get_parallelizability_info_from_cmd_invocation
-from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars
+from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars
 
 from definitions.ir.arg import Arg
 
 # for typing
-from datatypes_new.CommandInvocationPrefix import CommandInvocationPrefix
+from pash_annotations.datatypes.CommandInvocationPrefix import CommandInvocationPrefix
 
 from ir_utils import string_to_argument, redir_stdout_to_file, redir_file_to_stdin, make_command
 
diff --git a/compiler/annotations_utils/util_file_descriptors.py b/compiler/annotations_utils/util_file_descriptors.py
index fe68ed9fb..fb17438b0 100644
--- a/compiler/annotations_utils/util_file_descriptors.py
+++ b/compiler/annotations_utils/util_file_descriptors.py
@@ -1,6 +1,6 @@
 from util import log
 from definitions.ir.resource import FileResource, Resource, FileDescriptorResource
-from datatypes_new.BasicDatatypesWithIO import FileNameWithIOInfo, StdDescriptorWithIOInfo
+from pash_annotations.datatypes.BasicDatatypesWithIO import FileNameWithIOInfo, StdDescriptorWithIOInfo
 
 
 def resource_from_file_descriptor(file_descriptor) -> Resource:
diff --git a/compiler/annotations_utils/util_mapper.py b/compiler/annotations_utils/util_mapper.py
index 14bd965d1..07625b304 100644
--- a/compiler/annotations_utils/util_mapper.py
+++ b/compiler/annotations_utils/util_mapper.py
@@ -3,7 +3,7 @@
 # imports from annotation framework
 # for typing
 # for use
-from annotation_generation_new.datatypes.parallelizability.Mapper import Mapper
+from pash_annotations.annotation_generation.datatypes.parallelizability.Mapper import Mapper
 
 from definitions.ir.dfg_node import DFGNode
 from annotations_utils.util_cmd_invocations import get_command_invocation_prefix_from_dfg_node
diff --git a/compiler/annotations_utils/util_parsing.py b/compiler/annotations_utils/util_parsing.py
index cef464f79..ebeab7853 100644
--- a/compiler/annotations_utils/util_parsing.py
+++ b/compiler/annotations_utils/util_parsing.py
@@ -2,11 +2,11 @@
 
 from definitions.ir.arg import Arg
 
-from datatypes_new.CommandInvocationInitial import CommandInvocationInitial
-from datatypes_new.BasicDatatypes import Option, ArgStringType, Flag, Operand
-from parser_new.parser import parse, get_set_of_all_flags, get_dict_flag_to_primary_repr, get_set_of_all_options, \
+from pash_annotations.datatypes.CommandInvocationInitial import CommandInvocationInitial
+from pash_annotations.datatypes.BasicDatatypes import Option, ArgStringType, Flag, Operand
+from pash_annotations.parser.parser import parse, get_set_of_all_flags, get_dict_flag_to_primary_repr, get_set_of_all_options, \
     get_dict_option_to_primary_repr, are_all_individually_flags
-from parser_new.util_parser import get_json_data
+from pash_annotations.parser.util_parser import get_json_data
 
 
 from ir_utils import format_arg_chars, string_to_argument, log
diff --git a/compiler/definitions/ir/nodes/cat.py b/compiler/definitions/ir/nodes/cat.py
index 28df3920e..37d7a2996 100644
--- a/compiler/definitions/ir/nodes/cat.py
+++ b/compiler/definitions/ir/nodes/cat.py
@@ -1,4 +1,4 @@
-from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars
+from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars
 from definitions.ir.dfg_node import DFGNode
 
 class Cat(DFGNode):
diff --git a/compiler/definitions/ir/nodes/dgsh_tee.py b/compiler/definitions/ir/nodes/dgsh_tee.py
index cacdd94c9..381d96f0d 100644
--- a/compiler/definitions/ir/nodes/dgsh_tee.py
+++ b/compiler/definitions/ir/nodes/dgsh_tee.py
@@ -1,7 +1,7 @@
-from datatypes_new.AccessKind import make_stream_output, make_stream_input
-from datatypes_new.BasicDatatypes import Flag, ArgStringType
-from datatypes_new.BasicDatatypesWithIO import OptionWithIO
-from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars
+from pash_annotations.datatypes.AccessKind import make_stream_output, make_stream_input
+from pash_annotations.datatypes.BasicDatatypes import Flag, ArgStringType
+from pash_annotations.datatypes.BasicDatatypesWithIO import OptionWithIO
+from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars
 
 from annotations_utils.util_cmd_invocations import to_ast_flagoption, to_ast_operand
 from definitions.ir.dfg_node import *
diff --git a/compiler/definitions/ir/nodes/eager.py b/compiler/definitions/ir/nodes/eager.py
index ae931b486..73643768b 100644
--- a/compiler/definitions/ir/nodes/eager.py
+++ b/compiler/definitions/ir/nodes/eager.py
@@ -1,5 +1,5 @@
-from datatypes_new.AccessKind import AccessKind, make_stream_output, make_stream_input, make_other_output
-from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars
+from pash_annotations.datatypes.AccessKind import AccessKind, make_stream_output, make_stream_input, make_other_output
+from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars
 
 from definitions.ir.dfg_node import *
 
diff --git a/compiler/definitions/ir/nodes/pash_split.py b/compiler/definitions/ir/nodes/pash_split.py
index e21de4b1d..add232947 100644
--- a/compiler/definitions/ir/nodes/pash_split.py
+++ b/compiler/definitions/ir/nodes/pash_split.py
@@ -1,5 +1,5 @@
-from datatypes_new.AccessKind import make_stream_input, make_stream_output
-from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars
+from pash_annotations.datatypes.AccessKind import make_stream_input, make_stream_output
+from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars
 
 from definitions.ir.file_id import *
 from definitions.ir.dfg_node import *
diff --git a/compiler/definitions/ir/nodes/r_merge.py b/compiler/definitions/ir/nodes/r_merge.py
index 453f0c01f..345c13e23 100644
--- a/compiler/definitions/ir/nodes/r_merge.py
+++ b/compiler/definitions/ir/nodes/r_merge.py
@@ -1,5 +1,5 @@
-from datatypes_new.AccessKind import make_stream_input, make_stream_output
-from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars
+from pash_annotations.datatypes.AccessKind import make_stream_input, make_stream_output
+from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars
 
 from definitions.ir.dfg_node import *
 
diff --git a/compiler/definitions/ir/nodes/r_split.py b/compiler/definitions/ir/nodes/r_split.py
index 05900a1d9..3f4b5bc3f 100644
--- a/compiler/definitions/ir/nodes/r_split.py
+++ b/compiler/definitions/ir/nodes/r_split.py
@@ -1,8 +1,8 @@
 import os
 
-from datatypes_new.AccessKind import AccessKind, make_stream_input, make_stream_output
-from datatypes_new.BasicDatatypes import Operand, Flag
-from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars
+from pash_annotations.datatypes.AccessKind import AccessKind, make_stream_input, make_stream_output
+from pash_annotations.datatypes.BasicDatatypes import Operand, Flag
+from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars
 
 import config
 
diff --git a/compiler/definitions/ir/nodes/r_unwrap.py b/compiler/definitions/ir/nodes/r_unwrap.py
index 0a2aec195..112b72148 100644
--- a/compiler/definitions/ir/nodes/r_unwrap.py
+++ b/compiler/definitions/ir/nodes/r_unwrap.py
@@ -1,5 +1,5 @@
-from datatypes_new.AccessKind import make_stream_input, make_stream_output
-from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars
+from pash_annotations.datatypes.AccessKind import make_stream_input, make_stream_output
+from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars
 
 from definitions.ir.dfg_node import *
 from ir_utils import *
diff --git a/compiler/definitions/ir/nodes/r_wrap.py b/compiler/definitions/ir/nodes/r_wrap.py
index 316e81f33..7149a8cec 100644
--- a/compiler/definitions/ir/nodes/r_wrap.py
+++ b/compiler/definitions/ir/nodes/r_wrap.py
@@ -1,6 +1,6 @@
-from datatypes_new.AccessKind import make_stream_output, make_stream_input
-from datatypes_new.BasicDatatypes import ArgStringType
-from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars
+from pash_annotations.datatypes.AccessKind import make_stream_output, make_stream_input
+from pash_annotations.datatypes.BasicDatatypes import ArgStringType
+from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars
 
 from annotations_utils.util_cmd_invocations import to_arg_from_cmd_inv_with_io_vars_without_streaming_inputs_or_outputs_for_wrapping
 from definitions.ir.dfg_node import *
diff --git a/compiler/ir.py b/compiler/ir.py
index 2366b03e7..d4375008a 100644
--- a/compiler/ir.py
+++ b/compiler/ir.py
@@ -1,12 +1,13 @@
 import sys
-
-from datatypes_new.CommandInvocationInitial import CommandInvocationInitial
-from datatypes_new.BasicDatatypes import ArgStringType
-from datatypes_new.BasicDatatypesWithIO import FileNameWithIOInfo, StdDescriptorWithIOInfo, OptionWithIO
-from annotation_generation_new.datatypes.InputOutputInfo import InputOutputInfo
-from annotation_generation_new.datatypes.ParallelizabilityInfo import ParallelizabilityInfo
-from annotation_generation_new.datatypes.CommandProperties import CommandProperties
-from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars
+import pash_annotations.datatypes
+
+from pash_annotations.datatypes.CommandInvocationInitial import CommandInvocationInitial
+from pash_annotations.datatypes.BasicDatatypes import ArgStringType
+from pash_annotations.datatypes.BasicDatatypesWithIO import FileNameWithIOInfo, StdDescriptorWithIOInfo, OptionWithIO
+from pash_annotations.annotation_generation.datatypes.InputOutputInfo import InputOutputInfo
+from pash_annotations.annotation_generation.datatypes.ParallelizabilityInfo import ParallelizabilityInfo
+from pash_annotations.annotation_generation.datatypes.CommandProperties import CommandProperties
+from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars
 
 from annotations_utils.util_parsing import parse_arg_list_to_command_invocation
 from annotations_utils.util_cmd_invocations import get_input_output_info_from_cmd_invocation_util, get_parallelizability_info_from_cmd_invocation_util
diff --git a/compiler/pash_runtime.py b/compiler/pash_runtime.py
index 27217a971..86cccd136 100644
--- a/compiler/pash_runtime.py
+++ b/compiler/pash_runtime.py
@@ -4,7 +4,7 @@
 import traceback
 from datetime import datetime
 
-from annotation_generation_new.datatypes.parallelizability.AggregatorKind import AggregatorKindEnum
+from pash_annotations.annotation_generation.datatypes.parallelizability.AggregatorKind import AggregatorKindEnum
 
 import config
 from ir import *

From 3e76d9f4dfdce8d20ec72fae76ab74d062ddb381 Mon Sep 17 00:00:00 2001
From: Felix Stutz <fstutz@mpi-sws.org>
Date: Wed, 28 Sep 2022 12:23:09 +0200
Subject: [PATCH 29/64] Make setup-pash install pash-annotations with pip and
 not from repository

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>
---
 scripts/setup-pash.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scripts/setup-pash.sh b/scripts/setup-pash.sh
index 340625348..d2592a499 100755
--- a/scripts/setup-pash.sh
+++ b/scripts/setup-pash.sh
@@ -27,8 +27,7 @@ python3 -m pip install graphviz --root $PYTHON_PKG_DIR --ignore-installed #&> $L
 python3 -m pip install libdash --root $PYTHON_PKG_DIR --ignore-installed #&> $LOG_DIR/pip_install_libdash.log
 
 ## TODO: Fix a specific version somehow, maybe commit?
-git clone https://github.com/binpash/annotations.git ./annotations_repo
-python3 -m pip install ./annotations_repo --root $PYTHON_PKG_DIR --ignore-installed #&> $LOG_DIR/pip_install_annotations.log
+python3 -m pip install pash-annotations --root $PYTHON_PKG_DIR --ignore-installed #&> $LOG_DIR/pip_install_libdash.log
 
 ## numpy and matplotlib are only needed to generate the evaluation plots so they should not be in the main path
 if [[ "$install_eval" == 1 ]];  then

From c5d7b5420d166568366b7f8255135919231a2f99 Mon Sep 17 00:00:00 2001
From: Felix Stutz <fstutz@mpi-sws.org>
Date: Sat, 1 Oct 2022 12:46:01 +0200
Subject: [PATCH 30/64] Minor

Signed-off-by: Felix Stutz <fstutz@mpi-sws.org>
---
 scripts/setup-pash.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/setup-pash.sh b/scripts/setup-pash.sh
index d2592a499..a2e506c0b 100755
--- a/scripts/setup-pash.sh
+++ b/scripts/setup-pash.sh
@@ -26,8 +26,8 @@ python3 -m pip install graphviz --root $PYTHON_PKG_DIR --ignore-installed #&> $L
 # TODO 2022-08-01 if libdash wheel isn't available, we need autmake etc.
 python3 -m pip install libdash --root $PYTHON_PKG_DIR --ignore-installed #&> $LOG_DIR/pip_install_libdash.log
 
-## TODO: Fix a specific version somehow, maybe commit?
-python3 -m pip install pash-annotations --root $PYTHON_PKG_DIR --ignore-installed #&> $LOG_DIR/pip_install_libdash.log
+## TODO: Fix a specific version of package
+python3 -m pip install pash-annotations --root $PYTHON_PKG_DIR --ignore-installed #&> $LOG_DIR/pip_install_annotations.log
 
 ## numpy and matplotlib are only needed to generate the evaluation plots so they should not be in the main path
 if [[ "$install_eval" == 1 ]];  then

From 0e944526c7eec9258eb3add33c206c299e1d1c9f Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Fri, 30 Sep 2022 15:43:18 -0400
Subject: [PATCH 31/64] fix pash annotations import to specific version

---
 scripts/setup-pash.sh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/scripts/setup-pash.sh b/scripts/setup-pash.sh
index a2e506c0b..4b7ffdb32 100755
--- a/scripts/setup-pash.sh
+++ b/scripts/setup-pash.sh
@@ -25,9 +25,7 @@ echo "Installing python dependencies..."
 python3 -m pip install graphviz --root $PYTHON_PKG_DIR --ignore-installed #&> $LOG_DIR/pip_install_graphviz.log
 # TODO 2022-08-01 if libdash wheel isn't available, we need autmake etc.
 python3 -m pip install libdash --root $PYTHON_PKG_DIR --ignore-installed #&> $LOG_DIR/pip_install_libdash.log
-
-## TODO: Fix a specific version of package
-python3 -m pip install pash-annotations --root $PYTHON_PKG_DIR --ignore-installed #&> $LOG_DIR/pip_install_annotations.log
+python3 -m pip install 'pash-annotations>=0.2.0,<0.3.0' --root $PYTHON_PKG_DIR --ignore-installed #&> $LOG_DIR/pip_install_annotations.log
 
 ## numpy and matplotlib are only needed to generate the evaluation plots so they should not be in the main path
 if [[ "$install_eval" == 1 ]];  then

From d96f9d415d9482bce2c214e7566d8477ce8463db Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Thu, 15 Dec 2022 10:07:53 -0500
Subject: [PATCH 32/64] refactor preprocessor source code and try to factor out
 so that it can be called on its own

---
 compiler/ast_to_ir.py           | 145 ++++++++++++++++----------------
 compiler/config.py              |  51 ++++++++---
 compiler/dspash/worker.py       |   2 +-
 compiler/pash.py                |  58 +++----------
 compiler/pash_init_setup.sh     |   2 +-
 compiler/pash_runtime.py        |   2 +-
 compiler/pash_runtime_daemon.py |   2 +-
 compiler/preprocessor.py        |  72 ++++++++++++++++
 compiler/util.py                |  23 ++++-
 preprocessor                    |  18 ++++
 10 files changed, 235 insertions(+), 140 deletions(-)
 create mode 100644 compiler/preprocessor.py
 create mode 100755 preprocessor

diff --git a/compiler/ast_to_ir.py b/compiler/ast_to_ir.py
index 16a640237..2c9ab11c3 100644
--- a/compiler/ast_to_ir.py
+++ b/compiler/ast_to_ir.py
@@ -45,34 +45,34 @@
         }
 
 preprocess_cases = {
-    "Pipe": (lambda irFileGen, config, last_object:
-             lambda ast_node: preprocess_node_pipe(ast_node, irFileGen, config, last_object=last_object)),
-    "Command": (lambda irFileGen, config, last_object:
-                lambda ast_node: preprocess_node_command(ast_node, irFileGen, config, last_object=last_object)),
-    "Redir": (lambda irFileGen, config, last_object:
-              lambda ast_node: preprocess_node_redir(ast_node, irFileGen, config, last_object=last_object)),
-    "Background": (lambda irFileGen, config, last_object:
-                   lambda ast_node: preprocess_node_background(ast_node, irFileGen, config, last_object=last_object)),
-    "Subshell": (lambda irFileGen, config, last_object:
-                   lambda ast_node: preprocess_node_subshell(ast_node, irFileGen, config, last_object=last_object)),
-    "For": (lambda irFileGen, config, last_object:
-            lambda ast_node: preprocess_node_for(ast_node, irFileGen, config, last_object=last_object)),
-    "While": (lambda irFileGen, config, last_object:
-              lambda ast_node: preprocess_node_while(ast_node, irFileGen, config, last_object=last_object)),
-    "Defun": (lambda irFileGen, config, last_object:
-              lambda ast_node: preprocess_node_defun(ast_node, irFileGen, config, last_object=last_object)),
-    "Semi": (lambda irFileGen, config, last_object:
-             lambda ast_node: preprocess_node_semi(ast_node, irFileGen, config, last_object=last_object)),
-    "Or": (lambda irFileGen, config, last_object:
-           lambda ast_node: preprocess_node_or(ast_node, irFileGen, config, last_object=last_object)),
-    "And": (lambda irFileGen, config, last_object:
-            lambda ast_node: preprocess_node_and(ast_node, irFileGen, config, last_object=last_object)),
-    "Not": (lambda irFileGen, config, last_object:
-            lambda ast_node: preprocess_node_not(ast_node, irFileGen, config, last_object=last_object)),
-    "If": (lambda irFileGen, config, last_object:
-            lambda ast_node: preprocess_node_if(ast_node, irFileGen, config, last_object=last_object)),
-    "Case": (lambda irFileGen, config, last_object:
-             lambda ast_node: preprocess_node_case(ast_node, irFileGen, config, last_object=last_object))
+    "Pipe": (lambda last_object:
+             lambda ast_node: preprocess_node_pipe(ast_node, last_object=last_object)),
+    "Command": (lambda last_object:
+                lambda ast_node: preprocess_node_command(ast_node, last_object=last_object)),
+    "Redir": (lambda last_object:
+              lambda ast_node: preprocess_node_redir(ast_node, last_object=last_object)),
+    "Background": (lambda last_object:
+                   lambda ast_node: preprocess_node_background(ast_node, last_object=last_object)),
+    "Subshell": (lambda last_object:
+                   lambda ast_node: preprocess_node_subshell(ast_node, last_object=last_object)),
+    "For": (lambda last_object:
+            lambda ast_node: preprocess_node_for(ast_node, last_object=last_object)),
+    "While": (lambda last_object:
+              lambda ast_node: preprocess_node_while(ast_node, last_object=last_object)),
+    "Defun": (lambda last_object:
+              lambda ast_node: preprocess_node_defun(ast_node, last_object=last_object)),
+    "Semi": (lambda last_object:
+             lambda ast_node: preprocess_node_semi(ast_node, last_object=last_object)),
+    "Or": (lambda last_object:
+           lambda ast_node: preprocess_node_or(ast_node, last_object=last_object)),
+    "And": (lambda last_object:
+            lambda ast_node: preprocess_node_and(ast_node, last_object=last_object)),
+    "Not": (lambda last_object:
+            lambda ast_node: preprocess_node_not(ast_node, last_object=last_object)),
+    "If": (lambda last_object:
+            lambda ast_node: preprocess_node_if(ast_node, last_object=last_object)),
+    "Case": (lambda last_object:
+             lambda ast_node: preprocess_node_case(ast_node, last_object=last_object))
 }
 
 ir_cases = {
@@ -494,7 +494,7 @@ def compile_redirections(redirections, fileIdGen, config):
 ## The PaSh runtime then deserializes them, compiles them (if safe) and optimizes them.
 
 ## Replace candidate dataflow AST regions with calls to PaSh's runtime.
-def replace_ast_regions(ast_objects, irFileGen, config):
+def replace_ast_regions(ast_objects):
     preprocessed_asts = []
     candidate_dataflow_region = []
     last_object = False
@@ -528,7 +528,7 @@ def replace_ast_regions(ast_objects, irFileGen, config):
         ##   then the second output is true.
         ## - If the next AST needs to be replaced too (e.g. if the current one is a background)
         ##   then the third output is true
-        preprocessed_ast_object = preprocess_node(ast, irFileGen, config, last_object=last_object)
+        preprocessed_ast_object = preprocess_node(ast, last_object=last_object)
         ## If the dataflow region is not maximal then it implies that the whole
         ## AST should be replaced.
         assert(not preprocessed_ast_object.is_non_maximal() 
@@ -554,13 +554,13 @@ def replace_ast_regions(ast_objects, irFileGen, config):
                 ## we close the candidate.
                 dataflow_region_asts, dataflow_region_lines = unzip(candidate_dataflow_region)
                 dataflow_region_text = join_original_text_lines(dataflow_region_lines)
-                replaced_ast = replace_df_region(dataflow_region_asts, irFileGen, config,
+                replaced_ast = replace_df_region(dataflow_region_asts,
                                                  ast_text=dataflow_region_text, disable_parallel_pipelines=last_object)
                 candidate_dataflow_region = []
                 preprocessed_asts.append(replaced_ast)
             else:
                 if(preprocessed_ast_object.should_replace_whole_ast()):
-                    replaced_ast = replace_df_region([preprocessed_ast_object.ast], irFileGen, config,
+                    replaced_ast = replace_df_region([preprocessed_ast_object.ast],
                                                      ast_text=original_text, disable_parallel_pipelines=last_object)
                     preprocessed_asts.append(replaced_ast)
                 else:
@@ -575,7 +575,7 @@ def replace_ast_regions(ast_objects, irFileGen, config):
     if(len(candidate_dataflow_region) > 0):
         dataflow_region_asts, dataflow_region_lines = unzip(candidate_dataflow_region)
         dataflow_region_text = join_original_text_lines(dataflow_region_lines)
-        replaced_ast = replace_df_region(dataflow_region_asts, irFileGen, config,
+        replaced_ast = replace_df_region(dataflow_region_asts,
                                          ast_text=dataflow_region_text, disable_parallel_pipelines=True)
         candidate_dataflow_region = []
         preprocessed_asts.append(replaced_ast)
@@ -590,18 +590,18 @@ def join_original_text_lines(shell_source_lines_or_none):
     else:
         return "\n".join(shell_source_lines_or_none)
 
-def preprocess_node(ast_object, irFileGen, config, last_object=False):
+def preprocess_node(ast_object, last_object=False):
     global preprocess_cases
-    return ast_match_untyped(ast_object, preprocess_cases, irFileGen, config, last_object)
+    return ast_match_untyped(ast_object, preprocess_cases, last_object)
 
 ## This preprocesses the AST node and also replaces it if it needs replacement .
 ## It is called by constructs that cannot be included in a dataflow region.
-def preprocess_close_node(ast_object, irFileGen, config, last_object=False):
-    preprocessed_ast_object = preprocess_node(ast_object, irFileGen, config, last_object=last_object)
+def preprocess_close_node(ast_object, last_object=False):
+    preprocessed_ast_object = preprocess_node(ast_object, last_object=last_object)
     preprocessed_ast = preprocessed_ast_object.ast
     should_replace_whole_ast = preprocessed_ast_object.should_replace_whole_ast()
     if(should_replace_whole_ast):
-        final_ast = replace_df_region([preprocessed_ast], irFileGen, config, 
+        final_ast = replace_df_region([preprocessed_ast],
                                       disable_parallel_pipelines=last_object)
         something_replaced = True
     else:
@@ -609,7 +609,7 @@ def preprocess_close_node(ast_object, irFileGen, config, last_object=False):
         something_replaced = preprocessed_ast_object.will_anything_be_replaced()
     return final_ast, something_replaced
 
-def preprocess_node_pipe(ast_node, _irFileGen, _config, last_object=False):
+def preprocess_node_pipe(ast_node, last_object=False):
     ## A pipeline is *always* a candidate dataflow region.
     ## Q: Is that true?
 
@@ -626,7 +626,7 @@ def preprocess_node_pipe(ast_node, _irFileGen, _config, last_object=False):
     return preprocessed_ast_object
 
 ## TODO: Complete this
-def preprocess_node_command(ast_node, _irFileGen, _config, last_object=False):
+def preprocess_node_command(ast_node, last_object=False):
     ## TODO: Preprocess the internals of the pipe to allow
     ##       for mutually recursive calls to PaSh.
     ##
@@ -654,9 +654,9 @@ def preprocess_node_command(ast_node, _irFileGen, _config, last_object=False):
 
 # Background of (linno * t * redirection list) 
 ## TODO: It might be possible to actually not close the inner node but rather apply the redirections on it
-def preprocess_node_redir(ast_node, irFileGen, config, last_object=False):
+def preprocess_node_redir(ast_node, last_object=False):
     preprocessed_node, something_replaced = preprocess_close_node(ast_node.node,
-                                                                  irFileGen, config, last_object=last_object)
+                                                                  last_object=last_object)
     ## TODO: Could there be a problem with the in-place update
     ast_node.node = preprocessed_node
     preprocessed_ast_object = PreprocessedAST(ast_node,
@@ -667,7 +667,7 @@ def preprocess_node_redir(ast_node, irFileGen, config, last_object=False):
     return preprocessed_ast_object
 
 ## TODO: Is that correct? Also, this should probably affect `semi`, `and`, and `or`
-def preprocess_node_background(ast_node, _irFileGen, _config, last_object=False):
+def preprocess_node_background(ast_node, last_object=False):
     ## A background node is *always* a candidate dataflow region.
     ## Q: Is that true?
 
@@ -686,9 +686,8 @@ def preprocess_node_background(ast_node, _irFileGen, _config, last_object=False)
 ##
 ##       e.g. a subshell node should also be output as a subshell in the backend.
 ## FIXME: This might not just be suboptimal, but also wrong.
-def preprocess_node_subshell(ast_node, irFileGen, config, last_object=False):
+def preprocess_node_subshell(ast_node, last_object=False):
     preprocessed_body, something_replaced = preprocess_close_node(ast_node.body,
-                                                                  irFileGen, config,
                                                                   last_object=last_object)
     ## TODO: Could there be a problem with the in-place update
     ast_node.body = preprocessed_body
@@ -703,8 +702,8 @@ def preprocess_node_subshell(ast_node, irFileGen, config, last_object=False):
 
 ## TODO: This is not efficient at all since it calls the PaSh runtime everytime the loop is entered.
 ##       We have to find a way to improve that.
-def preprocess_node_for(ast_node, irFileGen, config, last_object=False):
-    preprocessed_body, something_replaced = preprocess_close_node(ast_node.body, irFileGen, config, last_object=last_object)
+def preprocess_node_for(ast_node, last_object=False):
+    preprocessed_body, something_replaced = preprocess_close_node(ast_node.body, last_object=last_object)
     ## TODO: Could there be a problem with the in-place update
     ast_node.body = preprocessed_body
     preprocessed_ast_object = PreprocessedAST(ast_node,
@@ -714,9 +713,9 @@ def preprocess_node_for(ast_node, irFileGen, config, last_object=False):
                                               last_ast=last_object)
     return preprocessed_ast_object
 
-def preprocess_node_while(ast_node, irFileGen, config, last_object=False):
-    preprocessed_test, sth_replaced_test = preprocess_close_node(ast_node.test, irFileGen, config, last_object=last_object)
-    preprocessed_body, sth_replaced_body = preprocess_close_node(ast_node.body, irFileGen, config, last_object=last_object)
+def preprocess_node_while(ast_node, last_object=False):
+    preprocessed_test, sth_replaced_test = preprocess_close_node(ast_node.test, last_object=last_object)
+    preprocessed_body, sth_replaced_body = preprocess_close_node(ast_node.body, last_object=last_object)
     ## TODO: Could there be a problem with the in-place update
     ast_node.test = preprocessed_test
     ast_node.body = preprocessed_body
@@ -729,9 +728,9 @@ def preprocess_node_while(ast_node, irFileGen, config, last_object=False):
     return preprocessed_ast_object
 
 ## This is the same as the one for `For`
-def preprocess_node_defun(ast_node, irFileGen, config, last_object=False):
+def preprocess_node_defun(ast_node, last_object=False):
     ## TODO: For now we don't want to compile function bodies
-    # preprocessed_body = preprocess_close_node(ast_node.body, irFileGen, config)
+    # preprocessed_body = preprocess_close_node(ast_node.body)
     ## TODO: Could there be a problem with the in-place update
     # ast_node.body = preprocessed_body
     preprocessed_ast_object = PreprocessedAST(ast_node,
@@ -742,12 +741,12 @@ def preprocess_node_defun(ast_node, irFileGen, config, last_object=False):
     return preprocessed_ast_object
 
 ## TODO: If the preprocessed is not maximal we actually need to combine it with the one on the right.
-def preprocess_node_semi(ast_node, irFileGen, config, last_object=False):
+def preprocess_node_semi(ast_node, last_object=False):
     # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left, irFileGen, config)
     ##
     ## TODO: Is it valid that only the right one is considered the last command?
-    preprocessed_left, sth_replaced_left = preprocess_close_node(ast_node.left_operand, irFileGen, config, last_object=False)
-    preprocessed_right, sth_replaced_right = preprocess_close_node(ast_node.right_operand, irFileGen, config, last_object=last_object)
+    preprocessed_left, sth_replaced_left = preprocess_close_node(ast_node.left_operand, last_object=False)
+    preprocessed_right, sth_replaced_right = preprocess_close_node(ast_node.right_operand, last_object=last_object)
     ## TODO: Could there be a problem with the in-place update
     ast_node.left_operand = preprocessed_left
     ast_node.right_operand = preprocessed_right
@@ -761,10 +760,10 @@ def preprocess_node_semi(ast_node, irFileGen, config, last_object=False):
 
 ## TODO: Make sure that what is inside an `&&`, `||`, `!` (and others) does not run in parallel_pipelines 
 ##       since we need its exit code.
-def preprocess_node_and(ast_node, irFileGen, config, last_object=False):
+def preprocess_node_and(ast_node, last_object=False):
     # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left, irFileGen, config)
-    preprocessed_left, sth_replaced_left = preprocess_close_node(ast_node.left_operand, irFileGen, config, last_object=last_object)
-    preprocessed_right, sth_replaced_right = preprocess_close_node(ast_node.right_operand, irFileGen, config, last_object=last_object)
+    preprocessed_left, sth_replaced_left = preprocess_close_node(ast_node.left_operand, last_object=last_object)
+    preprocessed_right, sth_replaced_right = preprocess_close_node(ast_node.right_operand, last_object=last_object)
     ## TODO: Could there be a problem with the in-place update
     ast_node.left_operand = preprocessed_left
     ast_node.right_operand = preprocessed_right
@@ -776,10 +775,10 @@ def preprocess_node_and(ast_node, irFileGen, config, last_object=False):
                                               last_ast=last_object)
     return preprocessed_ast_object
 
-def preprocess_node_or(ast_node, irFileGen, config, last_object=False):
+def preprocess_node_or(ast_node, last_object=False):
     # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left, irFileGen, config)
-    preprocessed_left, sth_replaced_left = preprocess_close_node(ast_node.left_operand, irFileGen, config, last_object=last_object)
-    preprocessed_right, sth_replaced_right = preprocess_close_node(ast_node.right_operand, irFileGen, config, last_object=last_object)
+    preprocessed_left, sth_replaced_left = preprocess_close_node(ast_node.left_operand, last_object=last_object)
+    preprocessed_right, sth_replaced_right = preprocess_close_node(ast_node.right_operand, last_object=last_object)
     ## TODO: Could there be a problem with the in-place update
     ast_node.left_operand = preprocessed_left
     ast_node.right_operand = preprocessed_right
@@ -791,9 +790,9 @@ def preprocess_node_or(ast_node, irFileGen, config, last_object=False):
                                               last_ast=last_object)
     return preprocessed_ast_object
 
-def preprocess_node_not(ast_node, irFileGen, config, last_object=False):
-    # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left, irFileGen, config)
-    preprocessed_body, sth_replaced = preprocess_close_node(ast_node.body, irFileGen, config, last_object=last_object)
+def preprocess_node_not(ast_node, last_object=False):
+    # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left)
+    preprocessed_body, sth_replaced = preprocess_close_node(ast_node.body, last_object=last_object)
     ## TODO: Could there be a problem with the in-place update
     ast_node.body = preprocessed_body
     preprocessed_ast_object = PreprocessedAST(ast_node,
@@ -804,11 +803,11 @@ def preprocess_node_not(ast_node, irFileGen, config, last_object=False):
     return preprocessed_ast_object
 
 
-def preprocess_node_if(ast_node, irFileGen, config, last_object=False):
+def preprocess_node_if(ast_node, last_object=False):
     # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left, irFileGen, config)
-    preprocessed_cond, sth_replaced_cond = preprocess_close_node(ast_node.cond, irFileGen, config, last_object=last_object)
-    preprocessed_then, sth_replaced_then = preprocess_close_node(ast_node.then_b, irFileGen, config, last_object=last_object)
-    preprocessed_else, sth_replaced_else = preprocess_close_node(ast_node.else_b, irFileGen, config, last_object=last_object)
+    preprocessed_cond, sth_replaced_cond = preprocess_close_node(ast_node.cond, last_object=last_object)
+    preprocessed_then, sth_replaced_then = preprocess_close_node(ast_node.then_b, last_object=last_object)
+    preprocessed_else, sth_replaced_else = preprocess_close_node(ast_node.else_b, last_object=last_object)
     ## TODO: Could there be a problem with the in-place update
     ast_node.cond = preprocessed_cond
     ast_node.then_b = preprocessed_then
@@ -821,13 +820,13 @@ def preprocess_node_if(ast_node, irFileGen, config, last_object=False):
                                               last_ast=last_object)
     return preprocessed_ast_object
 
-def preprocess_case(case, irFileGen, config, last_object=False):
-    preprocessed_body, sth_replaced = preprocess_close_node(case["cbody"], irFileGen, config, last_object=last_object)
+def preprocess_case(case, last_object=False):
+    preprocessed_body, sth_replaced = preprocess_close_node(case["cbody"], last_object=last_object)
     case["cbody"] = preprocessed_body
     return case, sth_replaced
 
-def preprocess_node_case(ast_node, irFileGen, config, last_object=False):
-    preprocessed_cases_replaced = [preprocess_case(case, irFileGen, config, last_object=last_object) for case in ast_node.cases]
+def preprocess_node_case(ast_node, last_object=False):
+    preprocessed_cases_replaced = [preprocess_case(case, last_object=last_object) for case in ast_node.cases]
     preprocessed_cases, sth_replaced_cases = list(zip(*preprocessed_cases_replaced))
     ## TODO: Could there be a problem with the in-place update
     ast_node.cases = preprocessed_cases
@@ -862,7 +861,7 @@ def preprocess_node_case(ast_node, irFileGen, config, last_object=False):
 ##
 ## If we are need to disable parallel pipelines, e.g., if we are in the context of an if,
 ## or if we are in the end of a script, then we set a variable.
-def replace_df_region(asts, irFileGen, config, disable_parallel_pipelines=False, ast_text=None):
+def replace_df_region(asts, disable_parallel_pipelines=False, ast_text=None):
     ir_filename = ptempfile()
 
     ## Serialize the node in a file
diff --git a/compiler/config.py b/compiler/config.py
index f2c2dabb6..353914114 100644
--- a/compiler/config.py
+++ b/compiler/config.py
@@ -26,7 +26,14 @@
 assert(not os.getenv('PASH_TMP_PREFIX') is None)
 PASH_TMP_PREFIX = os.getenv('PASH_TMP_PREFIX')
 
+##
+## Global configuration used by all pash components
+##
 LOGGING_PREFIX = ""
+OUTPUT_TIME = False
+DEBUG_LEVEL = 0
+LOG_FILE = ""
+
 
 HDFS_PREFIX = "$HDFS_DATANODE_DIR/"
 
@@ -35,6 +42,18 @@
 annotations = []
 pash_args = None
 
+
+## This function sets the global configuration
+##
+## TODO: Actually move everything outside of pash_args to configuration.
+def set_config_globals_from_pash_args(given_pash_args):
+    global pash_args, OUTPUT_TIME, DEBUG_LEVEL, LOG_FILE
+    pash_args = given_pash_args
+    OUTPUT_TIME = pash_args.output_time
+    DEBUG_LEVEL = pash_args.debug
+    LOG_FILE = pash_args.log_file
+
+
 ## Increase the recursion limit (it seems that the parser/unparser needs it for bigger graphs)
 sys.setrecursionlimit(10000)
 
@@ -60,8 +79,23 @@ def getWidth():
     cpus = os.cpu_count()
     return math.floor(cpus / 8) if cpus >= 16 else 2
 
+def add_general_config_arguments(parser):
+    ## TODO: Delete that at some point, or make it have a different use (e.g., outputting time even without -d 1).
+    parser.add_argument("-t", "--output_time", #FIXME: --time
+                        help="(obsolete, time is always logged now) output the time it took for every step",
+                        action="store_true")
+    parser.add_argument("-d", "--debug",
+                        type=int,
+                        help="configure debug level; defaults to 0",
+                        default=0)
+    parser.add_argument("--log_file",
+                        help="configure where to write the log; defaults to stderr.",
+                        default="")
+
 ## These are arguments that are common to pash.py and pash_runtime.py
 def add_common_arguments(parser):
+    add_general_config_arguments(parser)
+
     parser.add_argument("-w", "--width",
                         type=int,
                         default=getWidth(),
@@ -81,17 +115,9 @@ def add_common_arguments(parser):
     parser.add_argument("--profile_driven",
                         help="(experimental) use profiling information when optimizing",
                         action="store_true")
-    ## TODO: Delete that at some point, or make it have a different use (e.g., outputting time even without -d 1).
-    parser.add_argument("-t", "--output_time", #FIXME: --time
-                        help="(obsolete, time is always logged now) output the time it took for every step",
-                        action="store_true")
     parser.add_argument("-p", "--output_optimized", # FIXME: --print
                         help="output the parallel shell script for inspection",
                         action="store_true")
-    parser.add_argument("-d", "--debug",
-                        type=int,
-                        help="configure debug level; defaults to 0",
-                        default=0)
     parser.add_argument("--graphviz",
                         help="generates graphical representations of the dataflow graphs. The option argument corresponds to the format. PaSh stores them in a timestamped directory in the argument of --graphviz_dir",
                         choices=["no", "dot", "svg", "pdf", "png"],
@@ -103,9 +129,6 @@ def add_common_arguments(parser):
     parser.add_argument("--graphviz_dir",
                         help="the directory in which to store graphical representations",
                         default="/tmp")
-    parser.add_argument("--log_file",
-                        help="configure where to write the log; defaults to stderr.",
-                        default="")
     parser.add_argument("--no_eager",
                         help="(experimental) disable eager nodes before merging nodes",
                         action="store_true")
@@ -208,9 +231,9 @@ def pass_common_arguments(pash_arguments):
     return arguments
 
 def init_log_file():
-    global pash_args
-    if(not pash_args.log_file == ""):
-        with open(pash_args.log_file, "w") as f:
+    global LOG_FILE
+    if(not LOG_FILE == ""):
+        with open(LOG_FILE, "w") as f:
             pass
 
 
diff --git a/compiler/dspash/worker.py b/compiler/dspash/worker.py
index cb77c3bf8..3dcb7a8e2 100644
--- a/compiler/dspash/worker.py
+++ b/compiler/dspash/worker.py
@@ -113,7 +113,7 @@ def parse_args():
                         default=65432)
     config.add_common_arguments(parser)
     args = parser.parse_args()
-    config.pash_args = args
+    config.set_config_globals_from_pash_args(args)
     ## Initialize the log file
     config.init_log_file()
     if not config.config:
diff --git a/compiler/pash.py b/compiler/pash.py
index 3fa5a0777..00ac5ed83 100755
--- a/compiler/pash.py
+++ b/compiler/pash.py
@@ -6,61 +6,40 @@
 from annotations import *
 from ast_to_ir import *
 from ir import *
-from parse import parse_shell_to_asts, parse_shell_to_asts_interactive, from_ast_objects_to_shell
+from parse import parse_shell_to_asts_interactive
 from pash_graphviz import maybe_init_graphviz_dir
+from preprocessor import preprocess
 from util import *
 import config
 import shutil
 
-def main():
-    ## Set the logging prefix
-    config.LOGGING_PREFIX = "PaSh Preprocessor: "
+LOGGING_PREFIX = "PaSh: "
 
+@logging_prefix(LOGGING_PREFIX)
+def main():
     ## Parse arguments
     args, shell_name = parse_args()
     ## If it is interactive we need a different execution mode
     ##
     ## The user can also ask for an interactive mode irregardless of whether pash was invoked in interactive mode. 
     if(len(args.input) == 0 or args.interactive):
+        log("ERROR: --interactive option is not supported!", level=0)
+        assert(False)
+        ## This should never be used!
         interactive(args, shell_name)
     else:
-        ## 1. Execute the POSIX shell parser that returns the AST in JSON
         input_script_path = args.input[0]
         input_script_arguments = args.input[1:]
-        preprocessing_parsing_start_time = datetime.now()
-        ast_objects = parse_shell_to_asts(input_script_path)
-        preprocessing_parsing_end_time = datetime.now()
-        print_time_delta("Preprocessing -- Parsing", preprocessing_parsing_start_time, preprocessing_parsing_end_time, args)
 
         ## Preprocess and execute the parsed ASTs
-        return_code = preprocess_and_execute_asts(ast_objects, args, input_script_arguments, shell_name)
+        return_code = preprocess_and_execute_asts(input_script_path, args, input_script_arguments, shell_name)
         
         log("-" * 40) #log end marker
         ## Return the exit code of the executed script
         sys.exit(return_code)
 
-def preprocess_ast(ast_objects, args):
-    ## 2. Preprocess ASTs by replacing possible candidates for compilation
-    ##    with calls to the PaSh runtime.
-    preprocessing_pash_start_time = datetime.now()
-    preprocessed_asts = preprocess(ast_objects, config.config)
-    preprocessing_pash_end_time = datetime.now()
-    print_time_delta("Preprocessing -- PaSh", preprocessing_pash_start_time, preprocessing_pash_end_time, args)
-
-    ## 3. Translate the new AST back to shell syntax
-    preprocessing_unparsing_start_time = datetime.now()
-    preprocessed_shell_script = from_ast_objects_to_shell(preprocessed_asts)
-    if(args.output_preprocessed):
-        log("Preprocessed script:")
-        log(preprocessed_shell_script)
-    
-    preprocessing_unparsing_end_time = datetime.now()
-    print_time_delta("Preprocessing -- Unparsing", preprocessing_unparsing_start_time, preprocessing_unparsing_end_time, args)
-    return preprocessed_shell_script
-
-
-def preprocess_and_execute_asts(ast_objects, args, input_script_arguments, shell_name):
-    preprocessed_shell_script = preprocess_ast(ast_objects, args)
+def preprocess_and_execute_asts(input_script_path, args, input_script_arguments, shell_name):
+    preprocessed_shell_script = preprocess(input_script_path, args)
     
     ## Write the new shell script to a file to execute
     fname = ptempfile()
@@ -117,7 +96,7 @@ def interactive(args, shell_name):
         ast_objects = parse_shell_to_asts_interactive(input_script_path)
         for ast_object in ast_objects:
             ## Preprocess each ast object and produce a preprocessed shell script fragment
-            preprocessed_shell_script = preprocess_ast([ast_object], args)
+            preprocessed_shell_script = preprocess([ast_object], args)
             log("Sending script to shell process...")
             ## Send the preprocessed script fragment to the shell process
             shell_proc.stdin.write(preprocessed_shell_script)
@@ -175,7 +154,7 @@ def parse_args():
 
     config.add_common_arguments(parser)
     args = parser.parse_args()
-    config.pash_args = args
+    config.set_config_globals_from_pash_args(args)
 
     ## Initialize the log file
     config.init_log_file()
@@ -215,17 +194,6 @@ def parse_args():
 
     return args, shell_name
 
-def preprocess(ast_objects, config):
-    ## This is ids for the temporary files that we will save the IRs in
-    irFileGen = FileIdGen()
-
-    ## Preprocess ASTs by replacing AST regions with calls to PaSh's runtime.
-    ## Then the runtime will do the compilation and optimization with additional
-    ## information.
-    preprocessed_asts = replace_ast_regions(ast_objects, irFileGen, config)
-
-    return preprocessed_asts
-
 def shell_env(shell_name: str):
     new_env = os.environ.copy()
     new_env["PASH_TMP_PREFIX"] = config.PASH_TMP_PREFIX
diff --git a/compiler/pash_init_setup.sh b/compiler/pash_init_setup.sh
index c1c402300..733aa5357 100644
--- a/compiler/pash_init_setup.sh
+++ b/compiler/pash_init_setup.sh
@@ -206,7 +206,7 @@ else
         ##   then it must have crashed or so.
         i=0
         ## This is a magic number to make sure that we wait enough
-        maximum_retries=100
+        maximum_retries=1000
         ## For some reason, `nc -z` doesn't work on livestar (it always returns error)
         ## and therefore we need to send something. 
         until  echo "Daemon Start" 2> /dev/null | nc -U "$DAEMON_SOCKET" >/dev/null 2>&1 ; 
diff --git a/compiler/pash_runtime.py b/compiler/pash_runtime.py
index 86cccd136..6520b5d0c 100644
--- a/compiler/pash_runtime.py
+++ b/compiler/pash_runtime.py
@@ -44,7 +44,7 @@ def main_body():
 
     ## Parse arguments
     args = parse_args()
-    config.pash_args = args
+    config.set_config_globals_from_pash_args(args)
 
     ## Load the configuration
     if not config.config:
diff --git a/compiler/pash_runtime_daemon.py b/compiler/pash_runtime_daemon.py
index 73dede11f..a2ae2d073 100644
--- a/compiler/pash_runtime_daemon.py
+++ b/compiler/pash_runtime_daemon.py
@@ -48,7 +48,7 @@ def init():
     config.LOGGING_PREFIX = "Daemon: "
     
     args = parse_args()
-    config.pash_args = args
+    config.set_config_globals_from_pash_args(args)
 
     # Load the configuration
     if not config.config:
diff --git a/compiler/preprocessor.py b/compiler/preprocessor.py
new file mode 100644
index 000000000..475a9b375
--- /dev/null
+++ b/compiler/preprocessor.py
@@ -0,0 +1,72 @@
+import argparse
+from datetime import datetime
+import os
+
+import config
+from ast_to_ir import replace_ast_regions
+from ir import FileIdGen
+from parse import parse_shell_to_asts, from_ast_objects_to_shell
+from util import *
+
+LOGGING_PREFIX = "PaSh Preprocessor: "
+
+@logging_prefix(LOGGING_PREFIX)
+def preprocess(input_script_path, args):
+    print(input_script_path)
+    ## 1. Execute the POSIX shell parser that returns the AST in JSON
+    preprocessing_parsing_start_time = datetime.now()
+    ast_objects = parse_shell_to_asts(input_script_path)
+    preprocessing_parsing_end_time = datetime.now()
+    print_time_delta("Preprocessing -- Parsing", preprocessing_parsing_start_time, preprocessing_parsing_end_time, args)
+
+    ## 2. Preprocess ASTs by replacing possible candidates for compilation
+    ##    with calls to the PaSh runtime.
+    preprocessing_pash_start_time = datetime.now()
+    preprocessed_asts = preprocess_asts(ast_objects)
+    preprocessing_pash_end_time = datetime.now()
+    print_time_delta("Preprocessing -- PaSh", preprocessing_pash_start_time, preprocessing_pash_end_time, args)
+
+    ## 3. Translate the new AST back to shell syntax
+    preprocessing_unparsing_start_time = datetime.now()
+    preprocessed_shell_script = from_ast_objects_to_shell(preprocessed_asts)
+    if(args.output_preprocessed):
+        log("Preprocessed script:")
+        log(preprocessed_shell_script)
+    
+    preprocessing_unparsing_end_time = datetime.now()
+    print_time_delta("Preprocessing -- Unparsing", preprocessing_unparsing_start_time, preprocessing_unparsing_end_time, args)
+    return preprocessed_shell_script
+
+
+def preprocess_asts(ast_objects):
+
+    ## TODO: Add a potential selection here to decide what kind of transformation to apply
+
+    ## Preprocess ASTs by replacing AST regions with calls to PaSh's runtime.
+    ## Then the runtime will do the compilation and optimization with additional
+    ## information.
+    preprocessed_asts = replace_ast_regions(ast_objects)
+
+    return preprocessed_asts
+
+##
+## This is the command line interface for the preprocessor
+##
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input", help="the script to be preprocessed")
+    parser.add_argument("--output_preprocessed",
+                        help=" output the preprocessed script",
+                        action="store_true")
+    config.add_general_config_arguments(parser)
+
+    args = parser.parse_args()
+    config.set_config_globals_from_pash_args(args)
+
+    ## Initialize the log file
+    ## TODO: Can we move this somewhere where there is no need for copy paste?
+    config.init_log_file()
+    preprocess(args.input, args)
+
+if __name__ == '__main__':
+    main()
diff --git a/compiler/util.py b/compiler/util.py
index 1c4b30ace..989f91f98 100644
--- a/compiler/util.py
+++ b/compiler/util.py
@@ -1,4 +1,5 @@
 from datetime import timedelta
+import functools
 from typing import Optional, TypeVar, Union, List, Any
 TType = TypeVar("TType")
 import os
@@ -23,11 +24,25 @@ def print_time_delta(prefix, start_time, end_time, args=None):
     ## Always output time in the log.
     time_difference = (end_time - start_time) / timedelta(milliseconds=1)
     ## If output_time flag is set, log the time
-    if (config.pash_args.output_time == 1):
+    if (config.OUTPUT_TIME):
         log("{} time:".format(prefix), time_difference, " ms", level=0)
     else:
         log("{} time:".format(prefix), time_difference, " ms")
 
+
+## This function decorates a function to add the logging prefix (without missing the old prefix)
+def logging_prefix(logging_prefix):
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            old_prefix = config.LOGGING_PREFIX
+            config.LOGGING_PREFIX = logging_prefix
+            result = func(*args, **kwargs)
+            config.LOGGING_PREFIX = old_prefix
+            return result
+        return wrapper
+    return decorator
+
 ## This is a wrapper for prints
 ##
 ## TODO: Extend the configuration to allow for custom file to output PaSh log. This would
@@ -35,11 +50,11 @@ def print_time_delta(prefix, start_time, end_time, args=None):
 def log(*args, end='\n', level=1):
     ## If the debug logging level is at least
     ## as high as this log message.
-    if (config.pash_args.debug >= level):
-        if(config.pash_args.log_file == ""):
+    if (config.DEBUG_LEVEL >= level):
+        if(config.LOG_FILE == ""):
             print(config.LOGGING_PREFIX, *args, file=sys.stderr, end=end, flush=True)
         else:
-            with open(config.pash_args.log_file, "a") as f:
+            with open(config.LOG_FILE, "a") as f:
                 print(config.LOGGING_PREFIX, *args, file=f, end=end, flush=True)
 
 def ptempfile():
diff --git a/preprocessor b/preprocessor
new file mode 100755
index 000000000..5785eea53
--- /dev/null
+++ b/preprocessor
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+
+##
+## This is the wrapper for the PaSh preprocessor.
+## The preprocessor should never be used on its own except if you really know what you are doing.
+##
+
+## TODO: Move this and all the preprocessor source files in their own directory etc
+
+export PASH_TOP=${PASH_TOP:-${BASH_SOURCE%/*}}
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
+# point to the local downloaded folders
+export PYTHONPATH="${PASH_TOP}/python_pkgs/:${PYTHONPATH}"
+
+## Create a temporary directory where PaSh can use for temporary files and logs
+export PASH_TMP_PREFIX="$(mktemp -d /tmp/pash_XXXXXXX)/"
+
+python3 "$PASH_TOP/compiler/preprocessor.py" "$@"

From e72627d7c66e10364e3c7d27374ff417aed16ad6 Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Fri, 16 Dec 2022 07:07:12 -0500
Subject: [PATCH 33/64] Refactor ast_to_ast transformations and create an
 interface for the preprocessor

---
 compiler/ast_to_ast.py   | 574 ++++++++++++++++++++++++++++++++++++++
 compiler/ast_to_ir.py    | 576 +--------------------------------------
 compiler/ast_util.py     |  26 +-
 compiler/pash.py         |   8 +-
 compiler/preprocessor.py |  27 +-
 compiler/util.py         |   2 +-
 6 files changed, 618 insertions(+), 595 deletions(-)
 create mode 100644 compiler/ast_to_ast.py

diff --git a/compiler/ast_to_ast.py b/compiler/ast_to_ast.py
new file mode 100644
index 000000000..f0096a21d
--- /dev/null
+++ b/compiler/ast_to_ast.py
@@ -0,0 +1,574 @@
+from enum import Enum
+import pickle
+
+import config
+
+from ast_util import *
+from parse import from_ast_objects_to_shell_file
+
+
+## There are two types of ast_to_ast transformations
+class TransformationType(Enum):
+    PASH = 'pash'
+    SPECULATIVE = 'spec'
+
+## Use this object to pass options inside the preprocessing
+## trasnformation.
+class TransformationOptions:
+    def __init__(self, mode):
+        self.mode = TransformationType(mode)
+    
+    def get_mode(self):
+        return self.mode
+
+
+##
+## Preprocessing
+##
+
+## The preprocessing pass replaces all _candidate_ dataflow regions with
+## calls to PaSh's runtime to let it establish if they are actually dataflow
+## regions. The pass serializes all candidate dataflow regions:
+## - A list of ASTs if at the top level or
+## - an AST subtree if at a lower level
+##
+## The PaSh runtime then deserializes the(m, compiles them (if safe) and optimizes them.
+
+preprocess_cases = {
+    "Pipe": (lambda trans_options, last_object:
+             lambda ast_node: preprocess_node_pipe(ast_node, trans_options, last_object=last_object)),
+    "Command": (lambda trans_options, last_object:
+                lambda ast_node: preprocess_node_command(ast_node, trans_options, last_object=last_object)),
+    "Redir": (lambda trans_options, last_object:
+              lambda ast_node: preprocess_node_redir(ast_node, trans_options, last_object=last_object)),
+    "Background": (lambda trans_options, last_object:
+                   lambda ast_node: preprocess_node_background(ast_node, trans_options, last_object=last_object)),
+    "Subshell": (lambda trans_options, last_object:
+                   lambda ast_node: preprocess_node_subshell(ast_node, trans_options, last_object=last_object)),
+    "For": (lambda trans_options, last_object:
+            lambda ast_node: preprocess_node_for(ast_node, trans_options, last_object=last_object)),
+    "While": (lambda trans_options, last_object:
+              lambda ast_node: preprocess_node_while(ast_node, trans_options, last_object=last_object)),
+    "Defun": (lambda trans_options, last_object:
+              lambda ast_node: preprocess_node_defun(ast_node, trans_options, last_object=last_object)),
+    "Semi": (lambda trans_options, last_object:
+             lambda ast_node: preprocess_node_semi(ast_node, trans_options, last_object=last_object)),
+    "Or": (lambda trans_options, last_object:
+           lambda ast_node: preprocess_node_or(ast_node, trans_options, last_object=last_object)),
+    "And": (lambda trans_options, last_object:
+            lambda ast_node: preprocess_node_and(ast_node, trans_options, last_object=last_object)),
+    "Not": (lambda trans_options, last_object:
+            lambda ast_node: preprocess_node_not(ast_node, trans_options, last_object=last_object)),
+    "If": (lambda trans_options, last_object:
+            lambda ast_node: preprocess_node_if(ast_node, trans_options, last_object=last_object)),
+    "Case": (lambda trans_options, last_object:
+             lambda ast_node: preprocess_node_case(ast_node, trans_options, last_object=last_object))
+}
+
+
+
+## Replace candidate dataflow AST regions with calls to PaSh's runtime.
+def replace_ast_regions(ast_objects, mode):
+
+    trans_options = TransformationOptions(mode)
+
+    preprocessed_asts = []
+    candidate_dataflow_region = []
+    last_object = False
+    for i, ast_object in enumerate(ast_objects):
+        # log("Preprocessing AST {}".format(i))
+        # log(ast_object)
+        ## If we are working on the last object we need to keep that in mind when replacing.
+        ##
+        ## The last df-region should not be executed in parallel no matter what (to not lose its exit code.)
+        if (i == len(ast_objects) - 1):
+            # log("Last object")
+            last_object = True
+
+        ast, original_text, _linno_before, _linno_after = ast_object
+        ## TODO: Turn the untyped ast to an AstNode
+
+        ## Goals: This transformation can approximate in several directions.
+        ##        1. Not replacing a candidate dataflow region.
+        ##        2. Replacing a too large candidate region
+        ##           (making expansion not happen as late as possible)
+        ##        3. Not replacing a maximal dataflow region,
+        ##           e.g. splitting a big one into two.
+        ##        4. Replacing sections that are *certainly* not dataflow regions.
+        ##           (This can only lead to performance issues.)
+        ##
+        ##        Which of the above can we hope to be precise with?
+        ##        Can we have proofs indicating that we are not approximating those?
+
+        ## Preprocess ast by replacing subtrees with calls to runtime.
+        ## - If the whole AST needs to be replaced (e.g. if it is a pipeline)
+        ##   then the second output is true.
+        ## - If the next AST needs to be replaced too (e.g. if the current one is a background)
+        ##   then the third output is true
+        preprocessed_ast_object = preprocess_node(ast, trans_options, last_object=last_object)
+        ## If the dataflow region is not maximal then it implies that the whole
+        ## AST should be replaced.
+        assert(not preprocessed_ast_object.is_non_maximal() 
+               or preprocessed_ast_object.should_replace_whole_ast())
+        
+        ## If the whole AST needs to be replaced then it implies that
+        ## something will be replaced
+        assert(not preprocessed_ast_object.should_replace_whole_ast() 
+               or preprocessed_ast_object.will_anything_be_replaced())
+
+        ## If it isn't maximal then we just add it to the candidate
+        if(preprocessed_ast_object.is_non_maximal()):
+            candidate_dataflow_region.append((preprocessed_ast_object.ast,
+                                              original_text))
+        else:
+            ## If the current candidate dataflow region is non-empty
+            ## it means that the previous AST was in the background so
+            ## the current one has to be included in the process no matter what
+            if (len(candidate_dataflow_region) > 0):
+                candidate_dataflow_region.append((preprocessed_ast_object.ast,
+                                                  original_text))
+                ## Since the current one is maximal (or not wholy replaced)
+                ## we close the candidate.
+                dataflow_region_asts, dataflow_region_lines = unzip(candidate_dataflow_region)
+                dataflow_region_text = join_original_text_lines(dataflow_region_lines)
+                replaced_ast = replace_df_region(dataflow_region_asts, trans_options,
+                                                 ast_text=dataflow_region_text, disable_parallel_pipelines=last_object)
+                candidate_dataflow_region = []
+                preprocessed_asts.append(replaced_ast)
+            else:
+                if(preprocessed_ast_object.should_replace_whole_ast()):
+                    replaced_ast = replace_df_region([preprocessed_ast_object.ast], trans_options,
+                                                     ast_text=original_text, disable_parallel_pipelines=last_object)
+                    preprocessed_asts.append(replaced_ast)
+                else:
+                    ## In this case, it is possible that no replacement happened,
+                    ## meaning that we can simply return the original parsed text as it was.
+                    if(preprocessed_ast_object.will_anything_be_replaced() or original_text is None):
+                        preprocessed_asts.append(preprocessed_ast_object.ast)
+                    else:
+                        preprocessed_asts.append(UnparsedScript(original_text))
+
+    ## Close the final dataflow region
+    if(len(candidate_dataflow_region) > 0):
+        dataflow_region_asts, dataflow_region_lines = unzip(candidate_dataflow_region)
+        dataflow_region_text = join_original_text_lines(dataflow_region_lines)
+        replaced_ast = replace_df_region(dataflow_region_asts, trans_options,
+                                         ast_text=dataflow_region_text, disable_parallel_pipelines=True)
+        candidate_dataflow_region = []
+        preprocessed_asts.append(replaced_ast)
+
+    return preprocessed_asts
+
+## This function joins original unparsed shell source in a safe way 
+##   so as to deal with the case where some of the text is None (e.g., in case of stdin parsing).
+def join_original_text_lines(shell_source_lines_or_none):
+    if any([text_or_none is None for text_or_none in shell_source_lines_or_none]):
+        return None
+    else:
+        return "\n".join(shell_source_lines_or_none)
+
+def preprocess_node(ast_object, trans_options, last_object=False):
+    global preprocess_cases
+    return ast_match_untyped(ast_object, preprocess_cases, trans_options, last_object)
+
+## This preprocesses the AST node and also replaces it if it needs replacement .
+## It is called by constructs that cannot be included in a dataflow region.
+def preprocess_close_node(ast_object, trans_options, last_object=False):
+    preprocessed_ast_object = preprocess_node(ast_object, trans_options, last_object=last_object)
+    preprocessed_ast = preprocessed_ast_object.ast
+    should_replace_whole_ast = preprocessed_ast_object.should_replace_whole_ast()
+    if(should_replace_whole_ast):
+        final_ast = replace_df_region([preprocessed_ast], trans_options,
+                                      disable_parallel_pipelines=last_object)
+        something_replaced = True
+    else:
+        final_ast = preprocessed_ast
+        something_replaced = preprocessed_ast_object.will_anything_be_replaced()
+    return final_ast, something_replaced
+
+def preprocess_node_pipe(ast_node, trans_options, last_object=False):
+    ## A pipeline is *always* a candidate dataflow region.
+    ## Q: Is that true?
+
+    ## TODO: Preprocess the internals of the pipe to allow
+    ##       for mutually recursive calls to PaSh.
+    ##
+    ##       For example, if a command in the pipe has a command substitution
+    ##       in one of its arguments then we would like to call our runtime
+    ##       there instead of
+    preprocessed_ast_object = PreprocessedAST(ast_node,
+                                              replace_whole=True,
+                                              non_maximal=ast_node.is_background,
+                                              last_ast=last_object)
+    return preprocessed_ast_object
+
+## TODO: Complete this
+def preprocess_node_command(ast_node, trans_options, last_object=False):
+    ## TODO: Preprocess the internals of the pipe to allow
+    ##       for mutually recursive calls to PaSh.
+    ##
+    ##       For example, if a command in the pipe has a command substitution
+    ##       in one of its arguments then we would like to call our runtime
+    ##       there instead of
+
+    ## If there are no arguments, the command is just an
+    ## assignment (Q: or just redirections?)
+    if(len(ast_node.arguments) == 0):
+        preprocessed_ast_object = PreprocessedAST(ast_node,
+                                                  replace_whole=False,
+                                                  non_maximal=False,
+                                                  something_replaced=False,
+                                                  last_ast=last_object)
+        return preprocessed_ast_object
+
+    ## This means we have a command. Commands are always candidate dataflow
+    ## regions.
+    preprocessed_ast_object = PreprocessedAST(ast_node,
+                                              replace_whole=True,
+                                              non_maximal=False,
+                                              last_ast=last_object)
+    return preprocessed_ast_object
+
+# Background of (linno * t * redirection list) 
+## TODO: It might be possible to actually not close the inner node but rather apply the redirections on it
+def preprocess_node_redir(ast_node, trans_options, last_object=False):
+    preprocessed_node, something_replaced = preprocess_close_node(ast_node.node, trans_options,
+                                                                  last_object=last_object)
+    ## TODO: Could there be a problem with the in-place update
+    ast_node.node = preprocessed_node
+    preprocessed_ast_object = PreprocessedAST(ast_node,
+                                              replace_whole=False,
+                                              non_maximal=False,
+                                              something_replaced=something_replaced,
+                                              last_ast=last_object)
+    return preprocessed_ast_object
+
+## TODO: Is that correct? Also, this should probably affect `semi`, `and`, and `or`
+def preprocess_node_background(ast_node, trans_options, last_object=False):
+    ## A background node is *always* a candidate dataflow region.
+    ## Q: Is that true?
+
+    ## TODO: Preprocess the internals of the background to allow
+    ##       for mutually recursive calls to PaSh.
+    preprocessed_ast_object = PreprocessedAST(ast_node,
+                                              replace_whole=True,
+                                              non_maximal=True,
+                                              last_ast=last_object)
+    return preprocessed_ast_object
+
+## TODO: We can actually preprocess the underlying node and then
+##       return its characteristics above. However, we would need
+##       to add a field in the IR that a node runs in a subshell
+##       (which would have implications on how the backend outputs it).
+##
+##       e.g. a subshell node should also be output as a subshell in the backend.
+## FIXME: This might not just be suboptimal, but also wrong.
+def preprocess_node_subshell(ast_node, trans_options, last_object=False):
+    preprocessed_body, something_replaced = preprocess_close_node(ast_node.body, trans_options,
+                                                                  last_object=last_object)
+    ## TODO: Could there be a problem with the in-place update
+    ast_node.body = preprocessed_body
+    preprocessed_ast_object = PreprocessedAST(ast_node,
+                                              replace_whole=False,
+                                              non_maximal=False,
+                                              something_replaced=something_replaced,
+                                              last_ast=last_object)
+    return preprocessed_ast_object
+
+## TODO: For all of the constructs below, think whether we are being too conservative
+
+## TODO: This is not efficient at all since it calls the PaSh runtime everytime the loop is entered.
+##       We have to find a way to improve that.
+def preprocess_node_for(ast_node, trans_options, last_object=False):
+    preprocessed_body, something_replaced = preprocess_close_node(ast_node.body, trans_options, last_object=last_object)
+    ## TODO: Could there be a problem with the in-place update
+    ast_node.body = preprocessed_body
+    preprocessed_ast_object = PreprocessedAST(ast_node,
+                                              replace_whole=False,
+                                              non_maximal=False,
+                                              something_replaced=something_replaced,
+                                              last_ast=last_object)
+    return preprocessed_ast_object
+
+def preprocess_node_while(ast_node, trans_options, last_object=False):
+    preprocessed_test, sth_replaced_test = preprocess_close_node(ast_node.test, trans_options, last_object=last_object)
+    preprocessed_body, sth_replaced_body = preprocess_close_node(ast_node.body, trans_options, last_object=last_object)
+    ## TODO: Could there be a problem with the in-place update
+    ast_node.test = preprocessed_test
+    ast_node.body = preprocessed_body
+    something_replaced = sth_replaced_test or sth_replaced_body
+    preprocessed_ast_object = PreprocessedAST(ast_node,
+                                              replace_whole=False,
+                                              non_maximal=False,
+                                              something_replaced=something_replaced,
+                                              last_ast=last_object)
+    return preprocessed_ast_object
+
+## This is the same as the one for `For`
+def preprocess_node_defun(ast_node, trans_options, last_object=False):
+    ## TODO: For now we don't want to compile function bodies
+    # preprocessed_body = preprocess_close_node(ast_node.body)
+    ## TODO: Could there be a problem with the in-place update
+    # ast_node.body = preprocessed_body
+    preprocessed_ast_object = PreprocessedAST(ast_node,
+                                              replace_whole=False,
+                                              non_maximal=False,
+                                              something_replaced=False,
+                                              last_ast=last_object)
+    return preprocessed_ast_object
+
+## TODO: If the preprocessed is not maximal we actually need to combine it with the one on the right.
+def preprocess_node_semi(ast_node, trans_options, last_object=False):
+    # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left, irFileGen, config)
+    ##
+    ## TODO: Is it valid that only the right one is considered the last command?
+    preprocessed_left, sth_replaced_left = preprocess_close_node(ast_node.left_operand, trans_options, last_object=False)
+    preprocessed_right, sth_replaced_right = preprocess_close_node(ast_node.right_operand, trans_options, last_object=last_object)
+    ## TODO: Could there be a problem with the in-place update
+    ast_node.left_operand = preprocessed_left
+    ast_node.right_operand = preprocessed_right
+    sth_replaced = sth_replaced_left or sth_replaced_right
+    preprocessed_ast_object = PreprocessedAST(ast_node,
+                                              replace_whole=False,
+                                              non_maximal=False,
+                                              something_replaced=sth_replaced,
+                                              last_ast=last_object)
+    return preprocessed_ast_object
+
+## TODO: Make sure that what is inside an `&&`, `||`, `!` (and others) does not run in parallel_pipelines 
+##       since we need its exit code.
+def preprocess_node_and(ast_node, trans_options, last_object=False):
+    # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left, irFileGen, config)
+    preprocessed_left, sth_replaced_left = preprocess_close_node(ast_node.left_operand, trans_options, last_object=last_object)
+    preprocessed_right, sth_replaced_right = preprocess_close_node(ast_node.right_operand, trans_options, last_object=last_object)
+    ## TODO: Could there be a problem with the in-place update
+    ast_node.left_operand = preprocessed_left
+    ast_node.right_operand = preprocessed_right
+    sth_replaced = sth_replaced_left or sth_replaced_right
+    preprocessed_ast_object = PreprocessedAST(ast_node,
+                                              replace_whole=False,
+                                              non_maximal=False,
+                                              something_replaced=sth_replaced,
+                                              last_ast=last_object)
+    return preprocessed_ast_object
+
+def preprocess_node_or(ast_node, trans_options, last_object=False):
+    # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left, irFileGen, config)
+    preprocessed_left, sth_replaced_left = preprocess_close_node(ast_node.left_operand, trans_options, last_object=last_object)
+    preprocessed_right, sth_replaced_right = preprocess_close_node(ast_node.right_operand, trans_options, last_object=last_object)
+    ## TODO: Could there be a problem with the in-place update
+    ast_node.left_operand = preprocessed_left
+    ast_node.right_operand = preprocessed_right
+    sth_replaced = sth_replaced_left or sth_replaced_right
+    preprocessed_ast_object = PreprocessedAST(ast_node,
+                                              replace_whole=False,
+                                              non_maximal=False,
+                                              something_replaced=sth_replaced,
+                                              last_ast=last_object)
+    return preprocessed_ast_object
+
+def preprocess_node_not(ast_node, trans_options, last_object=False):
+    # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left)
+    preprocessed_body, sth_replaced = preprocess_close_node(ast_node.body, trans_options, last_object=last_object)
+    ## TODO: Could there be a problem with the in-place update
+    ast_node.body = preprocessed_body
+    preprocessed_ast_object = PreprocessedAST(ast_node,
+                                              replace_whole=False,
+                                              non_maximal=False,
+                                              something_replaced=sth_replaced,
+                                              last_ast=last_object)
+    return preprocessed_ast_object
+
+
+def preprocess_node_if(ast_node, trans_options, last_object=False):
+    # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left, irFileGen, config)
+    preprocessed_cond, sth_replaced_cond = preprocess_close_node(ast_node.cond, trans_options, last_object=last_object)
+    preprocessed_then, sth_replaced_then = preprocess_close_node(ast_node.then_b, trans_options, last_object=last_object)
+    preprocessed_else, sth_replaced_else = preprocess_close_node(ast_node.else_b, trans_options, last_object=last_object)
+    ## TODO: Could there be a problem with the in-place update
+    ast_node.cond = preprocessed_cond
+    ast_node.then_b = preprocessed_then
+    ast_node.else_b = preprocessed_else
+    sth_replaced = sth_replaced_cond or sth_replaced_then or sth_replaced_else
+    preprocessed_ast_object = PreprocessedAST(ast_node,
+                                              replace_whole=False,
+                                              non_maximal=False,
+                                              something_replaced=sth_replaced,
+                                              last_ast=last_object)
+    return preprocessed_ast_object
+
+def preprocess_case(case, trans_options, last_object=False):
+    preprocessed_body, sth_replaced = preprocess_close_node(case["cbody"], trans_options, last_object=last_object)
+    case["cbody"] = preprocessed_body
+    return case, sth_replaced
+
+def preprocess_node_case(ast_node, trans_options, last_object=False):
+    preprocessed_cases_replaced = [preprocess_case(case, trans_options, last_object=last_object) for case in ast_node.cases]
+    preprocessed_cases, sth_replaced_cases = list(zip(*preprocessed_cases_replaced))
+    ## TODO: Could there be a problem with the in-place update
+    ast_node.cases = preprocessed_cases
+    preprocessed_ast_object = PreprocessedAST(ast_node,
+                                              replace_whole=False,
+                                              non_maximal=False,
+                                              something_replaced=any(sth_replaced_cases),
+                                              last_ast=last_object)
+    return preprocessed_ast_object
+
+
+## TODO: I am a little bit confused about how compilation happens.
+##       Does it happen bottom up or top down: i.e. when we first encounter an occurence
+##       do we recurse in it and then compile from the leaf, or just compile the surface?
+
+
+
+## Replaces IR subtrees with a command that calls them (more
+## precisely, a command that calls a python script to call them).
+##
+## Note: The traversal that replace_irs does, is exactly the same as
+## the one that is done by compile_node. Both of these functions
+## transform nodes of type t to something else.
+##
+## TODO: For now this just replaces the IRs starting from the ourside
+## one first, but it should start from the bottom up to handle
+## recursive IRs.
+
+## This function serializes a candidate df_region in a file, and in its place,
+## it adds a command that calls our distribution planner with the name of the
+## saved file.
+##
+## If we are need to disable parallel pipelines, e.g., if we are in the context of an if,
+## or if we are in the end of a script, then we set a variable.
+def replace_df_region(asts, trans_options, disable_parallel_pipelines=False, ast_text=None):
+    ir_filename = ptempfile()
+
+    ## Serialize the node in a file
+    with open(ir_filename, "wb") as ir_file:
+        pickle.dump(asts, ir_file)
+
+    ## Serialize the candidate df_region asts back to shell
+    ## so that the sequential script can be run in parallel to the compilation.
+    sequential_script_file_name = ptempfile()
+    ## If we don't have the original ast text, we need to unparse the ast
+    if (ast_text is None):
+        kv_asts = [ast_node_to_untyped_deep(ast) for ast in asts]
+        from_ast_objects_to_shell_file(kv_asts, sequential_script_file_name)
+    else:
+        ## However, if we have the original ast text, then we can simply output that.
+        with open(sequential_script_file_name, "w") as script_file:
+            script_file.write(ast_text)
+
+    ## Replace it with a command that calls the distribution
+    ## planner with the name of the file.
+    ##
+    ## TODO: Modify this here to be conditional based on an argument on the call
+    transformation_mode = trans_options.get_mode()
+    if transformation_mode is TransformationType.PASH:
+        replaced_node = make_call_to_pash_runtime(ir_filename, sequential_script_file_name, disable_parallel_pipelines)
+    else:
+        replaced_node = make_call_to_spec_runtime(ir_filename, sequential_script_file_name)
+
+    return replaced_node
+
+## This function makes a command that calls the pash runtime
+## together with the name of the file containing an IR. Then the
+## pash runtime should read from this file and continue
+## execution.
+##
+## TODO: At the moment this is written in python but it is in essense a simple shell script.
+##       Is it possible to make it be a simple string instead of manually creating the AST?
+##
+## (MAYBE) TODO: The way I did it, is by calling the parser once, and seeing
+## what it returns. Maybe it would make sense to call the parser on
+## the fly to have a cleaner implementation here?
+def make_call_to_pash_runtime(ir_filename, sequential_script_file_name,
+                              disable_parallel_pipelines) -> AstNode:
+
+    ## Save the previous exit state:
+    ## ```
+    ## pash_previous_exit_status="$?"
+    ## ```
+    assignments = [["pash_previous_exit_status",
+                    [make_quoted_variable("?")]]]
+    previous_status_command = make_command([], assignments=assignments)
+
+    ## Save the input arguments
+    ## ```
+    ## source $PASH_TOP/runtime/save_args.sh "${@}"
+    ## ```
+    arguments = [string_to_argument("source"),
+                 string_to_argument(config.SAVE_ARGS_EXECUTABLE),
+                 [make_quoted_variable("@")]]
+    input_args_command = make_command(arguments)
+
+    ## Disable parallel pipelines if we are in the last command of the script.
+    ## ```
+    ## pash_disable_parallel_pipelines=1
+    ## ```
+    if(disable_parallel_pipelines):
+        assignments = [["pash_disable_parallel_pipelines",
+                        string_to_argument("1")]]
+    else:
+        assignments = [["pash_disable_parallel_pipelines",
+                        string_to_argument("0")]]
+    disable_parallel_pipelines_command = make_command([],
+                                                      assignments=assignments)
+
+    ## Call the runtime
+    arguments = [string_to_argument("source"),
+                 string_to_argument(config.RUNTIME_EXECUTABLE),
+                 string_to_argument(sequential_script_file_name),
+                 string_to_argument(ir_filename)]
+    ## Pass a relevant argument to the planner
+    arguments += config.pass_common_arguments(config.pash_args)
+    runtime_node = make_command(arguments)
+
+    ## Restore the arguments to propagate internal changes, e.g., from `shift` outside.
+    ## ```
+    ## eval "set -- \"\${pash_input_args[@]}\""
+    ## ```
+    ##
+    ## Alternative Solution: (TODO if we need extra performance -- avoiding eval) 
+    ## Implement an AST node that accepts and returns a literal string
+    ## bypassing unparsing. This would make this simpler and also more
+    ## efficient (avoiding eval).
+    ## However, it would require some work because we would need to implement
+    ## support for this node in various places of PaSh and the unparser.
+    ##      
+    ##
+    ## TODO: Maybe we need to only do this if there is a change.
+    ## 
+    set_arguments = [string_to_argument("eval"),
+                     [['Q', string_to_argument('set -- ') +
+                            [escaped_char('"')] + # The escaped quote
+                            string_to_argument('\\${pash_input_args[@]}') +
+                            [escaped_char('"')]]]]
+    set_args_node = make_command(set_arguments)
+
+
+    ## Restore the exit code (since now we have executed `set` last)
+    ## ```
+    ## ( exit "$pash_runtime_final_status")
+    ## ```
+    set_exit_status_command_arguments = [string_to_argument("exit"),
+                                         [make_quoted_variable("pash_runtime_final_status")]]
+    set_exit_status_command = make_command(set_exit_status_command_arguments)
+    set_exit_status_node = make_kv('Subshell', [0, set_exit_status_command, []])
+
+    sequence = make_semi_sequence([previous_status_command,
+                                   input_args_command,
+                                   disable_parallel_pipelines_command,
+                                   runtime_node,
+                                   set_args_node,
+                                   set_exit_status_node])
+    return sequence
+
+## TODO: Make that an actual call to the spec runtime
+def make_call_to_spec_runtime(ir_filename, sequential_script_file_name) -> AstNode:
+    ## Call the runtime
+    arguments = [string_to_argument("source"),
+                 string_to_argument(config.RUNTIME_EXECUTABLE),
+                 string_to_argument(sequential_script_file_name),
+                 string_to_argument(ir_filename)]
+    ## Pass a relevant argument to the planner
+    runtime_node = make_command(arguments)
+    return runtime_node
\ No newline at end of file
diff --git a/compiler/ast_to_ir.py b/compiler/ast_to_ir.py
index 2c9ab11c3..a6a4cf69f 100644
--- a/compiler/ast_to_ir.py
+++ b/compiler/ast_to_ir.py
@@ -3,11 +3,10 @@
 from definitions.ast_node import *
 from definitions.ast_node_c import *
 from util import *
-from parse import from_ast_objects_to_shell, from_ast_objects_to_shell_file
+from parse import from_ast_objects_to_shell
 from expand import *
 import subprocess
 import config
-import pickle
 
 ##
 ## Compile AST -> Extended AST with IRs
@@ -44,59 +43,6 @@
                   lambda ast_node: compile_node_for(ast_node, fileIdGen, config))
         }
 
-preprocess_cases = {
-    "Pipe": (lambda last_object:
-             lambda ast_node: preprocess_node_pipe(ast_node, last_object=last_object)),
-    "Command": (lambda last_object:
-                lambda ast_node: preprocess_node_command(ast_node, last_object=last_object)),
-    "Redir": (lambda last_object:
-              lambda ast_node: preprocess_node_redir(ast_node, last_object=last_object)),
-    "Background": (lambda last_object:
-                   lambda ast_node: preprocess_node_background(ast_node, last_object=last_object)),
-    "Subshell": (lambda last_object:
-                   lambda ast_node: preprocess_node_subshell(ast_node, last_object=last_object)),
-    "For": (lambda last_object:
-            lambda ast_node: preprocess_node_for(ast_node, last_object=last_object)),
-    "While": (lambda last_object:
-              lambda ast_node: preprocess_node_while(ast_node, last_object=last_object)),
-    "Defun": (lambda last_object:
-              lambda ast_node: preprocess_node_defun(ast_node, last_object=last_object)),
-    "Semi": (lambda last_object:
-             lambda ast_node: preprocess_node_semi(ast_node, last_object=last_object)),
-    "Or": (lambda last_object:
-           lambda ast_node: preprocess_node_or(ast_node, last_object=last_object)),
-    "And": (lambda last_object:
-            lambda ast_node: preprocess_node_and(ast_node, last_object=last_object)),
-    "Not": (lambda last_object:
-            lambda ast_node: preprocess_node_not(ast_node, last_object=last_object)),
-    "If": (lambda last_object:
-            lambda ast_node: preprocess_node_if(ast_node, last_object=last_object)),
-    "Case": (lambda last_object:
-             lambda ast_node: preprocess_node_case(ast_node, last_object=last_object))
-}
-
-ir_cases = {
-        ## Note: We should never encounter a Pipe construct, since all
-        ## of them must have been become IRs
-        ##
-        ## "Pipe": (lambda c, b, i: {c : [b, i]}),
-        "Command":   (lambda irFileGen, config:
-                      lambda ast_node: replace_irs_command(ast_node, irFileGen, config)),
-        "And" :      (lambda irFileGen, config:
-                      lambda ast_node: replace_irs_and_or_semi(ast_node, irFileGen, config)),
-        "Or" :       (lambda irFileGen, config:
-                      lambda ast_node: replace_irs_and_or_semi(ast_node, irFileGen, config)),
-        "Semi" :     (lambda irFileGen, config:
-                      lambda ast_node: replace_irs_and_or_semi(ast_node, irFileGen, config)),
-
-        ## TODO: Complete these
-        # "Redir" : (lambda c, l, n, r: compile_node_redir(c, l, n, r, fileIdGen)),
-        # "Subshell" : (lambda c, l, n, r: compile_node_subshell(c, l, n, r, fileIdGen)),
-        # "Background" : (lambda c, l, n, r: compile_node_background(c, l, n, r, fileIdGen)),
-        # "Defun" : (lambda c, l, n, b: compile_node_defun(c, l, n, b, fileIdGen)),
-        "For" :      (lambda irFileGen, config:
-                      lambda ast_node: replace_irs_for(ast_node, irFileGen, config)),
-        }
 
 def compile_asts(ast_objects, fileIdGen, config):
     compiled_asts = []
@@ -481,523 +427,3 @@ def compile_redirections(redirections, fileIdGen, config):
                              for redirection in redirections]
     return compiled_redirections
 
-##
-## Preprocessing
-##
-
-## The preprocessing pass replaces all _candidate_ dataflow regions with
-## calls to PaSh's runtime to let it establish if they are actually dataflow
-## regions. The pass serializes all candidate dataflow regions:
-## - A list of ASTs if at the top level or
-## - an AST subtree if at a lower level
-##
-## The PaSh runtime then deserializes them, compiles them (if safe) and optimizes them.
-
-## Replace candidate dataflow AST regions with calls to PaSh's runtime.
-def replace_ast_regions(ast_objects):
-    preprocessed_asts = []
-    candidate_dataflow_region = []
-    last_object = False
-    for i, ast_object in enumerate(ast_objects):
-        # log("Preprocessing AST {}".format(i))
-        # log(ast_object)
-        ## If we are working on the last object we need to keep that in mind when replacing.
-        ##
-        ## The last df-region should not be executed in parallel no matter what (to not lose its exit code.)
-        if (i == len(ast_objects) - 1):
-            # log("Last object")
-            last_object = True
-
-        ast, original_text, _linno_before, _linno_after = ast_object
-        ## TODO: Turn the untyped ast to an AstNode
-
-        ## Goals: This transformation can approximate in several directions.
-        ##        1. Not replacing a candidate dataflow region.
-        ##        2. Replacing a too large candidate region
-        ##           (making expansion not happen as late as possible)
-        ##        3. Not replacing a maximal dataflow region,
-        ##           e.g. splitting a big one into two.
-        ##        4. Replacing sections that are *certainly* not dataflow regions.
-        ##           (This can only lead to performance issues.)
-        ##
-        ##        Which of the above can we hope to be precise with?
-        ##        Can we have proofs indicating that we are not approximating those?
-
-        ## Preprocess ast by replacing subtrees with calls to runtime.
-        ## - If the whole AST needs to be replaced (e.g. if it is a pipeline)
-        ##   then the second output is true.
-        ## - If the next AST needs to be replaced too (e.g. if the current one is a background)
-        ##   then the third output is true
-        preprocessed_ast_object = preprocess_node(ast, last_object=last_object)
-        ## If the dataflow region is not maximal then it implies that the whole
-        ## AST should be replaced.
-        assert(not preprocessed_ast_object.is_non_maximal() 
-               or preprocessed_ast_object.should_replace_whole_ast())
-        
-        ## If the whole AST needs to be replaced then it implies that
-        ## something will be replaced
-        assert(not preprocessed_ast_object.should_replace_whole_ast() 
-               or preprocessed_ast_object.will_anything_be_replaced())
-
-        ## If it isn't maximal then we just add it to the candidate
-        if(preprocessed_ast_object.is_non_maximal()):
-            candidate_dataflow_region.append((preprocessed_ast_object.ast,
-                                              original_text))
-        else:
-            ## If the current candidate dataflow region is non-empty
-            ## it means that the previous AST was in the background so
-            ## the current one has to be included in the process no matter what
-            if (len(candidate_dataflow_region) > 0):
-                candidate_dataflow_region.append((preprocessed_ast_object.ast,
-                                                  original_text))
-                ## Since the current one is maximal (or not wholy replaced)
-                ## we close the candidate.
-                dataflow_region_asts, dataflow_region_lines = unzip(candidate_dataflow_region)
-                dataflow_region_text = join_original_text_lines(dataflow_region_lines)
-                replaced_ast = replace_df_region(dataflow_region_asts,
-                                                 ast_text=dataflow_region_text, disable_parallel_pipelines=last_object)
-                candidate_dataflow_region = []
-                preprocessed_asts.append(replaced_ast)
-            else:
-                if(preprocessed_ast_object.should_replace_whole_ast()):
-                    replaced_ast = replace_df_region([preprocessed_ast_object.ast],
-                                                     ast_text=original_text, disable_parallel_pipelines=last_object)
-                    preprocessed_asts.append(replaced_ast)
-                else:
-                    ## In this case, it is possible that no replacement happened,
-                    ## meaning that we can simply return the original parsed text as it was.
-                    if(preprocessed_ast_object.will_anything_be_replaced() or original_text is None):
-                        preprocessed_asts.append(preprocessed_ast_object.ast)
-                    else:
-                        preprocessed_asts.append(UnparsedScript(original_text))
-
-    ## Close the final dataflow region
-    if(len(candidate_dataflow_region) > 0):
-        dataflow_region_asts, dataflow_region_lines = unzip(candidate_dataflow_region)
-        dataflow_region_text = join_original_text_lines(dataflow_region_lines)
-        replaced_ast = replace_df_region(dataflow_region_asts,
-                                         ast_text=dataflow_region_text, disable_parallel_pipelines=True)
-        candidate_dataflow_region = []
-        preprocessed_asts.append(replaced_ast)
-
-    return preprocessed_asts
-
-## This function joins original unparsed shell source in a safe way 
-##   so as to deal with the case where some of the text is None (e.g., in case of stdin parsing).
-def join_original_text_lines(shell_source_lines_or_none):
-    if any([text_or_none is None for text_or_none in shell_source_lines_or_none]):
-        return None
-    else:
-        return "\n".join(shell_source_lines_or_none)
-
-def preprocess_node(ast_object, last_object=False):
-    global preprocess_cases
-    return ast_match_untyped(ast_object, preprocess_cases, last_object)
-
-## This preprocesses the AST node and also replaces it if it needs replacement .
-## It is called by constructs that cannot be included in a dataflow region.
-def preprocess_close_node(ast_object, last_object=False):
-    preprocessed_ast_object = preprocess_node(ast_object, last_object=last_object)
-    preprocessed_ast = preprocessed_ast_object.ast
-    should_replace_whole_ast = preprocessed_ast_object.should_replace_whole_ast()
-    if(should_replace_whole_ast):
-        final_ast = replace_df_region([preprocessed_ast],
-                                      disable_parallel_pipelines=last_object)
-        something_replaced = True
-    else:
-        final_ast = preprocessed_ast
-        something_replaced = preprocessed_ast_object.will_anything_be_replaced()
-    return final_ast, something_replaced
-
-def preprocess_node_pipe(ast_node, last_object=False):
-    ## A pipeline is *always* a candidate dataflow region.
-    ## Q: Is that true?
-
-    ## TODO: Preprocess the internals of the pipe to allow
-    ##       for mutually recursive calls to PaSh.
-    ##
-    ##       For example, if a command in the pipe has a command substitution
-    ##       in one of its arguments then we would like to call our runtime
-    ##       there instead of
-    preprocessed_ast_object = PreprocessedAST(ast_node,
-                                              replace_whole=True,
-                                              non_maximal=ast_node.is_background,
-                                              last_ast=last_object)
-    return preprocessed_ast_object
-
-## TODO: Complete this
-def preprocess_node_command(ast_node, last_object=False):
-    ## TODO: Preprocess the internals of the pipe to allow
-    ##       for mutually recursive calls to PaSh.
-    ##
-    ##       For example, if a command in the pipe has a command substitution
-    ##       in one of its arguments then we would like to call our runtime
-    ##       there instead of
-
-    ## If there are no arguments, the command is just an
-    ## assignment (Q: or just redirections?)
-    if(len(ast_node.arguments) == 0):
-        preprocessed_ast_object = PreprocessedAST(ast_node,
-                                                  replace_whole=False,
-                                                  non_maximal=False,
-                                                  something_replaced=False,
-                                                  last_ast=last_object)
-        return preprocessed_ast_object
-
-    ## This means we have a command. Commands are always candidate dataflow
-    ## regions.
-    preprocessed_ast_object = PreprocessedAST(ast_node,
-                                              replace_whole=True,
-                                              non_maximal=False,
-                                              last_ast=last_object)
-    return preprocessed_ast_object
-
-# Background of (linno * t * redirection list) 
-## TODO: It might be possible to actually not close the inner node but rather apply the redirections on it
-def preprocess_node_redir(ast_node, last_object=False):
-    preprocessed_node, something_replaced = preprocess_close_node(ast_node.node,
-                                                                  last_object=last_object)
-    ## TODO: Could there be a problem with the in-place update
-    ast_node.node = preprocessed_node
-    preprocessed_ast_object = PreprocessedAST(ast_node,
-                                              replace_whole=False,
-                                              non_maximal=False,
-                                              something_replaced=something_replaced,
-                                              last_ast=last_object)
-    return preprocessed_ast_object
-
-## TODO: Is that correct? Also, this should probably affect `semi`, `and`, and `or`
-def preprocess_node_background(ast_node, last_object=False):
-    ## A background node is *always* a candidate dataflow region.
-    ## Q: Is that true?
-
-    ## TODO: Preprocess the internals of the background to allow
-    ##       for mutually recursive calls to PaSh.
-    preprocessed_ast_object = PreprocessedAST(ast_node,
-                                              replace_whole=True,
-                                              non_maximal=True,
-                                              last_ast=last_object)
-    return preprocessed_ast_object
-
-## TODO: We can actually preprocess the underlying node and then
-##       return its characteristics above. However, we would need
-##       to add a field in the IR that a node runs in a subshell
-##       (which would have implications on how the backend outputs it).
-##
-##       e.g. a subshell node should also be output as a subshell in the backend.
-## FIXME: This might not just be suboptimal, but also wrong.
-def preprocess_node_subshell(ast_node, last_object=False):
-    preprocessed_body, something_replaced = preprocess_close_node(ast_node.body,
-                                                                  last_object=last_object)
-    ## TODO: Could there be a problem with the in-place update
-    ast_node.body = preprocessed_body
-    preprocessed_ast_object = PreprocessedAST(ast_node,
-                                              replace_whole=False,
-                                              non_maximal=False,
-                                              something_replaced=something_replaced,
-                                              last_ast=last_object)
-    return preprocessed_ast_object
-
-## TODO: For all of the constructs below, think whether we are being too conservative
-
-## TODO: This is not efficient at all since it calls the PaSh runtime everytime the loop is entered.
-##       We have to find a way to improve that.
-def preprocess_node_for(ast_node, last_object=False):
-    preprocessed_body, something_replaced = preprocess_close_node(ast_node.body, last_object=last_object)
-    ## TODO: Could there be a problem with the in-place update
-    ast_node.body = preprocessed_body
-    preprocessed_ast_object = PreprocessedAST(ast_node,
-                                              replace_whole=False,
-                                              non_maximal=False,
-                                              something_replaced=something_replaced,
-                                              last_ast=last_object)
-    return preprocessed_ast_object
-
-def preprocess_node_while(ast_node, last_object=False):
-    preprocessed_test, sth_replaced_test = preprocess_close_node(ast_node.test, last_object=last_object)
-    preprocessed_body, sth_replaced_body = preprocess_close_node(ast_node.body, last_object=last_object)
-    ## TODO: Could there be a problem with the in-place update
-    ast_node.test = preprocessed_test
-    ast_node.body = preprocessed_body
-    something_replaced = sth_replaced_test or sth_replaced_body
-    preprocessed_ast_object = PreprocessedAST(ast_node,
-                                              replace_whole=False,
-                                              non_maximal=False,
-                                              something_replaced=something_replaced,
-                                              last_ast=last_object)
-    return preprocessed_ast_object
-
-## This is the same as the one for `For`
-def preprocess_node_defun(ast_node, last_object=False):
-    ## TODO: For now we don't want to compile function bodies
-    # preprocessed_body = preprocess_close_node(ast_node.body)
-    ## TODO: Could there be a problem with the in-place update
-    # ast_node.body = preprocessed_body
-    preprocessed_ast_object = PreprocessedAST(ast_node,
-                                              replace_whole=False,
-                                              non_maximal=False,
-                                              something_replaced=False,
-                                              last_ast=last_object)
-    return preprocessed_ast_object
-
-## TODO: If the preprocessed is not maximal we actually need to combine it with the one on the right.
-def preprocess_node_semi(ast_node, last_object=False):
-    # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left, irFileGen, config)
-    ##
-    ## TODO: Is it valid that only the right one is considered the last command?
-    preprocessed_left, sth_replaced_left = preprocess_close_node(ast_node.left_operand, last_object=False)
-    preprocessed_right, sth_replaced_right = preprocess_close_node(ast_node.right_operand, last_object=last_object)
-    ## TODO: Could there be a problem with the in-place update
-    ast_node.left_operand = preprocessed_left
-    ast_node.right_operand = preprocessed_right
-    sth_replaced = sth_replaced_left or sth_replaced_right
-    preprocessed_ast_object = PreprocessedAST(ast_node,
-                                              replace_whole=False,
-                                              non_maximal=False,
-                                              something_replaced=sth_replaced,
-                                              last_ast=last_object)
-    return preprocessed_ast_object
-
-## TODO: Make sure that what is inside an `&&`, `||`, `!` (and others) does not run in parallel_pipelines 
-##       since we need its exit code.
-def preprocess_node_and(ast_node, last_object=False):
-    # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left, irFileGen, config)
-    preprocessed_left, sth_replaced_left = preprocess_close_node(ast_node.left_operand, last_object=last_object)
-    preprocessed_right, sth_replaced_right = preprocess_close_node(ast_node.right_operand, last_object=last_object)
-    ## TODO: Could there be a problem with the in-place update
-    ast_node.left_operand = preprocessed_left
-    ast_node.right_operand = preprocessed_right
-    sth_replaced = sth_replaced_left or sth_replaced_right
-    preprocessed_ast_object = PreprocessedAST(ast_node,
-                                              replace_whole=False,
-                                              non_maximal=False,
-                                              something_replaced=sth_replaced,
-                                              last_ast=last_object)
-    return preprocessed_ast_object
-
-def preprocess_node_or(ast_node, last_object=False):
-    # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left, irFileGen, config)
-    preprocessed_left, sth_replaced_left = preprocess_close_node(ast_node.left_operand, last_object=last_object)
-    preprocessed_right, sth_replaced_right = preprocess_close_node(ast_node.right_operand, last_object=last_object)
-    ## TODO: Could there be a problem with the in-place update
-    ast_node.left_operand = preprocessed_left
-    ast_node.right_operand = preprocessed_right
-    sth_replaced = sth_replaced_left or sth_replaced_right
-    preprocessed_ast_object = PreprocessedAST(ast_node,
-                                              replace_whole=False,
-                                              non_maximal=False,
-                                              something_replaced=sth_replaced,
-                                              last_ast=last_object)
-    return preprocessed_ast_object
-
-def preprocess_node_not(ast_node, last_object=False):
-    # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left)
-    preprocessed_body, sth_replaced = preprocess_close_node(ast_node.body, last_object=last_object)
-    ## TODO: Could there be a problem with the in-place update
-    ast_node.body = preprocessed_body
-    preprocessed_ast_object = PreprocessedAST(ast_node,
-                                              replace_whole=False,
-                                              non_maximal=False,
-                                              something_replaced=sth_replaced,
-                                              last_ast=last_object)
-    return preprocessed_ast_object
-
-
-def preprocess_node_if(ast_node, last_object=False):
-    # preprocessed_left, should_replace_whole_ast, is_non_maximal = preprocess_node(ast_node.left, irFileGen, config)
-    preprocessed_cond, sth_replaced_cond = preprocess_close_node(ast_node.cond, last_object=last_object)
-    preprocessed_then, sth_replaced_then = preprocess_close_node(ast_node.then_b, last_object=last_object)
-    preprocessed_else, sth_replaced_else = preprocess_close_node(ast_node.else_b, last_object=last_object)
-    ## TODO: Could there be a problem with the in-place update
-    ast_node.cond = preprocessed_cond
-    ast_node.then_b = preprocessed_then
-    ast_node.else_b = preprocessed_else
-    sth_replaced = sth_replaced_cond or sth_replaced_then or sth_replaced_else
-    preprocessed_ast_object = PreprocessedAST(ast_node,
-                                              replace_whole=False,
-                                              non_maximal=False,
-                                              something_replaced=sth_replaced,
-                                              last_ast=last_object)
-    return preprocessed_ast_object
-
-def preprocess_case(case, last_object=False):
-    preprocessed_body, sth_replaced = preprocess_close_node(case["cbody"], last_object=last_object)
-    case["cbody"] = preprocessed_body
-    return case, sth_replaced
-
-def preprocess_node_case(ast_node, last_object=False):
-    preprocessed_cases_replaced = [preprocess_case(case, last_object=last_object) for case in ast_node.cases]
-    preprocessed_cases, sth_replaced_cases = list(zip(*preprocessed_cases_replaced))
-    ## TODO: Could there be a problem with the in-place update
-    ast_node.cases = preprocessed_cases
-    preprocessed_ast_object = PreprocessedAST(ast_node,
-                                              replace_whole=False,
-                                              non_maximal=False,
-                                              something_replaced=any(sth_replaced_cases),
-                                              last_ast=last_object)
-    return preprocessed_ast_object
-
-
-## TODO: I am a little bit confused about how compilation happens.
-##       Does it happen bottom up or top down: i.e. when we first encounter an occurence
-##       do we recurse in it and then compile from the leaf, or just compile the surface?
-
-
-
-## Replaces IR subtrees with a command that calls them (more
-## precisely, a command that calls a python script to call them).
-##
-## Note: The traversal that replace_irs does, is exactly the same as
-## the one that is done by compile_node. Both of these functions
-## transform nodes of type t to something else.
-##
-## TODO: For now this just replaces the IRs starting from the ourside
-## one first, but it should start from the bottom up to handle
-## recursive IRs.
-
-## This function serializes a candidate df_region in a file, and in its place,
-## it adds a command that calls our distribution planner with the name of the
-## saved file.
-##
-## If we are need to disable parallel pipelines, e.g., if we are in the context of an if,
-## or if we are in the end of a script, then we set a variable.
-def replace_df_region(asts, disable_parallel_pipelines=False, ast_text=None):
-    ir_filename = ptempfile()
-
-    ## Serialize the node in a file
-    with open(ir_filename, "wb") as ir_file:
-        pickle.dump(asts, ir_file)
-
-    ## Serialize the candidate df_region asts back to shell
-    ## so that the sequential script can be run in parallel to the compilation.
-    sequential_script_file_name = ptempfile()
-    ## If we don't have the original ast text, we need to unparse the ast
-    if (ast_text is None):
-        kv_asts = [ast_node_to_untyped_deep(ast) for ast in asts]
-        from_ast_objects_to_shell_file(kv_asts, sequential_script_file_name)
-    else:
-        ## However, if we have the original ast text, then we can simply output that.
-        with open(sequential_script_file_name, "w") as script_file:
-            script_file.write(ast_text)
-
-    ## Replace it with a command that calls the distribution
-    ## planner with the name of the file.
-    replaced_node = make_call_to_runtime(ir_filename, sequential_script_file_name, disable_parallel_pipelines)
-
-    return replaced_node
-
-## This function makes a command that calls the pash runtime
-## together with the name of the file containing an IR. Then the
-## pash runtime should read from this file and continue
-## execution.
-##
-## TODO: At the moment this is written in python but it is in essense a simple shell script.
-##       Is it possible to make it be a simple string instead of manually creating the AST?
-##
-## (MAYBE) TODO: The way I did it, is by calling the parser once, and seeing
-## what it returns. Maybe it would make sense to call the parser on
-## the fly to have a cleaner implementation here?
-def make_call_to_runtime(ir_filename, sequential_script_file_name,
-                         disable_parallel_pipelines) -> AstNode:
-
-    ## Save the previous exit state:
-    ## ```
-    ## pash_previous_exit_status="$?"
-    ## ```
-    assignments = [["pash_previous_exit_status",
-                    [make_quoted_variable("?")]]]
-    previous_status_command = make_command([], assignments=assignments)
-
-    ## Save the input arguments
-    ## ```
-    ## source $PASH_TOP/runtime/save_args.sh "${@}"
-    ## ```
-    arguments = [string_to_argument("source"),
-                 string_to_argument(config.SAVE_ARGS_EXECUTABLE),
-                 [make_quoted_variable("@")]]
-    input_args_command = make_command(arguments)
-
-    ## Disable parallel pipelines if we are in the last command of the script.
-    ## ```
-    ## pash_disable_parallel_pipelines=1
-    ## ```
-    if(disable_parallel_pipelines):
-        assignments = [["pash_disable_parallel_pipelines",
-                        string_to_argument("1")]]
-    else:
-        assignments = [["pash_disable_parallel_pipelines",
-                        string_to_argument("0")]]
-    disable_parallel_pipelines_command = make_command([],
-                                                      assignments=assignments)
-
-    ## Call the runtime
-    arguments = [string_to_argument("source"),
-                 string_to_argument(config.RUNTIME_EXECUTABLE),
-                 string_to_argument(sequential_script_file_name),
-                 string_to_argument(ir_filename)]
-    ## Pass a relevant argument to the planner
-    arguments += config.pass_common_arguments(config.pash_args)
-    runtime_node = make_command(arguments)
-
-    ## Restore the arguments to propagate internal changes, e.g., from `shift` outside.
-    ## ```
-    ## eval "set -- \"\${pash_input_args[@]}\""
-    ## ```
-    ##
-    ## Alternative Solution: (TODO if we need extra performance -- avoiding eval) 
-    ## Implement an AST node that accepts and returns a literal string
-    ## bypassing unparsing. This would make this simpler and also more
-    ## efficient (avoiding eval).
-    ## However, it would require some work because we would need to implement
-    ## support for this node in various places of PaSh and the unparser.
-    ##      
-    ##
-    ## TODO: Maybe we need to only do this if there is a change.
-    ## 
-    set_arguments = [string_to_argument("eval"),
-                     [['Q', string_to_argument('set -- ') +
-                            [escaped_char('"')] + # The escaped quote
-                            string_to_argument('\\${pash_input_args[@]}') +
-                            [escaped_char('"')]]]]
-    set_args_node = make_command(set_arguments)
-
-
-    ## Restore the exit code (since now we have executed `set` last)
-    ## ```
-    ## ( exit "$pash_runtime_final_status")
-    ## ```
-    set_exit_status_command_arguments = [string_to_argument("exit"),
-                                         [make_quoted_variable("pash_runtime_final_status")]]
-    set_exit_status_command = make_command(set_exit_status_command_arguments)
-    set_exit_status_node = make_kv('Subshell', [0, set_exit_status_command, []])
-
-    sequence = make_semi_sequence([previous_status_command,
-                                   input_args_command,
-                                   disable_parallel_pipelines_command,
-                                   runtime_node,
-                                   set_args_node,
-                                   set_exit_status_node])
-    return sequence
-
-##
-## Pattern matching for the AST
-##
-
-def check_if_ast_is_supported(construct, arguments, **kwargs):
-    return
-
-def ast_match_untyped(untyped_ast_object, cases, *args):
-    ## TODO: This should construct the complete AstNode object (not just the surface level)
-    ## TODO: Remove this and then at some point make real proper use of the AstNode
-    ast_node = AstNode(untyped_ast_object)
-    if ast_node.construct is AstNodeConstructor.PIPE:
-        ast_node.check(children_count = lambda : len(ast_node.items) >= 2)
-    return ast_match(ast_node, cases, *args)
-
-def ast_match(ast_node, cases, *args):
-    ## TODO: Remove that once `ast_match_untyped` is fixed to
-    ##       construct the whole AstNode object.
-    if(not isinstance(ast_node, AstNode)):
-        return ast_match_untyped(ast_node, cases, *args)
-
-    return cases[ast_node.construct.value](*args)(ast_node)
diff --git a/compiler/ast_util.py b/compiler/ast_util.py
index c0213a130..7d25be089 100644
--- a/compiler/ast_util.py
+++ b/compiler/ast_util.py
@@ -26,4 +26,28 @@ def is_last_ast(self):
 ## need to be unparsed.
 class UnparsedScript:
     def __init__(self, text):
-        self.text = text
\ No newline at end of file
+        self.text = text
+
+
+##
+## Pattern matching for the AST
+##
+
+def check_if_ast_is_supported(construct, arguments, **kwargs):
+    return
+
+def ast_match_untyped(untyped_ast_object, cases, *args):
+    ## TODO: This should construct the complete AstNode object (not just the surface level)
+    ## TODO: Remove this and then at some point make real proper use of the AstNode
+    ast_node = AstNode(untyped_ast_object)
+    if ast_node.construct is AstNodeConstructor.PIPE:
+        ast_node.check(children_count = lambda : len(ast_node.items) >= 2)
+    return ast_match(ast_node, cases, *args)
+
+def ast_match(ast_node, cases, *args):
+    ## TODO: Remove that once `ast_match_untyped` is fixed to
+    ##       construct the whole AstNode object.
+    if(not isinstance(ast_node, AstNode)):
+        return ast_match_untyped(ast_node, cases, *args)
+
+    return cases[ast_node.construct.value](*args)(ast_node)
diff --git a/compiler/pash.py b/compiler/pash.py
index 00ac5ed83..d1f2b1fed 100755
--- a/compiler/pash.py
+++ b/compiler/pash.py
@@ -4,7 +4,7 @@
 import argparse
 from datetime import datetime
 from annotations import *
-from ast_to_ir import *
+import ast_to_ast
 from ir import *
 from parse import parse_shell_to_asts_interactive
 from pash_graphviz import maybe_init_graphviz_dir
@@ -39,7 +39,11 @@ def main():
         sys.exit(return_code)
 
 def preprocess_and_execute_asts(input_script_path, args, input_script_arguments, shell_name):
-    preprocessed_shell_script = preprocess(input_script_path, args)
+    mode = ast_to_ast.TransformationType('pash')
+    preprocessed_shell_script = preprocess(input_script_path, args, mode)
+    if(args.output_preprocessed):
+        log("Preprocessed script:")
+        log(preprocessed_shell_script)
     
     ## Write the new shell script to a file to execute
     fname = ptempfile()
diff --git a/compiler/preprocessor.py b/compiler/preprocessor.py
index 475a9b375..cdc6fa06e 100644
--- a/compiler/preprocessor.py
+++ b/compiler/preprocessor.py
@@ -3,7 +3,7 @@
 import os
 
 import config
-from ast_to_ir import replace_ast_regions
+import ast_to_ast
 from ir import FileIdGen
 from parse import parse_shell_to_asts, from_ast_objects_to_shell
 from util import *
@@ -11,41 +11,37 @@
 LOGGING_PREFIX = "PaSh Preprocessor: "
 
 @logging_prefix(LOGGING_PREFIX)
-def preprocess(input_script_path, args):
-    print(input_script_path)
+def preprocess(input_script_path, args, mode):
     ## 1. Execute the POSIX shell parser that returns the AST in JSON
     preprocessing_parsing_start_time = datetime.now()
     ast_objects = parse_shell_to_asts(input_script_path)
     preprocessing_parsing_end_time = datetime.now()
-    print_time_delta("Preprocessing -- Parsing", preprocessing_parsing_start_time, preprocessing_parsing_end_time, args)
+    print_time_delta("Preprocessing -- Parsing", preprocessing_parsing_start_time, preprocessing_parsing_end_time)
 
     ## 2. Preprocess ASTs by replacing possible candidates for compilation
     ##    with calls to the PaSh runtime.
     preprocessing_pash_start_time = datetime.now()
-    preprocessed_asts = preprocess_asts(ast_objects)
+    preprocessed_asts = preprocess_asts(ast_objects, mode)
     preprocessing_pash_end_time = datetime.now()
-    print_time_delta("Preprocessing -- PaSh", preprocessing_pash_start_time, preprocessing_pash_end_time, args)
+    print_time_delta("Preprocessing -- PaSh", preprocessing_pash_start_time, preprocessing_pash_end_time)
 
     ## 3. Translate the new AST back to shell syntax
     preprocessing_unparsing_start_time = datetime.now()
     preprocessed_shell_script = from_ast_objects_to_shell(preprocessed_asts)
-    if(args.output_preprocessed):
-        log("Preprocessed script:")
-        log(preprocessed_shell_script)
     
     preprocessing_unparsing_end_time = datetime.now()
-    print_time_delta("Preprocessing -- Unparsing", preprocessing_unparsing_start_time, preprocessing_unparsing_end_time, args)
+    print_time_delta("Preprocessing -- Unparsing", preprocessing_unparsing_start_time, preprocessing_unparsing_end_time)
     return preprocessed_shell_script
 
 
-def preprocess_asts(ast_objects):
+def preprocess_asts(ast_objects, mode):
 
     ## TODO: Add a potential selection here to decide what kind of transformation to apply
 
     ## Preprocess ASTs by replacing AST regions with calls to PaSh's runtime.
     ## Then the runtime will do the compilation and optimization with additional
     ## information.
-    preprocessed_asts = replace_ast_regions(ast_objects)
+    preprocessed_asts = ast_to_ast.replace_ast_regions(ast_objects, mode)
 
     return preprocessed_asts
 
@@ -55,9 +51,6 @@ def preprocess_asts(ast_objects):
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("input", help="the script to be preprocessed")
-    parser.add_argument("--output_preprocessed",
-                        help=" output the preprocessed script",
-                        action="store_true")
     config.add_general_config_arguments(parser)
 
     args = parser.parse_args()
@@ -66,7 +59,9 @@ def main():
     ## Initialize the log file
     ## TODO: Can we move this somewhere where there is no need for copy paste?
     config.init_log_file()
-    preprocess(args.input, args)
+    mode = ast_to_ast.TransformationType('spec')
+    preprocessed_shell_script = preprocess(args.input, args, mode)
+    print(preprocessed_shell_script)
 
 if __name__ == '__main__':
     main()
diff --git a/compiler/util.py b/compiler/util.py
index 989f91f98..04ca99fba 100644
--- a/compiler/util.py
+++ b/compiler/util.py
@@ -20,7 +20,7 @@ def pad(lst, index):
         lst += [None] * (index + 1 - len(lst))
     return lst
 
-def print_time_delta(prefix, start_time, end_time, args=None):
+def print_time_delta(prefix, start_time, end_time):
     ## Always output time in the log.
     time_difference = (end_time - start_time) / timedelta(milliseconds=1)
     ## If output_time flag is set, log the time

From 6bfdda10c7f5ab2a4ba829661cde2ffdbc5d5108 Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Fri, 16 Dec 2022 07:23:53 -0500
Subject: [PATCH 34/64] mini fix

---
 compiler/ir_to_ast.py    | 2 +-
 compiler/pash_runtime.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/compiler/ir_to_ast.py b/compiler/ir_to_ast.py
index df341d3ff..078466669 100644
--- a/compiler/ir_to_ast.py
+++ b/compiler/ir_to_ast.py
@@ -18,7 +18,7 @@ def to_shell(ir, args):
     output_script = from_ast_objects_to_shell(output_asts)
 
     backend_end_time = datetime.now()
-    print_time_delta("Backend", backend_start_time, backend_end_time, args)
+    print_time_delta("Backend", backend_start_time, backend_end_time)
 
     return output_script
 
diff --git a/compiler/pash_runtime.py b/compiler/pash_runtime.py
index 6520b5d0c..2de75450d 100644
--- a/compiler/pash_runtime.py
+++ b/compiler/pash_runtime.py
@@ -153,7 +153,7 @@ def compile_optimize_df_region(df_region, args, compiler_config):
     compilation_start_time = datetime.now()
     asts_and_irs = compile_candidate_df_region(df_region, config.config)
     compilation_end_time = datetime.now()
-    print_time_delta("Compilation", compilation_start_time, compilation_end_time, args)
+    print_time_delta("Compilation", compilation_start_time, compilation_end_time)
 
     ## Optimize all the IRs that can be optimized
     if(args.no_optimize):
@@ -238,7 +238,7 @@ def optimize_irs(asts_and_irs, args, compiler_config):
             optimized_asts_and_irs.append(ast_or_ir)
 
     optimization_end_time = datetime.now()
-    print_time_delta("Optimization", optimization_start_time, optimization_end_time, args)
+    print_time_delta("Optimization", optimization_start_time, optimization_end_time)
 
     return optimized_asts_and_irs
 

From 04142cbc2f01a5956770cfc1f1ef4960d13d1d78 Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Fri, 17 Feb 2023 11:33:19 -0500
Subject: [PATCH 35/64] whitespace to test ci

---
 compiler/ir.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/compiler/ir.py b/compiler/ir.py
index 2366b03e7..cb54affe9 100644
--- a/compiler/ir.py
+++ b/compiler/ir.py
@@ -1469,3 +1469,4 @@ def create_reduce_node(init_func, input_ids, output_ids):
         return init_func(flatten_list(input_ids), output_ids)
     # TODO: this is where we need to use our aggregator spec/node
 
+

From d3968e006ad5140cbd7e3aa5cf76e15cfbefbc60 Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Fri, 17 Feb 2023 12:52:21 -0500
Subject: [PATCH 36/64] add zenodo links

---
 evaluation/intro/input/setup.sh | 29 ++++++++++++++++++-----------
 evaluation/tests/input/setup.sh | 25 +++++++++++++++++--------
 2 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/evaluation/intro/input/setup.sh b/evaluation/intro/input/setup.sh
index a524e9e56..d0b058789 100755
--- a/evaluation/intro/input/setup.sh
+++ b/evaluation/intro/input/setup.sh
@@ -6,15 +6,19 @@ cd $(dirname $0)
 
 [ "$1" = "-c" ] && rm-files 100M.txt words sorted_words
 
+
 if [ ! -f ./100M.txt ]; then
   curl -f 'ndr.md/data/dummy/100M.txt' > 100M.txt
   if [ $? -ne 0 ]; then
-    curl -f 'http://www.gutenberg.org/files/2600/2600-0.txt' | head -c 1M > 1M.txt
-    [ $? -ne 0 ] && eexit 'cannot find 1M.txt'
-    touch 100M.txt
-    for (( i = 0; i < 10; i++ )); do
-      cat 1M.txt >> 10M.txt
-    done
+    curl -f 'https://zenodo.org/record/7650885/files/100M.txt' > 100M.txt
+    if [ $? -ne 0 ]; then
+      curl -f 'http://www.gutenberg.org/files/2600/2600-0.txt' | head -c 1M > 1M.txt
+      [ $? -ne 0 ] && eexit 'cannot find 1M.txt'
+      touch 100M.txt
+      for (( i = 0; i < 100; i++ )); do
+        cat 1M.txt >> 100M.txt
+      done
+    fi
   fi
   append_nl_if_not ./100M.txt
 fi
@@ -22,11 +26,14 @@ fi
 if [ ! -f ./words ]; then
   curl -f 'http://ndr.md/data/dummy/words' > words
   if [ $? -ne 0 ]; then
-    if [ $(uname) = 'Darwin' ]; then
-      cp /usr/share/dict/web2 words || eexit "cannot find dict file"
-    else
-      # apt install wamerican-insane
-      cp /usr/share/dict/words words || eexit "cannot find dict file"
+    curl -f 'https://zenodo.org/record/7650885/files/words' > words
+    if [ $? -ne 0 ]; then
+      if [ $(uname) = 'Darwin' ]; then
+        cp /usr/share/dict/web2 words || eexit "cannot find dict file"
+      else
+        # apt install wamerican-insane
+        cp /usr/share/dict/words words || eexit "cannot find dict file"
+      fi
     fi
   fi
   append_nl_if_not words
diff --git a/evaluation/tests/input/setup.sh b/evaluation/tests/input/setup.sh
index ac78afd20..26abb6772 100755
--- a/evaluation/tests/input/setup.sh
+++ b/evaluation/tests/input/setup.sh
@@ -18,15 +18,21 @@ esac
 if [ ! -f ./1M.txt ]; then
   curl -sf 'http://ndr.md/data/dummy/1M.txt' > 1M.txt
   if [ $? -ne 0 ]; then
-    curl -sf 'http://www.gutenberg.org/files/2600/2600-0.txt' | head -c 1${head_sz} > 1M.txt
-    [ $? -ne 0 ] && eexit 'cannot find 1M.txt'
+    curl -f 'https://zenodo.org/record/7650885/files/1M.txt' > 1M.txt
+    if [ $? -ne 0 ]; then
+      curl -sf 'http://www.gutenberg.org/files/2600/2600-0.txt' | head -c 1${head_sz} > 1M.txt
+      [ $? -ne 0 ] && eexit 'cannot find 1M.txt'
+    fi
   fi
   append_nl_if_not ./1M.txt
 fi
 
 if [ ! -f ./all_cmds.txt ]; then
   if [ "$(hostname)" = "deathstar" ]; then
-    curl -sf 'http://ndr.md/data/dummy/all_cmds.txt' > all_cmds.txt || eexit "all_cmds not found"
+    curl -sf 'http://ndr.md/data/dummy/all_cmds.txt' > all_cmds.txt
+    if [ $? -ne 0 ]; then
+      curl -f 'https://zenodo.org/record/7650885/files/all_cmds.txt' > all_cmds.txt || eexit "all_cmds not found"
+    fi
   else
     ls /usr/bin/* > all_cmds.txt
   fi
@@ -36,11 +42,14 @@ fi
 if [ ! -f ./words ]; then
   curl -sf 'http://ndr.md/data/dummy/words' > words
   if [ $? -ne 0 ]; then
-    if [ $(uname) = 'Darwin' ]; then
-      cp /usr/share/dict/web2 words || eexit "cannot find dict file"
-    else
-      # apt install wamerican-insane
-      cp /usr/share/dict/words words || eexit "cannot find dict file"
+    curl -f 'https://zenodo.org/record/7650885/files/words' > words
+    if [ $? -ne 0 ]; then
+      if [ $(uname) = 'Darwin' ]; then
+        cp /usr/share/dict/web2 words || eexit "cannot find dict file"
+      else
+        # apt install wamerican-insane
+        cp /usr/share/dict/words words || eexit "cannot find dict file"
+      fi
     fi
   fi
   append_nl_if_not words

From 0e3c6e1bea056d6a8d4e888a63d54dea52ed74e7 Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Fri, 17 Feb 2023 13:23:13 -0500
Subject: [PATCH 37/64] slight modifs in download scripts

---
 evaluation/intro/input/setup.sh | 8 ++++----
 evaluation/tests/input/setup.sh | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/evaluation/intro/input/setup.sh b/evaluation/intro/input/setup.sh
index d0b058789..6abec068d 100755
--- a/evaluation/intro/input/setup.sh
+++ b/evaluation/intro/input/setup.sh
@@ -8,9 +8,9 @@ cd $(dirname $0)
 
 
 if [ ! -f ./100M.txt ]; then
-  curl -f 'ndr.md/data/dummy/100M.txt' > 100M.txt
+  curl -sf --connect-timeout 10 'ndr.md/data/dummy/100M.txt' > 100M.txt
   if [ $? -ne 0 ]; then
-    curl -f 'https://zenodo.org/record/7650885/files/100M.txt' > 100M.txt
+    curl -sf 'https://zenodo.org/record/7650885/files/100M.txt' > 100M.txt
     if [ $? -ne 0 ]; then
       curl -f 'http://www.gutenberg.org/files/2600/2600-0.txt' | head -c 1M > 1M.txt
       [ $? -ne 0 ] && eexit 'cannot find 1M.txt'
@@ -24,9 +24,9 @@ if [ ! -f ./100M.txt ]; then
 fi
 
 if [ ! -f ./words ]; then
-  curl -f 'http://ndr.md/data/dummy/words' > words
+  curl -sf --connect-timeout 10 'http://ndr.md/data/dummy/words' > words
   if [ $? -ne 0 ]; then
-    curl -f 'https://zenodo.org/record/7650885/files/words' > words
+    curl -sf 'https://zenodo.org/record/7650885/files/words' > words
     if [ $? -ne 0 ]; then
       if [ $(uname) = 'Darwin' ]; then
         cp /usr/share/dict/web2 words || eexit "cannot find dict file"
diff --git a/evaluation/tests/input/setup.sh b/evaluation/tests/input/setup.sh
index 26abb6772..ed14d8955 100755
--- a/evaluation/tests/input/setup.sh
+++ b/evaluation/tests/input/setup.sh
@@ -16,7 +16,7 @@ esac
 [ "$1" = "-c" ] && rm-files 1M.txt all_cmds.txt words sorted_words 10M.txt
 
 if [ ! -f ./1M.txt ]; then
-  curl -sf 'http://ndr.md/data/dummy/1M.txt' > 1M.txt
+  curl -sf --connect-timeout 10 'http://ndr.md/data/dummy/1M.txt' > 1M.txt
   if [ $? -ne 0 ]; then
     curl -f 'https://zenodo.org/record/7650885/files/1M.txt' > 1M.txt
     if [ $? -ne 0 ]; then
@@ -29,7 +29,7 @@ fi
 
 if [ ! -f ./all_cmds.txt ]; then
   if [ "$(hostname)" = "deathstar" ]; then
-    curl -sf 'http://ndr.md/data/dummy/all_cmds.txt' > all_cmds.txt
+    curl -sf --connect-timeout 10 'http://ndr.md/data/dummy/all_cmds.txt' > all_cmds.txt
     if [ $? -ne 0 ]; then
       curl -f 'https://zenodo.org/record/7650885/files/all_cmds.txt' > all_cmds.txt || eexit "all_cmds not found"
     fi
@@ -40,7 +40,7 @@ if [ ! -f ./all_cmds.txt ]; then
 fi
 
 if [ ! -f ./words ]; then
-  curl -sf 'http://ndr.md/data/dummy/words' > words
+  curl -sf --connect-timeout 10 'http://ndr.md/data/dummy/words' > words
   if [ $? -ne 0 ]; then
     curl -f 'https://zenodo.org/record/7650885/files/words' > words
     if [ $? -ne 0 ]; then

From a8a92f8afe9b761166aac04db16000cc4df7608a Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Fri, 17 Feb 2023 13:25:49 -0500
Subject: [PATCH 38/64] Don't download 100M if it is not on ndr

---
 evaluation/intro/input/setup.sh | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/evaluation/intro/input/setup.sh b/evaluation/intro/input/setup.sh
index 6abec068d..763a2ea44 100755
--- a/evaluation/intro/input/setup.sh
+++ b/evaluation/intro/input/setup.sh
@@ -10,15 +10,12 @@ cd $(dirname $0)
 if [ ! -f ./100M.txt ]; then
   curl -sf --connect-timeout 10 'ndr.md/data/dummy/100M.txt' > 100M.txt
   if [ $? -ne 0 ]; then
-    curl -sf 'https://zenodo.org/record/7650885/files/100M.txt' > 100M.txt
-    if [ $? -ne 0 ]; then
-      curl -f 'http://www.gutenberg.org/files/2600/2600-0.txt' | head -c 1M > 1M.txt
-      [ $? -ne 0 ] && eexit 'cannot find 1M.txt'
-      touch 100M.txt
-      for (( i = 0; i < 100; i++ )); do
-        cat 1M.txt >> 100M.txt
-      done
-    fi
+    curl -f 'http://www.gutenberg.org/files/2600/2600-0.txt' | head -c 1M > 1M.txt
+    [ $? -ne 0 ] && eexit 'cannot find 1M.txt'
+    touch 100M.txt
+    for (( i = 0; i < 100; i++ )); do
+      cat 1M.txt >> 100M.txt
+    done
   fi
   append_nl_if_not ./100M.txt
 fi

From 487c421e0f46529ebfc07028bd6e2c11a71cc0d7 Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Sat, 18 Feb 2023 18:44:47 -0500
Subject: [PATCH 39/64] remove some unnecessary tests

---
 scripts/run_tests.sh | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh
index 08fec792c..1775de113 100755
--- a/scripts/run_tests.sh
+++ b/scripts/run_tests.sh
@@ -20,6 +20,7 @@ echo "Running aggregator tests..."
 cd "$PASH_TOP/evaluation/tests/agg/"
 ./run.sh
 
-echo "Running aggregator tests..."
-cd "$PASH_TOP/runtime/agg/cpp/tests"
-./test.sh
+## TODO: This has to go
+# echo "Running aggregator tests..."
+# cd "$PASH_TOP/runtime/agg/cpp/tests"
+# ./test.sh

From b31c4d3485462583d007959720a447825c19f08e Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Sat, 18 Feb 2023 19:07:58 -0500
Subject: [PATCH 40/64] new commit

---
 scripts/run_tests.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh
index 1775de113..dd663ea99 100755
--- a/scripts/run_tests.sh
+++ b/scripts/run_tests.sh
@@ -24,3 +24,4 @@ cd "$PASH_TOP/evaluation/tests/agg/"
 # echo "Running aggregator tests..."
 # cd "$PASH_TOP/runtime/agg/cpp/tests"
 # ./test.sh
+

From 059650bfa571685c96646429eb3ece0b0a045c65 Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Sat, 18 Feb 2023 19:30:47 -0500
Subject: [PATCH 41/64] Remove custom aggregators

---
 runtime/agg/README.md                        |  27 ----
 runtime/agg/cpp/Makefile                     |  31 -----
 runtime/agg/cpp/aggregators/grep/agg-bsd.h   |   1 -
 runtime/agg/cpp/aggregators/grep/agg-linux.h |  52 --------
 runtime/agg/cpp/aggregators/tr/agg-bsd.h     |   1 -
 runtime/agg/cpp/aggregators/tr/agg-linux.h   |  29 -----
 runtime/agg/cpp/aggregators/uniq/agg-bsd.h   |   1 -
 runtime/agg/cpp/aggregators/uniq/agg-linux.h |  78 -----------
 runtime/agg/cpp/aggregators/wc/agg-bsd.h     |  13 --
 runtime/agg/cpp/aggregators/wc/agg-linux.h   |  24 ----
 runtime/agg/cpp/aggregators/wc/common.h      |  51 --------
 runtime/agg/cpp/bin/.gitignore               |   2 -
 runtime/agg/cpp/common/command_opts.h        | 128 -------------------
 runtime/agg/cpp/common/main.cpp              |  99 --------------
 runtime/agg/cpp/common/main.h                |  53 --------
 runtime/agg/cpp/common/util.h                |  24 ----
 runtime/agg/cpp/intermediate/.gitignore      |   2 -
 runtime/agg/cpp/tests/temp/.gitignore        |   2 -
 runtime/agg/cpp/tests/test-bsd.sh            |  53 --------
 runtime/agg/cpp/tests/test-common.sh         |  19 ---
 runtime/agg/cpp/tests/test-linux.sh          |  57 ---------
 runtime/agg/cpp/tests/test.sh                |  16 ---
 runtime/agg/opt/add.sh                       |   2 -
 runtime/agg/opt/concat.sh                    |   2 -
 runtime/agg/opt/count.sh                     |   2 -
 runtime/agg/opt/head.sh                      |   4 -
 runtime/agg/opt/tail.sh                      |   4 -
 runtime/agg/opt/uniq-c.2.sh                  |  18 ---
 runtime/agg/opt/uniq.sh                      |   3 -
 runtime/agg/opt/wc.2.sh                      |  18 ---
 runtime/agg/opt/wc.sh                        |  10 --
 runtime/agg/py/awk-nl.py                     |  13 --
 runtime/agg/py/cat-n.py                      |  46 -------
 runtime/agg/py/cat.py                        |   8 --
 runtime/agg/py/head.py                       |  12 --
 runtime/agg/py/nl.py                         |  46 -------
 runtime/agg/py/tac.py                        |  11 --
 runtime/agg/py/tail.py                       |  13 --
 runtime/agg/py/tf.py                         |  34 -----
 runtime/agg/py/uniq-c.py                     |  34 -----
 runtime/agg/py/uniq.py                       |  32 -----
 runtime/agg/py/utils.py                      |  60 ---------
 runtime/agg/py/wc.py                         |  25 ----
 scripts/run_tests.sh                         |   5 -
 scripts/ws-client.py                         |   4 +-
 45 files changed, 3 insertions(+), 1166 deletions(-)
 delete mode 100644 runtime/agg/README.md
 delete mode 100644 runtime/agg/cpp/Makefile
 delete mode 100644 runtime/agg/cpp/aggregators/grep/agg-bsd.h
 delete mode 100644 runtime/agg/cpp/aggregators/grep/agg-linux.h
 delete mode 100644 runtime/agg/cpp/aggregators/tr/agg-bsd.h
 delete mode 100644 runtime/agg/cpp/aggregators/tr/agg-linux.h
 delete mode 100644 runtime/agg/cpp/aggregators/uniq/agg-bsd.h
 delete mode 100644 runtime/agg/cpp/aggregators/uniq/agg-linux.h
 delete mode 100644 runtime/agg/cpp/aggregators/wc/agg-bsd.h
 delete mode 100644 runtime/agg/cpp/aggregators/wc/agg-linux.h
 delete mode 100644 runtime/agg/cpp/aggregators/wc/common.h
 delete mode 100644 runtime/agg/cpp/bin/.gitignore
 delete mode 100644 runtime/agg/cpp/common/command_opts.h
 delete mode 100644 runtime/agg/cpp/common/main.cpp
 delete mode 100644 runtime/agg/cpp/common/main.h
 delete mode 100644 runtime/agg/cpp/common/util.h
 delete mode 100644 runtime/agg/cpp/intermediate/.gitignore
 delete mode 100644 runtime/agg/cpp/tests/temp/.gitignore
 delete mode 100755 runtime/agg/cpp/tests/test-bsd.sh
 delete mode 100755 runtime/agg/cpp/tests/test-common.sh
 delete mode 100755 runtime/agg/cpp/tests/test-linux.sh
 delete mode 100755 runtime/agg/cpp/tests/test.sh
 delete mode 100755 runtime/agg/opt/add.sh
 delete mode 100755 runtime/agg/opt/concat.sh
 delete mode 100755 runtime/agg/opt/count.sh
 delete mode 100755 runtime/agg/opt/head.sh
 delete mode 100755 runtime/agg/opt/tail.sh
 delete mode 100755 runtime/agg/opt/uniq-c.2.sh
 delete mode 100755 runtime/agg/opt/uniq.sh
 delete mode 100755 runtime/agg/opt/wc.2.sh
 delete mode 100755 runtime/agg/opt/wc.sh
 delete mode 100755 runtime/agg/py/awk-nl.py
 delete mode 100644 runtime/agg/py/cat-n.py
 delete mode 100755 runtime/agg/py/cat.py
 delete mode 100755 runtime/agg/py/head.py
 delete mode 100755 runtime/agg/py/nl.py
 delete mode 100755 runtime/agg/py/tac.py
 delete mode 100755 runtime/agg/py/tail.py
 delete mode 100755 runtime/agg/py/tf.py
 delete mode 100755 runtime/agg/py/uniq-c.py
 delete mode 100755 runtime/agg/py/uniq.py
 delete mode 100644 runtime/agg/py/utils.py
 delete mode 100755 runtime/agg/py/wc.py

diff --git a/runtime/agg/README.md b/runtime/agg/README.md
deleted file mode 100644
index e731efba6..000000000
--- a/runtime/agg/README.md
+++ /dev/null
@@ -1,27 +0,0 @@
-# Aggregators
-
-Currently aggregators are WIP. The new ones are in `cpp/bin`. They are automatically built during `setup_pash.sh` and the unit tests in `cpp/tests` are run during `run_tests.sh`. The interface is like the following:
-
-```sh
-aggregator inputFile1 inputFile2 args
-```
-
-Where `args` are the arguments that were passed to the command that produced the input files. The aggregator outputs to `stdout`.
-
-## Adding new aggregators
-
-Let's assume that the aggregator being implemented is for a command called `cmd`.
-
-1. Create a folder named `cmd` inside `cpp/aggregators`
-
-2. For each `OS` supported by PaSh:
-
-    2.1 Create a file named `OS-agg.h` inside that folder
-
-    2.2. Implement the aggregator inside that file using the instructions provided in `cpp/common/main.h` or use a different aggregator as an example. Remember about the include guard.
-
-    2.3 You may create additional files in the aggregator directory. This can be used to share code between aggregator implementations for different `OS`es. When `#include`ing, assume that the aggregator directory is in the include path.
-
-3. Add unit tests for the created aggregator in `cpp/tests/test-OS.sh` for each `OS`. Consult the instructions in that file. Remember to test all options and flags of the aggregator.
-
-Note: after completing these steps the aggregator will automatically be built by the `Makefile` with no changes to it required.
\ No newline at end of file
diff --git a/runtime/agg/cpp/Makefile b/runtime/agg/cpp/Makefile
deleted file mode 100644
index ac924993a..000000000
--- a/runtime/agg/cpp/Makefile
+++ /dev/null
@@ -1,31 +0,0 @@
-.PHONY: all clean
-all:
-
-CXX = g++
-CFLAGS = -Wall -std=c++17 -O3 -DNDEBUG -Icommon
-CFLAGS += common/main.cpp
-DEPS = common/*
-
-ifeq ($(shell uname -s), Linux)
-	SYSTEM_NAME = linux
-endif
-ifeq ($(shell uname -s), FreeBSD)
-	SYSTEM_NAME = bsd
-endif
-
-define add_aggregator
-
-$1: $(DEPS) aggregators/$1/agg-${SYSTEM_NAME}.h
-	mkdir -p intermediate/$1
-	cp aggregators/$1/agg-${SYSTEM_NAME}.h intermediate/$1/agg.h
-	${CXX} ${CFLAGS} -I intermediate/$1 -I aggregators/$1/ -o ./bin/$1
-TARGETS += $1
-endef
-
-$(eval $(foreach agg,$(shell ls aggregators),$(call add_aggregator,$(agg))))
-
-all: $(TARGETS)
-
-clean:
-	rm -rf ./bin/*
-	rm -rf ./intermediate/*
diff --git a/runtime/agg/cpp/aggregators/grep/agg-bsd.h b/runtime/agg/cpp/aggregators/grep/agg-bsd.h
deleted file mode 100644
index fb66b9238..000000000
--- a/runtime/agg/cpp/aggregators/grep/agg-bsd.h
+++ /dev/null
@@ -1 +0,0 @@
-#include "agg-linux.h"
diff --git a/runtime/agg/cpp/aggregators/grep/agg-linux.h b/runtime/agg/cpp/aggregators/grep/agg-linux.h
deleted file mode 100644
index d165aafed..000000000
--- a/runtime/agg/cpp/aggregators/grep/agg-linux.h
+++ /dev/null
@@ -1,52 +0,0 @@
-#ifndef AGG_GREP_H
-#define AGG_GREP_H
-
-#include "main.h"
-#include "util.h"
-#include "command_opts.h"
-
-constexpr cmd_opts g_options{
-    // These options require special implementations
-    cmd_opt{"count", 'c', cmd_opt::Argument::None},
-    //cmd_opt{"max-count", 'm', cmd_opt::Argument::Required},
-    //cmd_opt{"byte-offset", 'b', cmd_opt::Argument::None},
-    //cmd_opt{"line-number", 'n', cmd_opt::Argument::None},
-    //cmd_opt{"label", '\0', cmd_opt::Argument::Required},
-
-    // These options have no effect, they're here only not to
-    // throw an unrecognized option error
-    cmd_opt{"extended-regexp", 'E', cmd_opt::Argument::None},
-    cmd_opt{"fixed-strings", 'F', cmd_opt::Argument::None},
-    cmd_opt{"basic-regexp", 'G', cmd_opt::Argument::None},
-    cmd_opt{"perl-regexp", 'P', cmd_opt::Argument::None},
-    cmd_opt{"regexp", 'e', cmd_opt::Argument::Required},
-    cmd_opt{"file", 'f', cmd_opt::Argument::Required},
-    cmd_opt{"ignore-case", 'i', cmd_opt::Argument::None},
-    cmd_opt{"no-ignore-case", '\0', cmd_opt::Argument::None},
-    cmd_opt{"word-regexp", 'w', cmd_opt::Argument::None},
-    cmd_opt{"line-regexp", 'x', cmd_opt::Argument::None},
-    cmd_opt{"no-messages", 's', cmd_opt::Argument::None},
-    cmd_opt{"invert-match", 'v', cmd_opt::Argument::None},
-    cmd_opt{"line-buffered", '\0', cmd_opt::Argument::None},
-    cmd_opt{"only-matching", 'o', cmd_opt::Argument::None},
-    cmd_opt{"quiet", 'q', cmd_opt::Argument::None},
-    cmd_opt{"silent", '\0', cmd_opt::Argument::None},
-    cmd_opt{"binary-files", '\0', cmd_opt::Argument::Required},
-    cmd_opt{"text", 'a', cmd_opt::Argument::None},
-    cmd_opt{"",'I', cmd_opt::Argument::None}
-};
-
-void aggregate() noexcept
-{
-    if (!g_options.is_present(0))
-        output() << input1().rdbuf() << input2().rdbuf();
-    else
-    {
-        size_t count1, count2;
-        input1() >> count1;
-        input2() >> count2;
-        output() << count1 + count2 << '\n';
-    }
-}
-
-#endif
diff --git a/runtime/agg/cpp/aggregators/tr/agg-bsd.h b/runtime/agg/cpp/aggregators/tr/agg-bsd.h
deleted file mode 100644
index fb66b9238..000000000
--- a/runtime/agg/cpp/aggregators/tr/agg-bsd.h
+++ /dev/null
@@ -1 +0,0 @@
-#include "agg-linux.h"
diff --git a/runtime/agg/cpp/aggregators/tr/agg-linux.h b/runtime/agg/cpp/aggregators/tr/agg-linux.h
deleted file mode 100644
index ff5648f95..000000000
--- a/runtime/agg/cpp/aggregators/tr/agg-linux.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef AGG_TR_H
-#define AGG_TR_H
-
-#include "main.h"
-#include "util.h"
-#include "command_opts.h"
-
-constexpr cmd_opts g_options{
-    cmd_opt{"complement",'c', cmd_opt::Argument::None},
-    cmd_opt{"truncate-set1",'t', cmd_opt::Argument::None}, // linux exclusive
-    cmd_opt{"delete",'d', cmd_opt::Argument::None},
-    cmd_opt{"squeeze-repeats",'s', cmd_opt::Argument::None},
-    cmd_opt{"",'u', cmd_opt::Argument::None} // bsd exclusive
-};
-
-inline static void tr() noexcept
-{
-    nyi_error("The -s option isn't supported yet.");
-}
-
-void aggregate() noexcept
-{
-    if (!g_options.is_present(3))
-        output() << input1().rdbuf() << input2().rdbuf();
-    else
-        tr();
-}
-
-#endif
diff --git a/runtime/agg/cpp/aggregators/uniq/agg-bsd.h b/runtime/agg/cpp/aggregators/uniq/agg-bsd.h
deleted file mode 100644
index fb66b9238..000000000
--- a/runtime/agg/cpp/aggregators/uniq/agg-bsd.h
+++ /dev/null
@@ -1 +0,0 @@
-#include "agg-linux.h"
diff --git a/runtime/agg/cpp/aggregators/uniq/agg-linux.h b/runtime/agg/cpp/aggregators/uniq/agg-linux.h
deleted file mode 100644
index 345fead03..000000000
--- a/runtime/agg/cpp/aggregators/uniq/agg-linux.h
+++ /dev/null
@@ -1,78 +0,0 @@
-#ifndef AGG_UNIQ_H
-#define AGG_UNIQ_H
-
-#include "main.h"
-#include "util.h"
-#include "command_opts.h"
-#include <map>
-#include <iostream>
-
-constexpr cmd_opts g_options{
-    cmd_opt{"count",'c', cmd_opt::Argument::None}
-};
-
-static inline void uniq_c() noexcept
-{
-    size_t padding;
-    input1() >> padding;
-    padding = input1().tellg();
-    input1().seekg(std::ios_base::beg);
-
-    std::string commonKeyCand;
-    size_t count2;
-    input2() >> count2;
-    std::getline(input2(), commonKeyCand);
-
-    input1().seekg(-2, std::ios_base::end);
-    while (input1().get() != '\n')
-        input1().seekg(-2, std::ios_base::cur);
-    
-    size_t end_pos = input1().tellg();
-    std::string last_line;
-    size_t count1;
-    input1() >> count1;
-    std::getline(input1(), last_line);
-    input1().seekg(std::ios_base::beg);
-
-    if (last_line == commonKeyCand)
-    {
-        stream_copy_n(input1(), output(), end_pos);
-        output().width(padding);
-        output() << count1 + count2 << commonKeyCand << '\n';
-    }
-    else
-    {
-        output() << input1().rdbuf();
-        output().width(padding);
-        output() << count2 << commonKeyCand << '\n';
-    }
-
-    output() << input2().rdbuf();
-}
-static inline void uniq() noexcept
-{    
-    std::string commonKeyCand;
-    std::getline(input2(), commonKeyCand);
-    output() << input1().rdbuf();
-    input1().clear();
-    input1().seekg(-commonKeyCand.size() - 1, std::ios_base::end);
-    std::string last_line;
-    std::getline(input1(), last_line);
-    if (last_line != commonKeyCand)
-        output() << commonKeyCand << '\n';
-    output() << input2().rdbuf();
-}
-
-void aggregate() noexcept
-{
-    if (g_options.is_present(0))
-    {
-        uniq_c();
-    }
-    else
-    {
-        uniq();
-    }
-}
-
-#endif
diff --git a/runtime/agg/cpp/aggregators/wc/agg-bsd.h b/runtime/agg/cpp/aggregators/wc/agg-bsd.h
deleted file mode 100644
index 668fab856..000000000
--- a/runtime/agg/cpp/aggregators/wc/agg-bsd.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef AGG_WC_H
-#define AGG_WC_H
-
-#include "common.h"
-
-size_t platform_dependent(int& numbers_to_input) { 
-    if(g_options.is_present(2) && g_options.is_present(3))
-        --numbers_to_input;
-    output() << ' ';
-    return 7;
-}
-
-#endif
diff --git a/runtime/agg/cpp/aggregators/wc/agg-linux.h b/runtime/agg/cpp/aggregators/wc/agg-linux.h
deleted file mode 100644
index e03ad07ce..000000000
--- a/runtime/agg/cpp/aggregators/wc/agg-linux.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef AGG_WC_H
-#define AGG_WC_H
-
-#include <limits>
-
-#include "common.h"
-
-size_t platform_dependent(int&) {
-    size_t padding = std::numeric_limits<size_t>::max();
-    {
-        size_t prev_pos = 0, dummy;
-        while(input1() >> dummy)
-        {
-            size_t new_pos = input1().tellg();
-            padding = std::min(new_pos - prev_pos, padding);
-            prev_pos = new_pos + 1;
-        }
-        input1().clear();
-        input1().seekg(std::ios_base::beg);
-    }
-    return padding;
-}
-
-#endif
diff --git a/runtime/agg/cpp/aggregators/wc/common.h b/runtime/agg/cpp/aggregators/wc/common.h
deleted file mode 100644
index 1148e4fbf..000000000
--- a/runtime/agg/cpp/aggregators/wc/common.h
+++ /dev/null
@@ -1,51 +0,0 @@
-#include "main.h"
-#include <string>
-#include <algorithm>
-#include <cstdlib>
-
-inline constexpr cmd_opts g_options{
-    cmd_opt{"lines",'l', cmd_opt::Argument::None},
-    cmd_opt{"words",'w', cmd_opt::Argument::None},
-    cmd_opt{"bytes",'c', cmd_opt::Argument::None},
-    cmd_opt{"chars",'m', cmd_opt::Argument::None},
-    cmd_opt{"max-line-length",'L', cmd_opt::Argument::None}
-};
-
-size_t platform_dependent(int& numbers_to_input);
-
-void aggregate() noexcept
-{
-    int numbers_to_input = 0;
-    for (int i = 0; i < 4; ++i)
-        numbers_to_input += g_options.is_present(i); // +1 for each option
-    if (numbers_to_input == 0 && !g_options.is_present(4))
-        numbers_to_input = 3; // by default there are 3
-    
-    size_t padding = platform_dependent(numbers_to_input);
-
-    for (int i = 0; i < numbers_to_input; ++i)
-    {
-        size_t count1, count2;
-        input1() >> count1;
-        input2() >> count2;
-
-        if (i != 0)
-            output() << ' ';
-        output().width(padding);
-        output() << count1 + count2;
-    }
-
-    if (g_options.is_present(4))
-    {
-        size_t max_len1, max_len2;
-        input1() >> max_len1;
-        input2() >> max_len2;
-
-        if (numbers_to_input != 0)
-            output() << ' ';
-        output().width(padding);
-        output() << std::max(max_len1, max_len2);
-    }
-
-    output() << '\n';
-}
diff --git a/runtime/agg/cpp/bin/.gitignore b/runtime/agg/cpp/bin/.gitignore
deleted file mode 100644
index d6b7ef32c..000000000
--- a/runtime/agg/cpp/bin/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-*
-!.gitignore
diff --git a/runtime/agg/cpp/common/command_opts.h b/runtime/agg/cpp/common/command_opts.h
deleted file mode 100644
index 49d5e24e7..000000000
--- a/runtime/agg/cpp/common/command_opts.h
+++ /dev/null
@@ -1,128 +0,0 @@
-#ifndef COMMAND_OPTS_H
-#define COMMAND_OPTS_H
-
-#include <vector>
-#include <array>
-#include <string>
-#include <initializer_list>
-#include <getopt.h>
-
-struct opt_holder
-{
-    option* long_options;
-    const char* optstring;
-    int* present;
-    std::string* args;
-    std::array<char, 256> map;
-};
-
-struct cmd_opt
-{
-    const char* long_name;
-    char abbreviation;
-    enum class Argument {
-        None = no_argument,
-        Required = required_argument,
-        Optional = optional_argument
-    } argument;
-};
-
-template <size_t Size>
-class cmd_opts : public std::array<cmd_opt, Size>
-{
-public:
-    constexpr cmd_opts(std::initializer_list<cmd_opt> ar) noexcept :
-        std::array<cmd_opt, Size>{il_to_array<Size>(ar)},
-        long_opts{[&](){
-            std::array<option, Size + 1> long_opts{};
-            size_t current_idx = 0;
-            auto& base = static_cast<std::array<cmd_opt, Size>&>(*this);
-            int i = 0;
-            for (auto& opt : base)
-            {
-                if (opt.long_name && opt.long_name[0] != '\0')
-                {
-                    long_opts[current_idx++] = {
-                        opt.long_name,
-                        static_cast<int>(opt.argument),
-                        &present[i],
-                        1
-                    };
-                }
-                ++i;
-            }
-            return long_opts;
-        }()},
-        optstring{[&](){
-            std::array<char, 3 * Size + 1> optstring{};
-            size_t current_idx = 0;
-            auto& base = static_cast<std::array<cmd_opt, Size>&>(*this);
-            for (auto& opt : base)
-            {
-                if (opt.abbreviation)
-                {
-                    optstring[current_idx++] = opt.abbreviation;
-                    if (opt.argument != cmd_opt::Argument::None)
-                    {
-                        optstring[current_idx++] = ':';
-                        if (opt.argument == cmd_opt::Argument::Optional)
-                        {
-                            optstring[current_idx++] = ':';
-                        }
-                    }
-                }
-            }
-            return optstring;
-        }()},
-        option_data{long_opts.data(), optstring.data(), present, arg,
-            [&](){
-                std::array<char, 256> map{};
-                auto& base = static_cast<std::array<cmd_opt, Size>&>(*this);
-                int i = 0;
-                for (auto& opt : base)
-                {
-                    if (opt.abbreviation)
-                    {
-                        map[opt.abbreviation] = i;
-                    }
-                    ++i;
-                }
-                return map;
-            }()
-        }
-        {
-        }
-
-        [[nodiscard]] bool is_present(size_t opt_idx) const noexcept
-        {
-            return present[opt_idx] == 1;
-        }
-        [[nodiscard]] const std::string& argument(size_t opt_idx) const noexcept
-        {
-            return arg[opt_idx];
-        }
-private:
-    std::array<option, Size + 1> long_opts;
-    std::array<char, 3 * Size + 1> optstring;
-    inline static int present[Size + 1];
-    inline static std::string arg[Size + 1];
-
-    template<size_t N, typename T, std::size_t... I>
-    [[nodiscard]] constexpr auto il_to_array_impl(const std::initializer_list<T>& il, std::index_sequence<I...>) noexcept
-    {
-        return std::array<T, N>{std::data(il)[I]...};
-    }
-    template<size_t N, typename T>
-    [[nodiscard]] constexpr auto il_to_array(const std::initializer_list<T>& il) noexcept
-    {
-        return il_to_array_impl<N>(il, std::make_index_sequence<N>{});
-    }
-public:
-    const opt_holder option_data;
-};
-
-template <typename... T>
-cmd_opts(T...) -> cmd_opts<sizeof...(T)>;
-cmd_opts() -> cmd_opts<0>;
-
-#endif
diff --git a/runtime/agg/cpp/common/main.cpp b/runtime/agg/cpp/common/main.cpp
deleted file mode 100644
index 6914e5435..000000000
--- a/runtime/agg/cpp/common/main.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-#include "main.h"
-#include "agg.h"
-#include <algorithm>
-#include <iostream>
-#include <fstream>
-#include <cstdlib>
-#include <cstring>
-#include <unistd.h>
-#include <tuple>
-
-[[noreturn]] void nyi_error(const char* message) noexcept
-{
-	std::cerr << "Not yet implemented error: " << message << '\n';
-	exit(EXIT_FAILURE);
-}
-
-std::pair<int, char**> parse_options(int argc, char** argv, const opt_holder& options) noexcept
-{	
-	int ch, longind = 0;
-	while ((ch = getopt_long(argc, argv, options.optstring, options.long_options, &longind)) != -1)
-	{
-		size_t opt_idx;
-		switch(ch)
-		{
-		case 0:
-			opt_idx = (options.long_options[longind].flag - options.present);	
-			break;
-		case '?':
-			nyi_error(("Unsupported flag " + std::to_string(ch)).c_str());
-		default:
-			opt_idx = options.map[ch];
-			break;
-		}
-		if (optarg)
-			options.args[opt_idx] = optarg;
-		options.present[opt_idx] = 1;
-	}
-	return {argc - optind, argv + optind};
-}
-
-// Written to by main
-// Read by functions below it
-std::ifstream g_in1;
-std::ifstream g_in2;
-int g_argc;
-char** g_argv;
-
-int main(int _argc, char** _argv)
-{
-	std::ios_base::sync_with_stdio(false);
-
-	if (_argc < 3)
-	{
-		std::cerr << "Usage: " << _argv[0] << " input1 input2 [OPTION]...\n";
-		return EXIT_FAILURE;
-	}
-
-	g_in1.open(_argv[1], std::ios_base::in  | std::ios_base::binary);
-	g_in2.open(_argv[2], std::ios_base::in  | std::ios_base::binary);
-
-	if (g_in1.peek() == EOF)
-	{
-		output() << input2().rdbuf();
-		return EXIT_SUCCESS;
-	}
-	if (g_in2.peek() == EOF)
-	{
-		output() << input1().rdbuf();
-		return EXIT_SUCCESS;
-	}
-
-	std::tie(g_argc, g_argv) = parse_options(_argc - 2, _argv + 2, g_options.option_data);
-
-	aggregate();
-
-	return EXIT_SUCCESS;
-}
-
-[[nodiscard]] size_t argc() noexcept
-{
-	return g_argc;
-}
-[[nodiscard]] char** argv() noexcept
-{
-	return g_argv;
-}
-
-[[nodiscard]] std::istream& input1() noexcept
-{
-	return g_in1;
-}
-[[nodiscard]] std::istream& input2() noexcept
-{
-	return g_in2;
-}
-[[nodiscard]] std::ostream& output() noexcept
-{
-	return std::cout;
-}
diff --git a/runtime/agg/cpp/common/main.h b/runtime/agg/cpp/common/main.h
deleted file mode 100644
index 38a5da258..000000000
--- a/runtime/agg/cpp/common/main.h
+++ /dev/null
@@ -1,53 +0,0 @@
-#ifndef MAIN_H
-#define MAIN_H
-
-// See instructions below this code
-#include <string>
-#include <istream>
-#include "command_opts.h"
-
-// -*-*-*-*-*-*-*-*-*-* INSTRUCTIONS BEGIN HERE -*-*-*-*-*-*-*-*-*-*
-// This file contains common code between all aggregators
-// It should be included when implementing a new aggregator
-// It contains the aggregator's main function
-// To use it, just include it and implement the function presented below
-// and remember to link the executable with main.cpp
-
-void aggregate() noexcept; // implement this
-
-// It should work in the following manner:
-// 1. The aggregator should create a global variable called
-//    g_options in the manner showed below:
-
-// constexpr cmd_opts g_options{
-//     cmd_opt{ "combine", 'c'},
-//     cmd_opt{ "reverse", 'r'}
-// };
-
-//   Then, the member functions .is_present(idx) and .argument(idx) can be used
-//   with the argument being the index of the option in the constructor.
-
-// 2. If there are any commandline arguments remaining - not associated with
-//    an option - they can be retrieved using the below functions:
-
-[[nodiscard]] size_t argc() noexcept;
-[[nodiscard]] char** argv() noexcept;
-
-// 3. It should request inputs to aggregate when it needs them using
-//    objects returned by the functions below. Both inputs are guaranteed
-//    to be .good() at the beggining.
-
-[[nodiscard]] std::istream& input1() noexcept;
-[[nodiscard]] std::istream& input2() noexcept;
-
-// 4. It should output the aggregated results as soon as it has a piece
-//    of them done to the stream returned by the function below:
-
-[[nodiscard]] std::ostream& output() noexcept;
-
-// 5. If some behaviour has not been implemented yet, the below
-//    function can be called:
-
-[[noreturn]] void nyi_error(const char* message) noexcept;
-
-#endif
diff --git a/runtime/agg/cpp/common/util.h b/runtime/agg/cpp/common/util.h
deleted file mode 100644
index 2177a4f9a..000000000
--- a/runtime/agg/cpp/common/util.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef UTIL_H
-#define UTIL_H
-
-#include <iostream>
-#include <memory>
-
-bool stream_copy_n(std::istream& in, std::ostream& out, std::size_t count) noexcept
-{
-    const std::size_t buffer_size = 256 * 1024;
-    std::unique_ptr<char[]> buffer = std::make_unique<char[]>(buffer_size);
-    while(count > buffer_size)
-    {
-        in.read(buffer.get(), buffer_size);
-        out.write(buffer.get(), buffer_size);
-        count -= buffer_size;
-    }
-
-    in.read(buffer.get(), count);
-    out.write(buffer.get(), count);
-
-    return in.good() && out.good();
-}
-
-#endif
diff --git a/runtime/agg/cpp/intermediate/.gitignore b/runtime/agg/cpp/intermediate/.gitignore
deleted file mode 100644
index d6b7ef32c..000000000
--- a/runtime/agg/cpp/intermediate/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-*
-!.gitignore
diff --git a/runtime/agg/cpp/tests/temp/.gitignore b/runtime/agg/cpp/tests/temp/.gitignore
deleted file mode 100644
index d6b7ef32c..000000000
--- a/runtime/agg/cpp/tests/temp/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-*
-!.gitignore
diff --git a/runtime/agg/cpp/tests/test-bsd.sh b/runtime/agg/cpp/tests/test-bsd.sh
deleted file mode 100755
index 6ed04a93b..000000000
--- a/runtime/agg/cpp/tests/test-bsd.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-./test-common.sh grep "'\.'" ../bin/grep
-./test-common.sh grep "'[A-Z]'" ../bin/grep
-./test-common.sh grep "'x'" ../bin/grep
-./test-common.sh grep "'Bell'" ../bin/grep
-./test-common.sh grep "-c '^[A-Z]'" ../bin/grep
-./test-common.sh grep "-c '^....$'" ../bin/grep
-./test-common.sh grep "gz" ../bin/grep
-./test-common.sh grep "1969" ../bin/grep
-./test-common.sh grep "-vi '[aeiou]'" ../bin/grep
-./test-common.sh grep "-vc 'light.\*light.\*light'" ../bin/grep
-./test-common.sh grep "-v '^0$'" ../bin/grep
-./test-common.sh grep "-v '[KQRBN]'" ../bin/grep
-./test-common.sh grep "-i '^[^aeiou]*[aeiou][^aeiou]*[aeiou][^aeiou]$'" ../bin/grep
-./test-common.sh grep "-i '^[^aeiou]*[aeiou][^aeiou]*$'" ../bin/grep
-./test-common.sh grep "-c 'light.\*light.\*light'" ../bin/grep
-./test-common.sh grep "-c 'light.\*light'" ../bin/grep
-./test-common.sh grep "'print'" ../bin/grep
-./test-common.sh grep "'light.\*light'" ../bin/grep
-./test-common.sh grep "'\(.\).*\1\(.\).*\2\(.\).*\3\(.\).*\4'" ../bin/grep
-./test-common.sh grep "'[KQRBN]'" ../bin/grep
-./test-common.sh grep "'UNIX'" ../bin/grep
-./test-common.sh grep "'AT&T'" ../bin/grep
-
-./test-common.sh tr "'[a-z]' '\n'" ../bin/tr
-./test-common.sh tr "A-Z a-z" ../bin/tr
-./test-common.sh tr "-cud A-Z" ../bin/tr
-./test-common.sh tr "-c '[A-Z]' '\n'" ../bin/tr
-./test-common.sh tr "-d '\n'" ../bin/tr
-
-./test-common.sh wc "" ../bin/wc
-./test-common.sh wc "-l" ../bin/wc
-./test-common.sh wc "-w" ../bin/wc
-./test-common.sh wc "-c" ../bin/wc
-./test-common.sh wc "-m" ../bin/wc
-./test-common.sh wc "-L" ../bin/wc
-./test-common.sh wc "-lcm" ../bin/wc
-./test-common.sh wc "-mlw" ../bin/wc
-./test-common.sh wc "-mLc" ../bin/wc
-./test-common.sh wc "-L -mc -w" ../bin/wc
-
-./test-common.sh uniq "" ../bin/uniq
-./test-common.sh uniq "-c" ../bin/uniq 
-./test-common.sh uniq "--count" ../bin/uniq
-
-# These tests are run during PASH_TOP/scripts/run_tests.sh
-# Make sure to build the aggregators using PASH_TOP/scripts/setup-pash.sh first
-#
-# More tests can be added like this:
-#   ./test-common.sh cmd args agg
-# where 
-#   cmd   - is a shell command like   uniq
-#   args  - are arguements like       -c
-#   agg   - is an aggregator like     ./uniq-c
diff --git a/runtime/agg/cpp/tests/test-common.sh b/runtime/agg/cpp/tests/test-common.sh
deleted file mode 100755
index 63924fa4b..000000000
--- a/runtime/agg/cpp/tests/test-common.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-CMD="$1"
-FLG="$2"
-AGG="$3"
-
-cat $IN1 $IN2 | $CMD $FLG > ./temp/reference
-cat $IN1 | $CMD $FLG > ./temp/partial1
-cat $IN2 | $CMD $FLG > ./temp/partial2
-
-$AGG ./temp/partial1 ./temp/partial2 $FLG > ./temp/aggregated
-
-diff ./temp/aggregated ./temp/reference > ./temp/log
-if [ $? -ne 0 ]; then
-    cat ./temp/log | head
-    echo $CMD "$FLG ...FAIL"
-else
-    echo $CMD "$FLG ...pass"
-fi
-
-rm -f ./temp/partial1 ./temp/partial2 ./temp/aggregated ./temp/reference ./temp/log
diff --git a/runtime/agg/cpp/tests/test-linux.sh b/runtime/agg/cpp/tests/test-linux.sh
deleted file mode 100755
index f7bd59b01..000000000
--- a/runtime/agg/cpp/tests/test-linux.sh
+++ /dev/null
@@ -1,57 +0,0 @@
-./test-common.sh grep "'\.'" ../bin/grep
-./test-common.sh grep "'[A-Z]'" ../bin/grep
-./test-common.sh grep "'x'" ../bin/grep
-./test-common.sh grep "'Bell'" ../bin/grep
-./test-common.sh grep "-c '^[A-Z]'" ../bin/grep
-./test-common.sh grep "-c '^....$'" ../bin/grep
-./test-common.sh grep "gz" ../bin/grep
-./test-common.sh grep "1969" ../bin/grep
-./test-common.sh grep "-vi '[aeiou]'" ../bin/grep
-./test-common.sh grep "-vc 'light.\*light.\*light'" ../bin/grep
-./test-common.sh grep "-v '^0$'" ../bin/grep
-./test-common.sh grep "-v '[KQRBN]'" ../bin/grep
-./test-common.sh grep "-i '^[^aeiou]*[aeiou][^aeiou]*[aeiou][^aeiou]$'" ../bin/grep
-./test-common.sh grep "-i '^[^aeiou]*[aeiou][^aeiou]*$'" ../bin/grep
-./test-common.sh grep "-c 'light.\*light.\*light'" ../bin/grep
-./test-common.sh grep "-c 'light.\*light'" ../bin/grep
-./test-common.sh grep "'print'" ../bin/grep
-./test-common.sh grep "'light.\*light'" ../bin/grep
-./test-common.sh grep "'\(.\).*\1\(.\).*\2\(.\).*\3\(.\).*\4'" ../bin/grep
-./test-common.sh grep "'[KQRBN]'" ../bin/grep
-./test-common.sh grep "'UNIX'" ../bin/grep
-./test-common.sh grep "'AT&T'" ../bin/grep
-
-./test-common.sh tr "'[a-z]' '\n'" ../bin/tr
-./test-common.sh tr "A-Z a-z" ../bin/tr
-./test-common.sh tr "-c '[A-Z]' '\n'" ../bin/tr
-./test-common.sh tr "--complement '[1-9]\n*' '[a-z][A-Z]" ../bin/tr
-./test-common.sh tr "--complement -t '[1-9]\n*' '[a-z][A-Z]" ../bin/tr
-./test-common.sh tr "-d '\n'" ../bin/tr
-./test-common.sh tr "-tcd '[1-9][a-z][A-Z]\n'" ../bin/tr
-
-./test-common.sh wc "" ../bin/wc
-./test-common.sh wc "-l" ../bin/wc
-./test-common.sh wc "-w" ../bin/wc
-./test-common.sh wc "-c" ../bin/wc
-./test-common.sh wc "-m" ../bin/wc
-./test-common.sh wc "-L" ../bin/wc
-./test-common.sh wc "-lcm" ../bin/wc
-./test-common.sh wc "-mlw" ../bin/wc
-./test-common.sh wc "-mLc" ../bin/wc
-./test-common.sh wc "-L -mc -w" ../bin/wc
-./test-common.sh wc "--bytes -c --chars -L" ../bin/wc
-./test-common.sh wc "-L --lines --words" ../bin/wc
-
-./test-common.sh uniq "" ../bin/uniq
-./test-common.sh uniq "-c" ../bin/uniq 
-./test-common.sh uniq "--count" ../bin/uniq
-
-# These tests are run during PASH_TOP/scripts/run_tests.sh
-# Make sure to build the aggregators using PASH_TOP/scripts/setup-pash.sh first
-#
-# More tests can be added like this:
-#   ./test-common.sh cmd args agg
-# where 
-#   cmd   - is a shell command like   uniq
-#   args  - are arguements like       -c
-#   agg   - is an aggregator like     ./uniq-c
diff --git a/runtime/agg/cpp/tests/test.sh b/runtime/agg/cpp/tests/test.sh
deleted file mode 100755
index 6e750c4e6..000000000
--- a/runtime/agg/cpp/tests/test.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-if [[ -z "$PASH_TOP" ]]; then
-    export PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel --show-superproject-working-tree)}
-fi
-
-export IN1=$PASH_TOP/evaluation/intro/input/100M.txt
-export IN2=$PASH_TOP/evaluation/intro/input/words
-
-if [ ! -f $IN1 ]; then
-    $PASH_TOP/evaluation/intro/input/setup.sh
-fi
-
-if [[ $(uname -s) == 'Linux' ]]; then
-    ./test-linux.sh
-elif [[ $(uname -s) == 'FreeBSD' ]]; then
-    ./test-bsd.sh
-fi
diff --git a/runtime/agg/opt/add.sh b/runtime/agg/opt/add.sh
deleted file mode 100755
index d0ecaa468..000000000
--- a/runtime/agg/opt/add.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/usr/bin/env bash
-paste -d+ "$@" | bc
diff --git a/runtime/agg/opt/concat.sh b/runtime/agg/opt/concat.sh
deleted file mode 100755
index 3c0bf6d07..000000000
--- a/runtime/agg/opt/concat.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/usr/bin/env bash
-cat "$@"
diff --git a/runtime/agg/opt/count.sh b/runtime/agg/opt/count.sh
deleted file mode 100755
index d1d0ef458..000000000
--- a/runtime/agg/opt/count.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/usr/bin/env bash
-awk '{ count[$2] += $1 } END { for(e in count) print count[e], e }' "$@"
diff --git a/runtime/agg/opt/head.sh b/runtime/agg/opt/head.sh
deleted file mode 100755
index 6f217eb12..000000000
--- a/runtime/agg/opt/head.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/usr/bin/env bash
-# FIXME missing head parameters
-
-cat "${1}"
diff --git a/runtime/agg/opt/tail.sh b/runtime/agg/opt/tail.sh
deleted file mode 100755
index e4277feaa..000000000
--- a/runtime/agg/opt/tail.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/usr/bin/env bash
-# FIXME missing tail parameters
-
-cat "${@: -1}"
diff --git a/runtime/agg/opt/uniq-c.2.sh b/runtime/agg/opt/uniq-c.2.sh
deleted file mode 100755
index 3c8a259bc..000000000
--- a/runtime/agg/opt/uniq-c.2.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/usr/bin/env bash
-
-END_OF_1=$(tail -n 1 "$1")
-END_NUM=$(echo "$END_OF_1" | grep -E -o '^[ ]*[0-9]*[ ]*' | tr -d "[:space:]")
-END_WORD=$(echo "$END_OF_1" | sed 's/^[ ]*[0-9]*[ ]*//g')
-
-START_OF_2=$(head -n 1 "$2")
-START_NUM=$(echo "$START_OF_2" | grep -E -o '^[ ]*[0-9]*[ ]*' | tr -d "[:space:]")
-START_WORD=$(echo "$START_OF_2" | sed 's/^[ ]*[0-9]*[ ]*//g')
-
-if [[ $START_WORD == "$END_WORD" ]]; then
-  TOTAL_NUM=$((START_NUM + END_NUM))
-  sed '$d' "$1"
-  printf "%7s %s\n" "$TOTAL_NUM" "$START_WORD"
-  sed '1d' "$2"
-else
-  cat "$1" "$2"
-fi
diff --git a/runtime/agg/opt/uniq.sh b/runtime/agg/opt/uniq.sh
deleted file mode 100755
index dc75a38c0..000000000
--- a/runtime/agg/opt/uniq.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/usr/bin/env bash
-# simply rerun uniq
-cat "$@" | uniq
diff --git a/runtime/agg/opt/wc.2.sh b/runtime/agg/opt/wc.2.sh
deleted file mode 100755
index deb0db253..000000000
--- a/runtime/agg/opt/wc.2.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/usr/bin/env bash
-
-# Part of a distributed-`wc` wrapper, merging two `wc` results
-# FIXME needs correct padding
-
-paste -d '+'
-    <(cat "$1" |
-      wc |
-      tr -s ' '  '\n' |
-      tail -n +2)
-    <(cat "$2" |
-      wc |
-      tr -s ' '  '\n' |
-      tail -n +2) |
-  bc |
-  tr -s '\n'  ' ' |
-  sed 's/^/   /' |
-  sed 's/$/\ /'
diff --git a/runtime/agg/opt/wc.sh b/runtime/agg/opt/wc.sh
deleted file mode 100755
index d00d9efa1..000000000
--- a/runtime/agg/opt/wc.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/usr/bin/env bash
-
-A="paste -d '+' "
-for i in "$@"; do
-  # cat "$i" | tr -s ' '  '\n' | tail -n +2
-  A="$A <(cat $i | tr -s ' '  '\n' | tail -n +2) "
-done
-A="$A | bc | tr -s '\n'  ' ' | sed 's/^/   /' | sed 's/$/\n /'"
-
-eval "$A"
diff --git a/runtime/agg/py/awk-nl.py b/runtime/agg/py/awk-nl.py
deleted file mode 100755
index 1bcf1b5eb..000000000
--- a/runtime/agg/py/awk-nl.py
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/python
-import sys, os, functools, utils
-
-def agg(a, b):
-  # print(a, b)
-  if not a:
-    return b
-  return a  + b[1:]
-
-utils.help()
-res = functools.reduce(agg, utils.read_all(), [])
-utils.out("".join(res))
-
diff --git a/runtime/agg/py/cat-n.py b/runtime/agg/py/cat-n.py
deleted file mode 100644
index 206ac2416..000000000
--- a/runtime/agg/py/cat-n.py
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/usr/bin/python
-import sys, os, functools, utils
-
-PAD_LEN = 6
-
-def parseLine(s):
-  global PAD_LEN
-  # FIXME: This could identify padding number---out of band
-  res = s.split(None, 1) # if s else (None, s)
-  # if can't parse, just wrap "s"
-  return (int(res[0]), res[1])
-
-def emitLine(tup):
-  global PAD_LEN
-  return (str(tup[0]).rjust(PAD_LEN, ' ') + '\t' + tup[1])
-
-def augment(total, b):
-    def augment_aux(s):
-      (n, l) = parseLine(s)
-      # print("augmenting:", n, l)
-      t = (total + n, l) if n else (None, l)
-      return emitLine(t)
-    return map(augment_aux, b)
-
-# There's a chance the last line is empty, ie, with no counter; this searches
-# recursively to find a line that has a counter.
-def lastCount(a):
-    # print("lastCount of", a)
-    if not a:
-      # print("twice", a)
-      return 0
-    # print a[-1]
-    (n, l) = parseLine(a[-1])
-    return n if n else lastCount(a[:-1])
-
-def agg(a, b):
-  # print(a, b)
-  if not a:
-    return b
-  # print("combining", a, b)
-  return a + augment(lastCount(a), b)
-
-utils.help()
-res = functools.reduce(agg, utils.read_all(), [])
-utils.out("".join(res))
-
diff --git a/runtime/agg/py/cat.py b/runtime/agg/py/cat.py
deleted file mode 100755
index 793155008..000000000
--- a/runtime/agg/py/cat.py
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/usr/bin/env python
-import sys, os, functools, utils
-
-def agg(a, b):
-  return a + b
-
-utils.help()
-utils.out("".join(functools.reduce(agg, utils.read_all(), [])))
diff --git a/runtime/agg/py/head.py b/runtime/agg/py/head.py
deleted file mode 100755
index b52c86c1a..000000000
--- a/runtime/agg/py/head.py
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/usr/bin/python
-import sys, os, functools, utils
-
-def agg(a, b):
-  # print(a, b)
-  if not a:
-    return b
-  return a
-
-utils.help()
-res = functools.reduce(agg, utils.read_all(), [])
-utils.out("".join(res))
diff --git a/runtime/agg/py/nl.py b/runtime/agg/py/nl.py
deleted file mode 100755
index 965966859..000000000
--- a/runtime/agg/py/nl.py
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/usr/bin/python
-import sys, os, functools, utils
-
-PAD_LEN = 6
-
-def parseLine(s):
-  global PAD_LEN
-  # FIXME: This could identify padding number---out of band
-  res = s.split(None, 1) # if s else (None, s)
-  # if can't parse, just wrap "s"
-  return (int(res[0]), res[1]) if len(res) > 0 else (None, s)
-
-def emitLine(tup):
-  global PAD_LEN
-  return (str(tup[0]).rjust(PAD_LEN, ' ') + '\t' + tup[1]) if tup[0] else tup[1]
-
-def augment(total, b):
-    def augment_aux(s):
-      (n, l) = parseLine(s)
-      # print("augmenting:", n, l)
-      t = (total + n, l) if n else (None, l)
-      return emitLine(t)
-    return map(augment_aux, b)
-
-# There's a chance the last line is empty, ie, with no counter; this searches
-# recursively to find a line that has a counter.
-def lastCount(a):
-    # print("lastCount of", a)
-    if not a:
-      # print("twice", a)
-      return 0
-    # print a[-1]
-    (n, l) = parseLine(a[-1])
-    return n if n else lastCount(a[:-1])
-
-def agg(a, b):
-  # print(a, b)
-  if not a:
-    return b
-  # print("combining", a, b)
-  return a + augment(lastCount(a), b)
-
-utils.help()
-res = functools.reduce(agg, utils.read_all(), [])
-utils.out("".join(res))
-
diff --git a/runtime/agg/py/tac.py b/runtime/agg/py/tac.py
deleted file mode 100755
index aac1e4c85..000000000
--- a/runtime/agg/py/tac.py
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/usr/bin/python
-import sys, os, functools, utils
-
-def agg(a, b):
-  return b + a
-
-utils.help()
-res = functools.reduce(agg, utils.read_all(), [])
-# Python3 syntax? print("".join(res), end=' ')
-# Note comma to avoid newline
-utils.out("".join(res))
diff --git a/runtime/agg/py/tail.py b/runtime/agg/py/tail.py
deleted file mode 100755
index e48e043d8..000000000
--- a/runtime/agg/py/tail.py
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/python
-import sys, os, functools, utils
-
-def agg(a, b):
-  # print(a, b)
-  if not a:
-    return b
-  return b
-
-utils.help()
-res = functools.reduce(agg, utils.read_all(), [])
-utils.out("".join(res))
-
diff --git a/runtime/agg/py/tf.py b/runtime/agg/py/tf.py
deleted file mode 100755
index 77abf4bc5..000000000
--- a/runtime/agg/py/tf.py
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/usr/bin/python
-import sys, os, functools, utils
-from collections import OrderedDict
-
-PAD_LEN = 4
-
-def parseLine(s):
-  global PAD_LEN
-  # FIXME: This could identify padding number---out of band?
-  res = s.split()
-  # print(res)
-  return (res[0], int(res[1])) #, res[2]) 
-
-def emitLine(t):
-  global PAD_LEN
-  return " ".join(t)
-
-def update_index(index, lst):
-  # { index[i]:(index[i] if index[i] else j) for (i, j) in lst}
-  # print(index)
-  # print(lst)
-  print(index, lst)
-  index.update(lst)
-  return index # {index[i]: j for i, j in lst}
-
-def agg(a, b):
-  # print(a, b)
-  # TODO: take and emit out of fold!
-  return update_index(a, map(parseLine, b));
-
-utils.help()
-res = functools.reduce(agg, utils.read_all(), OrderedDict())
-utils.out("\n".join([ a + "  " + str(b) for a,b in res.items()]))
-
diff --git a/runtime/agg/py/uniq-c.py b/runtime/agg/py/uniq-c.py
deleted file mode 100755
index 602d495db..000000000
--- a/runtime/agg/py/uniq-c.py
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/usr/bin/python
-import sys, os, functools, utils
-
-PAD_LEN = 7 # FIXME OSX vs Linux
-
-def parseLine(s):
-  global PAD_LEN
-  # FIXME: This could identify padding number
-  res = s.split(None, 1)
-  # print(res)
-  return (int(res[0]), res[1])
-
-def emitLine(t):
-  global PAD_LEN
-  return " ".join([str(t[0]).rjust(PAD_LEN, ' '), t[1]])
-
-def combinePair(a, b):
-  # print(a, b)
-  az = parseLine(a)
-  bz = parseLine(b)
-  if (az[1] == bz[1]):
-    return [emitLine([az[0] + bz[0], az[1]])]
-  else:
-    return [a, b] # already strings, no need to emit
-
-def agg(a, b):
-  # print(a, b)
-  if not a:
-    return b
-  return a[:-1]  + combinePair(a[-1], b[0]) + b[1:]
-
-utils.help()
-res = functools.reduce(agg, utils.read_all(), [])
-utils.out("".join(res))
diff --git a/runtime/agg/py/uniq.py b/runtime/agg/py/uniq.py
deleted file mode 100755
index 098dba552..000000000
--- a/runtime/agg/py/uniq.py
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/python
-import sys, os, functools, utils
-
-# not used
-def parseLine(s):
-  return s
-
-# not used
-def emitLine(t):
-  return t
-
-def combinePair(a, b):
-  # print(a, b)
-  if (a == b):
-    return [a]
-  else:
-    return [a, b] # already strings, no need to emit
-
-def combineBlackbox(a, b):
-  return [utils.execute(utils.cmd(), a + b)]
-
-def combiner(a, b):
-  if not a:
-    return b
-  pair = combinePair(a[-1], b[0])
-  # pair = combineBlackbox(a[-1], b[0])
-  # print pair
-  return a[:-1]  + pair + b[1:]
-
-utils.help()
-res = functools.reduce(combiner, utils.read_all(), [])
-utils.out("".join(res))
diff --git a/runtime/agg/py/utils.py b/runtime/agg/py/utils.py
deleted file mode 100644
index 40adde792..000000000
--- a/runtime/agg/py/utils.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import sys, subprocess, os
-
-EOF_IS_NL = True
-
-def read_file(fname):
-  try:
-    return open(fname, 'r').readlines()
-  except IOError as _err:
-    # os.path.basename(sys.argv[0]) + ": " + 
-    sys.stderr.write(f + ": " + _err.strerror + "\n")
-
-def read_all(): 
-  global EOF_IS_NL
-  all_contents = []
-  for f in sys.argv[1:]:
-    contents = read_file(f)
-    EOF_IS_NL = EOF_IS_NL and contents[-1].endswith('\n')
-    all_contents.append(contents)
-  return all_contents
-
-#def out():
-#  if (sys.version_info.major == 2):
-#    print **locals(),
-#  else:
-#    print(**locals(), end=' ')
-def out(s):
-  global EOF_IS_NL
-  if not s.endswith('\n') and EOF_IS_NL:
-    sys.stdout.write(s + '\n')
-  else:
-    sys.stdout.write(s)
-  sys.stdout.flush()
-
-def cmd():
-  c = sys.argv[0].replace(".py", "").replace("./", "")
-  if 'MAP_CMD' in os.environ:
-    c = os.environ['MAP_CMD']
-  return c
-
-def execute(command, data):
-    p = subprocess.Popen([command], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
-    return p.communicate(data)[0]
-    # Python 3 equivalent:
-    # p = subprocess.run([cmd], stdout=subprocess.PIPE, input=data, encoding='ascii', check=True)
-    # return p.stdout
-
-def help(c=cmd()):
-  m = c
-  s = ' <(cat 1.t | ' + m + ') <(cat 2.t | ' + m + ') <(cat 3.t | ' + m + ')'
-  if len(sys.argv) < 2:
-    print 'echo "one\\ntwo\\nthree" > 1.t'
-    print 'echo "four\\nfive" > 2.t'
-    print 'echo "one\\ntwo\\nthree" > 3.t'
-    print sys.argv[0] + s
-    print '\nTo quickly test equivalence: '
-    print sys.argv[0] + s + ' > par.txt'
-    print 'cat 1.t 2.t 3.t | ' + m + ' > seq.txt'
-    print 'diff {seq,par}.txt'
-
-
diff --git a/runtime/agg/py/wc.py b/runtime/agg/py/wc.py
deleted file mode 100755
index 94f0cd9e0..000000000
--- a/runtime/agg/py/wc.py
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/usr/bin/python
-import sys, os, functools, utils
-
-PAD_LEN = 7 # needs to add a space character for when they exceed
-
-def parseLine(s):
-  global PAD_LEN
-  # FIXME: This could identify padding number
-  return map(int, s.split())
-
-def emitLine(t):
-  global PAD_LEN
-  return [" ".join(map(lambda e: str(e).rjust(PAD_LEN, ' '), t))]
-
-def combiner(a, b):
-  # print(a, b)
-  if not a:
-    return b
-  az = parseLine(a[0])
-  bz = parseLine(b[0])
-  return emitLine([ (i+j) for (i,j) in zip(az, bz) ])
-
-utils.help()
-res = functools.reduce(combiner, utils.read_all(), [])
-utils.out("".join(res))
diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh
index dd663ea99..0676c1786 100755
--- a/scripts/run_tests.sh
+++ b/scripts/run_tests.sh
@@ -20,8 +20,3 @@ echo "Running aggregator tests..."
 cd "$PASH_TOP/evaluation/tests/agg/"
 ./run.sh
 
-## TODO: This has to go
-# echo "Running aggregator tests..."
-# cd "$PASH_TOP/runtime/agg/cpp/tests"
-# ./test.sh
-
diff --git a/scripts/ws-client.py b/scripts/ws-client.py
index a27ca6af1..b0f44a933 100644
--- a/scripts/ws-client.py
+++ b/scripts/ws-client.py
@@ -5,6 +5,8 @@
 
 from websocket import create_connection
 
+RESULT_POLLING_FREQUENCY=60
+
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("-b", "--target_branch", 
@@ -51,7 +53,7 @@ def current_task(websocket):
 
 def wait_for_result(websocket, target_commit):
     found = False
-    sleep_duration = 360
+    sleep_duration = RESULT_POLLING_FREQUENCY
 
     while not found:
 

From cedccfbe473efa458fd53cab85e4100bfe9af978 Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Sat, 18 Feb 2023 19:34:17 -0500
Subject: [PATCH 42/64] Remove custom aggregators

---
 runtime/Makefile | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/runtime/Makefile b/runtime/Makefile
index d7ed6f191..2c35c1a03 100644
--- a/runtime/Makefile
+++ b/runtime/Makefile
@@ -1,12 +1,5 @@
-ifeq ($(optimized_agg_flag),1)
-all: eager split r-merge r-wrap r-split r-unwrap dgsh-tee agg set-diff
-.PHONY: all eager-debug split-debug clean agg
-else
 all: eager split r-merge r-wrap r-split r-unwrap dgsh-tee set-diff
 .PHONY: all eager-debug split-debug clean
-endif
-
-
 
 CFLAGS=-Wall
 
@@ -71,11 +64,8 @@ dgsh-tee:
 	gcc ${CFLAGS} $(dgsh_tee_SOURCES) $(libdgsh_a_SOURCES) -o $@; \
 	cp dgsh-tee ../../../dgsh-tee;
 	rm -rf dgsh/
-	
-agg:
-	$(MAKE) -C agg/cpp
+
 
 clean:
 	rm -f eager split r_split r_wrap r_unwrap dgsh-tee
 	rm -rf dgsh
-	$(MAKE) -C agg/cpp clean

From d6b913d8d4228e79a507233f796974f3fd60d403 Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Mon, 20 Feb 2023 16:05:20 -0500
Subject: [PATCH 43/64] modify readmes to not mention old annotations

---
 README.md          | 7 -------
 compiler/README.md | 7 +++----
 2 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index 5c3c161a1..c8f0bf390 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,5 @@
 ## PaSh: Light-touch Data-Parallel Shell Processing
 
-**TODO before testing new annotations (temporary fix):**
-
-Connect the new annotations repository to PaSh in the `future_annotations`-branch:
-- clone the `connect_to_pash` branch from the new repository for annotations: git@github.com:binpash/annotations.git 
-- Specify the path in `compiler/config.py`
-
 > _A system for parallelizing POSIX shell scripts._
 > _Hosted by the [Linux Foundation](https://linuxfoundation.org/press-release/linux-foundation-to-host-the-pash-project-accelerating-shell-scripting-with-automated-parallelization-for-industrial-use-cases/)._
 
@@ -40,7 +34,6 @@ For more details, manual installation, or other platforms see [installation inst
 
 This repo hosts the core `pash` development. The structure is as follows:
 
-* [annotations](./annotations/): DSL characterizing commands, parallelizability study, and associated annotations.
 * [compiler](./compiler): Shell-dataflow translations and associated parallelization transformations.
 * [docs](./docs): Design documents, tutorials, installation instructions, etc.
 * [evaluation](./evaluation): Shell pipelines and example [scripts](./evaluation/other/more-scripts) used for the evaluation.
diff --git a/compiler/README.md b/compiler/README.md
index b0de00e69..f28eea723 100644
--- a/compiler/README.md
+++ b/compiler/README.md
@@ -15,13 +15,12 @@ It then invokes the script, switching between evaluation, execution, and paralle
 (ii) it then expands the nodes of the AST, often calling the shell which performs that expansion;
 (iii) it compiles dataflow regions, parts of the AST that are potentially parallelizable, through an iterative optimization proceedure applied over a dataflow graph (DFG); and
 (iv) finally emits the parallel script by translating the DFG to AST and unparsing the AST back to a shell script.
-The compilation takes into account information about individual commands through [annotations](../annotations), and the emitted parallel script uses additional constructs provided by PaSh's [runtime library](../runtime).
+The compilation takes into account information about individual commands through annotations, and the emitted parallel script uses additional constructs provided by PaSh's [runtime library](../runtime).
 
 A correspondence between blocks in the diagram and Python modules is shown below:
 
 - Preprocessing: [pash.py](./pash.py)
 - Expansion and compilation: [ast_to_ir.py](./ast_to_ir.py)
-- Dealing with annotations: [annotations.py](./annotations.py), [command_categories.py](./command_categories.py)
 - Optimization: [pash_runtime.py](./pash_runtime.py)
 
 ## Compiler Overview
@@ -36,10 +35,10 @@ The [pash_runtime.sh](./pash_runtime.sh) script simply invokes the [pash.py](./p
 
 The compiler has several stages:
 
-1. It expands words in the AST and then it turns it into our dataflow model (guided by [annotations](../annotations))
+1. It expands words in the AST and then it turns it into our dataflow model (guided by annotations)
    - The expansion and translation happens in [ast_to_ir.py](./ast_to_ir.py)
    - The dataflow model is defined mostly in [ir.py](./ir.py)
-   - The annotations are processed in [annotations.py](./annotations.py) and [command_categories.py](./command_categories.py)
+   - The annotations are processed in [binpash/annotations](https://github.com/binpash/annotations)
 2. It performs transformations on the dataflow graph to expose parallelism (guided by annotations)
    - Translations happen in [pash_runtime.py](./pash_runtime.py)
 3. It then translates the dataflow graph back to a shell script to execute it with bash

From aa68a277fec16ef8cc2def9130b0cf24b4440b07 Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Mon, 20 Feb 2023 16:05:36 -0500
Subject: [PATCH 44/64] delete a bunch of obsolete code

---
 compiler/annotations_utils/util_aggregator.py |  55 ----
 compiler/annotations_utils/util_mapper.py     |  94 ------
 compiler/definitions/ir/dfg_node.py           |  94 ------
 compiler/definitions/ir/nodes/cat.py          |  31 --
 compiler/ir.py                                | 294 ++----------------
 compiler/pash_runtime.py                      | 276 ----------------
 compiler/pash_runtime_daemon.py               |  15 +-
 7 files changed, 26 insertions(+), 833 deletions(-)
 delete mode 100644 compiler/annotations_utils/util_aggregator.py
 delete mode 100644 compiler/annotations_utils/util_mapper.py

diff --git a/compiler/annotations_utils/util_aggregator.py b/compiler/annotations_utils/util_aggregator.py
deleted file mode 100644
index f51a5ab42..000000000
--- a/compiler/annotations_utils/util_aggregator.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# TODO: this file can properly be deleted
-
-from definitions.ir.dfg_node import DFGNode
-from definitions.ir.nodes.cat import Cat
-from annotations_utils.util_cmd_invocations import get_command_invocation_prefix_from_dfg_node
-from util import log
-from ir_utils import string_to_argument
-from definitions.ir.arg import Arg
-
-def get_aggregator_as_dfg_node_from_node(node, parallelizer, inputs, outputs) -> DFGNode:
-    assert(False)
-    cmd_inv_pref = get_command_invocation_prefix_from_dfg_node(node)
-    log(f'cmdinvpref for agg: {cmd_inv_pref}')
-    aggregator = parallelizer.get_actual_aggregator(cmd_inv_pref)
-    log(f'here agg: {aggregator}')
-    # TODO: this could be simplified once we use the new attributes
-    if aggregator.cmd_name == 'cat':
-        return Cat(inputs=inputs,
-                    outputs=outputs,
-                    com_name=Arg(string_to_argument(aggregator.cmd_name)),
-                    com_options=[], # empty and not taking over from other one
-                    com_category="stateless",
-                    com_redirs=node.com_redirs,
-                    com_assignments=node.com_assignments,
-                    flag_option_list=aggregator.flag_option_list,
-                    positional_config_list=aggregator.positional_config_list,
-                    positional_input_list=None,     # TODO: somehow from inputs, future shift
-                    positional_output_list=None    # TODO: somehow from outputs, future shift
-                # TODO:
-                # implicit_use_of_stdin = False,
-                # implicit_use_of_stdout = False,
-                # omitted for now since we do not consider nested parallelization
-                # parallelizer_list = None,
-                # cmd_related_properties = None,
-        )
-    else:
-        log(f'agg_com_name: {aggregator.cmd_name}')
-        log(f'agg_flag_option_list: {aggregator.flag_option_list}')
-        return DFGNode(inputs=inputs,
-                       outputs=outputs,
-                       com_name=Arg(string_to_argument(aggregator.cmd_name)),
-                       com_options=node.com_options,
-                       com_redirs=node.com_redirs,
-                       com_assignments=node.com_assignments,
-                       flag_option_list=aggregator.flag_option_list,
-                       positional_config_list=aggregator.positional_config_list,
-                       positional_input_list=None,     # TODO: somehow from inputs, future shift
-                       positional_output_list=None    # TODO: somehow from outputs, future shift
-                       # TODO:
-                       # implicit_use_of_stdin = False,
-                       # implicit_use_of_stdout = False,
-                       # omitted for now since we do not consider nested parallelization
-                       # parallelizer_list = None,
-                       # cmd_related_properties = None,
-                       )
diff --git a/compiler/annotations_utils/util_mapper.py b/compiler/annotations_utils/util_mapper.py
deleted file mode 100644
index 07625b304..000000000
--- a/compiler/annotations_utils/util_mapper.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# TODO: this file can properly be deleted
-
-# imports from annotation framework
-# for typing
-# for use
-from pash_annotations.annotation_generation.datatypes.parallelizability.Mapper import Mapper
-
-from definitions.ir.dfg_node import DFGNode
-from annotations_utils.util_cmd_invocations import get_command_invocation_prefix_from_dfg_node
-from util import log
-
-def get_actual_mapper_from_node(node, parallelizer) -> Mapper:
-    assert(False)
-    cmd_inv_pref = get_command_invocation_prefix_from_dfg_node(node)
-    return parallelizer.get_actual_mapper(cmd_inv_pref)
-
-def get_mapper_as_dfg_node_from_node(node, parallelizer, inputs, outputs) -> DFGNode:
-    assert(False)
-    mapper = get_actual_mapper_from_node(node, parallelizer)
-    log(f'mapper for cmd_name: {node.com_name}')
-    log(f'here mapper: {mapper}')
-    return DFGNode(inputs=inputs,
-                outputs=outputs,
-                com_name=mapper.cmd_name,
-                # com_options=node.com_options,
-                com_redirs=node.com_redirs,
-                com_assignments=node.com_assignments,
-                flag_option_list=mapper.flag_option_list,
-                positional_config_list=mapper.positional_config_list,
-                positional_input_list=None,     # TODO: somehow from inputs, future shift
-                positional_output_list=None    # TODO: somehow from outputs, future shift
-            # TODO:
-            # implicit_use_of_stdin = False,
-            # implicit_use_of_stdout = False,
-            # omitted for now since we do not consider nested parallelization
-            # parallelizer_list = None,
-            # cmd_related_properties = None,
-    )
-
-## MOVED from dfg_node
-## Get the file names of the outputs of the map commands. This
-## differs if the command is stateless, pure that can be
-## written as a map and a reduce, and a pure that can be
-## written as a generalized map and reduce.
-# BEGIN ANNO
-# OLD
-# def get_map_output_files(node, input_edge_ids, fileIdGen):
-# NEW
-def get_map_output_files(node, input_edge_ids, fileIdGen, parallelizer):
-    assert(False)
-    assert (node.is_parallelizable())
-    # TODO ANNO: How to substitute? @KK
-    if (node.com_category == "stateless"):
-        map_output_fids = [fileIdGen.next_ephemeral_file_id() for in_fid in input_edge_ids]
-    elif (node.is_pure_parallelizable()):
-        # BEGIN ANNO
-        # OLD
-        # map_output_fids = node.pure_get_map_output_files(input_edge_ids, fileIdGen)
-        # NEW
-        map_output_fids = pure_get_map_output_files(node, input_edge_ids, fileIdGen, parallelizer)
-        # END ANNO
-    else:
-        log("Unreachable code reached :(")
-        assert (False)
-        ## This should be unreachable
-
-    return map_output_fids
-
-## TODO: Fix this somewhere in the annotations and not in the code
-# BEGIN ANNO
-# OLD
-# def pure_get_map_output_files(node, input_edge_ids, fileIdGen):
-# NEW
-def pure_get_map_output_files(node, input_edge_ids, fileIdGen, parallelizer):
-    assert(False)
-    assert (node.is_pure_parallelizable())
-    # BEGIN ANNO
-    # OLD
-    ## The number of the mapper outputs defaults to 1
-    # if(node.com_mapper is None):
-    #     number_outputs = 1
-    # else:
-    #     number_outputs = node.com_mapper.num_outputs
-    # NEW
-    # TODO: which parallelizer did we choose?
-    actual_mapper = get_actual_mapper_from_node(node, parallelizer)
-    number_outputs = actual_mapper.num_outputs  # defaults to 1 in class Mapper
-    # END ANNO
-
-    new_output_fids = [[fileIdGen.next_ephemeral_file_id() for i in range(number_outputs)]
-                       for in_fid in input_edge_ids]
-    return new_output_fids
-
-
diff --git a/compiler/definitions/ir/dfg_node.py b/compiler/definitions/ir/dfg_node.py
index fe1559194..7259a29af 100644
--- a/compiler/definitions/ir/dfg_node.py
+++ b/compiler/definitions/ir/dfg_node.py
@@ -1,12 +1,9 @@
 import copy
-from command_categories import *
 from definitions.ir.redirection import *
 from definitions.ir.resource import *
 
 from annotations_utils.util_cmd_invocations import to_node_cmd_inv_with_io_vars, construct_property_container_from_list_of_properties
 
-import sys
-
 from util import return_empty_list_if_none_else_itself, return_default_if_none_else_itself
 
 ## Assumption: Everything related to a DFGNode must be already expanded.
@@ -71,16 +68,6 @@ def copy(self):
         DFGNode.next_id += 1
         return node_copy
 
-    ## TODO: Make that a proper class.
-    def set_inputs(self, inputs):
-        assert(False)
-        if(isinstance(inputs, list)):
-            self.inputs = ([], inputs)
-        elif(isinstance(inputs, tuple)):
-            self.inputs = inputs
-        else:
-            raise NotImplementedError()
-
     def get_input_list(self):
         inputs = self.cmd_invocation_with_io_vars.generate_inputs()
         return inputs.get_all_inputs()
@@ -96,18 +83,6 @@ def get_configuration_inputs(self):
         inputs = self.cmd_invocation_with_io_vars.generate_inputs()
         return inputs.get_config_inputs()
 
-    # def is_at_most_pure(self):
-    #     return (self.com_category in ["stateless", "pure", "parallelizable_pure"])
-
-    # def is_parallelizable(self):
-    #     return (self.is_pure_parallelizable() or self.is_stateless())
-
-    # def is_stateless(self):
-    #     return (self.com_category == "stateless")
-
-    # def is_pure_parallelizable(self):
-    #     return (self.com_category == "parallelizable_pure")
-
     def is_commutative(self):
         val = self.cmd_related_properties.get_property_value('is_commutative')
         if val is not None:
@@ -115,75 +90,6 @@ def is_commutative(self):
         else:
             return False
 
-    ## kk: 2021-07-23 Not totally sure if that is generally correct. Tests will say ¯\_(ツ)_/¯
-    ##     I think it assumes that new options can be added in the beginning if there are no options already
-    def append_options(self, new_options):
-        assert(False) # unreachable
-        if(len(self.com_options) > 0):
-            max_opt_index = max([i for i, _opt in self.com_options])
-        else:
-            max_opt_index = -1
-        new_com_options = [(max_opt_index + 1 + i, Arg(string_to_argument(opt))) 
-                           for i, opt in enumerate(new_options)]
-        self.com_options = self.com_options + new_com_options
-
-    ## This method handles special DFG nodes specially when it has to do
-    ## with turning them to commands.
-    ##
-    ## The goal would be for this function to be developed for more and more nodes
-    ## so as to guide the second version of the annotations.
-    ##
-    ## TODO: Abstract this function away to annotations 2.0
-    def special_to_ast(self, edges):
-        assert(False) # unreachable
-        # BEGIN ANNO
-        return None
-        # END ANNO
-        ## Every argument should be completely expanded so making it a string should be fine
-        if str(self.com_name) == "cat":
-            redirs = self._to_ast_aux_get_redirs()
-            assignments = self.com_assignments
-            com_name_ast = self.com_name.to_ast()
-            option_asts = [opt.to_ast() for _, opt in self.com_options]
-
-            ## We simply turn inputs to arguments by appending them to the options
-            input_arguments = self._to_ast_aux_inputs_as_args(edges, stdin_dash=True)
-
-            ## TODO: Make sure a library of useful constructs that create
-            ##       a command from a DFG node.
-
-            ## We simply send output to stdout (as redir if needed)
-            output_redir = self._to_ast_aux_single_stdout_fid(edges)
-
-            all_arguments = [com_name_ast] + option_asts + input_arguments
-            all_redirs = redirs + output_redir
-
-            node = make_command(all_arguments, redirections=all_redirs, assignments=assignments)
-            return node
-        else:
-            return None
-
-    ## This function handles the input fids as arguments.
-    def _to_ast_aux_inputs_as_args(self, edges, stdin_dash=False):
-        assert(False) # unreachable
-        input_fids = [edges[in_id][0] for in_id in self.get_input_list()]
-
-        input_arguments = [fid.to_ast(stdin_dash=stdin_dash)
-                            for fid in input_fids]
-        return input_arguments
-
-    ## This function handles the redirections when a command has a single output
-    ##   and it can always be stdout.
-    def _to_ast_aux_single_stdout_fid(self, edges):
-        assert(False) # unreachable
-        output_fids = [edges[out_id][0] for out_id in self.outputs]
-        assert len(output_fids) == 1
-        output_fid = output_fids[0]
-        # log("output fid:", output_fid)
-
-        output_redir = redirect_to_stdout_if_not_already(output_fid)
-        # log("Redir:", output_redir)
-        return output_redir
 
     ## Auxiliary method that returns any necessary redirections,
     ##   at the moment it doesn't look necessary.
diff --git a/compiler/definitions/ir/nodes/cat.py b/compiler/definitions/ir/nodes/cat.py
index 37d7a2996..675b3880b 100644
--- a/compiler/definitions/ir/nodes/cat.py
+++ b/compiler/definitions/ir/nodes/cat.py
@@ -1,37 +1,6 @@
 from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars
 from definitions.ir.dfg_node import DFGNode
 
-class Cat(DFGNode):
-    def __init__(self, inputs, outputs, com_name, com_category,
-                 com_options = [], com_redirs = [], com_assignments=[],
-                 # BEGIN ANNO
-                 flag_option_list = None,
-                 positional_config_list = None,
-                 positional_input_list = None,
-                 positional_output_list = None,
-                 implicit_use_of_stdin = None,
-                 implicit_use_of_stdout = None,
-                 parallelizer_list = None,
-                 cmd_related_properties = None
-                 # END ANNO
-                 ):
-        assert(False)
-        assert(str(com_name) == "cat")
-        assert(com_category == "stateless")
-        super().__init__(inputs, outputs, com_name, com_category,
-                         com_options=com_options,
-                         flag_option_list=flag_option_list,
-                         com_redirs=com_redirs,
-                         com_assignments=com_assignments,
-                         positional_config_list=positional_config_list,
-                         positional_input_list=positional_input_list,
-                         positional_output_list=positional_output_list,
-                         implicit_use_of_stdin=implicit_use_of_stdin,
-                         implicit_use_of_stdout=implicit_use_of_stdout,
-                         parallelizer_list=parallelizer_list,
-                         cmd_related_properties=cmd_related_properties
-                         )
-
 def make_cat_node(inputs, output):
     cmd_inv_cat = CommandInvocationWithIOVars.make_cat_command_invocation_with_io_vars(inputs, output)
     return DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(cmd_inv_cat)
diff --git a/compiler/ir.py b/compiler/ir.py
index 3d24f008f..61078052d 100644
--- a/compiler/ir.py
+++ b/compiler/ir.py
@@ -11,8 +11,8 @@
 
 from annotations_utils.util_parsing import parse_arg_list_to_command_invocation
 from annotations_utils.util_cmd_invocations import get_input_output_info_from_cmd_invocation_util, get_parallelizability_info_from_cmd_invocation_util
-from annotations_utils.util_mapper import get_mapper_as_dfg_node_from_node, get_map_output_files
-from annotations_utils.util_aggregator import get_aggregator_as_dfg_node_from_node
+# from annotations_utils.util_mapper import get_mapper_as_dfg_node_from_node
+# from annotations_utils.util_aggregator import get_aggregator_as_dfg_node_from_node
 from annotations_utils.util_file_descriptors import resource_from_file_descriptor
 
 from definitions.ir.file_id import *
@@ -214,38 +214,14 @@ def compile_command_to_DFG(fileIdGen, command, options,
     ## TODO: Add assignments
     com_assignments = []
 
-    ## TODO: Combine them both in a constructor that decided whether to instantiate Cat or DFGNode
-    # if(str(com_name) == "cat"):
-    #     dfg_node = Cat(dfg_inputs,
-    #                    dfg_outputs,
-    #                    com_name,
-    #                    ## TODO: We don't really need to pass category, name, or input_consumption for Cat
-    #                    com_category,
-    #                    com_options=dfg_options,
-    #                    com_redirs=com_redirs,
-    #                    com_assignments=com_assignments,
-    #                    )
-    # elif(str(com_name) == "hdfs" and str(dfg_options[0][1]) == "dfs" and str(dfg_options[1][1]) == "-cat"):
-    #     dfg_node = HDFSCat(dfg_inputs,
-    #                     dfg_outputs,
-    #                     com_name,
-    #                     com_category,
-    #                     com_options=dfg_options,
-    #                     com_redirs=com_redirs,
-    #                     com_assignments=com_assignments)
-    # else:
-    if(True):
-        ## Assume: Everything must be completely expanded
-        ## TODO: Add an assertion about that.
-        dfg_node = DFGNode(cmd_invocation_with_io_vars,
-                           com_redirs=com_redirs,
-                           com_assignments=com_assignments,
-                           parallelizer_list=parallelizer_list,
-                           cmd_related_properties=cmd_related_properties
-                           )
-
-    # if(not dfg_node.is_at_most_pure()): # which consequences has this check had?
-    #     raise ValueError()
+    ## Assume: Everything must be completely expanded
+    ## TODO: Add an assertion about that.
+    dfg_node = DFGNode(cmd_invocation_with_io_vars,
+                        com_redirs=com_redirs,
+                        com_assignments=com_assignments,
+                        parallelizer_list=parallelizer_list,
+                        cmd_related_properties=cmd_related_properties
+                        )
 
     node_id = dfg_node.get_id()
 
@@ -292,19 +268,19 @@ def make_tee(input, outputs):
                    com_name, 
                    com_category)
 
-def make_map_node(node, new_inputs, new_outputs, parallelizer):
-    return get_mapper_as_dfg_node_from_node(node, parallelizer, new_inputs, new_outputs)
+# def make_map_node(node, new_inputs, new_outputs, parallelizer):
+#     return get_mapper_as_dfg_node_from_node(node, parallelizer, new_inputs, new_outputs)
 
-## Makes a wrap node that encloses a map parallel node.
-##
-## At the moment it only works with one input and one output since wrap cannot redirect input in the command.
-def make_wrap_map_node(node, new_inputs, new_outputs):
-    assert(len(new_inputs) == 1)
-    assert(len(new_outputs) == 1)
+# ## Makes a wrap node that encloses a map parallel node.
+# ##
+# ## At the moment it only works with one input and one output since wrap cannot redirect input in the command.
+# def make_wrap_map_node(node, new_inputs, new_outputs):
+#     assert(len(new_inputs) == 1)
+#     assert(len(new_outputs) == 1)
 
-    new_node = make_map_node(node, new_inputs, new_outputs)
-    wrap_node = r_wrap.wrap_node(new_node)
-    return wrap_node
+#     new_node = make_map_node(node, new_inputs, new_outputs)
+#     wrap_node = r_wrap.wrap_node(new_node)
+#     return wrap_node
 
 
 
@@ -1043,235 +1019,7 @@ def introduce_aggregator_for_round_robin(self, out_mapper_ids, parallelizer, str
             pass
 
 
-    ## This function parallelizes a merger followed by a parallelizable node
-    ##
-    ## There are several combinations that it can handle:
-    ##   1. cat -> parallelizable node
-    ##   2. r_merge -> stateless node without conf_input
-    ##   3. r_merge -> commutative pure parallelizable node 
-    ##
-    ## 1. cat followed by a parallelizable node
-    ##
-    ##    (conf_input) ----+
-    ##                      \
-    ##    (in1) --- cat ---- node ---(out)---
-    ##             /
-    ##    (in2) --+
-    ##
-    ## is transformed to
-    ##
-    ##    (conf_input) -- tee ------------+
-    ##                       \             \
-    ##                        \   (in1) --- node --- agg ---(out)---
-    ##                         \                    /
-    ##                (in2) --- node --------------+
-    ##
-    ## where edges are named with parenthesis and nodes are named without them.
-    ##
-    ## 2. r_merge followed by a stateless node without conf_input
-    ##
-    ## TODO: Add visual representation
-    ##
-    ## In this case the stateless command is wrapped with wrap so we cannot actually tee the input (since we do not know apriori how many forks we have).
-    ## However, we can actually write it to a file (not always worth performance wise) and then read it from all at once.
-    ##
-    ##
-    ## TODO: Eventually delete the fileIdGen from here and always use the graph internal one.
-    ##
-    ## TODO: Eventually this should be tunable to not happen for all inputs (but maybe for less)
-    def parallelize_node(self, node_id, fileIdGen):
-        assert(False)
-        node = self.get_node(node_id)
-        # BEGIN ANNO
-        # OLD
-        # assert(node.is_parallelizable())
-        # NEW
-        rr_parallelizer_list = [parallelizer for parallelizer in node.parallelizer_list if parallelizer.splitter.is_splitter_round_robin()]
-        assert(len(rr_parallelizer_list) == 1)
-        rr_parallelizer = rr_parallelizer_list[0]
-        # to have this info later when the merger is created in a reduce tree
-        node.set_used_parallelizer(rr_parallelizer)
-        # END ANNO
-
-        ## Initialize the new_node list
-        new_nodes = []
-
-        ## Identify the previous merger node (cat or r_merge)
-        ##
-        ## TODO: This should also work for no cat (all inputs are part of the node)
-        node_input_ids = node.get_standard_inputs()
-        assert(len(node_input_ids) == 1)
-        node_input_id = node_input_ids[0]
-        previous_node_id = self.edges[node_input_id][1]
-        previous_node = self.get_node(previous_node_id)
-        assert(isinstance(previous_node, Cat)
-               or isinstance(previous_node, r_merge.RMerge))
-
-        ## Determine if the previous node is r_merge to determine which of the three parallelization cases to follow
-        r_merge_flag = isinstance(previous_node, r_merge.RMerge)
-
-        ## If the previous node of r_merge is an r_split, then we need to replace it with -r,
-        ## instead of doing unwraps.
-        if(r_merge_flag):
-            assert(False)
-            assert(isinstance(previous_node, r_merge.RMerge))
-            r_merge_prev_node_ids = self.get_previous_nodes(previous_node_id)
-
-            ## If all the previous nodes are r_split this means that they are the same
-            ##
-            ## Q: Could that ever not be true?
-            ##
-            ## TODO: If we ever want to measure the benefit from this optimization we need
-            ##       to make a conjunction in this flag here.
-            r_split_before_r_merge_opt_flag = all([isinstance(self.get_node(node_id), r_split.RSplit)
-                                                   for node_id in r_merge_prev_node_ids])
-
-            ## If r_split was right before the r_merge, and the node is pure parallelizable,
-            ## this means that we will not add unwraps, and therefore we need to add the -r flag to r_split.
-            if (r_split_before_r_merge_opt_flag
-                and node.is_pure_parallelizable()):
-                assert(node.is_commutative())
-                r_split_id = r_merge_prev_node_ids[0]
-                r_split_node = self.get_node(r_split_id)
-
-                ## Add -r flag in r_split
-                r_split_node.add_r_flag()
-        else:
-            r_split_before_r_merge_opt_flag = False
-
-
-        ## Identify the parallel inputs, each of which will be given to a different copy of the node.
-        parallel_input_ids = previous_node.get_input_list()
-        parallelism = len(parallel_input_ids)
-
-        ## Identify the output.
-        node_output_edge_ids = node.get_output_list()
-        assert(len(node_output_edge_ids) == 1)
-        node_output_edge_id = node_output_edge_ids[0]
-
-        ## Remove the original node and the cat node before it
-        ## This also unplugs all the inputs
-        self.remove_node(node_id)
-        self.remove_node(previous_node_id)
-
-        ## TODO: This does not work at the moment. There seem to be some issues with tee.
-        ##       It probably has to do with a misunderstanding of how configuration inputs work
-        ## Unplug the configuration inputs from the node and tee it
-        parallel_configuration_ids = [[] for _ in range(parallelism)]
-        node_conf_inputs = node.get_configuration_inputs()
-        for conf_edge_id in node_conf_inputs:
-            assert(False)
-            ## TODO: For now this does not work for r_merge
-            assert(not r_merge_flag)
-            # self.set_edge_to(conf_edge_id, None)
-            tee_id = self.tee_edge(conf_edge_id, parallelism, fileIdGen)
-            tee_node = self.get_node(tee_id)
-            for i in range(parallelism):
-                # TODO outputs probably non-existent
-                parallel_configuration_ids[i].append(tee_node.outputs[i])
-
-        ## Create a temporary output edge for each parallel command.
-        # BEGIN ANNO
-        # OLD
-        # map_output_fids = node.get_map_output_files(parallel_input_ids, fileIdGen)
-        # NEW (added parameter)
-        map_output_fids = get_map_output_files(node, parallel_input_ids, fileIdGen, rr_parallelizer)
-        # END ANNO
-        assert(len(map_output_fids) == len(parallel_input_ids))
-
-        all_map_output_ids = []
-        ## For each parallel input, create a parallel command
-        for index in range(parallelism):
-            ## Gather inputs and outputs
-            conf_ins = parallel_configuration_ids[index]
-            assert(len(conf_ins) == 0)
-            standard_in = parallel_input_ids[index]
-            new_inputs = (conf_ins, [standard_in])
-            map_output_fid = map_output_fids[index]
-            if(not isinstance(map_output_fid, list)):
-                output_fid_list = [map_output_fid]
-            else:
-                output_fid_list = map_output_fid
-            new_output_ids = [fid.get_ident() for fid in output_fid_list]
-            all_map_output_ids.append(new_output_ids)
-
-            ## Add the map_output_edges to the graph
-            for output_fid in output_fid_list:
-                self.add_edge(output_fid)
-
-            ## If the previous merger is r_merge we need to put wrap around the nodes
-            ## or unwrap before a commutative command
-            if(r_merge_flag is True):
-                assert(False)
-                ## For stateless nodes we are in case (2) and we wrap them
-                if (node.is_stateless()):
-                    parallel_node = make_wrap_map_node(node, new_inputs, new_output_ids)
-                    self.add_node(parallel_node)
-                else:
-                    ## If we have a pure parallelizable node, then we have to unwrap, before parallelizing the node.
-                    ##
-                    ## This can only work if the node is actually commutative
-                    assert(node.is_pure_parallelizable())
-                    assert(is_single_input(new_inputs))
-                    assert(node.is_commutative())
-
-                    ## Optimization: If the node before r_merge is an r_split, then we
-                    ##               don't need to add unwrap, and we can just add -r to r_split.
-                    if(r_split_before_r_merge_opt_flag):
-                        parallel_node = make_map_node(node, new_inputs, new_output_ids)
-                        self.add_node(parallel_node)
-                    else:
-                        ## Make the edge between unwrap and the command
-                        unwrap_output_fid = fileIdGen.next_ephemeral_file_id()
-                        unwrap_output_id = unwrap_output_fid.get_ident()
-                        self.add_edge(unwrap_output_fid)
-
-                        ## TODO: Make an unwrap node and create new inputs
-                        unwrap_node = r_unwrap.make_unwrap_node(new_inputs, unwrap_output_id)
-                        self.add_node(unwrap_node)
-                        self.set_edge_from(unwrap_output_id, unwrap_node.get_id())
-
-                        parallel_node_inputs = ([], [unwrap_output_id])
-                        parallel_node = make_map_node(node, parallel_node_inputs, new_output_ids)
-                        self.add_node(parallel_node)
-                        self.set_edge_to(unwrap_output_id, parallel_node.get_id())
-
-                        ## Note: unwrap needs to be set as the parallel node since below the inputs are set to point to it.
-                        parallel_node = unwrap_node
-            else:
-                ## If we are working with a `cat` (and not an r_merge), then we just make a parallel node
-                parallel_node = make_map_node(node, new_inputs, new_output_ids, rr_parallelizer)
-                self.add_node(parallel_node)
-
-            parallel_node_id = parallel_node.get_id()
-
-            ## Set the to of all input edges
-            for conf_in in conf_ins:
-                assert(False)
-                self.set_edge_to(conf_in, parallel_node_id)
-            self.set_edge_to(standard_in, parallel_node_id)
-
-
-        if (node.com_category == "stateless"):
-            if(r_merge_flag is True):
-                assert(False)
-                new_merger = r_merge.make_r_merge_node(flatten_list(all_map_output_ids), node_output_edge_id)
-            else:
-                # BEGIN ANNO
-                # OLD
-                # new_merger = make_cat_node(flatten_list(all_map_output_ids), node_output_edge_id)
-                # NEW
-                new_merger = get_aggregator_as_dfg_node_from_node(node, rr_parallelizer, flatten_list(all_map_output_ids), [node_output_edge_id])
-                # END ANNO
-
-            self.add_node(new_merger)
-            new_nodes.append(new_merger)
-            self.set_edge_from(node_output_edge_id, new_merger.get_id())
-        else:
-            ## TODO: Create an aggregator here. At the moment it happens in `pash_runtime.py`.
-            pass
 
-        return new_nodes, all_map_output_ids
 
     ## Replicates an edge using tee and returns the new node_id.
     def tee_edge(self, edge_id, times, fileIdGen):
diff --git a/compiler/pash_runtime.py b/compiler/pash_runtime.py
index 86cccd136..5a0dc6e4c 100644
--- a/compiler/pash_runtime.py
+++ b/compiler/pash_runtime.py
@@ -245,11 +245,9 @@ def optimize_irs(asts_and_irs, args, compiler_config):
 
 def print_graph_statistics(graph):
     total_nodes = graph.nodes
-    cat_nodes = [node for node in total_nodes.values() if isinstance(node, Cat)]
     eager_nodes = [node for node in total_nodes.values() if isinstance(node, Eager)]
     log("Total nodes after optimization:", len(total_nodes))
     log(" -- out of which:")
-    log("Cat nodes:", len(cat_nodes))
     log("Eager nodes:", len(eager_nodes))
 
 
@@ -371,7 +369,6 @@ def naive_parallelize_stateless_nodes_bfs(graph, fan_out, batch_size, no_cat_spl
 ## Optimizes several commands by splitting its input
 def split_command_input(curr, graph, fileIdGen, fan_out, _batch_size, r_split_flag, r_split_batch_size):
     assert(curr.is_parallelizable())
-    assert(not isinstance(curr, Cat))
     assert(fan_out > 1)
 
     ## At the moment this only works for nodes that have one standard input.
@@ -466,279 +463,6 @@ def split_hdfs_cat_input(hdfs_cat, next_node, graph, fileIdGen):
     return new_merger
 
 
-## TODO: There needs to be some state to keep track of open r-split sessions
-##       (that either end at r-merge or at r_unwrap before a commutative command).
-##
-## TODO: At the moment we greedily try to add r-splits if possible, so we need to have a better procedure of deciding whether to put them or not.
-##       For example for non-commutative pure commands.
-
-## This function takes a node (id) and parallelizes it
-def parallelize_node(curr_id, graph, fileIdGen, fan_out,
-                     batch_size, no_cat_split_vanish, r_split_flag, r_split_batch_size):
-    assert(False)
-    curr = graph.get_node(curr_id)
-    new_nodes_for_workset = []
-
-    # TODO: this whole fragment could be moved to the graph after picking a parallelizer
-    option_parallelizer_rr = curr.get_option_implemented_round_robin_parallelizer()
-    # for now, we use the `r_split_flag` here again:
-    if r_split_flag and option_parallelizer_rr is not None:
-        parallelizer_rr = option_parallelizer_rr
-        aggregator_spec = parallelizer_rr.get_aggregator_spec()
-        aggregator_kind = aggregator_spec.get_kind()
-        if aggregator_kind == AggregatorKindEnum.CONCATENATE: # is turned into an r_merge
-            streaming_inputs = curr.get_streaming_inputs()
-            assert(len(streaming_inputs) == 1)
-            streaming_input = streaming_inputs[0]
-            configuration_inputs = curr.get_configuration_inputs()
-            assert(len(configuration_inputs) == 0)
-            streaming_outputs = curr.get_output_list()
-            assert(len(streaming_outputs) == 1)
-            streaming_output = streaming_outputs[0]
-            original_cmd_invocation_with_io_vars = curr.cmd_invocation_with_io_vars
-
-            graph.remove_node(curr_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph
-
-            out_split_ids = graph.generate_ephemeral_edges(fileIdGen, fan_out)
-            splitter = r_split.make_r_split(streaming_input, out_split_ids, r_split_batch_size)
-            graph.set_edge_to(streaming_input, splitter.get_id())
-            for out_split_id in out_split_ids:
-                graph.set_edge_from(out_split_id, splitter.get_id())
-            graph.add_node(splitter)
-
-            in_mapper_ids = out_split_ids
-            out_mapper_ids = graph.generate_ephemeral_edges(fileIdGen, fan_out)
-            zip_mapper_in_out_ids = zip(in_mapper_ids, out_mapper_ids)
-
-            all_mappers = []
-            for (in_id, out_id) in zip_mapper_in_out_ids:
-                # BEGIN: these 4 lines could be refactored to be a function in graph such that
-                # creating end point of edges and the creation of edges is not decoupled
-                mapper_cmd_inv = parallelizer_rr.get_actual_mapper(original_cmd_invocation_with_io_vars, in_id, out_id)
-                mapper = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(mapper_cmd_inv)
-                # add r_wrap here:
-                mapper_r_wrapped = r_wrap.wrap_node(mapper, graph.edges)
-                graph.set_edge_to(in_id, mapper_r_wrapped.get_id())
-                graph.set_edge_from(out_id, mapper_r_wrapped.get_id())
-                # END
-                all_mappers.append(mapper_r_wrapped)
-            for new_node in all_mappers:
-                graph.add_node(new_node)
-
-            in_aggregator_ids = out_mapper_ids
-            out_aggregator_id = streaming_output
-            aggregator = r_merge.make_r_merge_node(in_aggregator_ids, out_aggregator_id)
-            for in_aggregator_id in in_aggregator_ids:
-                graph.set_edge_to(in_aggregator_id, aggregator.get_id())
-            graph.set_edge_from(streaming_output, aggregator.get_id())
-            all_aggregators = [aggregator]
-            ## Add the merge commands in the graph
-            for new_node in all_aggregators:
-                graph.add_node(new_node)
-    elif option_parallelizer_rr is not None: # do consecutive chunks
-        # TODO: we do consecutive chunks here but from a rr splitter
-        parallelizer_rr = option_parallelizer_rr
-        streaming_inputs = curr.get_streaming_inputs()
-        assert(len(streaming_inputs) == 1)
-        streaming_input = streaming_inputs[0]
-        configuration_inputs = curr.get_configuration_inputs()
-        assert(len(configuration_inputs) == 0)
-        streaming_outputs = curr.get_output_list()
-        assert(len(streaming_outputs) == 1)
-        streaming_output = streaming_outputs[0]
-        original_cmd_invocation_with_io_vars = curr.cmd_invocation_with_io_vars
-
-        graph.remove_node(curr_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph
-
-        out_split_ids = graph.generate_ephemeral_edges(fileIdGen, fan_out)
-        splitter = pash_split.make_split_file(streaming_input, out_split_ids)
-        graph.set_edge_to(streaming_input, splitter.get_id())
-        for out_split_id in out_split_ids:
-            graph.set_edge_from(out_split_id, splitter.get_id())
-        graph.add_node(splitter)
-
-        in_mapper_ids = out_split_ids
-        out_mapper_ids = graph.generate_ephemeral_edges(fileIdGen, fan_out)
-        zip_mapper_in_out_ids = zip(in_mapper_ids, out_mapper_ids)
-
-        all_mappers = []
-        for (in_id, out_id) in zip_mapper_in_out_ids:
-            # BEGIN: these 4 lines could be refactored to be a function in graph such that
-            # creating end point of edges and the creation of edges is not decoupled
-            mapper_cmd_inv = parallelizer_rr.get_actual_mapper(original_cmd_invocation_with_io_vars, in_id, out_id)
-            mapper = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(mapper_cmd_inv)
-            graph.set_edge_to(in_id, mapper.get_id())
-            graph.set_edge_from(out_id, mapper.get_id())
-            # END
-            all_mappers.append(mapper)
-        for new_node in all_mappers:
-            graph.add_node(new_node)
-
-        in_aggregator_ids = out_mapper_ids
-        out_aggregator_id = streaming_output
-        aggregator_spec = parallelizer_rr.get_aggregator_spec()
-        aggregator_kind = aggregator_spec.get_kind()
-        if aggregator_kind == AggregatorKindEnum.CONCATENATE or aggregator_kind == AggregatorKindEnum.CUSTOM_N_ARY:
-            aggregator_cmd_inv = parallelizer_rr.get_actual_aggregator(original_cmd_invocation_with_io_vars, in_aggregator_ids, out_aggregator_id)
-            aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(aggregator_cmd_inv)
-            for in_aggregator_id in in_aggregator_ids:
-                graph.set_edge_to(in_aggregator_id, aggregator.get_id())
-            graph.set_edge_from(streaming_output, aggregator.get_id())
-            all_aggregators = [aggregator]
-            ## Add the merge commands in the graph
-            for new_node in all_aggregators:
-                graph.add_node(new_node)
-        elif aggregator_kind == AggregatorKindEnum.CUSTOM_2_ARY:
-            # TODO: we simplify and assume that every mapper produces a single output for now:
-            map_in_aggregator_ids = [[id] for id in in_aggregator_ids]
-            graph.create_generic_aggregator_tree(original_cmd_invocation_with_io_vars, parallelizer_rr, map_in_aggregator_ids, out_aggregator_id, fileIdGen)
-        else:
-            raise Exception("aggregator kind not yet implemented")
-
-    return new_nodes_for_workset
-
-## TODO: Instead of moving a cat after a node, we need to parallelize cat,
-##       then remove cat (since it takes a single input to a single output),
-##       then parallelize the next node. This will allow us to handle `comm -23 p1 p2`
-##
-## TODO: A nice interface would be (check/apply transformation)
-##
-## TODO: This could be a method of IR.
-##
-## TODO: We need to check if the previous node is a cat or a merge
-def check_parallelize_dfg_node(merger_id, node_id, graph, fileIdGen):
-    assert(False)
-
-    ## Get merger inputs (cat or r_merge).
-    merger_input_edge_ids = graph.get_node_input_ids(merger_id)
-
-    ## If the merger has more than one input, then the next node could be parallelized
-    new_nodes = []
-    if (len(merger_input_edge_ids) > 1):
-        ## If the merger is r-merge, then the next node needs to either be stateless, or commutative parallelizable.
-        merger = graph.get_node(merger_id)
-        node = graph.get_node(node_id)
-        if((isinstance(merger, Cat)
-            and node.is_parallelizable())
-           or (isinstance(merger, r_merge.RMerge)
-               and (node.is_stateless()
-                    or node.is_commutative()))):
-            new_nodes = parallelize_dfg_node(merger_id, node_id, graph, fileIdGen)
-
-    return new_nodes
-
-def parallelize_dfg_node(old_merger_id, node_id, graph, fileIdGen):
-    assert(False)
-    node = graph.get_node(node_id)
-    assert(node.is_parallelizable())
-
-    ## TODO: Delete this
-    ## Get cat inputs and output. Note that there is only one output.
-    # old_merger_input_edge_ids = graph.get_node_input_ids(old_merger_id)
-    # old_merger_output_edge_ids = graph.get_node_output_ids(old_merger_id)
-    # assert(len(old_merger_output_edge_ids) == 1)
-    # old_merger_output_edge_id = old_merger_output_edge_ids[0]
-
-    new_nodes = []
-    ## We assume that every stateless and pure parallelizable command has one output_file_id for now.
-    ##
-    ## TODO: Check if this can be lifted.
-    node_output_edge_ids = graph.get_node_output_ids(node_id)
-    assert(len(node_output_edge_ids) == 1)
-    node_output_edge_id = node_output_edge_ids[0]
-
-    ## TODO: Add a commutativity check before actually applying this transformation if the current node is pure parallelizable.
-    new_parallel_nodes, map_output_ids = graph.parallelize_node(node_id, fileIdGen)
-    new_nodes += new_parallel_nodes
-
-    # log("after duplicate graph nodes:", graph.nodes)
-    # log("after duplicate graph edges:", graph.edges)
-
-    ## Make a merge command that joins the results of all the duplicated commands
-    ##
-    ## TODO: We need to figure out what to do with r_merge when commands are not commutative
-    if(node.is_pure_parallelizable()):
-        merge_commands, new_edges, final_output_id = create_merge_commands(node,
-                                                                           map_output_ids,
-                                                                           fileIdGen)
-        graph.add_edges(new_edges)
-
-        ## Add the merge commands in the graph
-        for merge_command in merge_commands:
-            graph.add_node(merge_command)
-
-        ## Replace the previous final_output_id with the previous id
-        final_merge_node_id = graph.edges[final_output_id][1]
-        final_merge_node = graph.get_node(final_merge_node_id)
-        final_merge_node.replace_edge(final_output_id, node_output_edge_id)
-        graph.set_edge_from(node_output_edge_id, final_merge_node_id)
-        graph.set_edge_from(final_output_id, None)
-
-        ## Only add the final node to the new_nodes
-        new_nodes.append(final_merge_node)
-
-
-    # log("after merge graph nodes:", graph.nodes)
-    # log("after merge graph edges:", graph.edges)
-
-    ## WARNING: In order for the above to not mess up
-    ## anything, there must be no other node that writes to
-    ## the same output as the curr node. Otherwise, the above
-    ## procedure will mess this up.
-    ##
-    ## TODO: Either make an assertion to catch any case that
-    ## doesn't satisfy the above assumption here, or extend
-    ## the intermediate representation and the above procedure
-    ## so that this assumption is lifted (either by not
-    ## parallelizing, or by properly handling this case)
-    return new_nodes
-
-## Creates a merge command for all pure commands that can be
-## parallelized using a map and a reduce/merge step
-##
-## Currently adding an aggregator can be done by adding another branch to this function
-##
-## TODO: Make that generic to work through annotations
-def create_merge_commands(curr, new_output_ids, fileIdGen):
-    assert(False)
-    if(str(curr.com_name) == "uniq"):
-        return create_uniq_merge_commands(curr, new_output_ids, fileIdGen)
-    else:
-        return create_generic_aggregator_tree(curr, new_output_ids, fileIdGen)
-
-## TODO: These must be generated using some file information
-##
-## TODO: Find a better place to put these functions
-def create_sort_merge_commands(curr, new_output_ids, fileIdGen):
-    assert(False)
-    output = create_reduce_tree(lambda ids: SortGReduce(curr, ids),
-                                new_output_ids, fileIdGen)
-    return output
-
-## Instead of creating a tree, we just create a single level reducer for uniq
-def create_uniq_merge_commands(curr, new_output_ids, fileIdGen):
-    assert(False)
-    ## Make an intermediate cat node
-    intermediate_fid = fileIdGen.next_ephemeral_file_id()
-    intermediate_id = intermediate_fid.get_ident()
-    new_cat = make_cat_node(flatten_list(new_output_ids), intermediate_id)
-
-    ## Make the new uniq output
-    new_out_fid = fileIdGen.next_ephemeral_file_id()
-    new_out_id = new_out_fid.get_ident()
-
-    ## TODO: Pass the options of `curr` correctly
-
-    ## Make the uniq merge node
-    uniq_com_name = Arg(string_to_argument("uniq"))
-    com_category = "pure_parallelizable"
-    node = DFGNode([intermediate_id],
-                   [new_out_id],
-                   uniq_com_name,
-                   com_category)
-
-    return ([new_cat, node], [intermediate_fid, new_out_fid], new_out_id)
-
 
 ## This functions adds an eager on a given edge.
 def add_eager(eager_input_id, graph, fileIdGen, intermediateFileIdGen, use_dgsh_tee):
diff --git a/compiler/pash_runtime_daemon.py b/compiler/pash_runtime_daemon.py
index 73dede11f..9de30d453 100644
--- a/compiler/pash_runtime_daemon.py
+++ b/compiler/pash_runtime_daemon.py
@@ -5,9 +5,9 @@
 import traceback
 from threading import Thread
 from datetime import datetime
-import queue
+# import queue
 
-from annotations import *
+import annotations
 import config
 from pash_graphviz import maybe_generate_graphviz
 import pash_runtime
@@ -15,17 +15,12 @@
 from dspash.worker_manager import WorkersManager
 
 ##
-# A Daemon responding to requests for compilation
-##
-# Note: Not an actual daemon with the strict Unix sense
+## A Daemon (not with the strict Unix sense) 
+## that responds to requests for compilation
 ##
 
 # TODO: Rename the pash_runtime to pash_compiler and this to pash_daemon
 
-# TODO: Should we maybe use sockets instead of fifos?
-
-# TODO: Fix the daemon logging.
-
 
 def handler(signum, frame):
     log("Signal:", signum, "caught")
@@ -55,7 +50,7 @@ def init():
         config.load_config(args.config_path)
 
     # Load annotations
-    config.annotations = load_annotation_files(
+    config.annotations = annotations.load_annotation_files(
         config.config['distr_planner']['annotations_dir'])
 
     pash_runtime.runtime_config = config.config['distr_planner']

From d60fc897ed279a717e3b19bb76a7fe42300a8862 Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Mon, 20 Feb 2023 16:33:53 -0500
Subject: [PATCH 45/64] remove more obsolete code and the r_split_flag

---
 compiler/config.py                         |   7 +-
 compiler/definitions/ir/aggregator_node.py |  47 -----
 compiler/ir.py                             |  50 +-----
 compiler/pash_init_setup.sh                |   2 +-
 compiler/pash_runtime.py                   | 197 ++-------------------
 compiler/test_evaluation_scripts.sh        |   4 +-
 6 files changed, 29 insertions(+), 278 deletions(-)

diff --git a/compiler/config.py b/compiler/config.py
index f2c2dabb6..37799ce89 100644
--- a/compiler/config.py
+++ b/compiler/config.py
@@ -120,12 +120,9 @@ def add_common_arguments(parser):
                         help="Run multiple pipelines in parallel if they are safe to run",
                         action="store_true",
                         default=False)
-    parser.add_argument("--r_split",
-                        help="(experimental) use round robin split, merge, wrap, and unwrap",
-                        action="store_true")
     parser.add_argument("--r_split_batch_size",
                         type=int,
-                        help="(experimental) configure the batch size of r_split (default: 1MB)",
+                        help="configure the batch size of r_split (default: 1MB)",
                         default=1000000)
     parser.add_argument("--dgsh_tee",
                         help="(experimental) use dgsh-tee instead of eager",
@@ -178,8 +175,6 @@ def pass_common_arguments(pash_arguments):
         arguments.append(string_to_argument(pash_arguments.log_file))
     if (pash_arguments.no_eager):
         arguments.append(string_to_argument("--no_eager"))
-    if (pash_arguments.r_split):
-        arguments.append(string_to_argument("--r_split"))
     if (pash_arguments.dgsh_tee):
         arguments.append(string_to_argument("--dgsh_tee"))
     if (pash_arguments.no_daemon):
diff --git a/compiler/definitions/ir/aggregator_node.py b/compiler/definitions/ir/aggregator_node.py
index 04b2eb8ce..be607d7b9 100644
--- a/compiler/definitions/ir/aggregator_node.py
+++ b/compiler/definitions/ir/aggregator_node.py
@@ -21,81 +21,34 @@ def __init__(self, old_node, input_ids, output_ids, name_string, new_options, fl
                          output_ids, 
                          name,
                          com_category,
-                         # BEGIN ANNO
-                         # OLD
-                         # com_options=old_node.com_options,
-                         # NEW
                          com_options=new_options, # changed that all are already in there and not appended
                          flag_option_list=flag_option_list,
-                         # END ANNO
                          com_redirs=com_redirs, 
                          com_assignments=old_node.com_assignments)
 
-        ## TODO: This assumes that all options from the old function are copied to the new.
-        ##
-        ## TODO: If we need a behavior where we don't keep the old flags, we can extend this
-        # BEGIN ANNO
-        # OLD
-        # self.append_options(new_options)
-        # END ANNO
-
 
 class AggregatorNode(MapperAggregatorNode):
     def __init__(self, old_node, input_ids, output_ids):
 
-        # BEGIN ANNO
         used_parallelizer = old_node.get_used_parallelizer()
         cmd_inv_pref = get_command_invocation_prefix_from_dfg_node(old_node)
         used_aggregator = used_parallelizer.get_actual_aggregator(cmd_inv_pref)
         log(f'used_agg: {used_aggregator}')
         log(f'old_node: {old_node}')
-        # END ANNO
 
         ## Check if an aggregator can be instantiated from the node
-        # BEGIN ANNO
-        # OLD
-        # if(old_node.com_aggregator is None):
-        # NEW
         if(used_aggregator is None):
-        # END ANNO
             log("Error: Node:", old_node, "does not contain information to instantiate an aggregator!")
             raise Exception('No information to instantiate aggregator')
 
         ## The name of the aggregator command
-        # BEGIN ANNO
-        # OLD
-        # agg_name_string = old_node.com_aggregator.name
-        # new_options = old_node.com_aggregator.options
-        # NEW
         agg_name_string = used_aggregator.cmd_name
         all_options_incl_new = [Arg.string_to_arg(el.get_name()) for el in used_aggregator.flag_option_list + used_aggregator.positional_config_list]
         # TODO: zip is nicer
         all_options_incl_new_right_format = [(i, all_options_incl_new[i]) for i in range(len(all_options_incl_new))]
-        # END ANNO
 
-        # BEGIN ANNO
-        # OLD
-        # super().__init__(old_node, input_ids, output_ids, agg_name_string, new_options)
-        # NEW
         super().__init__(old_node, input_ids, output_ids, agg_name_string, all_options_incl_new_right_format,
                          flag_option_list=used_aggregator.flag_option_list)
-        # END ANNO
 
         log("Generic Aggregator Created:", self)
 
-class MapperNode(MapperAggregatorNode):
-    def __init__(self, old_node, input_ids, output_ids):
-
-        assert(False)
-        ## Check if an mapper can be instantiated from the node
-        if(old_node.com_mapper is None):
-            log("Error: Node:", old_node, "does not contain information to instantiate a mapper!")
-            raise Exception('No information to instantiate mapper')
-
-        ## The name of the aggregator command
-        mapper_name_string = old_node.com_mapper.name
-        new_options = old_node.com_mapper.options
-
-        super().__init__(old_node, input_ids, output_ids, mapper_name_string, new_options)
-
-        log("Generic Mapper Created:", self)
diff --git a/compiler/ir.py b/compiler/ir.py
index 61078052d..4f42ab49b 100644
--- a/compiler/ir.py
+++ b/compiler/ir.py
@@ -1,4 +1,3 @@
-import sys
 import pash_annotations.datatypes
 
 from pash_annotations.datatypes.CommandInvocationInitial import CommandInvocationInitial
@@ -11,8 +10,6 @@
 
 from annotations_utils.util_parsing import parse_arg_list_to_command_invocation
 from annotations_utils.util_cmd_invocations import get_input_output_info_from_cmd_invocation_util, get_parallelizability_info_from_cmd_invocation_util
-# from annotations_utils.util_mapper import get_mapper_as_dfg_node_from_node
-# from annotations_utils.util_aggregator import get_aggregator_as_dfg_node_from_node
 from annotations_utils.util_file_descriptors import resource_from_file_descriptor
 
 from definitions.ir.file_id import *
@@ -241,21 +238,6 @@ def compile_command_to_DFG(fileIdGen, command, options,
     return dfg
 
 
-def make_split_files(input_id, fan_out, fileIdGen, r_split_flag, r_split_batch_size):
-    assert(fan_out > 1)
-    ## Generate the split file ids
-    out_fids = [fileIdGen.next_file_id() for i in range(fan_out)]
-    out_ids = [fid.get_ident() for fid in out_fids]
-    split_com = make_split_file(input_id, out_ids, r_split_flag, r_split_batch_size)
-    return [split_com], out_fids
-
-def make_split_file(input_id, out_ids, r_split_flag, r_split_batch_size):
-    if(r_split_flag):
-        split_com = r_split.make_r_split(input_id, out_ids, r_split_batch_size)
-    else:
-        split_com = pash_split.make_split_file(input_id, out_ids)
-    return split_com
-
 ##
 ## Node builder functions
 ##
@@ -268,21 +250,6 @@ def make_tee(input, outputs):
                    com_name, 
                    com_category)
 
-# def make_map_node(node, new_inputs, new_outputs, parallelizer):
-#     return get_mapper_as_dfg_node_from_node(node, parallelizer, new_inputs, new_outputs)
-
-# ## Makes a wrap node that encloses a map parallel node.
-# ##
-# ## At the moment it only works with one input and one output since wrap cannot redirect input in the command.
-# def make_wrap_map_node(node, new_inputs, new_outputs):
-#     assert(len(new_inputs) == 1)
-#     assert(len(new_outputs) == 1)
-
-#     new_node = make_map_node(node, new_inputs, new_outputs)
-#     wrap_node = r_wrap.wrap_node(new_node)
-#     return wrap_node
-
-
 
 ## Note: This might need more information. E.g. all the file
 ## descriptors of the IR, and in general any other local information
@@ -760,23 +727,21 @@ def add_edge(self, edge_fid):
     def empty(self):
         return (len(self.nodes) == 0)
 
-    def apply_parallelization_to_node(self, node_id, parallelizer, fileIdGen, fan_out,
-                                            batch_size, no_cat_split_vanish, r_split_batch_size):
+    def apply_parallelization_to_node(self, node_id, parallelizer, fileIdGen, fan_out, r_split_batch_size):
         splitter = parallelizer.get_splitter()
         if splitter.is_splitter_round_robin():
             self.apply_round_robin_parallelization_to_node(node_id, parallelizer, fileIdGen, fan_out,
-                                            batch_size, no_cat_split_vanish, r_split_batch_size)
+                                                           r_split_batch_size)
         elif splitter.is_splitter_round_robin_with_unwrap_flag():
             self.apply_round_robin_with_unwrap_flag_parallelization_to_node(node_id, parallelizer, fileIdGen, fan_out,
-                                                           batch_size, no_cat_split_vanish, r_split_batch_size)
+                                                                            r_split_batch_size)
         elif splitter.is_splitter_consec_chunks():
-            self.apply_consecutive_chunks_parallelization_to_node(node_id, parallelizer, fileIdGen, fan_out,
-                                                                 batch_size, no_cat_split_vanish, r_split_batch_size)
+            self.apply_consecutive_chunks_parallelization_to_node(node_id, parallelizer, fileIdGen, fan_out)
         else:
             raise Exception("Splitter not yet implemented")
 
     def apply_round_robin_parallelization_to_node(self, node_id, parallelizer, fileIdGen, fan_out,
-                                                        batch_size, no_cat_split_vanish, r_split_batch_size):
+                                                  r_split_batch_size):
         # TODO: this control flow should move done to aggregators once we implement them;
         #  currently, this cannot be done since splitter etc. would be added...
         aggregator_spec = parallelizer.get_aggregator_spec()
@@ -824,7 +789,7 @@ def apply_round_robin_parallelization_to_node(self, node_id, parallelizer, fileI
         self.introduce_aggregator_for_round_robin(out_mapper_ids, parallelizer, streaming_output)
 
     def apply_round_robin_with_unwrap_flag_parallelization_to_node(self, node_id, parallelizer, fileIdGen, fan_out,
-                                                  batch_size, no_cat_split_vanish, r_split_batch_size):
+                                                                   r_split_batch_size):
         # round robin with unwrap flag is an inferred parallelizer which ensures that
         # the command is commutative and has an aggregator for consecutive chunks;
         # thus we can check whether we can re-open a previous "RR"-parallelization ending with `r_merge`
@@ -865,8 +830,7 @@ def apply_round_robin_with_unwrap_flag_parallelization_to_node(self, node_id, pa
                                                      original_cmd_invocation_with_io_vars, out_aggregator_id, parallelizer,
                                                      streaming_output)
 
-    def apply_consecutive_chunks_parallelization_to_node(self, node_id, parallelizer, fileIdGen, fan_out,
-                                                               batch_size, no_cat_split_vanish, r_split_batch_size):
+    def apply_consecutive_chunks_parallelization_to_node(self, node_id, parallelizer, fileIdGen, fan_out):
         # check whether we can fuse with previous node's parallelization:
         # we can do so if the previous node's parallelization is the same, and the aggregator is concatenation
         # Assumption: it suffices to check that the previous node is an aggregator node of type concatenate
diff --git a/compiler/pash_init_setup.sh b/compiler/pash_init_setup.sh
index c1c402300..dd29f6a95 100644
--- a/compiler/pash_init_setup.sh
+++ b/compiler/pash_init_setup.sh
@@ -206,7 +206,7 @@ else
         ##   then it must have crashed or so.
         i=0
         ## This is a magic number to make sure that we wait enough
-        maximum_retries=100
+        maximum_retries=10000
         ## For some reason, `nc -z` doesn't work on livestar (it always returns error)
         ## and therefore we need to send something. 
         until  echo "Daemon Start" 2> /dev/null | nc -U "$DAEMON_SOCKET" >/dev/null 2>&1 ; 
diff --git a/compiler/pash_runtime.py b/compiler/pash_runtime.py
index 5a0dc6e4c..c78a21300 100644
--- a/compiler/pash_runtime.py
+++ b/compiler/pash_runtime.py
@@ -6,6 +6,7 @@
 
 from pash_annotations.annotation_generation.datatypes.parallelizability.AggregatorKind import AggregatorKindEnum
 
+import annotations
 import config
 from ir import *
 from ast_to_ir import compile_asts
@@ -51,7 +52,7 @@ def main_body():
         config.load_config(args.config_path)
 
     ## Load annotations
-    config.annotations = load_annotation_files(config.config['distr_planner']['annotations_dir'])
+    config.annotations = annotations.load_annotation_files(config.config['distr_planner']['annotations_dir'])
 
     runtime_config = config.config['distr_planner']
 
@@ -218,7 +219,7 @@ def optimize_irs(asts_and_irs, args, compiler_config):
             distributed_graph = choose_and_apply_parallelizing_transformations(ast_or_ir, compiler_config.width,
                                                                       runtime_config['batch_size'],
                                                                       args.no_cat_split_vanish,
-                                                                      args.r_split, args.r_split_batch_size)
+                                                                      args.r_split_batch_size)
             # pr.print_stats()
 
             # Eagers are added in remote notes when using distributed exec
@@ -252,14 +253,15 @@ def print_graph_statistics(graph):
 
 
 def choose_and_apply_parallelizing_transformations(graph, fan_out, batch_size, no_cat_split_vanish,
-                                                   r_split_flag, r_split_batch_size):
-    parallelizer_map = choose_parallelizing_transformations(graph, r_split_flag)
-    apply_parallelizing_transformations(graph, parallelizer_map, fan_out, batch_size, no_cat_split_vanish,
-                                                   r_split_flag, r_split_batch_size)
+                                                   r_split_batch_size):
+    parallelizer_map = choose_parallelizing_transformations(graph)
+    apply_parallelizing_transformations(graph, parallelizer_map, fan_out, batch_size, 
+                                        no_cat_split_vanish,
+                                        r_split_batch_size)
     return graph
 
 
-def choose_parallelizing_transformations(graph, r_split_flag): # shall return map
+def choose_parallelizing_transformations(graph): # shall return map
     source_node_ids = graph.source_nodes()
     parallelizer_map = {}
     workset = source_node_ids
@@ -275,149 +277,32 @@ def choose_parallelizing_transformations(graph, r_split_flag): # shall return ma
         elif not curr_id in visited:
             next_node_ids = graph.get_next_nodes(curr_id)
             workset += next_node_ids
-            parallelizer_map[curr_id] = choose_parallelizing_transformation(curr_id, graph, r_split_flag)
+            parallelizer_map[curr_id] = choose_parallelizing_transformation(curr_id, graph)
             visited.add(curr_id)
     return parallelizer_map
 
 
-def choose_parallelizing_transformation(curr_id, graph, r_split_flag): # shall return map entry
-    # here we can implement more sophisticated techniques to decide how to parallelize
+## This currently chooses the best parallelization based on priority:
+## 1. The round robin
+## 2. The round robin after having performed unwrap (not sure why this is the second priority)
+## 3. The consecutive chunks
+## 
+## TODO: In the future, we could develop more complex strategies      
+def choose_parallelizing_transformation(curr_id, graph): # shall return map entry
     curr = graph.get_node(curr_id)
-    # we ignore `r_split_flag` here as we want to exploit r_merge followed by commutative command
-    # which only works if the a parallelizer for the latter is chosen (sort does not have RR-parallelizer)
-    # we prioritize round robin over round robin with unwrap over consecutive chunks:
     list_all_parallelizers_in_priority = [curr.get_option_implemented_round_robin_parallelizer(),
                                           curr.get_option_implemented_round_robin_with_unwrap_parallelizer(),
                                           curr.get_option_implemented_consecutive_chunks_parallelizer()]
     return next((item for item in list_all_parallelizers_in_priority if item is not None), None)
-    # When `r_split_flag` should be used:
-    # if r_split_flag:
-    #     option_parallelizer = curr.get_option_implemented_round_robin_parallelizer()
-    # else:
-    #     option_parallelizer = curr.get_option_implemented_consecutive_chunks_parallelizer()
-    # return option_parallelizer
 
 
 def apply_parallelizing_transformations(graph, parallelizer_map, fan_out, batch_size, no_cat_split_vanish,
-                                        r_split_flag, r_split_batch_size):
+                                        r_split_batch_size):
     fileIdGen = graph.get_file_id_gen()
     node_id_non_none_parallelizer_list = [(node_id, parallelizer) for (node_id, parallelizer) in parallelizer_map.items()
                                                                   if parallelizer is not None]
     for (node_id, parallelizer) in node_id_non_none_parallelizer_list:
-        graph.apply_parallelization_to_node(node_id, parallelizer, fileIdGen, fan_out,
-                                            batch_size, no_cat_split_vanish, r_split_batch_size)
-## This is a simplistic planner, that pushes the available
-## parallelization from the inputs in file stateless commands. The
-## planner starts from the sources of the graph, and pushes
-## file parallelization as far as possible.
-##
-## It returns a maximally expanded (regarding files) graph, that can
-## be scheduled depending on the available computational resources.
-def naive_parallelize_stateless_nodes_bfs(graph, fan_out, batch_size, no_cat_split_vanish,
-                                          r_split_flag, r_split_batch_size):
-    assert(False)
-    source_node_ids = graph.source_nodes()
-
-    ## Generate a fileIdGen from a graph, that doesn't clash with the
-    ## current graph fileIds.
-    fileIdGen = graph.get_file_id_gen()
-
-    ## Starting from the sources of the graph traverse the whole graph using a
-    ## node_id workset. Every iteration we add the next nodes to the workset as
-    ## well as any newly added nodes due to optimizations.
-    workset = source_node_ids
-    visited = set()
-    while (len(workset) > 0):
-        curr_id = workset.pop(0)
-        assert(isinstance(curr_id, int))
-        ## Node must not be in visited, but must also be in the graph
-        ## (because it might have been deleted after some
-        ## optimization).
-        if(not curr_id in visited
-           and curr_id in graph.nodes):
-            # log("Curr id:", curr_id)
-            visited.add(curr_id)
-            next_node_ids = graph.get_next_nodes(curr_id)
-            workset += next_node_ids
-
-            # function application has side effects on graphs
-            new_nodes = parallelize_node(curr_id, graph, fileIdGen,
-                                         fan_out, batch_size, no_cat_split_vanish,
-                                         r_split_flag, r_split_batch_size)
-
-            ## Assert that the graph stayed valid after the transformation
-            ## TODO: Do not run this everytime in the loop if we are not in debug mode.
-            # log("Graph nodes:", graph.nodes)
-            # log("Graph edges:", graph.edges)
-            # assert(graph.valid())
-
-            ## Add new nodes to the workset depending on the optimization.
-            ##
-            ## WARNING: There is an assumption here that if there are new
-            ## nodes there was an optimization that happened and these new
-            ## nodes should ALL be added to the workset. Even if that is
-            ## correct, that is certainly non-optimal.
-            ##
-            ## TODO: Fix that
-            if(len(new_nodes) > 0):
-                # log("New nodes:", new_nodes)
-                workset += [node.get_id() for node in new_nodes]
-
-    return graph
-
-
-## Optimizes several commands by splitting its input
-def split_command_input(curr, graph, fileIdGen, fan_out, _batch_size, r_split_flag, r_split_batch_size):
-    assert(curr.is_parallelizable())
-    assert(fan_out > 1)
-
-    ## At the moment this only works for nodes that have one standard input.
-    standard_input_ids = curr.get_standard_inputs()
-    new_merger = None
-    if (len(standard_input_ids) == 1):
-        ## If the previous command is either a cat with one input, or
-        ## if it something else
-        
-        input_id = standard_input_ids[0]
-
-        ## First we have to make the split file commands.
-        split_file_commands, output_fids = make_split_files(input_id, fan_out, fileIdGen, r_split_flag, r_split_batch_size)
-        for output_fid in output_fids:
-            output_fid.make_ephemeral()
-            graph.add_edge(output_fid)
-
-        for split_file_command in split_file_commands:
-            graph.add_node(split_file_command)
-
-        ## With the split file commands in place and their output
-        ## fids (and this commands new input ids, we have to
-        ## create a new Cat node (or modify the existing one) to
-        ## have these as inputs, and connect its output to our
-        ## input.
-
-        ## Generate a new file id for the input of the current command.
-        new_input_fid = fileIdGen.next_file_id()
-        new_input_fid.make_ephemeral()
-        graph.add_edge(new_input_fid)
-        new_input_id = new_input_fid.get_ident()
-
-        output_ids = [fid.get_ident() for fid in output_fids]
-
-        ## If we add r_split, then the merger is actually an r_merge
-        if(r_split_flag):
-            new_merger = r_merge.make_r_merge_node(output_ids, new_input_id)
-        else:
-            new_merger = make_cat_node(output_ids, new_input_id)
-        graph.add_node(new_merger)
-
-        ## Replace the previous input edge with the new input edge that is after the cat.
-        curr.replace_edge(input_id, new_input_id)
-        graph.set_edge_to(new_input_id, curr.get_id())
-
-        # log("graph nodes:", graph.nodes)
-        # log("graph edges:", graph.edges)
-
-    return new_merger
+        graph.apply_parallelization_to_node(node_id, parallelizer, fileIdGen, fan_out, r_split_batch_size)
 
 def split_hdfs_cat_input(hdfs_cat, next_node, graph, fileIdGen):
     """
@@ -555,49 +440,5 @@ def add_eager_nodes(graph, use_dgsh_tee):
     return graph
 
 
-
-
-
-
-    ## TODO: In order to be able to execute it, we either have to
-    ## execute it in the starting shell (so that we have its state),
-    ## or we should somehow pass the parent shell's state to the the
-    ## distribution planner, and then the implementation environment.
-    ## In general, we probably have to find a way to pass around a
-    ## shell's state, as this will be essential for the distributed
-    ## setting too.
-    ##
-    ## Note: A way to do this is by using set > temp_file. Source:
-    ## https://arstechnica.com/civis/viewtopic.php?f=16&t=805521
-
-    ## TODO: We have to handle xargs in a special way. First of all,
-    ## in order to parallelize the command that xarg runs, we have to
-    ## do xargs -L 1 (or some other number) so that for every -L
-    ## lines, it calls a different instance of the command. Then it
-    ## will be parallelizable. In addition, we have to somehow
-    ## statically decide how much we will parallelize xarg, and how
-    ## many lines are going to be sent to each operator.
-    ##
-    ## This can probably be solved if we allow partial files in files
-    ## without resources.
-
-    ## TODO: There is slight problem with *, and other expansions in
-    ## the shell. The normal shell semantics is to expand the strings
-    ## in a command that is in a pipeline after the different
-    ## subshells have been spawned. However, we would like to have all
-    ## strings expanded as much as possible, so that we can statically
-    ## make choices about how much to distribute each command.
-    ##
-    ## Maybe we should run expansions on our own, before calling the
-    ## distribution planner? Or in the distribution planner itself? It
-    ## seems that the distribution planner should be able to do some
-    ## expansion itself though
-
-    ## TODO: There is a problem when given an unexpanded string. It
-    ## might be many files, so spliting the file up in different
-    ## pieces might be wrong.
-
-    ## BIG TODO: Extend the file class so that it supports tee etc.
-
 if __name__ == "__main__":
     main()
diff --git a/compiler/test_evaluation_scripts.sh b/compiler/test_evaluation_scripts.sh
index 18fa4086c..d7782d278 100755
--- a/compiler/test_evaluation_scripts.sh
+++ b/compiler/test_evaluation_scripts.sh
@@ -48,15 +48,13 @@ n_inputs=(
 if [ "$EXPERIMENTAL" -eq 1 ]; then
     configurations=(
         # "" # Commenting this out since the tests take a lot of time to finish
-        "--r_split"
         "--dgsh_tee"
-        # "--r_split --dgsh_tee"
         # "--speculation quick_abort"
         "--parallel_pipelines"
     )
 else
     configurations=(
-        "--r_split --dgsh_tee --parallel_pipelines --profile_driven"
+        "--dgsh_tee --parallel_pipelines --profile_driven"
     )
 fi
 

From 14ca8c7f4b801f5267e45ba74a72329aa689b08a Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Mon, 20 Feb 2023 16:47:43 -0500
Subject: [PATCH 46/64] remove an obsolete flag and an obsolete file

---
 compiler/config.py                   |   5 -
 compiler/pash_runtime.py             |   8 +-
 compiler/pash_runtime_quick_abort.sh | 283 ---------------------------
 3 files changed, 2 insertions(+), 294 deletions(-)
 delete mode 100644 compiler/pash_runtime_quick_abort.sh

diff --git a/compiler/config.py b/compiler/config.py
index 37799ce89..ea7ec0beb 100644
--- a/compiler/config.py
+++ b/compiler/config.py
@@ -109,9 +109,6 @@ def add_common_arguments(parser):
     parser.add_argument("--no_eager",
                         help="(experimental) disable eager nodes before merging nodes",
                         action="store_true")
-    parser.add_argument("--no_cat_split_vanish",
-                        help="(experimental) disable the optimization that removes cat with N inputs that is followed by a split with N inputs",
-                        action="store_true")
     parser.add_argument("--no_daemon",
                         help="Run the compiler everytime we need a compilation instead of using the daemon",
                         action="store_true",
@@ -187,8 +184,6 @@ def pass_common_arguments(pash_arguments):
         arguments.append(string_to_argument("--daemon_communicates_through_unix_pipes"))
     arguments.append(string_to_argument("--r_split_batch_size"))
     arguments.append(string_to_argument(str(pash_arguments.r_split_batch_size)))
-    if (pash_arguments.no_cat_split_vanish):
-        arguments.append(string_to_argument("--no_cat_split_vanish"))
     arguments.append(string_to_argument("--debug"))
     arguments.append(string_to_argument(str(pash_arguments.debug)))
     arguments.append(string_to_argument("--termination"))
diff --git a/compiler/pash_runtime.py b/compiler/pash_runtime.py
index c78a21300..8579608a7 100644
--- a/compiler/pash_runtime.py
+++ b/compiler/pash_runtime.py
@@ -218,7 +218,6 @@ def optimize_irs(asts_and_irs, args, compiler_config):
             # with cProfile.Profile() as pr:
             distributed_graph = choose_and_apply_parallelizing_transformations(ast_or_ir, compiler_config.width,
                                                                       runtime_config['batch_size'],
-                                                                      args.no_cat_split_vanish,
                                                                       args.r_split_batch_size)
             # pr.print_stats()
 
@@ -252,11 +251,9 @@ def print_graph_statistics(graph):
     log("Eager nodes:", len(eager_nodes))
 
 
-def choose_and_apply_parallelizing_transformations(graph, fan_out, batch_size, no_cat_split_vanish,
-                                                   r_split_batch_size):
+def choose_and_apply_parallelizing_transformations(graph, fan_out, batch_size, r_split_batch_size):
     parallelizer_map = choose_parallelizing_transformations(graph)
     apply_parallelizing_transformations(graph, parallelizer_map, fan_out, batch_size, 
-                                        no_cat_split_vanish,
                                         r_split_batch_size)
     return graph
 
@@ -296,8 +293,7 @@ def choose_parallelizing_transformation(curr_id, graph): # shall return map entr
     return next((item for item in list_all_parallelizers_in_priority if item is not None), None)
 
 
-def apply_parallelizing_transformations(graph, parallelizer_map, fan_out, batch_size, no_cat_split_vanish,
-                                        r_split_batch_size):
+def apply_parallelizing_transformations(graph, parallelizer_map, fan_out, batch_size, r_split_batch_size):
     fileIdGen = graph.get_file_id_gen()
     node_id_non_none_parallelizer_list = [(node_id, parallelizer) for (node_id, parallelizer) in parallelizer_map.items()
                                                                   if parallelizer is not None]
diff --git a/compiler/pash_runtime_quick_abort.sh b/compiler/pash_runtime_quick_abort.sh
deleted file mode 100644
index b3b541b83..000000000
--- a/compiler/pash_runtime_quick_abort.sh
+++ /dev/null
@@ -1,283 +0,0 @@
-#!/bin/bash
-
-## File directory
-RUNTIME_DIR=$(dirname "${BASH_SOURCE[0]}")
-
-still_alive()
-{
-    jobs -p | tr '\n' ' '
-}
-
-log()
-{
-    pash_redir_output echo "$$: (QAbort) " "$@"
-}
-
-# Taken from: https://stackoverflow.com/a/20473191
-# list_include_item "10 11 12" "2"
-list_include_item() {
-  local list="$1"
-  local item="$2"
-  if [[ $list =~ (^|[[:space:]])"$item"($|[[:space:]]) ]] ; then
-    # yes, list include item
-    result=0
-  else
-    result=1
-  fi
-  return $result
-}
-
-## This spawns a buffer command to buffer inputs and outputs
-##
-## It writes the pid to stdout
-spawn_eager()
-{
-    local name=$1
-    local input=$2
-    local output=$3
-    local eager_file=$("$RUNTIME_DIR/pash_ptempfile_name.sh")
-    ## Note: Using eager actually leads to some deadlock issues. It must have to do with eagers behavior when
-    ##       its input or output closes.
-    # "$RUNTIME_DIR/../runtime/eager" "$input" "$output" "$eager_file" </dev/null 1>/dev/null 2>/dev/null &
-    "$RUNTIME_DIR/../runtime/dgsh_tee.sh" "$input" "$output" -I -f </dev/null 1>/dev/null 2>/dev/null &
-    local eager_pid=$!
-    log "Spawned $name eager: $eager_pid with:"
-    log "  -- IN: $input"
-    log "  -- OUT: $output"
-    log "  -- INTERM: $eager_file"
-    echo "$eager_pid"
-}
-
-## Kills the process group that belongs to the given pgid
-kill_pg()
-{
-    local pg_lead_pid=$1
-    /bin/kill -15 "-${pg_lead_pid}" 2> /dev/null
-}
-
-## TODO: Make sure that this waits for all processes in the process group to finish executing.
-wait_pg()
-{
-    local pg_lead_pid=$1
-    wait "$pg_lead_pid" 2> /dev/null    
-}
-
-kill_wait_pg()
-{
-    kill_pg "$1"
-    wait_pg "$1"
-}
-
-## Solution Schematic:
-##
-##  (A)      (B)      (C)       (D)        (E)
-## stdin --- tee --- eager --- seq.sh --- eager --- OUT_SEQ
-##            \      (F)
-##             \--- eager --- PAR_IN
-##
-## (1) If compiler fails, or sequential is done executing:
-##     - cat OUT_SEQ > stdout
-##
-## (2) If compiler succeeds:
-##     - USR1 to reroute so that it redirects to /dev/null
-##     - PAR_IN redirect to par stdin.
-##
-## Simplifying assumptions:
-## - Not worrying about stderr
-## - Not worrying about other inputs at the moment (assuming they are files if compiler succeeds)
-## - Not worrying about other outputs 
-##   + assuming that the parallel implementation will overwrite them
-##   + Assuming that the DFG outputs are not appended
-##
-## TODO: A first TODO would be to check them in the compilation process
-##
-## TODO: An alternative TODO would be to let preprocessing give us information about them, allowing us to 
-##       have a finer tuned execution plan depending on this information. For example, if we see that script
-##       has append to some file we can be carefull and buffer its output using eager.
-
-## NOTE: The intuition about why quick-abort works is that if the compilation succeeds, then the
-##       script is a DFG, meaning that we know exactly how it affects its environment after completing.
-##       Therefore, we can go back and stop the already running script without risking unsafe behavior.
-
-## TODO: We also want to avoid executing the compiled script if it doesn't contain any improvement.
-
-## TODO: Maybe the reroute needs to be put around (C) and not (D)
-
-## TODO: Improve the happy path (very fast sequential) execution time 
-
-## TODO: Use reroute around dgsh_tees to make sure that they do not use storage unnecessarily 
-##       (if their later command is done).
-
-if [ "$pash_execute_flag" -eq 1 ]; then
-    # set -x
-    ## (A) Redirect stdin to `tee`
-    pash_tee_stdin=$("$RUNTIME_DIR/pash_ptempfile_name.sh")
-    mkfifo "$pash_tee_stdin"
-    ## The redirections below are necessary to ensure that the background `cat` reads from stdin.
-    { setsid cat > "$pash_tee_stdin" <&3 3<&- & } 3<&0
-    pash_input_cat_pid=$!
-    log "Spawned input cat with pid: $pash_input_cat_pid"
-
-    ## (B) A `tee` that duplicates input to both the sequential and parallel
-    pash_tee_stdout1=$("$RUNTIME_DIR/pash_ptempfile_name.sh")
-    pash_tee_stdout2=$("$RUNTIME_DIR/pash_ptempfile_name.sh")
-    mkfifo "$pash_tee_stdout1" "$pash_tee_stdout2"
-    tee "$pash_tee_stdout1" > "$pash_tee_stdout2" < "$pash_tee_stdin" &
-
-    ## (C) The sequential input eager
-    pash_seq_eager_output=$("$RUNTIME_DIR/pash_ptempfile_name.sh")
-    mkfifo "$pash_seq_eager_output"
-    seq_input_eager_pid=$(spawn_eager "sequential input" "$pash_tee_stdout1" "$pash_seq_eager_output")
-
-    ## (D) Sequential command
-    pash_seq_output=$("$RUNTIME_DIR/pash_ptempfile_name.sh")
-    mkfifo "$pash_seq_output"
-    setsid "$RUNTIME_DIR/pash_wrap_vars.sh" \
-        "$pash_runtime_shell_variables_file" \
-        "$pash_output_variables_file" \
-        "$pash_output_set_file" \
-        "$pash_sequential_script_file" \
-        > "$pash_seq_output" < "$pash_seq_eager_output" &
-    pash_seq_pid=$!
-    log "Sequential pid: $pash_seq_pid"
-
-    ## (E) The sequential output eager
-    pash_seq_eager2_output=$("$RUNTIME_DIR/pash_ptempfile_name.sh")
-    mkfifo "$pash_seq_eager2_output"
-    seq_output_eager_pid=$(spawn_eager "sequential output" "$pash_seq_output" "$pash_seq_eager2_output")
-
-    ## (F) Second eager
-    pash_par_eager_output=$("$RUNTIME_DIR/pash_ptempfile_name.sh")
-    mkfifo "$pash_par_eager_output"
-    par_eager_pid=$(spawn_eager "parallel input" "$pash_tee_stdout2" "$pash_par_eager_output")
-
-    ## Run the compiler
-    setsid python3 "$RUNTIME_DIR/pash_runtime.py" "$pash_compiled_script_file" --var_file "$pash_runtime_shell_variables_file" "${@:2}" &
-    pash_compiler_pid=$!
-    log "Compiler pid: $pash_compiler_pid"
-
-    ## Wait until one of the two (original script, or compiler) die
-    alive_pids=$(still_alive)
-    log "Still alive: $alive_pids"
-    while `list_include_item "$alive_pids" "$pash_seq_pid"` && `list_include_item "$alive_pids" "$pash_compiler_pid"` ; do
-        ## Wait for either of the two to complete
-        wait -n "$pash_seq_pid" "$pash_compiler_pid"
-        completed_pid_status=$?
-        log "Process exited with return code: $completed_pid_status"
-        alive_pids=$(still_alive)
-        log "Still alive: $alive_pids"
-    done
-
-    ## If the sequential is still alive we want to see if the compiler succeeded
-    if `list_include_item "$alive_pids" "$pash_seq_pid"` ; then
-        pash_runtime_return_code=$completed_pid_status
-        log "Compilation was done first with return code: $pash_runtime_return_code"
-
-        ## We only want to run the parallel if the compiler succeeded.
-        if [ "$pash_runtime_return_code" -eq 0 ]; then
-
-            ## TODO: Is this necessary
-            ## Redirect the sequential output to /dev/null
-            cat "$pash_seq_eager2_output" > /dev/null &
-            seq_cat_pid=$!
-            log "seq to /dev/null cat pid: $seq_cat_pid"
-
-            ## Kill the sequential process tree
-            log "Killing sequential pid: $pash_seq_pid..."
-            kill_pg "$pash_seq_pid"
-            kill_status=$?
-            wait_pg "$pash_seq_pid"
-            seq_exit_status=$?
-            log "Sequential pid: $pash_seq_pid was killed successfully returning status $seq_exit_status."
-            log "Still alive: $(still_alive)"
-
-            ## If kill failed it means it was already completed, 
-            ## and therefore we do not need to run the parallel.
-            ##
-            ## TOOD: Enable this optimization
-            if true || [ "$kill_status" -eq 0 ]; then
-                ## (2) Run the parallel
-                log "Run parallel:"
-                log "  -- Runtime vars: $pash_runtime_shell_variables_file"
-                log "  -- Output vars: $pash_output_variables_file"
-                log "  -- Output set: ${pash_output_set_file}"
-                log "  -- Compiled script: ${pash_compiled_script_file}"
-                log "  -- Input: $pash_par_eager_output"
-
-                "$RUNTIME_DIR/pash_wrap_vars.sh" \
-                    "$pash_runtime_shell_variables_file" \
-                    "$pash_output_variables_file" \
-                    "$pash_output_set_file" \
-                    "$pash_compiled_script_file" \
-                    < "$pash_par_eager_output" &
-                ## Note: For some reason the above redirection used to create some issues,
-                ##       but no more after we started using dgsh-tee
-
-                pash_par_pid=$!
-                log "Parallel is running with pid: $pash_par_pid..."
-                # strace -p $pash_par_pid 2>> $PASH_REDIR
-                wait "$pash_par_pid"
-                pash_runtime_final_status=$?
-                log "Parallel is done with status: $pash_runtime_final_status"
-            else
-                ## TODO: Handle that case properly by enabling the optimization above.
-                log "ERROR: Shouldn't have reached that"
-                exit 1
-            fi
-        else
-            ## If the compiler failed we just wait until the sequential is done.
-
-            ## (1) Redirect the seq output to stdout
-            cat "$pash_seq_eager2_output" &
-            seq_output_cat_pid=$!
-            log "STDOUT cat pid: $seq_output_cat_pid"
-
-            log "Waiting for sequential: $pash_seq_pid"
-            wait "$pash_seq_pid"
-            pash_runtime_final_status=$?
-            log "DONE Sequential: $pash_seq_pid exited with status: $pash_runtime_final_status"
-
-            ## TODO: It is not clear if we also need to wait for the output cat to end.
-            log "Waiting for sequential output cat: $seq_output_cat_pid"
-            wait "$seq_output_cat_pid"
-            log "DONE Waiting for sequential output cat: $seq_output_cat_pid"
-
-        fi
-    else
-        pash_runtime_final_status=$completed_pid_status
-        log "Sequential was done first with return code: $pash_runtime_final_status"
-
-        ## (1) Redirect the seq output to stdout
-        cat "$pash_seq_eager2_output" &
-        final_cat_pid=$!
-        log "STDOUT cat pid: $final_cat_pid"
-
-        ## We need to kill the compiler to not get delayed log output
-        ## If this fails (meaning that compilation is done) we do not care
-        kill_wait_pg "$pash_compiler_pid"
-
-        wait "$final_cat_pid"
-    fi
-
-    ## TODO: Not clear if this is needed or if it doesn indeed kill all the
-    ##       processes and cleans up everything properly
-    ## Kill the input process
-    log "Killing the input cat process: $pash_input_cat_pid"
-    kill_wait_pg "$pash_input_cat_pid"
-    # kill -9 $pash_input_cat_pid 2> /dev/null
-    # wait $pash_input_cat_pid 2> /dev/null
-    log "The input cat: $pash_input_cat_pid died!"
-    
-
-    ## TODO: This (and the above) should not be needed actually, everything should be already done due to
-    ##       sequential and parallel both having exited.
-    ## Kill every spawned process
-    still_alive_pids="$(still_alive)"
-    log "Killing all the still alive: $still_alive_pids"
-    kill -15 "$still_alive_pids" 2> /dev/null
-    wait "$still_alive_pids" 2> /dev/null
-    log "All the alive pids died: $still_alive_pids"
-    
-    ## Return the exit code
-    (exit "$pash_runtime_final_status")
-fi

From b8f0b0e06d4db8fa388b0494b35b513c5b26d750 Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Mon, 20 Feb 2023 16:58:18 -0500
Subject: [PATCH 47/64] remove r_split and dgsh_tee flags but allow them to be
 used for legacy interface

---
 compiler/config.py                      |  7 +++---
 compiler/dspash/ir_helper.py            |  2 +-
 compiler/pash_runtime.py                | 33 +++++++------------------
 compiler/test_evaluation_scripts.sh     |  3 +--
 evaluation/tests/agg/run.sh             |  2 +-
 evaluation/tests/interface_tests/run.sh |  2 +-
 6 files changed, 17 insertions(+), 32 deletions(-)

diff --git a/compiler/config.py b/compiler/config.py
index ea7ec0beb..186ff6f03 100644
--- a/compiler/config.py
+++ b/compiler/config.py
@@ -121,8 +121,11 @@ def add_common_arguments(parser):
                         type=int,
                         help="configure the batch size of r_split (default: 1MB)",
                         default=1000000)
+    parser.add_argument("--r_split",
+                        help="does nothing -- only here for old interfaces (not used anywhere in the code)",
+                        action="store_true")
     parser.add_argument("--dgsh_tee",
-                        help="(experimental) use dgsh-tee instead of eager",
+                        help="does nothing -- only here for old interfaces (not used anywhere in the code)",
                         action="store_true")
     parser.add_argument("--speculation",
                         help="(experimental) run the original script during compilation; if compilation succeeds, abort the original and run only the parallel (quick_abort) (Default: no_spec)",
@@ -172,8 +175,6 @@ def pass_common_arguments(pash_arguments):
         arguments.append(string_to_argument(pash_arguments.log_file))
     if (pash_arguments.no_eager):
         arguments.append(string_to_argument("--no_eager"))
-    if (pash_arguments.dgsh_tee):
-        arguments.append(string_to_argument("--dgsh_tee"))
     if (pash_arguments.no_daemon):
         arguments.append(string_to_argument("--no_daemon"))
     if (pash_arguments.distributed_exec):
diff --git a/compiler/dspash/ir_helper.py b/compiler/dspash/ir_helper.py
index e75f959ed..97368e267 100644
--- a/compiler/dspash/ir_helper.py
+++ b/compiler/dspash/ir_helper.py
@@ -68,7 +68,7 @@ def to_shell_file(graph: IR, args) -> str:
         os.makedirs(directory, exist_ok=True)
 
     if not args.no_eager:
-        graph = pash_runtime.add_eager_nodes(graph, args.dgsh_tee)
+        graph = pash_runtime.add_eager_nodes(graph)
 
     script = to_shell(graph, args)
     with open(filename, "w") as f:
diff --git a/compiler/pash_runtime.py b/compiler/pash_runtime.py
index 8579608a7..e4453ac5a 100644
--- a/compiler/pash_runtime.py
+++ b/compiler/pash_runtime.py
@@ -223,7 +223,7 @@ def optimize_irs(asts_and_irs, args, compiler_config):
 
             # Eagers are added in remote notes when using distributed exec
             if(not args.no_eager and not args.distributed_exec): 
-                eager_distributed_graph = add_eager_nodes(distributed_graph, args.dgsh_tee)
+                eager_distributed_graph = add_eager_nodes(distributed_graph)
             else:
                 eager_distributed_graph = distributed_graph
 
@@ -346,26 +346,12 @@ def split_hdfs_cat_input(hdfs_cat, next_node, graph, fileIdGen):
 
 
 ## This functions adds an eager on a given edge.
-def add_eager(eager_input_id, graph, fileIdGen, intermediateFileIdGen, use_dgsh_tee):
+def add_eager(eager_input_id, graph, fileIdGen):
     new_fid = fileIdGen.next_ephemeral_file_id()
     new_id = new_fid.get_ident()
 
-    if use_dgsh_tee:
-        ## TODO: seperate to better use dgsh-tee params and maybe deprecate eager
-        eager_node = dgsh_tee.make_dgsh_tee_node(eager_input_id, new_id)
-    else:
-        ## TODO: Remove the line below if eager creates its intermediate file
-        ##       on its own.
-        # TODO: find a better solution to make unique numbers, currently: set to max-value + 1
-        intermediateFileIdGen.bump_counter_to_value_of(fileIdGen)
-        intermediate_fid = intermediateFileIdGen.next_temporary_file_id()
-        # TODO: this edge will never have to since eager is set to output even though it reads from it
-        graph.add_edge(intermediate_fid)
-        fileIdGen.bump_counter_to_value_of(intermediateFileIdGen)
-
-        eager_exec_path = '{}/{}'.format(config.PASH_TOP, runtime_config['eager_executable_path'])
-
-        eager_node = make_eager_node(eager_input_id, new_id, intermediate_fid, eager_exec_path)
+    ## TODO: seperate to better use dgsh-tee params and maybe deprecate eager
+    eager_node = dgsh_tee.make_dgsh_tee_node(eager_input_id, new_id)
 
     ## Add the edges and the nodes to the graph
     graph.add_edge(new_fid)
@@ -382,12 +368,11 @@ def add_eager(eager_input_id, graph, fileIdGen, intermediateFileIdGen, use_dgsh_
 
 ## This function adds eager nodes wherever the width of graph is
 ## becoming smaller.
-def add_eager_nodes(graph, use_dgsh_tee):
+def add_eager_nodes(graph):
     source_node_ids = graph.source_nodes()
 
     ## Generate a fileIdGen that doesnt clash with graph fids.
     fileIdGen = graph.get_file_id_gen()
-    intermediateFileIdGen = FileIdGen(0, runtime_config['eager_intermediate_prefix'])
 
     ## Get the next nodes
     workset = [node for source_node_id in source_node_ids for node in graph.get_next_nodes(source_node_id)]
@@ -415,23 +400,23 @@ def add_eager_nodes(graph, use_dgsh_tee):
                     assert(to_node == curr_id)
                     ## If the edge is an input edge, then we don't want to put eager.
                     if(not from_node is None):
-                        add_eager(curr_input_id, graph, fileIdGen, intermediateFileIdGen, use_dgsh_tee)
+                        add_eager(curr_input_id, graph, fileIdGen)
 
             if(isinstance(curr, Split)):
                 eager_input_ids = curr.get_output_list()[:-1]
                 for edge_id in eager_input_ids:
-                    add_eager(edge_id, graph, fileIdGen, intermediateFileIdGen, use_dgsh_tee)
+                    add_eager(edge_id, graph, fileIdGen)
 
             ## Add an eager after r_unwrap            
             if(isinstance(curr, r_unwrap.RUnwrap)):
                 eager_input_id = curr.get_output_list()[0]
-                add_eager(eager_input_id, graph, fileIdGen, intermediateFileIdGen, use_dgsh_tee)
+                add_eager(eager_input_id, graph, fileIdGen)
 
             ## Add an eager after r_split
             if(isinstance(curr, r_split.RSplit)):
                 eager_input_ids = curr.get_output_list()
                 for edge_id in eager_input_ids:
-                    add_eager(edge_id, graph, fileIdGen, intermediateFileIdGen, use_dgsh_tee)
+                    add_eager(edge_id, graph, fileIdGen)
 
     return graph
 
diff --git a/compiler/test_evaluation_scripts.sh b/compiler/test_evaluation_scripts.sh
index d7782d278..6c8702f57 100755
--- a/compiler/test_evaluation_scripts.sh
+++ b/compiler/test_evaluation_scripts.sh
@@ -48,13 +48,12 @@ n_inputs=(
 if [ "$EXPERIMENTAL" -eq 1 ]; then
     configurations=(
         # "" # Commenting this out since the tests take a lot of time to finish
-        "--dgsh_tee"
         # "--speculation quick_abort"
         "--parallel_pipelines"
     )
 else
     configurations=(
-        "--dgsh_tee --parallel_pipelines --profile_driven"
+        "--parallel_pipelines --profile_driven"
     )
 fi
 
diff --git a/evaluation/tests/agg/run.sh b/evaluation/tests/agg/run.sh
index 67320818c..6b5a77ed8 100755
--- a/evaluation/tests/agg/run.sh
+++ b/evaluation/tests/agg/run.sh
@@ -4,7 +4,7 @@ export PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel --show-superproject-
 # time: print real in seconds, to simplify parsing
 
 bash="bash"
-pash="$PASH_TOP/pa.sh --r_split --dgsh_tee --r_split_batch_size 1000000 --parallel_pipelines --profile_driven"
+pash="$PASH_TOP/pa.sh --r_split_batch_size 1000000 --parallel_pipelines --profile_driven"
 
 output_dir="$PASH_TOP/evaluation/tests/agg/output"
 rm -rf "$output_dir"
diff --git a/evaluation/tests/interface_tests/run.sh b/evaluation/tests/interface_tests/run.sh
index ad6b9fd1e..43a60fa99 100755
--- a/evaluation/tests/interface_tests/run.sh
+++ b/evaluation/tests/interface_tests/run.sh
@@ -4,7 +4,7 @@ export PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel --show-superproject-
 # time: print real in seconds, to simplify parsing
 
 bash="bash"
-pash="$PASH_TOP/pa.sh --parallel_pipelines --r_split --dgsh_tee --profile_driven"
+pash="$PASH_TOP/pa.sh --parallel_pipelines --profile_driven"
 
 output_dir="$PASH_TOP/evaluation/tests/interface_tests/output"
 rm -rf "$output_dir"

From 77997da438c10aa466b0d03f10984989afceaebd Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Mon, 20 Feb 2023 16:59:10 -0500
Subject: [PATCH 48/64] typo

---
 compiler/pash_init_setup.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler/pash_init_setup.sh b/compiler/pash_init_setup.sh
index dd29f6a95..c1c402300 100644
--- a/compiler/pash_init_setup.sh
+++ b/compiler/pash_init_setup.sh
@@ -206,7 +206,7 @@ else
         ##   then it must have crashed or so.
         i=0
         ## This is a magic number to make sure that we wait enough
-        maximum_retries=10000
+        maximum_retries=100
         ## For some reason, `nc -z` doesn't work on livestar (it always returns error)
         ## and therefore we need to send something. 
         until  echo "Daemon Start" 2> /dev/null | nc -U "$DAEMON_SOCKET" >/dev/null 2>&1 ; 

From 57b723f93ab239a24a4e6862b828377f77d5d787 Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Mon, 20 Feb 2023 17:53:43 -0500
Subject: [PATCH 49/64] Trigger CI on scripts changes

---
 .github/workflows/build.yaml      | 2 ++
 .github/workflows/posix.yaml      | 1 +
 .github/workflows/tests.yaml      | 2 ++
 .github/workflows/tight-loop.yaml | 2 ++
 4 files changed, 7 insertions(+)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index c044edf72..7cad5016c 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -8,6 +8,7 @@ on:
         - runtime/**
         - evaluation/**
         - annotations/**
+        - scripts/**
   push:
     branches:
       - main
@@ -17,6 +18,7 @@ on:
         - runtime/**
         - evaluation/**
         - annotations/**
+        - scripts/**
 
 # Jobs section
 jobs:
diff --git a/.github/workflows/posix.yaml b/.github/workflows/posix.yaml
index bc92543a8..2ead27a3b 100644
--- a/.github/workflows/posix.yaml
+++ b/.github/workflows/posix.yaml
@@ -8,6 +8,7 @@ on:
           - runtime/**
           - evaluation/**
           - annotations/**
+          - scripts/**
 jobs:
   issue-jobs:
     runs-on: ubuntu-latest
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 4b96ea592..f8f6bb0e6 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -8,6 +8,7 @@ on:
         - runtime/**
         - evaluation/**
         - annotations/**
+        - scripts/**
   push:
     branches:
       - main
@@ -17,6 +18,7 @@ on:
         - runtime/**
         - evaluation/**
         - annotations/**
+        - scripts/**
 
 # Jobs section
 jobs:
diff --git a/.github/workflows/tight-loop.yaml b/.github/workflows/tight-loop.yaml
index 8da96cb09..fc1d6f5d8 100644
--- a/.github/workflows/tight-loop.yaml
+++ b/.github/workflows/tight-loop.yaml
@@ -10,6 +10,7 @@ on:
       - runtime/**
       - evaluation/**
       - annotations/**
+      - scripts/**
 
   pull_request_target:
     types: [assigned, opened, synchronize, reopened, ready_for_review]
@@ -19,6 +20,7 @@ on:
           - runtime/**
           - evaluation/**
           - annotations/**
+          - scripts/**
 
 # Jobs section
 jobs:

From 9873c8fd53e78d18de07c160bbf1a649b82a536d Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Tue, 21 Feb 2023 10:25:03 -0500
Subject: [PATCH 50/64] Fix interface tests, PaSh does not support interactive
 mode

---
 evaluation/tests/interface_tests/run.sh | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/evaluation/tests/interface_tests/run.sh b/evaluation/tests/interface_tests/run.sh
index 43a60fa99..b06a8dd0c 100755
--- a/evaluation/tests/interface_tests/run.sh
+++ b/evaluation/tests/interface_tests/run.sh
@@ -142,13 +142,10 @@ test14()
     $shell +a readonly.sh
 }
 
-## Checks interactivity
-##
-## TODO: Make the interactivity script more elaborate (variable dependencies)
 test15()
 {
     local shell=$1
-    $shell < readonly.sh 
+    $shell readonly.sh 
 }
 
 test16()

From b7d3dc86e1678a2d77e142182f2a337be34a4ecd Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Tue, 21 Feb 2023 10:38:23 -0500
Subject: [PATCH 51/64] move preprocessor to its own directory

---
 compiler/pash.py                                   | 2 +-
 compiler/preprocessor/__init__.py                  | 0
 preprocessor => compiler/preprocessor/preprocessor | 2 +-
 compiler/{ => preprocessor}/preprocessor.py        | 0
 4 files changed, 2 insertions(+), 2 deletions(-)
 create mode 100644 compiler/preprocessor/__init__.py
 rename preprocessor => compiler/preprocessor/preprocessor (91%)
 rename compiler/{ => preprocessor}/preprocessor.py (100%)

diff --git a/compiler/pash.py b/compiler/pash.py
index d1f2b1fed..fc69b780b 100755
--- a/compiler/pash.py
+++ b/compiler/pash.py
@@ -8,7 +8,7 @@
 from ir import *
 from parse import parse_shell_to_asts_interactive
 from pash_graphviz import maybe_init_graphviz_dir
-from preprocessor import preprocess
+from preprocessor.preprocessor import preprocess
 from util import *
 import config
 import shutil
diff --git a/compiler/preprocessor/__init__.py b/compiler/preprocessor/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/preprocessor b/compiler/preprocessor/preprocessor
similarity index 91%
rename from preprocessor
rename to compiler/preprocessor/preprocessor
index 5785eea53..fb3e40081 100755
--- a/preprocessor
+++ b/compiler/preprocessor/preprocessor
@@ -15,4 +15,4 @@ export PYTHONPATH="${PASH_TOP}/python_pkgs/:${PYTHONPATH}"
 ## Create a temporary directory where PaSh can use for temporary files and logs
 export PASH_TMP_PREFIX="$(mktemp -d /tmp/pash_XXXXXXX)/"
 
-python3 "$PASH_TOP/compiler/preprocessor.py" "$@"
+python3 "$PASH_TOP/preprocessor/preprocessor.py" "$@"
diff --git a/compiler/preprocessor.py b/compiler/preprocessor/preprocessor.py
similarity index 100%
rename from compiler/preprocessor.py
rename to compiler/preprocessor/preprocessor.py

From 22063fa9379455f725c575cf675fb779a03111c2 Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Tue, 21 Feb 2023 10:43:32 -0500
Subject: [PATCH 52/64] nit

---
 compiler/preprocessor/preprocessor    | 4 ++--
 compiler/preprocessor/preprocessor.py | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/compiler/preprocessor/preprocessor b/compiler/preprocessor/preprocessor
index fb3e40081..f11c2f38a 100755
--- a/compiler/preprocessor/preprocessor
+++ b/compiler/preprocessor/preprocessor
@@ -10,9 +10,9 @@
 export PASH_TOP=${PASH_TOP:-${BASH_SOURCE%/*}}
 export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
 # point to the local downloaded folders
-export PYTHONPATH="${PASH_TOP}/python_pkgs/:${PYTHONPATH}"
+export PYTHONPATH="${PASH_TOP}/compiler/:${PASH_TOP}/python_pkgs/:${PYTHONPATH}"
 
 ## Create a temporary directory where PaSh can use for temporary files and logs
 export PASH_TMP_PREFIX="$(mktemp -d /tmp/pash_XXXXXXX)/"
 
-python3 "$PASH_TOP/preprocessor/preprocessor.py" "$@"
+python3 "$PASH_TOP/compiler/preprocessor/preprocessor.py" "$@"
diff --git a/compiler/preprocessor/preprocessor.py b/compiler/preprocessor/preprocessor.py
index cdc6fa06e..99242194f 100644
--- a/compiler/preprocessor/preprocessor.py
+++ b/compiler/preprocessor/preprocessor.py
@@ -59,6 +59,9 @@ def main():
     ## Initialize the log file
     ## TODO: Can we move this somewhere where there is no need for copy paste?
     config.init_log_file()
+
+    ## TODO: Modify this to allow for multiple different transformation types 
+    ##       (and potentially being more loosely coupled with PaSh).
     mode = ast_to_ast.TransformationType('spec')
     preprocessed_shell_script = preprocess(args.input, args, mode)
     print(preprocessed_shell_script)

From 4ff9e3407a2a99f17d67e27c5ac7f24c3f6d94be Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Tue, 21 Feb 2023 11:43:44 -0500
Subject: [PATCH 53/64] more cleanup

---
 TODO.md                                       |   16 -
 compiler/execute_gnu_parallel_script.sh       |   51 -
 compiler/gather_results.py                    | 1745 -----------------
 ...nerate_gnu_parallel_intermediary_script.py |   70 -
 compiler/test_evaluation_scripts.sh           |  230 ---
 compiler/wrapper.py                           |  123 --
 scripts/ci/ci.sh                              |    2 +-
 scripts/run_tests.sh                          |    2 +-
 8 files changed, 2 insertions(+), 2237 deletions(-)
 delete mode 100644 TODO.md
 delete mode 100755 compiler/execute_gnu_parallel_script.sh
 delete mode 100644 compiler/gather_results.py
 delete mode 100644 compiler/generate_gnu_parallel_intermediary_script.py
 delete mode 100755 compiler/test_evaluation_scripts.sh
 delete mode 100644 compiler/wrapper.py

diff --git a/TODO.md b/TODO.md
deleted file mode 100644
index 0ffd62f55..000000000
--- a/TODO.md
+++ /dev/null
@@ -1,16 +0,0 @@
-## TODOs before merging to `future`
-
-- fix tests from compiler/test_evaluation_scripts.sh:
-  + bigrams
-- clean up utils for annotations
-- graphviz
-- Changing PaSh flags (making the default be priority r-split and then consecutive chunks), so remove the r_split flag and make defaults be the ones from the OSDI paper
-- Fixing annotation library installation to a specific commit
-- Remove code which got obsolete due to the changes
-- Room for optimization: basically disable parallelization after a tr which squeezes all new lines since there are no sequences of data to parallelize anyway for the moment. 
-    Long-term, we could allow parallelization but with a adj_line_merge aggregator.
-- Changes to scripts:
-  + `shortest_scripts.sh`: here I only needed to modify the script slightly: 
-    (1) option arguments for `cut` with whitespace as the parser cannot deal with them otherwise currently but we might want to change this in the future, 
-    (2) `head -n 15` instead of `head -15` which might be a bit harder to support. I did not really see how the man-page supports this actually when skimming but I might have missed that. 
-- tr_test.sh: Outside the testing script, the outputs are the same but somehow it still shows different outputs. Checked this with Konstantinos and he will check the testing script later.
diff --git a/compiler/execute_gnu_parallel_script.sh b/compiler/execute_gnu_parallel_script.sh
deleted file mode 100755
index bd9f5afa8..000000000
--- a/compiler/execute_gnu_parallel_script.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/bin/bash
-
-microbenchmark=$1
-n_in=$2
-results_subdir="gnu_parallel"
-
-experiment="${microbenchmark}_${n_in}"
-
-DISH_TOP=${DISH_TOP:-$(git rev-parse --show-toplevel --show-superproject-working-tree)}
-
-eval_directory="../evaluation/"
-intermediary_directory="${eval_directory}/intermediary/"
-results="${eval_directory}results/${results_subdir}/"
-prefix="${intermediary_directory}${experiment}_gnu_parallel"
-
-mkdir -p results
-
-env_file="${prefix}_env.sh"
-funs_file="${prefix}_funs.sh"
-gnu_parallel_script="${prefix}.sh"
-
-gnu_parallel_scripts_dir="${eval_directory}/gnu_parallel_benchmarks/"
-microbenchmarks_dir="${eval_directory}/microbenchmarks/"
-
-## Generate the intermediary gnu parallel scripts
-python3 generate_gnu_parallel_intermediary_script.py "${gnu_parallel_scripts_dir}" "${microbenchmarks_dir}" \
-        "${microbenchmark}" "${n_in}" "${intermediary_directory}" || 
-{ echo 'GNU parallel script generation failed' ; exit 1; }
-
-seq_output="${intermediary_directory}/${microbenchmark}_seq_output"
-gnu_parallel_output="${intermediary_directory}/${microbenchmark}_gnu_parallel_output"
-
-echo "Environment:"
-cat "$env_file"
-. "$env_file"
-export "$(cut -d= -f1 "$env_file")"
-
-## Export necessary functions
-if [ -f "$funs_file" ]; then
-    source "$funs_file"
-fi
-
-gnu_parallel_result_filename="${results}${experiment}_gnu_parallel.time"
-
-echo "GNU Parallel:"
-cat "$gnu_parallel_script"
-{ time /bin/bash "$gnu_parallel_script" > "$gnu_parallel_output" ; } 2> >(tee "$gnu_parallel_result_filename" >&2)
-
-echo "Checking for equivalence..."
-diff -s "$seq_output" "$gnu_parallel_output" | tee -a "$gnu_parallel_result_filename"
-
diff --git a/compiler/gather_results.py b/compiler/gather_results.py
deleted file mode 100644
index a0052309f..000000000
--- a/compiler/gather_results.py
+++ /dev/null
@@ -1,1745 +0,0 @@
-import copy
-import math
-import argparse
-import os
-import statistics
-import numpy as np
-import matplotlib.pyplot as plt
-import matplotlib.lines as pltlines
-import matplotlib.ticker as plticker
-
-parser = argparse.ArgumentParser(description='Produce plots from various experiments with PaSh.')
-parser.add_argument('--eurosys2021',
-                    action='store_true',
-                    help='generates the plots for all the experiments in the EuroSys2021 paper')
-parser.add_argument('--all',
-                    action='store_true',
-                    help='generates all plots')
-parser.add_argument('--debug',
-                    action='store_true',
-                    help='prints debugging info')
-
-args = parser.parse_args()
-
-if args.all is True:
-    args.eurosys2021 = True
-
-if not args.all and not args.eurosys2021:
-    print("You have to specify some plot to generate!")
-    print("See command usage with --help.")
-    exit(0)
-
-
-SMALL_SIZE = 16
-MEDIUM_SIZE = 18
-BIGGER_SIZE = 20
-
-plt.rc('font', size=MEDIUM_SIZE)          # controls default text sizes
-plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
-plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
-plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
-plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
-plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
-plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
-
-plt.rcParams['mathtext.fontset'] = 'stix'
-plt.rcParams['font.family'] = 'STIXGeneral'
-
-
-MICROBENCHMARKS = "../evaluation/microbenchmarks/"
-RESULTS = "../evaluation/results/"
-SMALL_RESULTS = "../evaluation/results/eurosys_small/"
-UNIX50_RESULTS = "../evaluation/results/unix50/"
-SMALL_UNIX50_RESULTS = "../evaluation/results/unix50_4_1073741824/"
-BIG_UNIX50_RESULTS = "../evaluation/results/unix50_16_10737418240/"
-COARSE_UNIX50_RESULTS = "../evaluation/results/unix50-naive/"
-OASA_RESULTS = "../evaluation/buses/results/"
-
-## Create the plots directory
-os.makedirs("../evaluation/plots", exist_ok=True)
-
-all_experiments = ["minimal_grep",
-                   "minimal_sort",
-                   "topn",
-                   "wf",
-                   "grep",
-                   "spell",
-                   "shortest_scripts",
-                   "diff",
-                   "bigrams",
-                   "alt_bigrams",
-                   "set-diff",
-                   "double_sort"]
-
-pretty_names = {"minimal_grep" : "nfa-regex",
-                "minimal_sort" : "sort",
-                "wf" : "wf",
-                "topn" : "top-n",
-                "grep" : "filter",
-                "bigrams" : "bi-grams",
-                "alt_bigrams" : "bi-grams-opt",
-                "spell" : "spell",
-                "shortest_scripts" : "shortest-scripts",
-                "diff" : "difference",
-                "set-diff" : "set-difference",
-                "double_sort" : "sort-sort"}
-
-structures = {"minimal_grep" : "$3\\times\\tsta$",
-              "minimal_sort" : "$\\tsta, \\tpur$",
-              "wf" : "$3\\times\\tsta, 3\\times\\tpur$",
-              "topn" : "$2\\times\\tsta, 4\\times\\tpur$",
-              "grep" : "$3\\times\\tsta$",
-              "bigrams" : "$3\\times\\tsta, 3\\times\\tpur$",
-              "alt_bigrams" : "$3\\times\\tsta, \\tpur$",
-              "spell" : "$4\\times\\tsta, 3\\times\\tpur$",
-              "shortest_scripts" : "$5\\times\\tsta, 2\\times\\tpur$",
-              "diff" : "$2\\times\\tsta, 3\\times\\tpur$",
-              "set-diff" : "$5\\times\\tsta, 2\\times\\tpur$",
-              "double_sort" : "$\\tsta, 2\\times\\tpur$"}
-
-highlights = {"minimal_grep" : "complex NFA regex",
-              "minimal_sort" : "\\tti{sort}ing",
-              "wf" : "double \\tti{sort}, \\tti{uniq} reduction",
-              "topn" : "double \\tti{sort}, \\tti{uniq} reduction",
-              "grep" : "IO-intensive, computation-light",
-              "bigrams" : "stream shifting and merging",
-              "alt_bigrams" : "optimized version of bigrams",
-              "spell" : "comparisons (\\tti{comm})",
-              "shortest_scripts" : "long \\tsta pipeline ending with \\tpur",
-              "diff" : "non-parallelizable \\tti{diff}ing",
-              "set-diff" : "two pipelines merging to a \\tti{comm}",
-              "double_sort" : "parallelizable \\tpur after \\tpur"}
-
-input_filename_sizes = {"1G": "1~GB",
-                        "3G": "3~GB",
-                        "10G": "10~GB",
-                        "100G": "100~GB",
-                        "1M": "1~MB",
-                        "10M": "10~MB",
-                        "100M": "100~MB",
-                        "all_cmds_x1000": "85~MB"}
-
-suffix_to_runtime_config = {"distr": "eager",
-                            "distr_auto_split": "split",
-                            "distr_no_task_par_eager": "blocking-eager",
-                            "distr_no_eager": "no-eager",
-                            "distr_auto_split_fan_in_fan_out": "no-aux-cat-split",
-                            "pash": "split"}
-
-all_line_plots = ["split",
-                  "mini-split",
-                  "eager",
-                  "blocking-eager",
-                  "no-eager",
-                  "no-aux-cat-split"]
-
-file_suffixes = {"split": "distr_auto_split.time",
-                 "mini-split": "distr_auto_split.time",
-                 "eager": "distr.time",
-                 "blocking-eager": "distr_no_task_par_eager.time",
-                 "no-eager": "distr_no_eager.time",
-                 "no-aux-cat-split": "distr_auto_split_fan_in_fan_out.time"}
-
-
-class LinePlotConfig:
-    ## TODO: Have a default label
-    def __init__(self, linestyle, color, label, linewidth=0.5):
-        self.linestyle = linestyle
-        self.color = color
-        self.label = label
-        self.linewidth = linewidth
-    
-    def plot(self, xvalues, yvalues, ax):
-        return ax.plot(xvalues, yvalues, self.linestyle, linewidth=self.linewidth, color=self.color, label=self.label)
-
-    ## The following two methods are needed since the linestyle argument in plot
-    ## does not correspond to the one in lines2D
-    def get_ls(self):
-        linestyle = self.linestyle
-        if(len(linestyle) == 2 and linestyle[0] == '-' and not linestyle[1] == '-'):
-            return '-'
-        return linestyle
-
-    def get_marker(self):
-        linestyle = self.linestyle
-        if(len(linestyle) == 2 and linestyle[0] == '-' and not linestyle[1] == '-'):
-            return linestyle[1]
-        return None
-
-    def get_artist(self):
-        marker = self.get_marker()
-        linestyle = self.get_ls()
-        return pltlines.Line2D([], [], marker=marker, linestyle=linestyle, linewidth=self.linewidth, color=self.color, label=self.label)
-
-default_line_plot_configs = {'eager': LinePlotConfig('-D', 'tab:red', 'Parallel w/o split'),
-                             'split': LinePlotConfig('-o', 'tab:blue', 'Parallel'),
-                             'mini-split': LinePlotConfig('-o', 'tab:blue', 'Parallel'),
-                             'blocking-eager': LinePlotConfig('-p', 'orange', 'Blocking Eager'),
-                             'no-eager': LinePlotConfig('-^', 'green', 'No Eager'),
-                             'no-aux-cat-split': LinePlotConfig('-v', 'brown', 'No Aux Cat-Split')}
-
-class Config:
-    def __init__(self, pash, width=None, runtime=None):
-        if(pash):
-            assert(Config.is_runtime_valid(runtime))
-            self.pash = True
-            self.width = width
-            self.runtime = runtime
-        else:
-            self.pash = False
-    
-    def __repr__(self):
-        if(self.pash):
-            return("PaSh Config(width={}, runtime={})".format(self.width, self.runtime))
-        else:
-            return("No PaSh")
-
-    def is_runtime_valid(runtime):
-        possible_runtimes = ["no-eager",
-                             "blocking-eager",
-                             "eager",
-                             "mini-split",
-                             "split",
-                             "no-aux-cat-split"]
-        return runtime in possible_runtimes
-
-
-class Result:
-    def __init__(self, script, config, value, description=""):
-        self.script = script
-        self.config = config
-        self.value = value
-        self.description = description
-    
-    def __repr__(self):
-        return("Result(description={}, script={}, value={}, config={})".format(self.description,
-                                                                               self.script,
-                                                                               self.value,
-                                                                               self.config))
-
-    def __truediv__(self, other):
-        if not isinstance(other, (int, float, Result)):
-            return NotImplemented
-
-        ## TODO: Change that to Result too
-        return self.value / other
-
-    def __rtruediv__(self, other):
-        if not isinstance(other, (int, float, Result)):
-            return NotImplemented
-
-        if(self.value == 0):
-            if(args.debug):
-                print("Division by zero")
-            return 0
-
-        ## TODO: Change that to Result too
-        return other / self.value
-
-    def __add__(self, other):
-        if not isinstance(other, (int, float, Result)):
-            return NotImplemented
-
-        if not self.description == "execution time":
-            return NotImplemented
-
-        ## TODO: Change that to Result too
-        return other + self.value
-
-    def __radd__(self, other):
-        return self + other
-
-class ResultVector:
-    def __init__(self, script, config, results, description, xvalues, xaxis):
-        self.script = script
-        self.config = config
-        self.results = results 
-        self.description = description
-        self.xvalues = xvalues
-        self.xaxis = xaxis
-
-    def __repr__(self):
-        return("Results(description={}, script={}, config={})".format(self.description,
-                                                                      self.script,
-                                                                      self.value,
-                                                                      self.config))
-
-    def plot(self, ax, linestyle='--', linewidth=0.5, color=None, label=None):
-        if(label is None):
-            ## TODO: If a label is not passed, we can generate one through config
-            return NotImplemented
-        
-        if(color is None):
-            return ax.plot(self.xvalues, self.results, linestyle, linewidth=linewidth, label=label)
-        else:
-            ## This might not be a necessary case-split
-            return ax.plot(self.xvalues, self.results, linestyle, linewidth=linewidth, color=color, label=label)
-
-    def __iter__(self):
-        ''' Returns the iterator for the list of results. '''
-        return self.results.__iter__()
-
-def safe_zero_div(a, b):
-    if(a is None or b is None):
-        return None
-    elif(b == 0):
-        if(args.debug):
-            print("WARNING: Division by zero")
-        return 0
-    else:
-        return a / b
-
-
-##
-## Read From Files
-##
-
-def get_experiment_files(experiment, results_dir):
-    files = [f for f in os.listdir(results_dir) if f.startswith(experiment)]
-    return [int(f.split(experiment + "_")[1].split("_")[0]) for f in files]
-
-def read_total_time(filename):
-    try:
-        time = 0
-        f = open(filename)
-        for line in f:
-            if(line.startswith("real")):
-                minutes, seconds_milli = line.split("\t")[1].split("s")[0].split("m")
-                seconds, milliseconds = seconds_milli.split(".")
-                time = (int(minutes) * 60 + int(seconds)) * 1000 + int(milliseconds)
-        f.close()
-        return time
-    except:
-        print("!! WARNING: Filename:", filename, "not found!!!")
-        return 0
-
-def read_distr_execution_time(filename):
-    try:
-        f = open(filename)
-        times = []
-        for line in f:
-            if(line.startswith("Execution time")):
-                milliseconds = line.split(": ")[1].split(" ")[0]
-                times.append(float(milliseconds))
-        f.close()
-        if(len(times) == 0):
-            return 0
-        return sum(times)
-    except:
-        print("!! WARNING: Filename:", filename, "not found!!!")
-        return 0
-
-def read_distr_total_compilation_time(filename):
-    try:
-        f = open(filename)
-        times = []
-        for line in f:
-            if(line.startswith("Compilation time")
-               or line.startswith("Optimization time")
-               or line.startswith("Backend time")):
-                milliseconds = line.split(": ")[1].split(" ")[0]
-                times.append(float(milliseconds))
-        f.close()
-        # print(times)
-        return sum(times)
-    except:
-        print("!! WARNING: Filename:", filename, "not found!!!")
-        return 0
-
-def read_distr_command_number(filename):
-    try:
-        with open(filename) as f:
-            for line in f:
-                if(line.startswith("Total nodes after optimization:")):
-                    number_of_commands = int(line.split(": ")[1])
-                    return number_of_commands
-            return "\\todo{X}"
-    except:
-        print("!! WARNING: Filename:", filename, "not found!!!")
-        return "\\todo{X}"
-
-def check_output_diff_correctness_for_experiment(filename):
-    try:
-        with open(filename) as f:
-            for line in f:
-                if(line.startswith("Files ")
-                   and line.rstrip().endswith("are identical")):
-                    return "Correct"
-            return "Wrong"
-    except:
-        # print("!! WARNING: Filename:", filename, "not found!!!")
-        return "Not Found"
-
-##
-## Result Collection
-##
-
-def script_name_from_prefix(prefix):
-    ## TODO: Move this outside
-    script_name = prefix.split("/")[-1].rstrip("_")
-    return script_name
-
-def runtime_config_from_suffix(suffix):
-    runtime = suffix_to_runtime_config[suffix.split(".")[0]]
-    return runtime
-
-def sequential_experiment_exec_time(prefix, scaleup_number, suffix='seq.time'):
-    config = Config(pash=False)
-    value = read_total_time('{}{}_{}'.format(prefix, scaleup_number, suffix))
-    description = "execution time"
-    script_name = script_name_from_prefix(prefix)
-    result = Result(script_name, config, value, description)
-    return result
-
-def distributed_experiment_exec_time(prefix, scaleup_number, suffix):
-    pash_runtime = runtime_config_from_suffix(suffix)
-    # print(pash_runtime)
-    config = Config(pash=True, width=scaleup_number, runtime=pash_runtime)
-    value = read_distr_execution_time('{}{}_{}'.format(prefix, scaleup_number, suffix))
-    description = "execution time"
-    script_name = script_name_from_prefix(prefix)
-    result = Result(script_name, config, value, description)
-    return result
-
-def collect_distr_experiment_execution_times(prefix, suffix, scaleup_numbers):
-    numbers = [distributed_experiment_exec_time(prefix, n, suffix)
-               for n in scaleup_numbers]
-
-    ## TODO: Turn to Result
-    compile_numbers = [read_distr_total_compilation_time('{}{}_{}'.format(prefix, n, suffix))
-                       for n in scaleup_numbers]
-    return (numbers, compile_numbers)
-
-def collect_experiment_scaleup_times(prefix, scaleup_numbers, suffix="distr.time"):
-    ## Since we have the same input size in all cases, only use the
-    ## one sequential execution for the sequential time
-    seq_number = sequential_experiment_exec_time(prefix, scaleup_numbers[0])
-    distr_numbers, compile_numbers = collect_distr_experiment_execution_times(prefix, suffix, scaleup_numbers)
-    return (seq_number, distr_numbers, compile_numbers)
-
-def collect_baseline_experiment_speedups(prefix, scaleup_numbers, base_seq):
-    seq_numbers = [sequential_experiment_exec_time(prefix, n)
-                   for n in scaleup_numbers]
-    speedup = [safe_zero_div(base_seq, t) for t in seq_numbers]
-    return speedup
-
-def collect_distr_experiment_speedup_seq_time(prefix, scaleup_numbers, suffix="distr.time"):
-    distr_speedup, _, seq_time = collect_distr_experiment_speedup_with_compilation(prefix, scaleup_numbers, suffix)
-    return (distr_speedup, seq_time)
-
-def collect_distr_experiment_speedup(prefix, scaleup_numbers, suffix="distr.time"):
-    distr_speedup, _ = collect_distr_experiment_speedup_seq_time(prefix, scaleup_numbers, suffix=suffix)
-    return distr_speedup
-
-def collect_distr_experiment_speedup_with_compilation(prefix, scaleup_numbers, suffix="distr.time"):
-    seq_number, distr_numbers, compile_numbers = collect_experiment_scaleup_times(prefix, scaleup_numbers, suffix=suffix)
-    distr_speedups = [safe_zero_div(seq_number, t) for i, t in enumerate(distr_numbers)]
-    compile_distr_speedups = [safe_zero_div(seq_number, t + compile_numbers[i]) for i, t in enumerate(distr_numbers)]
-
-    ## TODO: Populate the Results using a method in Results   
-    ## TODO: Pass these as arguments to the other methods 
-    ##       (or if populating through a method in Results we can just fill there) 
-    # script_name = script_name_from_prefix(prefix)
-    # description = "speedups"
-    # runtime = runtime_config_from_suffix(suffix)
-    # config = Config(pash=True, width="Var", runtime=runtime)
-    # result_vec = ResultVector(script_name, config, distr_speedups, 
-    #                           description, scaleup_numbers, "width")
-    # return (result_vec, compile_distr_speedups)
-    return (distr_speedups, compile_distr_speedups, seq_number)
-
-## This function just collects the execution time for a specific suffix
-def collect_experiment_scaleup_times_simple(prefix, scaleup_numbers, suffix='distr.time'):
-    _seq_number, distr_numbers, _compile_numbers = collect_experiment_scaleup_times(prefix, scaleup_numbers, suffix=suffix)
-    return distr_numbers
-
-def collect_non_pash_experiment_scaleup_times(prefix, scaleup_numbers, suffix):
-    numbers = [sequential_experiment_exec_time(prefix, n, suffix)
-               for n in scaleup_numbers]
-    return numbers
-
-
-def collect_experiment_command_number(prefix, suffix, scaleup_numbers):
-    command_numbers = [read_distr_command_number('{}{}_{}'.format(prefix, n, suffix))
-                       for n in scaleup_numbers]
-    return command_numbers
-
-def check_output_diff_correctness(prefix, scaleup_numbers):
-    global all_line_plots
-    global file_suffixes
-    
-    result_correctness = {}
-    for line_plot in all_line_plots:
-        suffix = file_suffixes[line_plot]
-        result_correctness[line_plot] = [(n, check_output_diff_correctness_for_experiment('{}{}_{}'.format(prefix, n, suffix)))
-                                         for n in scaleup_numbers]
-    return result_correctness
-
-def collect_scaleup_line_speedups(experiment, all_scaleup_numbers, results_dir):
-    global all_line_plots
-    global file_suffixes
-
-    print("Collecting results for:", experiment)
-
-    prefix = '{}/{}_'.format(results_dir, experiment)
-
-    ## Gather results
-    all_speedup_results = {}
-    sequential_time = 0
-    for line_plot in all_line_plots:
-        file_suffix = file_suffixes[line_plot]
-        try:
-            speedup_results, sequential_time = collect_distr_experiment_speedup_seq_time(prefix,
-                                                                                         all_scaleup_numbers,
-                                                                                         file_suffix)
-            all_speedup_results[line_plot] = speedup_results
-        except ValueError:
-            ## TODO: Should we do anything here?
-            pass
-
-    ## Check if outputs are correct
-    output_diff = check_output_diff_correctness(prefix, all_scaleup_numbers)
-    
-    return (all_speedup_results, output_diff, sequential_time)
-
-
-def plot_scaleup_lines(experiment, all_scaleup_numbers, all_speedup_results, custom_scaleup_plots, 
-                       ax, line_plot_configs=default_line_plot_configs):
-    print("Plotting:", experiment)
-
-    default_line_plots = ["split",
-                          "eager",
-                          "blocking-eager",
-                          "no-eager",
-                          "no-aux-cat-split"]
-
-    ## Decide which lines to plot
-    line_plots = default_line_plots
-    if(experiment in custom_scaleup_plots):
-        line_plots = custom_scaleup_plots[experiment]
-
-    ## Compute the best speedups (for averages)
-    best_result = []
-    if("no-eager" in line_plots):
-        best_result = all_speedup_results["no-eager"]
-    if("blocking-eager" in line_plots):
-        best_result = all_speedup_results["blocking-eager"]
-    if("eager" in line_plots):
-        best_result = all_speedup_results["eager"]
-    if("split" in line_plots):
-        split_res = all_speedup_results["split"]
-        if(sum(split_res) > sum(best_result)):
-            best_result = all_speedup_results["split"]
-    elif("mini-split" in line_plots):
-        split_res = all_speedup_results["mini-split"]
-        if(sum(split_res) > sum(best_result)):
-            best_result = all_speedup_results["mini-split"]
-
-    ## We need to return the no-eager speedups as the baseline non runtime primitives.
-    no_eager_distr_speedup = all_speedup_results["eager"]
-    if("no-eager" in line_plots):
-        no_eager_distr_speedup = all_speedup_results["no-eager"]
-
-
-    ## Compute if a split line exists to change the color of the non-split top line
-    split_exists = "split" in line_plots
-
-    lines = []
-    to_plot_lines = [line_plot for line_plot in line_plots
-                     if line_plot in all_speedup_results]
-    maximum_y = 0
-    for line_plot in to_plot_lines:
-        ## Get the result
-        speedup_results = all_speedup_results[line_plot]
-
-        ## Aggregate all the speedups to find the maximum one
-        maximum_y = max(maximum_y, max(speedup_results))
-
-        ## Get the config
-        plot_config = copy.deepcopy(line_plot_configs[line_plot])
-
-        ## If a split exists, the eager should be red
-        ## TODO: Move this logic outside. It should not be part of the plotting function
-        if(line_plot == "eager" and not split_exists):
-            plot_config.color = 'tab:blue'
-            plot_config.linestyle = '-o'
-            plot_config.label = 'Parallel'
-
-        line, = plot_config.plot(all_scaleup_numbers, speedup_results, ax)
-        lines.append(line)
-
-    ## Set the ylim
-    ## TODO: Move this outside
-    ax.set_ylim(top=maximum_y*1.15)
-
-    return lines, best_result, no_eager_distr_speedup
-
-def plot_sort_with_baseline(results_dir, small=True):
-
-    if(small):
-        infix="small_"
-        all_scaleup_numbers = [2, 16]
-    else:
-        infix=""
-        all_scaleup_numbers = [2, 4, 8, 16, 32, 64]
-
-    sort_prefix = '{}/baseline_sort/baseline_sort_{}'.format(results_dir, infix)
-    double_scaleup_numbers = [2 * num for num in all_scaleup_numbers]
-
-    pash_times = collect_non_pash_experiment_scaleup_times(sort_prefix,
-                                                           all_scaleup_numbers,
-                                                           suffix='pash.time')
-
-    pash_no_eager_times = collect_non_pash_experiment_scaleup_times(sort_prefix,
-                                                                    all_scaleup_numbers,
-                                                                    suffix='pash_no_eager.time')
-
-    sort_par_times = collect_non_pash_experiment_scaleup_times(sort_prefix,
-                                                               double_scaleup_numbers,
-                                                               suffix='parallel.time')
-
-    sort_times = collect_non_pash_experiment_scaleup_times(sort_prefix,
-                                                          [2],
-                                                          suffix='seq.time')
-    sort_time = sort_times[0]
-    # print(pash_times)
-    # print(sort_par_times)
-    # print(sort_time)
-    
-    sort_distr_speedup = [safe_zero_div(sort_time, t) for t in pash_times]
-    no_eager_distr_speedup = [safe_zero_div(sort_time, t) for t in pash_no_eager_times]
-    baseline_sort_distr_speedup = [safe_zero_div(sort_time, t) for t in sort_par_times]
-
-    fig, ax = plt.subplots()
-
-    ## Plot speedup
-    ax.set_ylabel('Speedup')
-    ax.set_xlabel('--width')
-    ax.plot(all_scaleup_numbers, sort_distr_speedup, '-o', linewidth=0.5, label='Pash')
-    ax.plot(all_scaleup_numbers, no_eager_distr_speedup, '-^', linewidth=0.5, label='Pash - No Eager')
-    ax.plot(all_scaleup_numbers, baseline_sort_distr_speedup, '-p', linewidth=0.5, label='sort --parallel')
-
-    plt.xticks(all_scaleup_numbers[1:])
-    plt.legend(loc='lower right')
-    # plt.title("Comparison with sort --parallel")
-
-
-    plt.tight_layout()
-    plt.savefig(os.path.join('../evaluation/plots', "sort_baseline_{}comparison_scaleup.pdf".format(infix)),bbox_inches='tight')
-
-
-def collect_format_input_size(experiment):
-    raw_size = collect_input_size(experiment)
-    try:
-        result = input_filename_sizes[raw_size]
-    except:
-        result = "\\todo{UNKNOWN}"
-    return result
-
-def collect_input_size(experiment):
-    env_file = os.path.join(MICROBENCHMARKS, '{}_env.sh'.format(experiment))
-    with open(env_file) as file:
-        input_file_names = [line.rstrip().split("=")[1] for line in file.readlines() if line.startswith("IN")]
-        assert(len(input_file_names) == 1)
-        input_file_name = input_file_names[0]
-    # print(input_file_name)
-    # try:
-    #     input_size = os.stat(input_file_name).st_size
-    # except:
-    #     input_size = 0
-    clean_name = input_file_name.split('/')[-1].split('.')[0]
-    return clean_name
-
-def separate_into_lines(script):
-    indent_len = 4
-    lines = [script]
-    while (len(lines[-1]) > 90):
-        last_line = lines[-1]
-        last_break_index = -1
-        for i in range(90):
-            if(last_line[i] in ["&", "|"]):
-                last_break_index = i
-
-        if(last_break_index == -1):
-            break
-        new_pre_last_line = last_line[:(last_break_index+1)]
-        new_last_line = " " * indent_len + last_line[(last_break_index+1):].lstrip()
-        lines[-1] = new_pre_last_line
-        lines.append(new_last_line)
-    return lines
-
-
-def collect_script(experiment):
-    # print(experiment)
-    script_file = os.path.join(MICROBENCHMARKS, '{}.sh'.format(experiment))
-    with open(script_file) as file:
-        script_lines = file.readlines()
-    clean_script_lines = [line.lstrip() for line in script_lines
-                            if not line.lstrip().startswith('#')]
-    clean_script_lines = [line.split('#')[0].rstrip(' ') for line in clean_script_lines]
-    ## Remove mkfifo, cat, rm
-    clean_script_lines = [line for line in clean_script_lines
-                            if not line.startswith("mkfifo") and
-                                not line.startswith("rm")]
-    script = "".join(clean_script_lines).rstrip()
-    script = script.replace("|\n", "| ")
-    script = script.replace("&\n", "& ")
-    script = script.replace("\n", "; ")
-    print(len(script), script)
-    lines = separate_into_lines(script)
-    wrapped_lines = ['\lstinline[columns=fixed,basicstyle=\\footnotesize\\ttfamily]!{}!'.format(line)
-                     for line in lines]
-    wrapped_script = '\\\\ ~&~ '.join(wrapped_lines)
-    return wrapped_script
-
-def format_time_seconds(time_milliseconds):
-    time_seconds = time_milliseconds / 1000
-    time_minutes = int(time_seconds // 60)
-    time_only_seconds = time_seconds % 60
-    if(time_minutes > 0):
-        formatted_time = '{}m{:.3f}s'.format(time_minutes, time_only_seconds)
-    else:
-        formatted_time = '{:.3f}s'.format(time_seconds)
-    return formatted_time
-
-def format_time_milliseconds(time_milliseconds):
-    time_seconds = int(time_milliseconds // 1000)
-    time_only_milliseconds = time_milliseconds % 1000
-    if(time_seconds > 0):
-        formatted_time = '{}s{:.3f}ms'.format(time_seconds, time_only_milliseconds)
-    else:
-        formatted_time = '{:.3f}ms'.format(time_milliseconds)
-    return formatted_time
-
-
-def generate_table_header(full=True):
-    header = []
-    if(not full):
-        # header += ['\\begin{tabular*}{\\textwidth}{l @{\\extracolsep{\\fill}} llll}']
-        header += ['\\begin{tabular*}{\\textwidth}{l @{\\extracolsep{\\fill}} l}']
-        header += ['\\toprule']
-        # header += ['Script ~&~ Structure & Input &'
-        #            'Seq. Time & Highlights \\\\']
-        header += ['Name ~&~ Script \\\\']
-    else:
-        header += ['\\begin{tabular*}{\\textwidth}{l @{\\extracolsep{\\fill}} llllllll}']
-        header += ['\\toprule']
-        header += ['Script ~&~ Structure & Input &'
-                   'Seq. Time & \\multicolumn{2}{l}{\\#Nodes(16, 64)} &'
-                   '\\multicolumn{2}{l}{Compile Time (16, 64)} & Highlights \\\\']
-    header += ['\\midrule']
-    return "\n".join(header)
-
-def generate_table_footer(full=True):
-    footer = []
-    footer += ['\\bottomrule']
-    footer += ['\\end{tabular*}']
-    return "\n".join(footer)
-
-
-def generate_experiment_line(experiment, results_dir=RESULTS, full=True, small=False):
-    line = []
-    line += [pretty_names[experiment], '~&~']
-
-    if(not full):
-        script = collect_script(experiment)
-        line += [script, '\\\\']
-        return " ".join(line)
-
-    line += [structures[experiment], '&']
-    ## Collect and output the input size
-    input_size = collect_format_input_size(experiment)
-    line += [input_size, '&']
-
-    if(small is False):
-        suffix='distr.time'
-        if(experiment in ["spell", "bigrams", "double_sort"]):
-            suffix='distr_auto_split.time'
-    else:
-        suffix='distr_auto_split.time'
-
-    ## Collect and output the sequential time for the experiment
-    scaleup_numbers = [2, 16, 64]
-    experiment_results_prefix = '{}/{}_'.format(results_dir, experiment)
-    seq_time, _, compile_times = collect_experiment_scaleup_times(experiment_results_prefix, scaleup_numbers, suffix=suffix)
-    seq_time_seconds = format_time_seconds(seq_time)
-    # seq_time_seconds = seq_times[0] / 1000
-    line += [seq_time_seconds, '&']
-
-    commands_16, commands_64 = collect_experiment_command_number(experiment_results_prefix,
-                                                                suffix, [16, 64])
-    if(full):
-        line += ['{} & {}'.format(commands_16, commands_64), '&']
-
-    ## Collect and output compile times
-    compile_time_16_milliseconds = compile_times[1]
-    compile_time_64_milliseconds = compile_times[2]
-    if(full):
-        line += ['{} & {}'.format(format_time_seconds(compile_time_16_milliseconds),
-                                  format_time_seconds(compile_time_64_milliseconds)), '&']
-    line += [highlights[experiment], '\\\\']
-    return " ".join(line)
-
-def generate_tables(experiments, results_dir=RESULTS, table_suffix="", small=False):
-    generate_tex_table(experiments, results_dir=results_dir, table_suffix=table_suffix, small=small)
-
-def generate_tex_table(experiments, results_dir=RESULTS, table_suffix="", small=False):
-    header = generate_table_header()
-    lines = []
-    for experiment in experiments:
-        line = generate_experiment_line(experiment, results_dir=results_dir, small=small)
-        # print(line)
-        lines.append(line)
-    data = "\n".join(lines)
-    footer = generate_table_footer()
-    table_tex = "\n".join([header, data, footer])
-    tex_filename = os.path.join('../evaluation/plots', 'microbenchmarks-table{}.tex'.format(table_suffix))
-    with open(tex_filename, 'w') as file:
-        file.write(table_tex)
-
-def generate_tex_coarse_table(experiments):
-    header = generate_table_header(full=False)
-    lines = []
-    for experiment in experiments:
-        line = generate_experiment_line(experiment, full=False)
-        # print(line)
-        lines.append(line)
-    data = "\n".join(lines)
-    footer = generate_table_footer(full=False)
-    table_tex = "\n".join([header, data, footer])
-    tex_filename = os.path.join('../evaluation/plots', 'coarse-microbenchmarks-table.tex')
-    with open(tex_filename, 'w') as file:
-        file.write(table_tex)
-
-
-def collect_unix50_pipeline_scaleup_times(pipeline_number, unix50_results_dir, scaleup_numbers, suffix='distr.time'):
-    prefix = '{}/unix50_pipeline_{}_'.format(unix50_results_dir, pipeline_number)
-    distr_speedups = collect_distr_experiment_speedup(prefix, scaleup_numbers, suffix)
-    absolute_seq_time, _, _ = collect_experiment_scaleup_times(prefix, scaleup_numbers)
-    return (distr_speedups, absolute_seq_time)
-
-def collect_unix50_pipeline_coarse_scaleup_times(pipeline_number, unix50_results_dir, scaleup_numbers):
-    prefix = '{}/unix50_pipeline_{}_'.format(unix50_results_dir, pipeline_number)
-    # distr_speedups = collect_distr_experiment_speedup(prefix, scaleup_numbers)
-    no_eager_distr_speedup = collect_distr_experiment_speedup(prefix,
-                                                              scaleup_numbers,
-                                                              'distr_no_eager.time')
-    absolute_seq_times = [sequential_experiment_exec_time(prefix, scaleup_numbers[0])
-                          for _ in scaleup_numbers]
-    return (no_eager_distr_speedup, absolute_seq_times)
-
-def collect_unix50_pipeline_fan_in_fan_out_scaleup_times(pipeline_number, unix50_results_dir, scaleup_numbers):
-    prefix = '{}/unix50_pipeline_{}_'.format(unix50_results_dir, pipeline_number)
-    fan_in_fan_out_distr_speedup = collect_distr_experiment_speedup(prefix,
-                                                                    scaleup_numbers,
-                                                                    'distr_auto_split_fan_in_fan_out.time')
-    absolute_seq_times = [sequential_experiment_exec_time(prefix, scaleup_numbers[0])
-                          for _ in scaleup_numbers]
-    return (fan_in_fan_out_distr_speedup, absolute_seq_times)
-
-
-def aggregate_unix50_results(all_results, scaleup_numbers):
-    avg_distr_results = [[] for _ in scaleup_numbers]
-    for pipeline in all_results:
-        pipeline_distr_results = pipeline[0]
-        # print(pipeline_distr_results)
-        for i in range(len(scaleup_numbers)):
-            avg_distr_results[i].append(pipeline_distr_results[i])
-
-    for i in range(len(pipeline_distr_results)):
-        avg_distr_results[i] = sum(avg_distr_results[i]) / len(avg_distr_results[i])
-
-    return avg_distr_results
-
-def compute_and_print_aggrs(individual_results, absolute_seq_times_s):
-    mean = safe_zero_div(sum(individual_results), len(individual_results))
-    if (len(individual_results) > 0):
-        median = statistics.median(individual_results)
-    else:
-        median = 0
-    geo_mean = math.exp(safe_zero_div(np.log(individual_results).sum(),
-                                      len(individual_results)))
-    weighted_res = [i*a for i, a in zip(individual_results, absolute_seq_times_s)]
-    weighted_avg = safe_zero_div(sum(weighted_res), sum(absolute_seq_times_s))
-    print("  Mean:", mean)
-    print("  Median:", median)
-    print("  Geometric Mean:", geo_mean)
-    print("  Weighted Average:", weighted_avg)
-    return mean, median, geo_mean, weighted_res, weighted_avg
-
-def make_unix50_bar_chart(all_results, scaleup_numbers, parallelism, small_prefix=""):
-
-    ## Sort by speedup
-    sorted_indices = sorted(range(len(all_results)), key=lambda x:all_results[x][0][scaleup_numbers.index(parallelism)], reverse=True)
-    sorted_all_results = sorted(all_results, key=lambda x: x[0][scaleup_numbers.index(parallelism)], reverse=True)
-    print("Unix50 Par: {} Sorted Indices:".format(parallelism))
-    print("|------------------------") 
-    indices_map = sorted([(v,i) for i, v in enumerate(sorted_indices)], key=lambda x: x[0])
-    print("\n".join(["Old: {}, New: {}".format(v,i) for v, i in indices_map]))
-    print("|------------------------") 
-    ## Filter small exec times.
-    sorted_all_results = [res for res in sorted_all_results
-                          if (res[1] / 1000) > 0.1]
-
-    ## Plot individual speedups
-    individual_results = [distr_exec_speedup[scaleup_numbers.index(parallelism)]
-                          for distr_exec_speedup, _ in sorted_all_results]
-    absolute_seq_times_s = [absolute_seq / 1000
-                            for _, absolute_seq in sorted_all_results]
-    print("Unix50 individual speedups for {} parallelism:".format(parallelism), individual_results)
-
-    aggrs = compute_and_print_aggrs(individual_results, absolute_seq_times_s)
-    mean = aggrs[0]
-
-    w = 0.2
-    ind = np.arange(len(individual_results))
-    speedup_color = 'tab:blue'
-    seq_time_color = 'tab:red'
-
-
-
-    fig, ax1 = plt.subplots()
-    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
-    # print(fig.get_size_inches())
-    fig.set_size_inches(14, 6)
-
-    ## Plot line at 1
-    plt.xlim(-1, len(individual_results))
-
-    ## Plot speedup
-    ax2.set_ylabel('Speedup', color=speedup_color)
-    ax2.bar(ind-w, individual_results, width=2*w, align='center', color=speedup_color)
-    ax2.tick_params(axis='y', labelcolor=speedup_color)
-
-    ax1.set_ylabel('Sequential Time (s)', color=seq_time_color)
-    ax1.set_yscale("log")
-    ax1.set_ylim([5, 6000])
-    ax1.set_xlabel('Script Index')
-    ax1.bar(ind+w, absolute_seq_times_s, width=2*w, align='center', color=seq_time_color)
-    ax1.tick_params(axis='y', labelcolor=seq_time_color)
-
-    ## Plot average line
-    ax2.hlines([1], -1, len(individual_results) + 1, linewidth=0.8, linestyles="dotted")
-    # ax2.hlines([mean], -1, len(individual_results) + 1, linewidth=1.2)
-    mean_xs = range(-1, len(individual_results) + 1)
-    ax2.plot(mean_xs, [mean for _ in mean_xs], linewidth=1.2, label='Mean Speedup')
-
-    ax1.yaxis.tick_right()
-    ax1.yaxis.set_label_position("right")
-    ax2.yaxis.tick_left()
-    ax2.yaxis.set_label_position("left")
-    old_ticks = ax2.get_yticks()
-    # print(old_ticks)
-    ax2.set_yticks([1] + old_ticks[:-1])
-    # plt.yscale("log")
-    # plt.yticks(range(1, 18, 2))
-    # plt.ylim((0.1, 20))
-    ax2.legend()
-    # ax2.grid()
-    # plt.title("Unix50 Individual Speedups")
-    plt.tight_layout()
-    plt.savefig(os.path.join('../evaluation/plots', "unix50{}_individual_speedups_{}.pdf".format(small_prefix, parallelism)),bbox_inches='tight')
-
-def make_coarse_unix50_bar_chart(all_results, scaleup_numbers, parallelism):
-    ## Plot individual speedups
-    individual_results = [distr_exec_speedup[scaleup_numbers.index(parallelism)]
-                          for distr_exec_speedup, _ in all_results]
-    absolute_seq_times_s = [absolute_seq / 1000
-                            for _, absolute_seq in all_results]
-    print("Unix50 individual speedups for {} parallelism:".format(parallelism), individual_results)
-    
-    aggrs = compute_and_print_aggrs(individual_results, absolute_seq_times_s)
-    mean = aggrs[0]
-
-    w = 0.2
-    ind = np.arange(len(individual_results))
-    speedup_color = 'tab:blue'
-    seq_time_color = 'tab:red'
-
-    fig, ax1 = plt.subplots()
-    # print(fig.get_size_inches())
-    fig.set_size_inches(10, 4)
-
-    ## Plot speedup
-    ax1.set_ylabel('Speedup', color=speedup_color)
-    ax1.set_xlabel('Pipeline')
-    plt.xlim(-1, len(individual_results))
-    plt.hlines([1], -1, len(individual_results) + 1, linewidth=0.8)
-    ax1.bar(ind-w, individual_results, width=2*w, align='center', color=speedup_color)
-    ax1.tick_params(axis='y', labelcolor=speedup_color)
-
-    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
-    ax2.set_ylabel('Sequential Time (s)', color=seq_time_color)
-    ax2.set_yscale("log")
-    ax2.bar(ind+w, absolute_seq_times_s, width=2*w, align='center', color=seq_time_color)
-    ax2.tick_params(axis='y', labelcolor=seq_time_color)
-    # plt.yscale("log")
-    # plt.yticks(range(1, 18, 2))
-    # plt.ylim((0.1, 20))
-    # plt.legend(loc='lower right')
-    # plt.title("Unix50 Individual Speedups")
-    plt.tight_layout()
-    plt.savefig(os.path.join('../evaluation/plots', "unix50_coarse_individual_speedups_{}.pdf".format(parallelism)),bbox_inches='tight')
-
-def get_pipelines_res(individual_results, absolute_seq_times_s, pipelines):
-    speedups = [individual_results[index] for index in pipelines]
-    seq_times = [absolute_seq_times_s[index] for index in pipelines]
-    return speedups, seq_times
-
-def make_unix50_scatter_plot(all_results, scaleup_numbers, parallelism):
-    individual_results = [distr_exec_speedup[scaleup_numbers.index(parallelism)]
-                          for distr_exec_speedup, _ in all_results]
-    absolute_seq_times_s = [absolute_seq / 1000
-                            for _, absolute_seq in all_results]
-    print("Unix50 individual speedups for {} scripts and {} parallelism:".format(len(individual_results), parallelism), individual_results)
-    
-    aggrs = compute_and_print_aggrs(individual_results, absolute_seq_times_s)
-    mean = aggrs[0]
-
-    slowdown_pipelines = [2,19,31]
-    no_speedup_pipelines = [13, 24, 25, 26, 29, 30]
-    sort_pipelines = [0, 1, 3, 15, 16, 18, 20, 27, 28, 33]
-    deep_pipelines = [7, 8, 9, 10, 11, 12, 14, 27, 28]
-    io_pipelines = [4, 5, 6, 7, 8, 12, 14, 22, 23]
-    special_pipelines = slowdown_pipelines + no_speedup_pipelines + sort_pipelines + deep_pipelines + io_pipelines
-    rest_pipelines_set = set(range(len(all_results))) - set(special_pipelines)
-    rest_pipelines = sorted(list(rest_pipelines_set))
-
-
-    fig, ax = plt.subplots()
-    fig.set_size_inches(10, 5)
-
-    ## Slowdown Pipelines
-    speeds, seqs = get_pipelines_res(individual_results, absolute_seq_times_s, rest_pipelines)
-    ax.scatter(seqs, speeds, label="Highly Parallelizable")
-    print("|-- Highly parallelizable speedups:", speeds)
-
-    speeds, seqs = get_pipelines_res(individual_results, absolute_seq_times_s, sort_pipelines)
-    ax.scatter(seqs, speeds, label="Contain sort")
-
-    speeds, seqs = get_pipelines_res(individual_results, absolute_seq_times_s, io_pipelines)
-    ax.scatter(seqs, speeds, label="Non CPU-heavy")
-
-    speeds, seqs = get_pipelines_res(individual_results, absolute_seq_times_s, deep_pipelines)
-    ax.scatter(seqs, speeds, label="Deep")
-
-    speeds, seqs = get_pipelines_res(individual_results, absolute_seq_times_s, no_speedup_pipelines)
-    ax.scatter(seqs, speeds, label="Non parallelizable")
-    print("|-- Non parallelizable speedups:", speeds)
-
-    speeds, seqs = get_pipelines_res(individual_results, absolute_seq_times_s, slowdown_pipelines)
-    ax.scatter(seqs, speeds, label="Contain head")
-    print("|-- Head script speedups:", speeds)
-
-    ax.grid()
-    ax.set_ylabel('Speedup')
-    ax.set_xlabel('Sequential Time (s)')
-    ax.set_xscale("log")
-    # plt.hlines([1], -1, 1000000, linewidth=0.8)
-    old_ylim = plt.ylim()
-    plt.yticks(list(plt.yticks()[0]) + [1])
-    plt.xlim(0, max(absolute_seq_times_s) * 2)
-    plt.ylim(old_ylim)
-    plt.legend()
-    plt.tight_layout()
-    plt.savefig(os.path.join('../evaluation/plots', "unix50_scatter_speedups_{}.pdf".format(parallelism)),bbox_inches='tight')
-
-def plot_unix50_avg_speedup(all_results, scaleup_numbers, filename):
-    avg_results = aggregate_unix50_results(all_results, scaleup_numbers)
-    print("Unix50 average speedup:", avg_results)
-
-    ## Plot average speedup
-    fig, ax = plt.subplots()
-
-    ## Plot speedup
-    ax.set_ylabel('Speedup')
-    ax.set_xlabel('--width')
-    ax.plot(scaleup_numbers, avg_results, '-o', linewidth=0.5, label='Parallel')
-    # plt.yscale("log")
-    plt.xticks(scaleup_numbers)
-    plt.legend(loc='lower right')
-    plt.title("Unix50 Throughput")
-    plt.tight_layout()
-    plt.savefig(os.path.join('../evaluation/plots', filename),bbox_inches='tight')
-
-def collect_all_unix50_results(unix50_results_dir, scaleup_numbers=[2, 4, 8, 16], suffix='distr.time'):
-
-    try:
-        files = [f for f in os.listdir(unix50_results_dir)]
-    except:
-        files = []
-    # print(files)
-    pipeline_numbers = sorted(list(set([f.split('_')[2] for f in files])))
-    # print(pipeline_numbers)
-
-    all_results = [collect_unix50_pipeline_scaleup_times(pipeline_number,
-                                                         unix50_results_dir,
-                                                         scaleup_numbers,
-                                                         suffix=suffix)
-                   for pipeline_number in pipeline_numbers]
-    fan_in_fan_out_results = [collect_unix50_pipeline_fan_in_fan_out_scaleup_times(pipeline_number,
-                                                                    unix50_results_dir,
-                                                                    scaleup_numbers)
-                   for pipeline_number in pipeline_numbers]
-    
-    return (all_results, fan_in_fan_out_results)
-
-def collect_unix50_scaleup_times(all_results, scaleup=[2,4,8,16], small_prefix="", scatter=True):
-    
-    # print(all_results)
-
-    for parallelism in scaleup:
-        make_unix50_bar_chart(all_results, scaleup, parallelism, small_prefix=small_prefix)
-        if(scatter):
-            make_unix50_scatter_plot(all_results, scaleup, parallelism)
-
-    # plot_unix50_avg_speedup(all_results, scaleup, "unix50_throughput_scaleup.pdf")
-    
-
-# def collect_unix50_coarse_scaleup_times(all_results):
-#     print(len(all_results))
-#     scaleup_numbers = [16]
-
-#     # print(all_results)
-
-#     for parallelism in scaleup_numbers:
-#         make_coarse_unix50_bar_chart(all_results, scaleup_numbers, parallelism)
-
-#     plot_unix50_avg_speedup(all_results, scaleup_numbers, "unix50_coarse_throughput_scaleup.pdf")
-
-
-def get_statistics_from_lines(lines):
-    minimums = lines[0].get_ydata().copy()
-    maximums = lines[0].get_ydata().copy()
-    sums = [0 for _ in lines[0].get_ydata()]
-    for line in lines:
-        for i, y in enumerate(line.get_ydata()):
-            minimums[i] = min(minimums[i], y)
-            maximums[i] = max(maximums[i], y)
-            sums[i] += y
-    avgs = [float(s)/len(lines) for s in sums]
-    return minimums, maximums, avgs
-
-def set_tiling_axes_labels_ticks(fig):
-    axs = fig.get_axes()
-    for ax in axs:
-        if(ax.is_first_col()):
-            ax.set_ylabel('Speedup')
-        if(ax.is_last_row()):
-            ax.set_xlabel('--width')
-        if(not ax.is_last_row()):
-            ax.set_xticklabels([])
-        # ax.label_outer()
-
-def plot_tiling_experiments(fig, gs, experiments, all_experiment_results, all_scaleup_numbers,
-                            correctness=None, custom_scaleup_plots={},
-                            line_plot_configs=default_line_plot_configs):
-    total_lines = []
-    averages = [[] for _ in all_scaleup_numbers]
-    no_eager_averages = [[] for _ in all_scaleup_numbers]
-    ## Plot microbenchmarks
-    for i, experiment in enumerate(experiments):
-        ax = fig.add_subplot(gs[i])
-        all_speedup_results = all_experiment_results[experiment]
-        lines, best_result, no_eager_result = plot_scaleup_lines(experiment, all_scaleup_numbers, all_speedup_results, 
-                                                                 custom_scaleup_plots, ax, line_plot_configs=line_plot_configs)
-        # if(experiment == "double_sort"):
-        total_lines += lines
-        ax.set_xticks(all_scaleup_numbers[1:])
-
-        text_color = 'black'
-        if (not correctness is None 
-            and any_wrong(correctness, experiment, all_line_plots)):
-            text_color = 'red'
-        ax.text(.5,.91,pretty_names[experiment],
-                horizontalalignment='center',
-                transform=ax.transAxes,
-                color=text_color)
-        # ax.set_yticks([])
-        fig.add_subplot(ax)
-
-        ## Update averages
-        for i, res in enumerate(best_result):
-            averages[i].append(res)
-        for i, res in enumerate(no_eager_result):
-            no_eager_averages[i].append(res)
-    
-    set_tiling_axes_labels_ticks(fig)
-    
-    return (total_lines, averages, no_eager_averages)
-
-
-def print_aggregates(prefix, averages, no_eager_averages):
-    ## Print average, geo-mean
-    one_liner_averages = [safe_zero_div(sum(res), len(res)) for res in averages]
-    all_no_eager_averages = [safe_zero_div(sum(res), len(res)) for res in no_eager_averages]
-    geo_means = [math.exp(safe_zero_div(np.log(res).sum(), len(res)))
-                 for res in averages]
-    print(prefix, "One-liners Aggregated results:")
-    print(" |-- Averages:", one_liner_averages)
-    print(" |-- No Eager Averages:", all_no_eager_averages)
-    print(" |-- Geometric Means:", geo_means)
-
-## TODO: Add more experiments to be ploted in the report
-def report_all_one_liners(all_scaleup_numbers, all_experiment_results, correctness):
-    global all_line_plots
-
-    line_plots = ["split", "eager", "blocking-eager", "no-eager", "no-aux-cat-split"]
-    legend_names = ["PaSh",
-                    "PaSh w/o split",
-                    "Blocking Eager",
-                    "No Eager",
-                    "No Aux Cat-Split"]
-
-    fig = plt.figure()
-    columns = 5
-    rows = (len(all_experiment_results.keys()) // columns) + 1
-    gs = fig.add_gridspec(rows, columns, hspace=0.05)
-
-    plot_res = plot_tiling_experiments(fig, gs, all_experiments, 
-                                       all_experiment_results, 
-                                       all_scaleup_numbers, 
-                                       correctness)
-    _total_lines, averages, no_eager_averages = plot_res 
-
-    legend_artists = [default_line_plot_configs[lplot].get_artist() for lplot in line_plots]
-    plt.legend(legend_artists, legend_names, loc='lower right', fontsize=16)
-    # plt.title(pretty_names[experiment])
-
-    fig.set_size_inches(columns * 6, rows * 5)
-    plt.tight_layout()
-    plt.savefig(os.path.join('../evaluation/plots', "all_one_liners_scaleup.pdf"),bbox_inches='tight')
-
-    print_aggregates("All", averages, no_eager_averages)
-
-def plot_one_liners_tiling(all_experiment_results, experiments,
-                           custom_scaleup_plots,
-                           all_scaleup_numbers=[2, 4, 8, 16, 32, 64],
-                           prefix=""):
-
-    line_plots = ["split", "eager", "blocking-eager", "no-eager"]
-
-    legend_names = ["PaSh",
-                    "PaSh w/o split",
-                    "Blocking Eager",
-                    "No Eager"]
-
-    fig = plt.figure()
-    gs = fig.add_gridspec(2, 5, hspace=0.05)
-    
-    plot_res = plot_tiling_experiments(fig, gs, experiments, 
-                                       all_experiment_results, 
-                                       all_scaleup_numbers,
-                                       custom_scaleup_plots=custom_scaleup_plots)
-    _total_lines, averages, no_eager_averages = plot_res
-
-    legend_artists = [default_line_plot_configs[lplot].get_artist() for lplot in line_plots]
-    plt.legend(legend_artists, legend_names, loc='lower right', fontsize=16)
-    # plt.title(pretty_names[experiment])
-
-    fig.set_size_inches(27, 8.2)
-    plt.tight_layout()
-    plt.savefig(os.path.join('../evaluation/plots', "{}tiling_throughput_scaleup.pdf".format(prefix)),bbox_inches='tight')
-
-    print_aggregates("Systems", averages, no_eager_averages)
-
-def plot_less_one_liners_tiling(all_experiment_results, all_sequential_results, experiments, unix50_results, oasa_results):
-
-    all_scaleup_numbers = [2, 4, 8, 16, 32, 64]
-
-    coarse_custom_scaleup_plots = {"minimal_grep" : ["blocking-eager"],
-                                   "minimal_sort" : ["eager"],
-                                   "topn" : ["no-eager"],
-                                   "wf" : ["no-eager"],
-                                   "spell" : ["mini-split"],
-                                   "diff" : ["no-eager"],
-                                   "bigrams" : ["mini-split"],
-                                   "set-diff" : ["eager"],
-                                   "shortest_scripts" : ["no-eager"],
-                                  }
-
-    line_plot_configs = {'eager': LinePlotConfig('-o', 'tab:blue', 'Parallel w/o split'),
-                         'split': LinePlotConfig('-o', 'tab:blue', 'Parallel'),
-                         'mini-split': LinePlotConfig('-o', 'tab:blue', 'Parallel'),
-                         'blocking-eager': LinePlotConfig('-o', 'tab:blue', 'Blocking Eager'),
-                         'no-eager': LinePlotConfig('-o', 'tab:blue', 'No Eager')}
-
-    # confs = ["Parallel"]
-
-    fig = plt.figure()
-    gs = fig.add_gridspec(3, 3, hspace=0.05)
-
-    plot_res = plot_tiling_experiments(fig, gs, experiments, 
-                                       all_experiment_results, 
-                                       all_scaleup_numbers,
-                                       custom_scaleup_plots=coarse_custom_scaleup_plots,
-                                       line_plot_configs=line_plot_configs)
-    total_lines, averages, no_eager_averages = plot_res
-
-    # plt.legend(total_lines, confs, loc='lower right', fontsize=16)
-    # plt.title(pretty_names[experiment])
-
-    fig.set_size_inches(22.5, 12.5)
-    plt.tight_layout()
-    ## TODO: Replace the prefix with a constant
-    plt.savefig(os.path.join('../evaluation/plots', "coarse_tiling_throughput_scaleup.pdf"),bbox_inches='tight')
-
-    ## Plot the aggregate plot for the one liners
-    plot_less_one_liners_aggregate(total_lines, all_scaleup_numbers)
-    plot_all_less_one_liners_one_plot(total_lines, all_scaleup_numbers, experiments)
-
-    ## Plot two bars, one for the fan-in fan-out and one for the total
-    scaleup_number = 16
-    plot_bar_chart_one_liners(total_lines, all_experiment_results, all_sequential_results, experiments, scaleup_number)
-    plot_bar_chart_one_liners_and_unix50(total_lines, all_experiment_results, all_sequential_results, experiments, scaleup_number, unix50_results, oasa_results)
-    plot_one_bar_chart_one_liners_and_unix50(total_lines, all_experiment_results, all_sequential_results, experiments, scaleup_number, unix50_results)
-
-    print_aggregates("Coarse", averages, no_eager_averages)
-
-def gather_abs_times_speedups_bar(total_lines, all_experiment_results, all_sequential_results, experiments, scaleup_number):
-    seq_results_s = [all_sequential_results[experiment] / 1000.0 for experiment in experiments]
-
-    ## Gather the good results
-    good_speedups = []
-    for l_i, line in enumerate(total_lines):
-        i, = np.where(line.get_xdata() == scaleup_number)
-        speedup = line.get_ydata()[int(i)]
-        good_speedups.append(speedup)
-    # print(good_results)
-    good_results = [seq_results_s[l_i] / speedup for i, speedup in enumerate(good_speedups)]
-
-    ## Gather the no-aux transformation results
-    no_aux_speedups = []
-    for ex_i, experiment in enumerate(experiments):
-        line = total_lines[ex_i]
-        no_aux_res = all_experiment_results[experiment]["no-aux-cat-split"]
-        i, = np.where(line.get_xdata() == scaleup_number) 
-        speedup = no_aux_res[int(i)]
-        no_aux_speedups.append(speedup)
-    # print(no_aux_results)
-
-    no_aux_results = [safe_zero_div(seq_results_s[i], speedup)
-                      for i, speedup in enumerate(no_aux_speedups)]
-    return (good_results, good_speedups, no_aux_results, no_aux_speedups, seq_results_s)
-
-
-def plot_bar_chart_one_liners(total_lines, all_experiment_results, all_sequential_results, experiments, scaleup_number):
-    res = gather_abs_times_speedups_bar(total_lines, all_experiment_results, all_sequential_results, experiments, scaleup_number)
-    (good_results, good_speedups, no_aux_results, no_aux_speedups, seq_results_s) = res
-    w = 0.12
-    ind = np.arange(len(good_results))
-    good_speedup_color = 'tab:blue'
-    no_aux_speedup_color = 'tab:orange'
-    seq_color = 'tab:green'
-
-    fig, ax = plt.subplots()
-    # print(fig.get_size_inches())
-    fig.set_size_inches(12, 6)
-
-    ## Plot speedup
-    ax.set_xlabel('Execution time (s)')
-    ax.set_ylabel('Script')
-    ax.grid(axis='x', zorder=0)
-
-    # plt.vlines([1], -1, len(good_results) + 1, linewidth=0.8)
-    ax.barh(ind+2*w, good_results[::-1], height=2*w, align='center', 
-            color=good_speedup_color, label='Par', zorder=3)
-    ax.barh(ind, no_aux_results[::-1], height=2*w, align='center', 
-            color=no_aux_speedup_color, label='Par -aux', zorder=3)
-    ax.barh(ind-2*w, seq_results_s[::-1], height=2*w, align='center', 
-            color=seq_color, label='Seq', zorder=3)
-    ylabels = [pretty_names[exp] for exp in experiments]
-    plt.yticks(ind, ylabels[::-1])    
-    # plt.ylim((0.1, 20))
-    plt.legend(loc='lower right')
-    plt.tight_layout()
-
-    plt.savefig(os.path.join('../evaluation/plots', "coarse_one_liners_bar_{}.pdf".format(scaleup_number)),bbox_inches='tight')
-    print("Transformation averages:")
-    print("|-- All transformations:", sum(good_speedups) / len(good_speedups))
-    print("|-- No Cat-Split transformation:", sum(no_aux_speedups) / len(no_aux_speedups))
-
-
-def plot_bar_chart_one_liners_and_unix50(total_lines, all_experiment_results, all_sequential_results, experiments, scaleup_number, unix50_all_results, oasa_results):
-    res = gather_abs_times_speedups_bar(total_lines, all_experiment_results, all_sequential_results, experiments, scaleup_number)
-    (good_results, good_speedups, no_aux_results, no_aux_speedups, seq_results_s) = res
-
-    unix50_results, unix50_results_fan_in = unix50_all_results
-    index_16 = 3
-    unix50_seq_times_s = [absolute_seq / 1000 for _, absolute_seq in unix50_results]
-    unix50_good_speedups = [float(distr_exec_speedup[index_16])
-                            for distr_exec_speedup, _ in unix50_results]
-    unix50_good_results = [unix50_seq_times_s[i] / float(distr_exec_speedup)
-                           for i, distr_exec_speedup in enumerate(unix50_good_speedups)]
-    unix50_no_aux_speedups = [distr_exec_speedup[index_16]
-                              for distr_exec_speedup, _ in unix50_results_fan_in]
-    unix50_no_aux_results = [safe_zero_div(unix50_seq_times_s[i], distr_exec_speedup)
-                             for i, distr_exec_speedup in enumerate(unix50_no_aux_speedups)]
-
-    ## Extract the OASA experiment data
-    ##
-    ## TODO: Disgusting copy-paste...
-    oasa_experiments = [str(i) for i in range(1,5)]
-    oasa_seq_times_s = [oasa_results[exp]['seq'] / 1000.0 
-                       for exp in oasa_experiments]
-    oasa_good_speedups = [float(oasa_results[exp]['split'][index_16]) 
-                          for exp in oasa_experiments]
-    oasa_good_results = [safe_zero_div(oasa_seq_times_s[i], float(distr_exec_speedup))
-                         for i, distr_exec_speedup in enumerate(oasa_good_speedups)]
-    oasa_no_aux_speedups = [float(oasa_results[exp]['no-aux-cat-split'][index_16]) 
-                            for exp in oasa_experiments]
-    oasa_no_aux_results = [safe_zero_div(oasa_seq_times_s[i], distr_exec_speedup)
-                             for i, distr_exec_speedup in enumerate(oasa_no_aux_speedups)]
-
-    all_good_results = good_results + unix50_good_results + oasa_good_results
-    all_no_aux_results = no_aux_results + unix50_no_aux_results + oasa_no_aux_results
-    all_seq_results = seq_results_s + unix50_seq_times_s + oasa_seq_times_s
-
-    ylim = 1500
-    w = 0.12
-    ind = np.arange(len(all_good_results))
-    good_speedup_color = 'tab:blue'
-    no_aux_speedup_color = 'tab:orange'
-    seq_color = 'tab:green'
-
-    fig, ax = plt.subplots()
-    # print(fig.get_size_inches())
-    fig.set_size_inches(25, 7.5)
-
-    ## Plot speedup
-    ax.set_ylabel('Execution time (s)')
-    ax.set_xlabel('Script')
-    ax.grid(axis='y', zorder=0)
-
-    # plt.vlines([1], -1, len(good_results) + 1, linewidth=0.8)
-    ax.bar(ind+2*w, all_good_results, width=2*w, align='center', 
-            color=good_speedup_color, label='Parallel', zorder=3)
-    ax.bar(ind, all_no_aux_results, width=2*w, align='center', 
-            color=no_aux_speedup_color, label='No Cat-Split', zorder=3)
-    ax.bar(ind-2*w, all_seq_results, width=2*w, align='center', 
-            color=seq_color, label='Baseline', zorder=3)
-    ## Add text on top
-    prev_i = -2
-    for i, v in enumerate(all_seq_results):
-        if(v > ylim):
-            if(prev_i == i-1):
-                y_pos = ylim*1.07
-            else:
-                y_pos = ylim * 1.01
-                prev_i = i
-            plt.text(i-2*w, y_pos, str(int(v)), ha='center')
-
-    xlabels = [pretty_names[exp] for exp in experiments] + ["unix50-{}".format(i) for i in range(len(unix50_good_results))] + ["buses-{}".format(i) for i in range(1,5)]
-    plt.xticks(ind, xlabels, rotation=45, ha="right") 
-    # ax.set_xscale("log")
-    plt.ylim((0, ylim))
-    plt.legend(loc='upper right')
-    plt.tight_layout()
-
-    ## Add vertical lines
-    line_x=len(good_results) - 4 * w
-    plt.axvline(x=line_x, color='black', linestyle='--')
-
-    line_x=len(good_results + unix50_good_results) - 4 * w
-    plt.axvline(x=line_x, color='black', linestyle='--')
-
-    all_good_speedups = good_speedups + unix50_good_speedups + oasa_good_speedups
-    all_no_aux_speedups = no_aux_speedups + unix50_no_aux_speedups + oasa_no_aux_speedups
-    print(all_no_aux_speedups)
-    print("All Transformation averages:")
-    print("|-- All transformations:", sum(all_good_speedups) / len(all_good_speedups))
-    print("|-- No Cat-Split transformation:", sum(all_no_aux_speedups) / len(all_no_aux_speedups))
-
-    print("All Transformation minimum:")
-    print("|-- All transformations:", min(all_good_speedups))
-    print("|-- No Cat-Split transformation:", min(all_no_aux_speedups))
-
-    print("All Transformation maximum:")
-    print("|-- All transformations:", max(all_good_speedups))
-    print("|-- No Cat-Split transformation:", max(all_no_aux_speedups))
-
-
-    plt.savefig(os.path.join('../evaluation/plots', "coarse_all_bar_{}.pdf".format(scaleup_number)),bbox_inches='tight')
-
-def plot_one_bar_chart_one_liners_and_unix50(total_lines, all_experiment_results, all_sequential_results, experiments, scaleup_number, unix50_all_results):
-    res = gather_abs_times_speedups_bar(total_lines, all_experiment_results, all_sequential_results, experiments, scaleup_number)
-    (_, good_speedups, _, _, seq_results_s) = res
-
-    # nfa_index = experiments.index('minimal_grep')
-    # good_speedups[nfa_index] = 15.81
-
-    unix50_results, _ = unix50_all_results
-    index_16 = 3
-    unix50_good_speedups = [float(distr_exec_speedup[index_16])
-                            for distr_exec_speedup, _ in unix50_results]
-    unix50_good_speedups = [res for res in unix50_good_speedups
-                            if res > 0.2]
-
-    w = 0.2
-    ind = np.arange(len(good_speedups + unix50_good_speedups))
-    good_speedup_color = 'tab:blue'
-
-    fig, ax = plt.subplots()
-    # print(fig.get_size_inches())
-    fig.set_size_inches(25, 7.5)
-
-    ## Plot speedup
-    ax.set_ylabel('Speedup')
-    ax.set_xlabel('Script')
-    ax.grid(axis='y', zorder=0)
-
-    ax.bar(ind, good_speedups + unix50_good_speedups, width=2*w, align='center', 
-            color=good_speedup_color, label='PaSh', zorder=3)
-
-    xlabels = [pretty_names[exp] for exp in experiments] + ["unix50-{}".format(i) for i in range(len(unix50_good_speedups))]
-    plt.xticks(ind, xlabels, rotation=45, ha="right")
-    old_ylim = plt.ylim()
-    plt.yticks(list(plt.yticks()[0]) + [1])
-    plt.ylim(old_ylim)
-    # ax.set_xscale("log")
-    # plt.legend(loc='upper right')
-    plt.tight_layout()
-
-    all_good_speedups = good_speedups + unix50_good_speedups
-    print("One bar all Transformation averages:")
-    print("|-- All transformations:", sum(all_good_speedups) / len(all_good_speedups))
-    print("One bar all Transformation minimum:")
-    print("|-- All transformations:", min(all_good_speedups))
-    print("One bar all Transformation maximum:")
-    print("|-- All transformations:", max(all_good_speedups))
-
-
-    plt.savefig(os.path.join('../evaluation/plots', "coarse_all_one_bar_{}.pdf".format(scaleup_number)),bbox_inches='tight')
-
-
-
-def plot_less_one_liners_aggregate(total_lines, all_scaleup_numbers):
-    mins, maxs, avgs = get_statistics_from_lines(total_lines)
-
-    ## Plot one plot with all together
-    fig, ax = plt.subplots()
-    ax.plot(all_scaleup_numbers, avgs)
-    ax.fill_between(all_scaleup_numbers, mins, maxs, alpha=0.2)
-    ax.hlines([1], all_scaleup_numbers[0], all_scaleup_numbers[-1], linewidth=0.8, linestyles="dotted")
-    loc = plticker.MultipleLocator(base=10.0) # this locator puts ticks at regular intervals
-    ax.yaxis.set_major_locator(loc)
-    plt.xticks(all_scaleup_numbers)
-    plt.xlim(all_scaleup_numbers[0], all_scaleup_numbers[-1])
-    ax.grid()
-    plt.ylim(0,maxs[-1]*1.05)
-    fig.set_size_inches(9, 5)
-    set_tiling_axes_labels_ticks(fig)
-    plt.tight_layout()
-    plt.savefig(os.path.join('../evaluation/plots', "coarse_aggregate_throughput_scaleup.pdf"),bbox_inches='tight')
-
-def plot_all_less_one_liners_one_plot(total_lines, all_scaleup_numbers, experiments):
-    mins, maxs, avgs = get_statistics_from_lines(total_lines)
-    fig, ax = plt.subplots()
-    ax.plot(all_scaleup_numbers, avgs, label="Average", linewidth=2)
-    for i, line in enumerate(total_lines):
-        label = pretty_names[experiments[i]]
-        ax.plot(line.get_xdata(), line.get_ydata(), alpha=0.8, label=label, linewidth=1, linestyle="dashed")
-        # new_line = pltlines.Line2D(line.get_xdata(), line.get_ydata())
-        # new_line.update_from(line)
-        # ax.add_line(new_line)
-    ax.hlines([1], all_scaleup_numbers[0], all_scaleup_numbers[-1], linewidth=0.8, linestyles="dotted")
-    loc = plticker.MultipleLocator(base=10.0) # this locator puts ticks at regular intervals
-    ax.yaxis.set_major_locator(loc)
-    plt.xticks(all_scaleup_numbers)
-    plt.xlim(all_scaleup_numbers[0], all_scaleup_numbers[-1])
-    ax.grid()
-    # ax.set_yscale("log")
-    plt.ylim(0,maxs[-1]*1.05)
-    # plt.ylim(0,20)
-    fig.set_size_inches(9, 5)
-    set_tiling_axes_labels_ticks(fig)
-    plt.tight_layout()
-    plt.legend(loc='upper left', fontsize=14)
-    plt.savefig(os.path.join('../evaluation/plots', "coarse_all_in_one_throughput_scaleup.pdf"),bbox_inches='tight')
-
-
-def format_correctness(correctness):
-    global all_experiments
-    global all_line_plots
-
-    print("\nWrong Result Summary (If results are missing for complete conf they are ignored):")
-    for experiment in all_experiments:
-        for line_plot in all_line_plots:
-            result_correctness = correctness[experiment][line_plot]
-            all_missing = all([res == "Not Found" for n, res in result_correctness])
-            wrong_ns = []
-            missing_ns = []
-            for n, res in result_correctness:
-                if res == "Wrong":
-                    wrong_ns.append(n)
-                elif (res == "Not Found"):
-                    missing_ns.append(n)
-            if len(wrong_ns) > 0:
-                print("|-- WARNING: Wrong output for", experiment, "-", line_plot, "-", wrong_ns)
-            if len(missing_ns) > 0 and not all_missing:
-                print("|-- WARNING: Missing output for", experiment, "-", line_plot, "-", missing_ns)            
-
-def any_wrong(correctness, experiment, line_plots):
-    for line_plot in line_plots:
-        result_correctness = correctness[experiment][line_plot]
-        any_wrong = any([res == "Wrong" for n, res in result_correctness])
-        if any_wrong:
-            return True
-    return False
-
-
-## Set the fonts to be larger
-SMALL_SIZE = 22
-MEDIUM_SIZE = 24
-BIGGER_SIZE = 30
-
-plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
-plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
-plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
-plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
-plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
-plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
-plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
-
-# Plot microbenchmarks
-diff_results = []
-all_scaleup_numbers = [2, 4, 8, 16, 32, 64]
-all_experiment_results = {}
-all_sequential_results = {}
-correctness = {}
-for experiment in all_experiments:
-    all_speedup_results, output_diff, sequential_time = collect_scaleup_line_speedups(experiment, all_scaleup_numbers, RESULTS)
-    all_experiment_results[experiment] = all_speedup_results
-    all_sequential_results[experiment] = sequential_time
-    correctness[experiment] = output_diff
-
-small_one_liners_scaleup_numbers = [2, 16]
-small_one_liner_results = {}
-for experiment in all_experiments:
-    small_speedup_results, _, sequential_time = collect_scaleup_line_speedups(experiment, small_one_liners_scaleup_numbers, SMALL_RESULTS)
-    small_one_liner_results[experiment] = small_speedup_results
-
-## Gather results for OASA benchmarks
-oasa_experiments = [str(i) for i in range(1,5)]
-oasa_scaleup_numbers = all_scaleup_numbers
-oasa_experiment_results = {}
-for experiment in oasa_experiments:
-    all_speedup_results, _, sequential_time = collect_scaleup_line_speedups(experiment, oasa_scaleup_numbers, OASA_RESULTS)
-    oasa_experiment_results[experiment] = all_speedup_results
-    oasa_experiment_results[experiment]['seq'] = sequential_time
-
-## Make a report of all one-liners
-report_all_one_liners(all_scaleup_numbers, all_experiment_results, correctness)
-
-## Legacy unix50 results
-if args.all:
-    unix50_results, unix50_results_fan_in = collect_all_unix50_results(UNIX50_RESULTS)
-
-## Collect all unix50 results
-if args.eurosys2021:
-    small_unix50_results, _ = collect_all_unix50_results(SMALL_UNIX50_RESULTS, scaleup_numbers=[4], suffix='distr_auto_split.time')
-    big_unix50_results, _ = collect_all_unix50_results(BIG_UNIX50_RESULTS, scaleup_numbers=[16], suffix='distr_auto_split.time')
-
-##
-## Theory Paper
-##
-coarse_experiments = ["minimal_grep",
-                      "minimal_sort",
-                      "topn",
-                      "wf",
-                      "spell",
-                      "bigrams",
-                      "diff",
-                      "set-diff",
-                      "shortest_scripts"]
-
-if args.all:
-    plot_less_one_liners_tiling(all_experiment_results, all_sequential_results, 
-                                coarse_experiments, (unix50_results, unix50_results_fan_in),
-                                oasa_experiment_results)
-    generate_tex_coarse_table(coarse_experiments)
-    # collect_unix50_coarse_scaleup_times(unix50_results)
-
-
-##
-## Systems Paper
-##
-experiments = ["minimal_grep",
-               "minimal_sort",
-               "topn",
-               "wf",
-               "spell",
-               "diff",
-               "bigrams",
-               "set-diff",
-               "double_sort",
-               "shortest_scripts"]
-
-## Large inputs `-l`
-custom_scaleup_plots = {"minimal_grep" : ["eager", "blocking-eager"],
-                        "minimal_sort": ["eager", "blocking-eager", "no-eager"],
-                        "topn": ["eager", "blocking-eager", "no-eager"],
-                        "wf": ["eager", "blocking-eager", "no-eager"],
-                        "spell" : ["split", "eager"],
-                        "diff" : ["eager", "blocking-eager", "no-eager"],
-                        "bigrams" : ["split", "eager"],
-                        "set-diff" : ["eager", "blocking-eager", "no-eager"],
-                        "double_sort" : ["split", "eager", "blocking-eager", "no-eager"],
-                        "shortest_scripts" : ["eager", "blocking-eager", "no-eager"]}
-
-if args.eurosys2021:
-    plot_one_liners_tiling(all_experiment_results, experiments, custom_scaleup_plots)
-
-## Medium input `-m`
-medium_custom_scaleup_plots = {"minimal_grep" : ["split", "blocking-eager"],
-                               "minimal_sort": ["split", "blocking-eager", "no-eager"],
-                               "topn": ["split", "blocking-eager", "no-eager"],
-                               "wf": ["split", "blocking-eager", "no-eager"],
-                               "spell" : ["split", "eager"],
-                               "diff" : ["split", "blocking-eager", "no-eager"],
-                               "bigrams" : ["split", "eager"],
-                               "set-diff" : ["split", "blocking-eager", "no-eager"],
-                               "double_sort" : ["split", "eager", "blocking-eager", "no-eager"],
-                               "shortest_scripts" : ["split", "blocking-eager", "no-eager"]}
-
-
-if args.eurosys2021:
-    plot_one_liners_tiling(small_one_liner_results, experiments,
-                           medium_custom_scaleup_plots,
-                           all_scaleup_numbers=small_one_liners_scaleup_numbers,
-                           prefix="medium_")
-
-## Small input `-s`
-small_custom_scaleup_plots = {"minimal_grep" : ["split"],
-                              "minimal_sort": ["split"],
-                              "topn": ["split"],
-                              "wf": ["split"],
-                              "spell" : ["split"],
-                              "diff" : ["split"],
-                              "bigrams" : ["split"],
-                              "set-diff" : ["split"],
-                              "double_sort" : ["split"],
-                              "shortest_scripts" : ["split"]}
-
-if args.eurosys2021:
-    plot_one_liners_tiling(small_one_liner_results, experiments,
-                           small_custom_scaleup_plots,
-                           all_scaleup_numbers=small_one_liners_scaleup_numbers,
-                           prefix="small_")
-
-if args.eurosys2021:
-    generate_tables(experiments, results_dir=SMALL_RESULTS, table_suffix="-small", small=True)
-    generate_tables(experiments)
-    collect_unix50_scaleup_times(small_unix50_results, scaleup=[4], small_prefix="_1GB", scatter=False)
-    collect_unix50_scaleup_times(big_unix50_results, scaleup=[16], small_prefix="_10GB", scatter=False)
-    plot_sort_with_baseline(RESULTS, small=True)
-    plot_sort_with_baseline(RESULTS, small=False)
-
-## Legacy plots
-if args.all:
-    collect_unix50_scaleup_times(unix50_results)
-
-
-## Format and print correctness results
-if args.all:
-    format_correctness(correctness)
diff --git a/compiler/generate_gnu_parallel_intermediary_script.py b/compiler/generate_gnu_parallel_intermediary_script.py
deleted file mode 100644
index 70e76c669..000000000
--- a/compiler/generate_gnu_parallel_intermediary_script.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import sys
-import os
-from shutil import copyfile
-from generate_microbenchmark_intermediary_scripts import list_split_inputs, generate_env_file, replace_in_variable
-
-def generate_gnu_parallel_script(input_script, output_script, new_input_files):
-    ## Generate the sequential script
-    with open(input_script) as file:
-        input_script_data = file.read()
-
-    output_script_data = replace_in_variable(input_script_data, new_input_files)
-    output_script_data = replace_temp_variables(output_script_data, new_input_files)
-
-    ## TODO: Also create the TEMP_C variable instead of having it fixed in the script
-
-    with open(output_script, "w") as file:
-        file.write(output_script_data)
-
-def replace_temp_variables(data, new_input_files):
-    temp_vars = [line.split('=')[0] for line in data.split('\n')
-                 if line.startswith('TEMP_C')]
-    filenames = [filename.split("/")[-1] 
-                 for filename in new_input_files]
-    new_data = data
-    for temp_var in temp_vars:
-        number = temp_var.split("TEMP_C")[-1]
-        format_string = "/tmp/{}.out" + str(number)
-        replace_origin_string = ' ${TEMP' + str(number) + '}'
-        print(replace_origin_string)
-        new_data = new_data.replace(replace_origin_string, ' ' + ' '.join([format_string.format(filename)
-                                                                           for filename in filenames]))
-
-    number_jobs = len(new_input_files)
-    new_data = new_data.replace("${JOBS}", str(number_jobs))
-    return new_data
-
-
-input_script_dir = sys.argv[1]
-input_env_dir = sys.argv[2]
-name_of_script = sys.argv[3]
-number_of_inputs = int(sys.argv[4])
-output_dir = sys.argv[5]
-
-try:
-    env_suffix = sys.argv[6]
-except:
-    env_suffix = "env"
-
-## This script takes a microbenchmark script as input, finds the $IN
-## occurence in it and then generates an intermediary script with many
-## $INs in its place.
-input_script = os.path.join(input_script_dir, name_of_script + ".sh")
-output_script = os.path.join(output_dir, '{}_{}_gnu_parallel.sh'.format(name_of_script, number_of_inputs))
-input_env = os.path.join(input_env_dir, name_of_script + "_{}.sh".format(env_suffix))
-output_env = os.path.join(output_dir, '{}_{}_gnu_parallel_env.sh'.format(name_of_script, number_of_inputs))
-input_funs = os.path.join(input_env_dir, name_of_script + "_funs.sh")
-output_funs = os.path.join(output_dir, '{}_{}_gnu_parallel_funs.sh'.format(name_of_script, number_of_inputs))
-
-## Find and split input files given the environment file
-new_input_files = list_split_inputs(output_dir)
-
-## Generate new environment file
-generate_env_file(input_env, output_env, new_input_files)
-
-## Copy the funs file (if it exists)
-if os.path.exists(input_funs):
-    copyfile(input_funs, output_funs)
-
-generate_gnu_parallel_script(input_script, output_script, new_input_files)
-
diff --git a/compiler/test_evaluation_scripts.sh b/compiler/test_evaluation_scripts.sh
deleted file mode 100755
index 6c8702f57..000000000
--- a/compiler/test_evaluation_scripts.sh
+++ /dev/null
@@ -1,230 +0,0 @@
-#!/bin/bash
-# time: print real in seconds, to simplify parsing
-## Necessary to set PASH_TOP
-cd $(dirname $0)
-export PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel --show-superproject-working-tree)}
-export DEBUG=0
-export PASH_LOG=1
-# export DEBUG=1 # Uncomment to print pash output
-## Determines whether the experimental pash flags will be tested. 
-## By default they are not.
-export EXPERIMENTAL=0
-for item in $@
-do
-    if [ "--debug" == "$item" ] || [ "-d" == "$item" ]; then
-        export DEBUG=1
-    fi
-    if [ "--no-pash-log" == "$item" ]; then
-        export PASH_LOG=0
-    fi
-    if [ "--experimental" == "$item" ]; then
-        export EXPERIMENTAL=1
-    fi
-done
-
-microbenchmarks_dir="${PASH_TOP}/evaluation/tests"
-intermediary_dir="${PASH_TOP}/evaluation/tests/test_intermediary"
-test_results_dir="${PASH_TOP}/evaluation/tests/results"
-results_time="$test_results_dir/results.time"
-results_time_bash=${results_time}_bash
-results_time_pash=${results_time}_pash
-
-echo "Deleting eager intermediate files..."
-rm -rf "$test_results_dir"
-rm -rf "$intermediary_dir"
-mkdir -p $intermediary_dir
-mkdir -p "$test_results_dir"
-
-echo "Generating inputs..."
-cd "$microbenchmarks_dir/input"
-./setup.sh
-cd -
-
-n_inputs=(
-    2
-    8
-)
-
-if [ "$EXPERIMENTAL" -eq 1 ]; then
-    configurations=(
-        # "" # Commenting this out since the tests take a lot of time to finish
-        # "--speculation quick_abort"
-        "--parallel_pipelines"
-    )
-else
-    configurations=(
-        "--parallel_pipelines --profile_driven"
-    )
-fi
-
-
-## Tests where the compiler will not always succeed (e.g. because they have mkfifo)
-script_microbenchmarks=(
-    diff                 # (quick-abort) BUG: Might have to do with the named pipes, and the fact that they are reused for parallel and sequential script.
-    set-diff             # TODO: Handle redirection after reduce
-    export_var_script    # Tests whether exported variables in the scripts that are processed by PaSh runtime are visible to the rest of the script.
-    comm-par-test        # Small comm test to ensure non-parallelizability
-    comm-par-test2       # Small comm test with input redirection and hyphen
-    tee_web_index_bug    # Tests a tee bug from web index
-    fun-def              # Tests whether PaSh can handle a simple function definition
-    bigrams              # One-liner
-    spell-grep           # Spell variant with `grep -f` instead of `comm`
-)
-
-pipeline_microbenchmarks=(
-    grep                 # One-liner
-    minimal_sort         # One-liner
-    minimal_grep         # One-liner
-    topn                 # One-liner
-    wf                   # One-liner
-    spell                # One-liner
-    shortest_scripts     # One-liner
-    alt_bigrams          # One-liner
-    deadlock_test        # Test to check deadlock prevention using drain_stream
-    double_sort          # Checks maximum peformance gains from split
-    no_in_script         # Tests whether a script can be executed by our infrastructure without having its input in a file called $IN
-    for_loop_simple      # Tests whether PaSh can handle a for loop where the body is parallelizable
-    minimal_grep_stdin   # Tests whether PaSh can handle a script that reads from stdin
-    micro_10             # A small version of the pipeline above for debugging.
-    sed-test             # Tests all sed occurences in our evaluation to make sure that they work
-    tr-test              # Tests all possible behaviors of tr that exist in our evaluation
-    grep-test            # Tests some interesting grep invocations
-    ann-agg              # Tests custom aggregators in annotations
-    # # # # micro_1000           # Not being run anymore, as it is very slow. Tests whether the compiler is fast enough. It is a huge pipeline without any computation.
-)
-
-
-
-execute_pash_and_check_diff() {
-    TIMEFORMAT="%3R" # %3U %3S"
-    if [ "$DEBUG" -eq 1 ]; then
-        { time "$PASH_TOP/pa.sh" $@ ; } 1> "$pash_output" 2> >(tee -a "${pash_time}" >&2) &&
-        diff -s "$seq_output" "$pash_output" | head | tee -a "${pash_time}" >&2
-    else
-
-        { time "$PASH_TOP/pa.sh" $@ ; } 1> "$pash_output" 2>> "${pash_time}" &&
-        b=$(cat "$pash_time"); 
-        test_diff_ec=$(cmp -s "$seq_output" "$pash_output" && echo 0 || echo 1)
-        # differ
-        script=$(basename $script_to_execute)
-        if [ $test_diff_ec -ne 0 ]; then
-            c=$(diff -s "$seq_output" "$pash_output" | head)
-            echo "$c$b" > "${pash_time}"
-            echo "$script are not identical" >> $test_results_dir/result_status
-        else
-            echo "Files $seq_output and $pash_output are identical" > "${pash_time}"
-            echo "$script are identical" >> $test_results_dir/result_status
-        fi
-
-    fi
-}
-
-execute_tests() {
-    assert_correctness="$1"
-    microbenchmarks=("${@:2}")
-
-    microbenchmark_configs=( )
-    for i in "${!microbenchmarks[@]}"; do
-        all_flags=${test_flags[@]}
-        microbenchmark_configs[$i]="${microbenchmarks[$i]};${all_flags// /;}"
-    done
-
-    ## This is almost the same loop as the one in execute_evaluation_scripts
-    for microbenchmark_config in "${microbenchmark_configs[@]}"; do
-        IFS=";" read -r -a flags <<< "${microbenchmark_config}"
-        microbenchmark=${flags[0]}
-        echo "Executing test: $microbenchmark"
-        # Execute the sequential script on the first run only
-        
-        prefix="${microbenchmarks_dir}/${microbenchmark}"
-
-        export seq_output="${intermediary_dir}/${microbenchmark}_seq_output"
-        seq_time="$test_results_dir/${microbenchmark}_seq.time"
-
-        export script_to_execute="${prefix}.sh"
-        env_file="${prefix}_env_test.sh"
-        funs_file="${prefix}_funs.sh"
-        input_file="${prefix}_test.in"
-
-        if [ -f "$env_file" ]; then
-            . $env_file
-            vars_to_export=$(cut -d= -f1 $env_file)
-            if [ ! -z "$vars_to_export" ]; then
-                export $vars_to_export
-            fi
-        else
-            echo "|-- Does not have env file"
-        fi
-
-        ## Export necessary functions
-        if [ -f "$funs_file" ]; then
-            source $funs_file
-        fi
-
-        ## Redirect the input if there is an input file
-        stdin_redir="/dev/null"
-        if [ -f "$input_file" ]; then
-            stdin_redir="$(cat "$input_file")"
-            echo "|-- Has input file: $stdin_redir"
-        fi
-
-        TIMEFORMAT="${microbenchmark%%.*}:%3R" # %3U %3S"
-        echo -n "|-- Executing the script with bash..."
-        { time /bin/bash "$script_to_execute" > $seq_output ; } \
-            < "$stdin_redir" 2>> "${seq_time}"
-        echo "   exited with $?"
-        tail -n1 ${seq_time} >> ${results_time_bash}
-        for conf in "${configurations[@]}"; do
-            for n_in in "${n_inputs[@]}"; do
-                echo "|-- Executing with pash --width ${n_in} ${conf}..."
-                export pash_time="${test_results_dir}/${microbenchmark}_${n_in}_distr_$(echo ${conf} | tr -d ' ').time"
-                export pash_output="${intermediary_dir}/${microbenchmark}_${n_in}_pash_output"
-                export script_conf=${microbenchmark}_${n_in}
-                echo '' > "${pash_time}"
-                # do we need to write the PaSh output ?
-                cat $stdin_redir |
-                    execute_pash_and_check_diff -d $PASH_LOG $assert_correctness ${conf} --width "${n_in}" --output_time $script_to_execute                 
-                tail -n1 "${pash_time}" >> "${results_time_pash}_${n_in}"
-            done
-        done
-    done
-}
-
-execute_tests "" "${script_microbenchmarks[@]}"
-execute_tests "--assert_compiler_success" "${pipeline_microbenchmarks[@]}"
-
-#cat ${results_time} | sed 's/,/./' > /tmp/a
-#cat /tmp/a | sed 's/@/,/' > ${results_time}
-
-
-if type lsb_release >/dev/null 2>&1 ; then
-   distro=$(lsb_release -i -s)
-elif [ -e /etc/os-release ] ; then
-   distro=$(awk -F= '$1 == "ID" {print $2}' /etc/os-release)
-fi
-
-distro=$(printf '%s\n' "$distro" | LC_ALL=C tr '[:upper:]' '[:lower:]')
-# now do different things depending on distro
-case "$distro" in
-    freebsd*)  
-        # change sed to gsed
-        sed () {
-            gsed $@
-        }
-        ;;
-    *)
-        ;;
-esac
-
-echo "group,Bash,Pash2,Pash8" > ${results_time}
-paste -d'@' $test_results_dir/results.time_*  | sed 's\,\.\g' | sed 's\:\,\g' | sed 's\@\,\g' >> ${results_time}
-
-#echo "Below follow the identical outputs:"
-#grep "are identical" "$test_results_dir"/result_status | awk '{print $1}'
-
-echo "Below follow the non-identical outputs:"     
-grep "are not identical" "$test_results_dir"/result_status | awk '{print $1}'
-
-TOTAL_TESTS=$(cat "$test_results_dir"/result_status | wc -l)
-PASSED_TESTS=$(grep -c "are identical" "$test_results_dir"/result_status)
-echo "Summary: ${PASSED_TESTS}/${TOTAL_TESTS} tests passed."
diff --git a/compiler/wrapper.py b/compiler/wrapper.py
deleted file mode 100644
index 0999fd0e1..000000000
--- a/compiler/wrapper.py
+++ /dev/null
@@ -1,123 +0,0 @@
-orig_commands = []
-cid = 0
-WRAPPER_CMD = "wrap"
-
-def get_results():
-    global orig_commands, cid, WRAPPER_CMD
-    return (cid, orig_commands, WRAPPER_CMD)
-    
-def get_id():
-    global cid 
-    sid = str(cid)
-    cid = cid + 1
-    return sid
-
-def get_wrapper():
-    # ord / chr() for ascii conversion
-    global WRAPPER_CMD
-    io = [ ['C', ord(c)] for c in WRAPPER_CMD]
-    id = [ ['C', ord(c)] for c in get_id()]
-    return (io, id)
-
-def wrap_command_value(value):
-    global orig_commands
-    orig_commands.append(value)
-    w = get_wrapper()
-    value[2].insert(0, w[1])
-    value[2].insert(0, w[0])
-
-def try_wrap(ast_node):
-    # FIXME: deprecated!
-    if ast_node[0] == "Command":
-        value = ast_node[1]
-        # For commands, `value` is of type:
-        # [ lineno, [assgn], [arg], [redir] ]
-        wrap_command_value(value)
-
-def rewrite_ast(asts):
-    for i, node in enumerate(asts):
-        # print("Rewrite AST {}".format(i))
-        rewrite_node(node)
-
-# Fixme: Pythonic conversion for in-place rewrite? e.g., `rewrite!` ?
-def rewrite_node(node):
-    rewrite = {
-        "Command": rewrite_command,
-        "Pipe": rewrite_pipe,
-        "Redir": rewrite_redir,
-        "Background": rewrite_background,
-        "Subshell": rewrite_subshell,
-        "And": rewrite_and,
-        "Or": rewrite_or,
-        "Not": rewrite_not,
-        "Semi": rewrite_semi,
-        "If": rewrite_if,
-        "While": rewrite_while,
-        "For": rewrite_for,
-        "Case": rewrite_case,
-        "Defun": rewrite_defun
-    }
-    rewrite[node[0]](node[1])
-
-def rewrite_command(value):
-    # Command of (linno * assign list * args * redirection list)
-    wrap_command_value(value)
-
-def rewrite_pipe(value):
-    # Pipe of (bool * t list)
-    for i, term in enumerate(value[1]):
-        rewrite_node(term)
-    
-def rewrite_redir(value):
-    # Redir of (linno * t * redirection list)
-    rewrite_node(value[1])
-
-def rewrite_background(value):
-    # Background of (linno * t * redirection list)
-    rewrite_node(value[1])
-
-def rewrite_subshell(value):
-    # Subshell of (linno * t * redirection list)
-    rewrite_node(value[1])
-
-def rewrite_and(value):
-    # And of (t * t)
-    rewrite_node(value[0])
-    rewrite_node(value[1])
-
-def rewrite_or(value):
-    # Or of (t * t)
-    rewrite_node(value[0])
-    rewrite_node(value[1])
-
-def rewrite_not(value):
-    # Not of t
-    rewrite_node(value[0])
-
-def rewrite_semi(value):
-    # Semi of (t * t)
-    rewrite_node(value[0])
-    rewrite_node(value[1])
-
-def rewrite_if(value):
-    # If of (t * t * t)
-    rewrite_node(value[0])
-    rewrite_node(value[1])
-    rewrite_node(value[2])
-
-def rewrite_while(value):
-    # While of (t * t)
-    rewrite_node(value[0])
-    rewrite_node(value[1])
-
-def rewrite_for(value):
-    # For of (linno * arg * t * string)
-    rewrite_node(value[2])
-
-def rewrite_case(value):
-    # Case of (linno * arg * case list)
-    (lambda x: x)(value) #no-op
-
-def rewrite_defun(value):
-    # Defun of (linno * string * t)
-    rewrite_node(value[2])
diff --git a/scripts/ci/ci.sh b/scripts/ci/ci.sh
index 6454fd43e..11721856b 100755
--- a/scripts/ci/ci.sh
+++ b/scripts/ci/ci.sh
@@ -34,7 +34,7 @@ build_runtime() {
 }
 
 pash_tests() {
-  cd ../compiler 
+  cd ../evaluation/tests/ 
   ./test_evaluation_scripts.sh | tee  >(grep '^Summary' | cut -d ' ' -f2 > pash_tests.sum)
   PASH_RESULTS=$(cat pash_tests.sum)
   cd $PASH_TOP/scripts
diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh
index 0676c1786..cb5c3acc1 100755
--- a/scripts/run_tests.sh
+++ b/scripts/run_tests.sh
@@ -13,7 +13,7 @@ cd "$PASH_TOP/evaluation/tests/interface_tests"
 ./run.sh
 
 echo "Running compiler tests..."
-cd "$PASH_TOP/compiler"
+cd "$PASH_TOP/evaluation/tests/"
 ./test_evaluation_scripts.sh
 
 echo "Running aggregator tests..."

From ef19436fca377107292200f42790b5b61f90abdc Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Tue, 21 Feb 2023 11:44:46 -0500
Subject: [PATCH 54/64] move tests

---
 evaluation/tests/test_evaluation_scripts.sh | 229 ++++++++++++++++++++
 1 file changed, 229 insertions(+)
 create mode 100755 evaluation/tests/test_evaluation_scripts.sh

diff --git a/evaluation/tests/test_evaluation_scripts.sh b/evaluation/tests/test_evaluation_scripts.sh
new file mode 100755
index 000000000..b3c6731de
--- /dev/null
+++ b/evaluation/tests/test_evaluation_scripts.sh
@@ -0,0 +1,229 @@
+#!/bin/bash
+# time: print real in seconds, to simplify parsing
+## Necessary to set PASH_TOP
+cd $(dirname $0)
+export PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel --show-superproject-working-tree)}
+export DEBUG=0
+export PASH_LOG=1
+# export DEBUG=1 # Uncomment to print pash output
+## Determines whether the experimental pash flags will be tested. 
+## By default they are not.
+export EXPERIMENTAL=0
+for item in $@
+do
+    if [ "--debug" == "$item" ] || [ "-d" == "$item" ]; then
+        export DEBUG=1
+    fi
+    if [ "--no-pash-log" == "$item" ]; then
+        export PASH_LOG=0
+    fi
+    if [ "--experimental" == "$item" ]; then
+        export EXPERIMENTAL=1
+    fi
+done
+
+microbenchmarks_dir="${PASH_TOP}/evaluation/tests"
+intermediary_dir="${PASH_TOP}/evaluation/tests/test_intermediary"
+test_results_dir="${PASH_TOP}/evaluation/tests/results"
+results_time="$test_results_dir/results.time"
+results_time_bash=${results_time}_bash
+results_time_pash=${results_time}_pash
+
+echo "Deleting eager intermediate files..."
+rm -rf "$test_results_dir"
+rm -rf "$intermediary_dir"
+mkdir -p $intermediary_dir
+mkdir -p "$test_results_dir"
+
+echo "Generating inputs..."
+cd "$microbenchmarks_dir/input"
+./setup.sh
+cd -
+
+n_inputs=(
+    2
+    8
+)
+
+if [ "$EXPERIMENTAL" -eq 1 ]; then
+    configurations=(
+        # "" # Commenting this out since the tests take a lot of time to finish
+        "--parallel_pipelines"
+    )
+else
+    configurations=(
+        "--parallel_pipelines --profile_driven"
+    )
+fi
+
+
+## Tests where the compiler will not always succeed (e.g. because they have mkfifo)
+script_microbenchmarks=(
+    diff                 # (quick-abort) BUG: Might have to do with the named pipes, and the fact that they are reused for parallel and sequential script.
+    set-diff             # TODO: Handle redirection after reduce
+    export_var_script    # Tests whether exported variables in the scripts that are processed by PaSh runtime are visible to the rest of the script.
+    comm-par-test        # Small comm test to ensure non-parallelizability
+    comm-par-test2       # Small comm test with input redirection and hyphen
+    tee_web_index_bug    # Tests a tee bug from web index
+    fun-def              # Tests whether PaSh can handle a simple function definition
+    bigrams              # One-liner
+    spell-grep           # Spell variant with `grep -f` instead of `comm`
+)
+
+pipeline_microbenchmarks=(
+    grep                 # One-liner
+    minimal_sort         # One-liner
+    minimal_grep         # One-liner
+    topn                 # One-liner
+    wf                   # One-liner
+    spell                # One-liner
+    shortest_scripts     # One-liner
+    alt_bigrams          # One-liner
+    deadlock_test        # Test to check deadlock prevention using drain_stream
+    double_sort          # Checks maximum peformance gains from split
+    no_in_script         # Tests whether a script can be executed by our infrastructure without having its input in a file called $IN
+    for_loop_simple      # Tests whether PaSh can handle a for loop where the body is parallelizable
+    minimal_grep_stdin   # Tests whether PaSh can handle a script that reads from stdin
+    micro_10             # A small version of the pipeline above for debugging.
+    sed-test             # Tests all sed occurences in our evaluation to make sure that they work
+    tr-test              # Tests all possible behaviors of tr that exist in our evaluation
+    grep-test            # Tests some interesting grep invocations
+    ann-agg              # Tests custom aggregators in annotations
+    # # # # micro_1000           # Not being run anymore, as it is very slow. Tests whether the compiler is fast enough. It is a huge pipeline without any computation.
+)
+
+
+
+execute_pash_and_check_diff() {
+    TIMEFORMAT="%3R" # %3U %3S"
+    if [ "$DEBUG" -eq 1 ]; then
+        { time "$PASH_TOP/pa.sh" $@ ; } 1> "$pash_output" 2> >(tee -a "${pash_time}" >&2) &&
+        diff -s "$seq_output" "$pash_output" | head | tee -a "${pash_time}" >&2
+    else
+
+        { time "$PASH_TOP/pa.sh" $@ ; } 1> "$pash_output" 2>> "${pash_time}" &&
+        b=$(cat "$pash_time"); 
+        test_diff_ec=$(cmp -s "$seq_output" "$pash_output" && echo 0 || echo 1)
+        # differ
+        script=$(basename $script_to_execute)
+        if [ $test_diff_ec -ne 0 ]; then
+            c=$(diff -s "$seq_output" "$pash_output" | head)
+            echo "$c$b" > "${pash_time}"
+            echo "$script are not identical" >> $test_results_dir/result_status
+        else
+            echo "Files $seq_output and $pash_output are identical" > "${pash_time}"
+            echo "$script are identical" >> $test_results_dir/result_status
+        fi
+
+    fi
+}
+
+execute_tests() {
+    assert_correctness="$1"
+    microbenchmarks=("${@:2}")
+
+    microbenchmark_configs=( )
+    for i in "${!microbenchmarks[@]}"; do
+        all_flags=${test_flags[@]}
+        microbenchmark_configs[$i]="${microbenchmarks[$i]};${all_flags// /;}"
+    done
+
+    ## This is almost the same loop as the one in execute_evaluation_scripts
+    for microbenchmark_config in "${microbenchmark_configs[@]}"; do
+        IFS=";" read -r -a flags <<< "${microbenchmark_config}"
+        microbenchmark=${flags[0]}
+        echo "Executing test: $microbenchmark"
+        # Execute the sequential script on the first run only
+        
+        prefix="${microbenchmarks_dir}/${microbenchmark}"
+
+        export seq_output="${intermediary_dir}/${microbenchmark}_seq_output"
+        seq_time="$test_results_dir/${microbenchmark}_seq.time"
+
+        export script_to_execute="${prefix}.sh"
+        env_file="${prefix}_env_test.sh"
+        funs_file="${prefix}_funs.sh"
+        input_file="${prefix}_test.in"
+
+        if [ -f "$env_file" ]; then
+            . $env_file
+            vars_to_export=$(cut -d= -f1 $env_file)
+            if [ ! -z "$vars_to_export" ]; then
+                export $vars_to_export
+            fi
+        else
+            echo "|-- Does not have env file"
+        fi
+
+        ## Export necessary functions
+        if [ -f "$funs_file" ]; then
+            source $funs_file
+        fi
+
+        ## Redirect the input if there is an input file
+        stdin_redir="/dev/null"
+        if [ -f "$input_file" ]; then
+            stdin_redir="$(cat "$input_file")"
+            echo "|-- Has input file: $stdin_redir"
+        fi
+
+        TIMEFORMAT="${microbenchmark%%.*}:%3R" # %3U %3S"
+        echo -n "|-- Executing the script with bash..."
+        { time /bin/bash "$script_to_execute" > $seq_output ; } \
+            < "$stdin_redir" 2>> "${seq_time}"
+        echo "   exited with $?"
+        tail -n1 ${seq_time} >> ${results_time_bash}
+        for conf in "${configurations[@]}"; do
+            for n_in in "${n_inputs[@]}"; do
+                echo "|-- Executing with pash --width ${n_in} ${conf}..."
+                export pash_time="${test_results_dir}/${microbenchmark}_${n_in}_distr_$(echo ${conf} | tr -d ' ').time"
+                export pash_output="${intermediary_dir}/${microbenchmark}_${n_in}_pash_output"
+                export script_conf=${microbenchmark}_${n_in}
+                echo '' > "${pash_time}"
+                # do we need to write the PaSh output ?
+                cat $stdin_redir |
+                    execute_pash_and_check_diff -d $PASH_LOG $assert_correctness ${conf} --width "${n_in}" --output_time $script_to_execute                 
+                tail -n1 "${pash_time}" >> "${results_time_pash}_${n_in}"
+            done
+        done
+    done
+}
+
+execute_tests "" "${script_microbenchmarks[@]}"
+execute_tests "--assert_compiler_success" "${pipeline_microbenchmarks[@]}"
+
+#cat ${results_time} | sed 's/,/./' > /tmp/a
+#cat /tmp/a | sed 's/@/,/' > ${results_time}
+
+
+if type lsb_release >/dev/null 2>&1 ; then
+   distro=$(lsb_release -i -s)
+elif [ -e /etc/os-release ] ; then
+   distro=$(awk -F= '$1 == "ID" {print $2}' /etc/os-release)
+fi
+
+distro=$(printf '%s\n' "$distro" | LC_ALL=C tr '[:upper:]' '[:lower:]')
+# now do different things depending on distro
+case "$distro" in
+    freebsd*)  
+        # change sed to gsed
+        sed () {
+            gsed $@
+        }
+        ;;
+    *)
+        ;;
+esac
+
+echo "group,Bash,Pash2,Pash8" > ${results_time}
+paste -d'@' $test_results_dir/results.time_*  | sed 's\,\.\g' | sed 's\:\,\g' | sed 's\@\,\g' >> ${results_time}
+
+#echo "Below follow the identical outputs:"
+#grep "are identical" "$test_results_dir"/result_status | awk '{print $1}'
+
+echo "Below follow the non-identical outputs:"     
+grep "are not identical" "$test_results_dir"/result_status | awk '{print $1}'
+
+TOTAL_TESTS=$(cat "$test_results_dir"/result_status | wc -l)
+PASSED_TESTS=$(grep -c "are identical" "$test_results_dir"/result_status)
+echo "Summary: ${PASSED_TESTS}/${TOTAL_TESTS} tests passed."

From 01c0023f46543523d766f3c091e831595619282d Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Tue, 21 Feb 2023 12:01:09 -0500
Subject: [PATCH 55/64] nit

---
 evaluation/tests/minimal_grep_stdin_test.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evaluation/tests/minimal_grep_stdin_test.in b/evaluation/tests/minimal_grep_stdin_test.in
index e4f85a218..c4d2ac9e4 100755
--- a/evaluation/tests/minimal_grep_stdin_test.in
+++ b/evaluation/tests/minimal_grep_stdin_test.in
@@ -1 +1 @@
-../evaluation/tests/input/1M.txt
+./input/1M.txt

From ff53e2999f6d9a3ae5e31770bc61d93da7e2899d Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Tue, 21 Feb 2023 12:33:23 -0500
Subject: [PATCH 56/64] remove the last annotation files

---
 compiler/annotations.py         | 399 --------------------------------
 compiler/command-classes.yaml   |  32 ---
 compiler/command_categories.py  | 279 ----------------------
 compiler/config.json            |   1 -
 compiler/config.py              |   1 -
 compiler/dspash/worker.py       |   6 +-
 compiler/pash.py                |   1 -
 compiler/pash_runtime.py        |   4 -
 compiler/pash_runtime_daemon.py |   5 -
 9 files changed, 3 insertions(+), 725 deletions(-)
 delete mode 100644 compiler/annotations.py
 delete mode 100644 compiler/command-classes.yaml
 delete mode 100644 compiler/command_categories.py

diff --git a/compiler/annotations.py b/compiler/annotations.py
deleted file mode 100644
index a7c78be02..000000000
--- a/compiler/annotations.py
+++ /dev/null
@@ -1,399 +0,0 @@
-import config
-import os
-import json
-from ir_utils import *
-from util import *
-
-##
-## Load annotation files
-##
-
-def load_annotation_file(abs_annotation_filename):
-    with open(abs_annotation_filename) as annotation_file:
-        try:
-            annotation = json.load(annotation_file)
-            return [annotation]
-        except json.JSONDecodeError as err:
-            log("WARNING: Could not parse annotation for file:", abs_annotation_filename)
-            log("|-- {}".format(err))
-            return []
-
-def load_annotation_files(annotation_dir):
-    annotations = []
-    if(not os.path.isabs(annotation_dir)):
-        annotation_dir = os.path.join(config.PASH_TOP, annotation_dir)
-
-    for (dirpath, _dirnames, filenames) in os.walk(annotation_dir):
-        json_filenames = [os.path.join(dirpath, filename) for filename in filenames
-                          if filename.endswith(".json")]
-        curr_annotations = [ann for filename in json_filenames for ann in load_annotation_file(filename) ]
-        annotations += curr_annotations
-    return annotations
-    
-## TODO: Make the input/output io_list an object and therefore also define
-##       methods that transform inputs/outputs <-> args.
-
-## TODO: Think of a way to create a procedure that:
-##       - given an annotation
-##       - creates two functions:
-##         1. One that takes options and creates inputs, outputs for a DFG Node
-##         2. One that takes inputs+outputs of a DFG and creates arguments and redirections
-##            for the corresponding command.
-##
-##       Such a procedure would replace the current ad-hoc functions that 
-##       re-read the annotation and then do what they are supposed to do.
-
-##
-## Inputs, Outputs -> Command Arguments and redirections
-##
-def construct_args_redirs(command, options, input_fids, output_fids, annotations):
-    command_ann = get_command_from_annotations(command, options, annotations)
-    assert(not command_ann is None)
-    ## TODO: Move parsing to happen once when the annotation is loaded.
-    _, input_args_redirs_assigner_fun = parse_command_inputs_outputs(command_ann['inputs'])
-    _, output_args_redirs_assigner_fun = parse_command_inputs_outputs(command_ann['outputs'])
-    ann_options = []
-    if('options' in command_ann):
-        ann_options = command_ann['options']
-    args, redirs = input_args_redirs_assigner_fun(input_fids, ann_options, [], [])
-    args, redirs = output_args_redirs_assigner_fun(output_fids, ann_options, args, redirs)
-    return args, redirs
-
-def args_redirs_from_io_list(io_list, fids, ann_options, args, redirs):
-    rem_fids = fids
-    for io in io_list:
-        new_args, new_redirs, rem_fids = args_redirs_from_io_list_el(io, rem_fids, ann_options, args, redirs)
-        args = new_args
-        redirs = new_redirs
-    assert(len(rem_fids) == 0)
-    return args, redirs
-
-## This function redirects an fid to stdout if it is not already stdout
-def redirect_to_stdout_if_not_already(fid):
-    if (fid.has_file_descriptor_resource() and fid.resource.is_stdout()):
-        return []
-    else:
-        return [redir_stdout_to_file(fid.to_ast())]
-
-## TODO: We need to handle args[:] followed by stdin by having a look-ahead.
-##       It might not be necessary actually. Since it is valid to have them all in
-##       args then.
-def args_redirs_from_io_list_el(io, fids, ann_options, args, redirs):
-    if(io == "stdin"):
-        fid = fids[0]
-        rem_fids = fids[1:]
-        ## Do not redirect if it is from stdin
-        if (fid.has_file_descriptor_resource() and fid.resource.is_stdin()):
-            return args, redirs, rem_fids
-        else:
-            new_redirs = redirs
-            new_redirs.append(redir_file_to_stdin(fid.to_ast()))
-            return args, new_redirs, rem_fids
-    elif(io == "stdout"):
-        fid = fids[0]
-        rem_fids = fids[1:]
-        new_redirs = redirs + redirect_to_stdout_if_not_already(fid)
-        return args, new_redirs, rem_fids
-    else:
-        assert(io.startswith("args"))
-        indices = io.split("[")[1].split("]")[0]
-
-        if(not ":" in indices):
-            index = int(indices)
-            ## The argument list is growing with this, so the index might be larger
-            args = pad(args, index)
-            fid = fids[0]
-            rem_fids = fids[1:]
-            args[index] = fid
-        else:
-            start_i_str, end_i_str = indices.split(":")
-
-            start_i = 0
-            if(not start_i_str == ""):
-                start_i = int(start_i_str)
-
-            ## TODO: We need to handle args[:] followed by stdin by having a look-ahead.
-
-            ## If it has a variable end we need to add all the fids
-            end_i = len(fids) + start_i
-            if(not end_i_str == ""):
-                ## TODO: This might be wrong
-                end_i = int(end_i_str)
-
-            ## The argument list is growing with this, so the index might be larger
-            args = pad(args, end_i - 1)
-
-            ## All the arguments in the required range must be None
-            assert(len([arg for arg in args[start_i:end_i] if not arg is None]) == 0)
-            for i in range(start_i,end_i):
-                args[i] = fids[i-start_i]
-
-            ## Remove the used fids
-            rem_fids = fids[(end_i-start_i):]
-
-        ## If the command has the "stdin-hyphen" option turned on,
-        ## then it means that `-` should be interpreted as stdin
-        ## TODO: What should we do for stdin-hyphen?
-        # if('stdin-hyphen' in ann_options):
-        #     io_list = [handle_stdin_hyphen(io, options) for io in io_list]
-        return args, redirs, rem_fids
-
-## This function parses the command inputs and creates a function 
-## that can be extracts inputs from options
-##
-## TODO: Come up with the conditions that need to hold for these functions
-##
-## TODO: Unify both of these creations into one. Now there is a lot of duplication.
-def parse_command_inputs_outputs(inputs_outputs):
-    if(isinstance(inputs_outputs, list)):
-        input_assigner_fun = lambda options, ann_options: interpret_io_list(inputs_outputs, options, ann_options)
-        args_redirs_assigner_fun = lambda fids, ann_options, args, redirs: args_redirs_from_io_list(inputs_outputs,
-                                                                                                    fids,
-                                                                                                    ann_options,
-                                                                                                    args,
-                                                                                                    redirs)
-    else:
-        configuration_inputs = inputs_outputs["configuration"]
-        standard_inputs = inputs_outputs["standard"]
-        input_assigner_fun = lambda options, ann_options: assign_configuration_standard_inputs(configuration_inputs, standard_inputs, options, ann_options)
-        all_inputs = configuration_inputs + standard_inputs
-        args_redirs_assigner_fun = lambda fids, ann_options, args, redirs: args_redirs_from_io_list(all_inputs,
-                                                                                                    fids,
-                                                                                                    ann_options,
-                                                                                                    args,
-                                                                                                    redirs)
-
-    return (input_assigner_fun, args_redirs_assigner_fun)
-    
-def assign_configuration_standard_inputs(configuration_inputs, standard_inputs, options, ann_options):
-    extracted_config_inputs, options_to_rem1 = interpret_io_list(configuration_inputs, options, ann_options)
-    extracted_standard_inputs, options_to_rem2 = interpret_io_list(standard_inputs, options, ann_options)
-    extracted_inputs = (extracted_config_inputs, extracted_standard_inputs)
-    return (extracted_inputs, options_to_rem1 + options_to_rem2)
-
-
-## Checks if the annotation for that command exists
-def get_command_io_from_annotations(command, options, annotations):
-    command_ann = get_command_from_annotations(command, options, annotations)
-    if(command_ann):
-        ## TODO: Move parsing to happen once when the annotation is loaded.
-        input_assigner_fun, _ = parse_command_inputs_outputs(command_ann['inputs'])
-        output_assigner_fun, _ = parse_command_inputs_outputs(command_ann['outputs'])
-        ann_options = []
-        if('options' in command_ann):
-            ann_options = command_ann['options']
-        extracted_inputs, options_to_rem1 = input_assigner_fun(options, ann_options)
-        extracted_outputs, options_to_rem2 = output_assigner_fun(options, ann_options)
-
-        ## Some options do not have to be considered for option_indices
-        ## At the moment this is only `-` for stdin-hyphen
-        if(isinstance(extracted_inputs, tuple)):
-            options_to_ignore = options_to_rem1 + options_to_rem2 + extracted_inputs[0] + extracted_inputs[1] + extracted_outputs
-        else:
-            options_to_ignore = options_to_rem1 + options_to_rem2 + extracted_inputs + extracted_outputs
-        option_indices = rest_options(options_to_ignore, options)
-        return (extracted_inputs, extracted_outputs, option_indices)
-
-def rest_options(options_to_ignore, options):
-    input_output_indices = [io[1] for io in options_to_ignore
-                            if isinstance(io, tuple)]
-    io_indices_set = set(input_output_indices)
-    all_indices = [("option", i) for i in range(len(options))
-                   if not i in io_indices_set]
-    return all_indices
-
-def interpret_io_list(files, options, ann_options):
-    io_files = []
-    options_to_remove = []
-    for io in files:
-        new_io_files, new_options_to_remove = interpret_io(io, options, ann_options)
-        io_files += new_io_files
-        options_to_remove += new_options_to_remove
-    return (io_files, options_to_remove)
-
-def interpret_io(io, options, ann_options):
-    if(io == "stdin"):
-        return (["stdin"], [])
-    elif(io == "stdout"):
-        return (["stdout"], [])
-    else:
-        assert(io.startswith("args"))
-        indices = io.split("[")[1].split("]")[0]
-
-        # log(io, options)
-
-        ## Find the file arguments (and their indices)
-        args_indices = non_option_args_indices(options)
-
-        ## Single index
-        if(not ":" in indices):
-            index = int(indices)
-            io_list = [("option", args_indices[index][1])]
-        else:
-            start_i_str, end_i_str = indices.split(":")
-
-            start_i = 0
-            if(not start_i_str == ""):
-                start_i = int(start_i_str)
-
-            end_i = len(args_indices)
-            if(not end_i_str == ""):
-                end_i = int(end_i_str)
-
-            io_list = []
-            for _, i in args_indices[start_i:end_i]:
-                io_list.append(("option", i))
-
-        options_to_remove = []
-        ## If the command has the "stdin-hyphen" option turned on,
-        ## then it means that `-` should be interpreted as stdin
-        if('stdin-hyphen' in ann_options):
-            ## We need to remove the `-` from the options.
-            ## TODO: This is not a complete solution. Normally we should
-            ##       keep this info in the DFGNode somewhere. The compilation
-            ##       Command <-> DFGNode is not a bijection at the moment.
-            io_list, options_to_remove = handle_stdin_hyphen(io_list, options)
-        return (io_list, options_to_remove)
-
-def handle_stdin_hyphen(io_list, options):
-    options_to_remove = []
-    for i in range(len(io_list)):
-        io = io_list[i]
-        if(isinstance(io, tuple)):
-            option = options[io[1]]
-
-            ## TODO: For absolute completeness, not being able to
-            ## interpret the argument to `-` means that it should also
-            ## bump up the command class (or not be translated to the DFG).
-            if(format_arg_chars(option) == '-'):
-                io_list[i] = "stdin"
-                ## Remove `-` from options
-                options_to_remove.append(io)
-    return (io_list, options_to_remove)
-
-
-def get_command_class_from_annotations(command, options, annotations):
-    command_ann = get_command_from_annotations(command, options, annotations)
-    if(command_ann):
-        return command_ann['class']
-
-def get_command_properties_from_annotations(command, options, annotations):
-    command_ann = get_command_from_annotations(command, options, annotations)
-    if(command_ann
-       and 'properties' in command_ann):
-        return command_ann['properties']
-
-def get_command_aggregator_from_annotations(command, options, annotations):
-    log(f'still used')
-    command_ann = get_command_from_annotations(command, options, annotations)
-    if(command_ann
-       and 'aggregator' in command_ann):
-        return command_ann['aggregator']
-
-def get_command_mapper_from_annotations(command, options, annotations):
-    command_ann = get_command_from_annotations(command, options, annotations)
-    if(command_ann
-       and 'mapper' in command_ann):
-        return command_ann['mapper']
-
-## TODO: Find a general way to handle arbitrary paths etc
-def get_command_from_annotations(command_path, options, annotations):
-    ## Get only the basename from the path of the command
-    command_basename = os.path.basename(command_path)
-    ## Find an annotation for this command
-    relevant_annotations = [ann for ann in annotations if ann['command'] == command_basename]
-    if(len(relevant_annotations) == 0):
-        return None
-    elif(len(relevant_annotations) > 1):
-        log("Warning: More than one annotation for command:", command_basename)
-
-    return get_command_from_annotation(command_basename, options, relevant_annotations[0])
-
-def get_command_from_annotation(command, options, annotation):
-    assert(annotation['command'] == command)
-
-    cases = annotation['cases']
-    case = find_annotation_case(options, cases)
-    return case
-
-def find_annotation_case(options, cases):
-    for case in cases:
-        if(predicate_satisfied(options, case['predicate'])):
-            return case
-
-    ## Unreachable
-    assert(False)
-
-def predicate_satisfied(options, predicate):
-    if(predicate == 'default'):
-        return True
-
-    func = interpret_predicate(predicate)
-    return func(options)
-
-def interpret_predicate(predicate):
-    # log(predicate)
-    operator = predicate['operator']
-    operands = []
-    try:
-        operands = predicate['operands']
-    except:
-        pass
-    if(operator == 'len_args_eq'):
-        return lambda options: len_args(operands[0], options)
-    elif(operator == 'exists'):
-        return lambda options: exists_operator(operands, options)
-    elif(operator == 'value'):
-        return lambda options: value_operator(operands, options)
-    elif(operator == 'all'):
-        return lambda options: all_operator(operands, options)
-    elif(operator == 'or'):
-        return lambda options: or_operator(operands, options)
-    elif(operator == 'not'):
-        return lambda options: not_operator(operands, options)
-
-    ## TODO: Fill in the rest
-    return lambda x: log("Uninterpreted operator:", operator); False
-
-
-##
-## Helper functions for predicate interpretation
-##
-
-def len_args(desired_length, options):
-    args = non_option_args(options)
-    return (len(args) == desired_length)
-
-def exists_operator(desired_options, options):
-    opt_args_set = set(option_args(options))
-    existence = map(lambda opt: opt in opt_args_set, desired_options)
-    return any(existence)
-
-## Checks that an option exists and that it has a specific value
-def value_operator(option_value, options):
-    args_list = format_args(options)
-    desired_opt = option_value[0]
-    desired_val = option_value[1]
-    ## If the desired option does exist, and the next argument is indeed the correct value
-    try:
-        opt_i = args_list.index(desired_opt)
-        val = args_list[opt_i+1]
-        return (desired_val == val)
-    except:
-        return False
-
-def all_operator(desired_options, options):
-    opt_args_set = set(option_args(options))
-    existence = map(lambda opt: opt in opt_args_set, desired_options)
-    return all(existence)
-
-def or_operator(operands, options):
-    operand_predicates = map(lambda op: interpret_predicate(op)(options), operands)
-    return any(operand_predicates)
-
-## Operands: One predicate
-def not_operator(operands, options):
-    assert(len(operands) == 1)
-    operand = operands[0]
-    return not interpret_predicate(operand)(options)
-
diff --git a/compiler/command-classes.yaml b/compiler/command-classes.yaml
deleted file mode 100644
index b7a4ded4c..000000000
--- a/compiler/command-classes.yaml
+++ /dev/null
@@ -1,32 +0,0 @@
-stateless:
-    - command: col
-    - command: groff # not clear
-    - command: sed # not always
-    - command: cut
-    - command: gunzip # file stateless
-    - command: iconv
-    - command: pandoc # html-block parallel
-    - command: stem-words.js
-    - command: xargs # unclear if all xargs are stateless
-    - command: distrotest_loop
-    - command: extract_text
-pure:
-    - command: sort
-    - command: wc
-    - command: uniq
-    - command: bigrams_aux
-    - command: trigrams_aux
-    - command: alt_bigrams_aux
-    - command: diff
-    - command: head
-    - command: tail
-    - command: awk
-    - command: fmt
-    - command: tee
-    - command: seq
-parallelizable_pure:
-    - command: sort
-    - command: bigrams_aux
-    - command: alt_bigrams_aux
-    - command: test_one
-
diff --git a/compiler/command_categories.py b/compiler/command_categories.py
deleted file mode 100644
index 3cadf25fe..000000000
--- a/compiler/command_categories.py
+++ /dev/null
@@ -1,279 +0,0 @@
-from annotations import *
-from ir_utils import *
-from util import *
-
-import config
-
-###
-### This file contains the logic that determines the category and the
-### input outputs of a command. In the future, this should be replaced
-### by a parser of the command classification DSL.
-###
-
-##
-## Custom input output functions for specific commands
-##
-
-
-def default_input_output(options):
-    opt_indices = [("option", i) for i in range(len(options))]
-    return (["stdin"], ["stdout"], opt_indices)
-
-
-##
-## Custom category functions for specific commands
-##
-
-## TODO: All of these are possibly non-complete
-def is_sed_pure(options):
-    first_opt = format_expanded_arg_chars(options[0])
-    if(not (first_opt.startswith("-")
-            or first_opt.startswith("s"))
-       and ("d" in first_opt
-            or "q" in first_opt)):
-        return "pure"
-    else:
-        return "stateless"
-
-def contains_s(option):
-    return ((option.startswith("-") and "s" in option)
-            or option == "--squeeze-repeats")
-
-def contains_d(option):
-    return ((option.startswith("-") and "d" in option)
-            or option == "--delete")
-
-def is_tr_pure(options):
-    formatted_opts = [format_expanded_arg_chars(option)
-                      for option in options]
-    set_opts = [opt for opt in formatted_opts if not opt.startswith("-")]
-    set1_opt = set_opts[0]
-    ## If -s is one of the options (and \n is in the last SET)
-    ## If -d is one of the options (and \n is in SET1)
-    if((any([contains_s(opt) for opt in formatted_opts])
-        and ("\\n" in set_opts[-1]
-             or "\\012" in set_opts[-1]))
-       or (any([contains_d(opt) for opt in formatted_opts])
-           and ("\\n" in set1_opt
-                or "\\012" in set1_opt))):
-        return "pure"
-    else:
-        return "stateless"
-
-## TODO: Move that to annotation
-def is_uniq_pure(options):
-    if(len(options) > 0):
-        first_opt = format_arg_chars(options[0])
-        if(first_opt == "-c"):
-            return "pure"
-    return "parallelizable_pure"
-
-
-##
-## Dictionaries with the custom functions
-##
-
-custom_command_input_outputs = {
-    
-}
-
-custom_command_args_redirs_from_input_outputs = {
-    
-}
-
-custom_command_categories = {
-    "sed"  : is_sed_pure,
-    "tr"   : is_tr_pure,
-    "uniq" : is_uniq_pure,
-}
-
-## The class that contains the aggregator information that was parsed from the annotation
-class Aggregator:
-    def __init__(self, aggregator_json):
-        ## Exactly one of the two should exist in the JSON
-        assert('name' in aggregator_json or 'path' in aggregator_json)
-        assert(not ('name' in aggregator_json and 'path' in aggregator_json))
-        
-        ## TODO: Instead of initializing like this, we could keep both, and return the correct information when asked the name.
-        if('path' in aggregator_json):
-            ## Set the name to be the absolute path
-            self.name = os.path.join(config.PASH_TOP, aggregator_json['path'])
-        else:
-            self.name = aggregator_json['name']
-        
-        ## By default options are []
-        if('options' in aggregator_json):
-            self.options = aggregator_json['options']
-        else:
-            self.options = []
-    
-    def __repr__(self):
-        return "Aggregator(name={},opts={})".format(self.name, self.options)
-
-
-## The class that contains the mapper information that was mapped from annotation (rare).
-class Mapper:
-    def __init__(self, mapper_json):
-        ## Exactly one of the two should exist in the JSON
-        assert('name' in mapper_json or 'path' in mapper_json)
-        assert(not ('name' in mapper_json and 'path' in mapper_json))
-        
-        if('path' in mapper_json):
-            ## Set the name to be the absolute path
-            self.name = os.path.join(config.PASH_TOP, mapper_json['path'])
-        else:
-            self.name = mapper_json['name']
-
-        ## By default options are []
-        if('options' in mapper_json):
-            self.options = mapper_json['options']
-        else:
-            self.options = []
-        
-        if ('num_outputs' in mapper_json):
-            self.num_outputs = mapper_json['num_outputs']
-        else:
-            self.num_outputs = 1
-    
-    def __repr__(self):
-        if(self.num_outputs == 1):
-            return "Mapper(name={},opts={})".format(self.name, self.options)
-        else:
-            return "Mapper(name={},opts={},num_outs={})".format(self.name, self.options, self.num_outputs)
-
-## This function returns the input and output streams of a command.
-##
-## The input and output lists, contain tuples that refer to options:
-## e.g. ("option", 0) or "stdin", "stdout" when they refer to stdin or
-## stdout.
-##
-## At the moment it has just hardcoded knowledge of the inputs and
-## outputs of several commands.
-##
-## By default they are the stdin and the stdout of the node, and they
-## are only filled in for commands that we (or the developer) has
-## specified a list of input resources that also contains files in the
-## arguments.
-def find_command_input_output(command, options):
-    global custom_command_input_outputs
-
-    command_string = format_arg_chars(command)
-    # log("Command to categorize:", command_string)
-
-    assert(isinstance(command_string, str))
-
-    if (command_string in custom_command_input_outputs):
-        log(" -- Warning: Overriding standard inputs-outputs for:", command_string)
-        custom_io_fun = custom_command_input_outputs[command_string]
-        return custom_io_fun(options)
-
-    ## Find the inputs-outputs of the command in the annotation files (if it exists)
-    command_io_from_annotation = get_command_io_from_annotations(command_string,
-                                                                 options,
-                                                                 config.annotations)
-    if (command_io_from_annotation):
-        # log("inputs-outputs found for:", command_string)
-        # log("|--", command_io_from_annotation)
-        return command_io_from_annotation
-
-    return default_input_output(options)
-
-## This function is the reverse of the one above. It gives us arguments and redirections
-## from inputs and outputs.
-def create_command_arguments_redirs(command, options, inputs, outputs):
-    global custom_command_args_redirs_from_input_outputs
-
-    command_string = format_arg_chars(command)
-    # log("Command to categorize:", command_string)
-
-    assert(isinstance(command_string, str))
-
-    if (command_string in custom_command_args_redirs_from_input_outputs):
-        log(" -- Warning: Overriding standard inputs-outputs for:", command_string)
-        custom_io_fun = custom_command_args_redirs_from_input_outputs[command_string]
-        return custom_io_fun(options)
-
-    ## Find the inputs-outputs of the command in the annotation files (if it exists)
-    command_arguments_redirs = construct_args_redirs(command_string,
-                                                     options,
-                                                     inputs,
-                                                     outputs,
-                                                     config.annotations)
-    if (command_arguments_redirs):
-        # log("arguments, redirs found for:", command_string)
-        # log("|--", command_arguments_redirs)
-        return command_arguments_redirs
-
-    ## TODO: Implement that
-    raise NotImplementedError()
-    return default_arguments_redirs(options, inputs, outputs)
-
-## This functions finds and returns a string representing the command category
-def find_command_category(command, options):
-    global custom_command_categories
-
-    command_string = format_arg_chars(command)
-    # log("Command to categorize:", command_string)
-
-    assert(isinstance(command_string, str))
-
-    ## Override standard categories
-    if (command_string in custom_command_categories):
-        log(" -- Warning: Overriding standard category for:", command_string)
-        custom_category_fun = custom_command_categories[command_string]
-        return custom_category_fun(options)
-
-    ## Find the class of the command in the annotation files (if it exists)
-    command_class_from_annotation = get_command_class_from_annotations(command_string,
-                                                                       options,
-                                                                       config.annotations)
-    if (command_class_from_annotation):
-        log("class:", command_class_from_annotation, "found for:", command_string)
-        return command_class_from_annotation
-
-    return('none')
-
-def find_command_properties(command, options):
-
-    command_string = format_arg_chars(command)
-    # log("Command to find properties:", command_string)
-
-    assert(isinstance(command_string, str))
-
-    ## Find the properties of the command in the annotation files (if it exists)
-    command_properties_from_annotation = get_command_properties_from_annotations(command_string,
-                                                                                 options,
-                                                                                 config.annotations)
-    if (command_properties_from_annotation):
-        log("properties:", command_properties_from_annotation, "found for:", command_string)
-        return command_properties_from_annotation
-
-    return []
-
-def find_command_mapper_aggregator(command, options):
-
-    command_string = format_arg_chars(command)
-    # log("Command to find aggregator:", command_string)
-
-    assert(isinstance(command_string, str))
-
-    ## Find the aggregator of the command in the annotation files (if it exists)
-    command_aggregator_from_annotation = get_command_aggregator_from_annotations(command_string,
-                                                                                 options,
-                                                                                 config.annotations)
-    aggregator_object = None
-    if (command_aggregator_from_annotation):
-        aggregator_object = Aggregator(command_aggregator_from_annotation)
-        log("aggregator:", aggregator_object, "found for:", command_string)
-        
-    ## Find the mapper of the command in the annotation files (if it exists)
-    command_mapper_from_annotation = get_command_mapper_from_annotations(command_string,
-                                                                         options,
-                                                                         config.annotations)
-    mapper_object = None
-    if (command_mapper_from_annotation):
-        mapper_object = Mapper(command_mapper_from_annotation)
-        log("mapper:", mapper_object, "found for:", command_string)
-    
-    return (mapper_object, aggregator_object)
-
diff --git a/compiler/config.json b/compiler/config.json
index 4bed7462b..ab2655b3b 100644
--- a/compiler/config.json
+++ b/compiler/config.json
@@ -22,7 +22,6 @@
         "eager_executable_path": "runtime/eager.sh",
         "drain_stream_executable_path": "runtime/drain_stream.sh",
         "output_dir": "/tmp/distr_output",
-        "annotations_dir": "annotations",
         "batch_size": 1000,
         "distr_backend": false,
         "nodes": [
diff --git a/compiler/config.py b/compiler/config.py
index 1ff7f5b8f..ee1812b07 100644
--- a/compiler/config.py
+++ b/compiler/config.py
@@ -39,7 +39,6 @@
 
 
 config = {}
-annotations = []
 pash_args = None
 
 
diff --git a/compiler/dspash/worker.py b/compiler/dspash/worker.py
index 3dcb7a8e2..d8f18db77 100644
--- a/compiler/dspash/worker.py
+++ b/compiler/dspash/worker.py
@@ -16,7 +16,6 @@
 
 import config
 from util import log
-from annotations import load_annotation_files
 import pash_runtime
 from dspash.socket_utils import send_msg, recv_msg
 from dspash.ir_helper import save_configs, to_shell_file
@@ -123,8 +122,9 @@ def parse_args():
 def init():
     args = parse_args()
     config.LOGGING_PREFIX = f"Worker {config.pash_args.port}: "
-    config.annotations = load_annotation_files(
-        config.config['distr_planner']['annotations_dir'])
+    ## KK: 2023-02-21 Commenting this out, we need to figure out if the new annotations work with the distribution package
+    # config.annotations = load_annotation_files(
+    #     config.config['distr_planner']['annotations_dir'])
     pash_runtime.runtime_config = config.config['distr_planner']
     pash_runtime.termination = ""
 
diff --git a/compiler/pash.py b/compiler/pash.py
index fc69b780b..56e91c12f 100755
--- a/compiler/pash.py
+++ b/compiler/pash.py
@@ -3,7 +3,6 @@
 import subprocess
 import argparse
 from datetime import datetime
-from annotations import *
 import ast_to_ast
 from ir import *
 from parse import parse_shell_to_asts_interactive
diff --git a/compiler/pash_runtime.py b/compiler/pash_runtime.py
index ff89af753..eb29794f6 100644
--- a/compiler/pash_runtime.py
+++ b/compiler/pash_runtime.py
@@ -6,7 +6,6 @@
 
 from pash_annotations.annotation_generation.datatypes.parallelizability.AggregatorKind import AggregatorKindEnum
 
-import annotations
 import config
 from ir import *
 from ast_to_ir import compile_asts
@@ -51,9 +50,6 @@ def main_body():
     if not config.config:
         config.load_config(args.config_path)
 
-    ## Load annotations
-    config.annotations = annotations.load_annotation_files(config.config['distr_planner']['annotations_dir'])
-
     runtime_config = config.config['distr_planner']
 
     ## Read any shell variables files if present
diff --git a/compiler/pash_runtime_daemon.py b/compiler/pash_runtime_daemon.py
index d7a3ca9b1..df80fcaed 100644
--- a/compiler/pash_runtime_daemon.py
+++ b/compiler/pash_runtime_daemon.py
@@ -7,7 +7,6 @@
 from datetime import datetime
 # import queue
 
-import annotations
 import config
 from pash_graphviz import maybe_generate_graphviz
 import pash_runtime
@@ -49,10 +48,6 @@ def init():
     if not config.config:
         config.load_config(args.config_path)
 
-    # Load annotations
-    config.annotations = annotations.load_annotation_files(
-        config.config['distr_planner']['annotations_dir'])
-
     pash_runtime.runtime_config = config.config['distr_planner']
 
     return args

From 55966bb319e846beadad6ea800c6ef746a444ce9 Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Tue, 21 Feb 2023 14:24:12 -0500
Subject: [PATCH 57/64] Start moving things to shell_ast folder

---
 compiler/ast_to_ir.py                  | 34 +++---------------------
 compiler/ir_utils.py                   | 10 ++-----
 compiler/parse.py                      |  6 ++---
 compiler/pash.py                       |  4 ++-
 compiler/preprocessor/preprocessor.py  |  2 +-
 compiler/shell_ast/__init__.py         |  0
 compiler/{ => shell_ast}/ast_to_ast.py |  2 +-
 compiler/{ => shell_ast}/ast_util.py   | 36 ++++++++++++++++++++++++++
 8 files changed, 49 insertions(+), 45 deletions(-)
 create mode 100644 compiler/shell_ast/__init__.py
 rename compiler/{ => shell_ast}/ast_to_ast.py (99%)
 rename compiler/{ => shell_ast}/ast_util.py (58%)

diff --git a/compiler/ast_to_ir.py b/compiler/ast_to_ir.py
index a6a4cf69f..c7b12f25f 100644
--- a/compiler/ast_to_ir.py
+++ b/compiler/ast_to_ir.py
@@ -1,4 +1,4 @@
-from ast_util import *
+from shell_ast.ast_util import *
 from ir import *
 from definitions.ast_node import *
 from definitions.ast_node_c import *
@@ -8,6 +8,9 @@
 import subprocess
 import config
 
+## TODO: Separate the ir stuff to the bare minimum and 
+##       try to move this to the shell_ast folder.
+
 ##
 ## Compile AST -> Extended AST with IRs
 ##
@@ -291,35 +294,6 @@ def should_expand_arg_char(arg_char):
 def should_expand_argument(argument):
     return any([should_expand_arg_char(arg_char) for arg_char in argument])
 
-def make_echo_ast(argument, var_file_path):
-    nodes = []
-    ## Source variables if present
-    if(not var_file_path is None):
-        arguments = [string_to_argument("source"), string_to_argument(var_file_path)]
-
-        line_number = 0
-        node = make_kv('Command', [line_number, [], arguments, []])
-        nodes.append(node)
-
-    ## Reset the exit status
-    variable_arg = make_kv('V', ['Normal', "false", 'pash_previous_exit_status', []])
-    arguments = [string_to_argument("exit"), [variable_arg]]
-    exit_node = make_kv('Command', [0, [], arguments, []])
-    node = make_kv('Subshell', [0, exit_node, []])
-    nodes.append(node)
-
-    ## Reset the input arguments
-    variable_arg = make_kv('V', ['Normal', "false", 'pash_input_args', []])
-    arguments = [string_to_argument("set"), string_to_argument("--"), [variable_arg]]
-    set_node = make_kv('Command', [0, [], arguments, []])
-    nodes.append(set_node)
-
-    arguments = [string_to_argument("echo"), string_to_argument("-n"), argument]
-
-    line_number = 0
-    node = make_kv('Command', [line_number, [], arguments, []])
-    nodes.append(node)
-    return nodes
 
 ## TODO: Move this function somewhere more general
 def execute_shell_asts(asts):
diff --git a/compiler/ir_utils.py b/compiler/ir_utils.py
index 915e152a9..dbd24e8d8 100644
--- a/compiler/ir_utils.py
+++ b/compiler/ir_utils.py
@@ -143,14 +143,8 @@ def format_expanded_arg_char(arg_char):
         ## TODO: Make this correct
         raise ValueError
 
-## These functions check tuple inputs (configuration and streaming ones)
-def is_single_input(inputs):
-    assert(False)
-    assert(isinstance(inputs, tuple))
-    conf_inputs = inputs[0]
-    streaming_inputs = inputs[1]
-    return (len(conf_inputs) == 0
-            and len(streaming_inputs) == 1)
+
+## TODO: This seems like it should go to ast_util
 
 ## This function gets a key and a value from the ast json format
 def get_kv(dic):
diff --git a/compiler/parse.py b/compiler/parse.py
index 2b0bd78be..a701ab6bf 100644
--- a/compiler/parse.py
+++ b/compiler/parse.py
@@ -3,11 +3,9 @@
 import subprocess
 import sys
 
-from ast_util import *
+from shell_ast.ast_util import UnparsedScript
 from util import *
-from definitions.ast_node import *
-
-sys.path.append(os.path.join(config.PASH_TOP, "compiler/parser/ceda"))
+from definitions.ast_node import AstNode, ast_node_to_untyped_deep
 
 import libdash.parser
 import libdash.printer
diff --git a/compiler/pash.py b/compiler/pash.py
index 56e91c12f..e79c2cedd 100755
--- a/compiler/pash.py
+++ b/compiler/pash.py
@@ -3,7 +3,9 @@
 import subprocess
 import argparse
 from datetime import datetime
-import ast_to_ast
+
+from shell_ast import ast_to_ast
+
 from ir import *
 from parse import parse_shell_to_asts_interactive
 from pash_graphviz import maybe_init_graphviz_dir
diff --git a/compiler/preprocessor/preprocessor.py b/compiler/preprocessor/preprocessor.py
index 99242194f..9b47fc563 100644
--- a/compiler/preprocessor/preprocessor.py
+++ b/compiler/preprocessor/preprocessor.py
@@ -3,7 +3,7 @@
 import os
 
 import config
-import ast_to_ast
+from shell_ast import ast_to_ast
 from ir import FileIdGen
 from parse import parse_shell_to_asts, from_ast_objects_to_shell
 from util import *
diff --git a/compiler/shell_ast/__init__.py b/compiler/shell_ast/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/compiler/ast_to_ast.py b/compiler/shell_ast/ast_to_ast.py
similarity index 99%
rename from compiler/ast_to_ast.py
rename to compiler/shell_ast/ast_to_ast.py
index f0096a21d..72756f509 100644
--- a/compiler/ast_to_ast.py
+++ b/compiler/shell_ast/ast_to_ast.py
@@ -3,7 +3,7 @@
 
 import config
 
-from ast_util import *
+from shell_ast.ast_util import *
 from parse import from_ast_objects_to_shell_file
 
 
diff --git a/compiler/ast_util.py b/compiler/shell_ast/ast_util.py
similarity index 58%
rename from compiler/ast_util.py
rename to compiler/shell_ast/ast_util.py
index 7d25be089..2a93b6a0f 100644
--- a/compiler/ast_util.py
+++ b/compiler/shell_ast/ast_util.py
@@ -1,4 +1,6 @@
 from definitions.ast_node import *
+## TODO: These calls need to be moved here, they don't make sense in ir_utils
+from ir_utils import string_to_argument, make_kv
 
 ## This class is used by the preprocessor in ast_to_ir
 class PreprocessedAST:
@@ -51,3 +53,37 @@ def ast_match(ast_node, cases, *args):
         return ast_match_untyped(ast_node, cases, *args)
 
     return cases[ast_node.construct.value](*args)(ast_node)
+
+##
+## Make some nodes
+##
+
+def make_echo_ast(argument, var_file_path):
+    nodes = []
+    ## Source variables if present
+    if(not var_file_path is None):
+        arguments = [string_to_argument("source"), string_to_argument(var_file_path)]
+
+        line_number = 0
+        node = make_kv('Command', [line_number, [], arguments, []])
+        nodes.append(node)
+
+    ## Reset the exit status
+    variable_arg = make_kv('V', ['Normal', "false", 'pash_previous_exit_status', []])
+    arguments = [string_to_argument("exit"), [variable_arg]]
+    exit_node = make_kv('Command', [0, [], arguments, []])
+    node = make_kv('Subshell', [0, exit_node, []])
+    nodes.append(node)
+
+    ## Reset the input arguments
+    variable_arg = make_kv('V', ['Normal', "false", 'pash_input_args', []])
+    arguments = [string_to_argument("set"), string_to_argument("--"), [variable_arg]]
+    set_node = make_kv('Command', [0, [], arguments, []])
+    nodes.append(set_node)
+
+    arguments = [string_to_argument("echo"), string_to_argument("-n"), argument]
+
+    line_number = 0
+    node = make_kv('Command', [line_number, [], arguments, []])
+    nodes.append(node)
+    return nodes
\ No newline at end of file

From 5e05bef5310b416264dd46384d67e8590d892042 Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Tue, 21 Feb 2023 14:53:08 -0500
Subject: [PATCH 58/64] move some more files to shell_ast

---
 compiler/ast_to_ir.py              | 2 +-
 compiler/{ => shell_ast}/expand.py | 3 ++-
 compiler/test_expansion.py         | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)
 rename compiler/{ => shell_ast}/expand.py (99%)

diff --git a/compiler/ast_to_ir.py b/compiler/ast_to_ir.py
index c7b12f25f..ac3d0ea60 100644
--- a/compiler/ast_to_ir.py
+++ b/compiler/ast_to_ir.py
@@ -4,7 +4,7 @@
 from definitions.ast_node_c import *
 from util import *
 from parse import from_ast_objects_to_shell
-from expand import *
+from shell_ast.expand import *
 import subprocess
 import config
 
diff --git a/compiler/expand.py b/compiler/shell_ast/expand.py
similarity index 99%
rename from compiler/expand.py
rename to compiler/shell_ast/expand.py
index aac8770ad..636fea2c8 100644
--- a/compiler/expand.py
+++ b/compiler/shell_ast/expand.py
@@ -5,7 +5,8 @@
 
 import ast_to_ir
 import config
-import parse
+## Could be useful for debugging
+# import parse
 
 ################################################################################
 # SAFE EXPANSION ANALYSIS
diff --git a/compiler/test_expansion.py b/compiler/test_expansion.py
index 5895a8c8a..ca33b2a21 100644
--- a/compiler/test_expansion.py
+++ b/compiler/test_expansion.py
@@ -1,6 +1,6 @@
 import parse
 import config
-import expand
+from shell_ast import expand
 import json_ast
 
 import copy

From dc73bd8de1e4e603a590e0d0ed0b961cf7c17d0c Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Tue, 21 Feb 2023 14:58:27 -0500
Subject: [PATCH 59/64] remove obsolete code from ir_utils

---
 compiler/ir_utils.py | 58 +-------------------------------------------
 1 file changed, 1 insertion(+), 57 deletions(-)

diff --git a/compiler/ir_utils.py b/compiler/ir_utils.py
index dbd24e8d8..396e9bf5b 100644
--- a/compiler/ir_utils.py
+++ b/compiler/ir_utils.py
@@ -2,63 +2,7 @@
 
 from util import *
 
-##
-## Separate the option from the non-option command arguments
-##
-
-def option_args(options):
-    option_args_inds = option_args_indices(options)
-    args = [option for option, i in option_args_inds]
-    return args
-
-def option_args_indices(options):
-    non_option_indices = [i for _, i in non_option_args_indices(options)]
-    non_option_indices_set = set(non_option_indices)
-
-    ## Find the option args by finding the complement of the
-    ## non-option args
-    formated_options = [format_arg_chars(opt) for opt in options]
-    option_args = [(option, i) for i, option in enumerate(formated_options)
-                   if not i in non_option_indices_set]
-    return option_args
-
-def non_option_args(options):
-    non_option_args_inds = non_option_args_indices(options)
-    args = [option for option, i in non_option_args_inds]
-    return args
-
-def non_option_args_indices(options):
-    formated_options = [format_arg_chars(opt) for opt in options]
-    # log(formated_options)
-
-    ## TODO: This might need to become more general
-    ##
-    ## WARNING: Handling `-` as stdin should not be done for all
-    ## commands but only for those that have the stdin-hyphen option.
-    args = [(option, i) for i, option in enumerate(formated_options)
-            if not option.startswith("-") or option == "-"]
-    return args
-
-## This function interleaves option arguments (that might contain Nones)
-## with the rest of the arguments
-##
-## Assumption: rest_arguments does not contain Nones
-def interleave_args(opt_arguments, rest_arguments):
-    assert(all([arg for arg in rest_arguments if not arg is None]))
-    arguments = opt_arguments
-    for i in range(len(arguments)):
-        if(arguments[i] is None):
-            rest_arg = rest_arguments.pop(0)
-            arguments[i] = rest_arg
-    arguments += rest_arguments
-    return arguments
-
-def get_command_from_definition(command_definition):
-    if 'command' in command_definition:
-        return command_definition['command']
-
-    log('Possible issue with definition file: Missing command in command definition {}'.format(command_definition))
-    return ''
+## TODO: Move everything to ast_util
 
 def format_args(args):
     formatted_args = [format_arg_chars(arg_chars) for arg_chars in args]

From fefe55f3d702e90d580f1a741db9cdc89eeb6ea1 Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Tue, 21 Feb 2023 17:51:25 -0500
Subject: [PATCH 60/64] remove ir_utils

---
 .../annotations_utils/util_cmd_invocations.py |   2 +-
 compiler/annotations_utils/util_parsing.py    |   2 +-
 compiler/config.py                            |  61 +++---
 compiler/definitions/ast_node.py              |   2 +-
 compiler/definitions/ir/arg.py                |   2 +-
 compiler/definitions/ir/file_id.py            |   2 +-
 compiler/definitions/ir/nodes/pash_split.py   |   1 -
 compiler/definitions/ir/nodes/r_split.py      |   2 +-
 compiler/definitions/ir/nodes/r_unwrap.py     |   1 -
 compiler/definitions/ir/nodes/r_wrap.py       |   2 +-
 compiler/definitions/ir/redirection.py        |   2 +-
 compiler/definitions/ir/resource.py           |   5 +-
 compiler/ir.py                                |   2 +-
 compiler/ir_to_ast.py                         |   3 +-
 compiler/ir_utils.py                          | 181 -----------------
 compiler/shell_ast/ast_to_ast.py              |   3 +-
 compiler/shell_ast/ast_util.py                | 182 +++++++++++++++++-
 17 files changed, 226 insertions(+), 229 deletions(-)
 delete mode 100644 compiler/ir_utils.py

diff --git a/compiler/annotations_utils/util_cmd_invocations.py b/compiler/annotations_utils/util_cmd_invocations.py
index 8ee2907df..f012b0dd8 100644
--- a/compiler/annotations_utils/util_cmd_invocations.py
+++ b/compiler/annotations_utils/util_cmd_invocations.py
@@ -13,7 +13,7 @@
 # for typing
 from pash_annotations.datatypes.CommandInvocationPrefix import CommandInvocationPrefix
 
-from ir_utils import string_to_argument, redir_stdout_to_file, redir_file_to_stdin, make_command
+from shell_ast.ast_util import  string_to_argument, redir_stdout_to_file, redir_file_to_stdin, make_command
 
 def get_command_invocation_prefix_from_dfg_node(dfg_node):
     return CommandInvocationPrefix(cmd_name = dfg_node.com_name,
diff --git a/compiler/annotations_utils/util_parsing.py b/compiler/annotations_utils/util_parsing.py
index ebeab7853..f4655b9fa 100644
--- a/compiler/annotations_utils/util_parsing.py
+++ b/compiler/annotations_utils/util_parsing.py
@@ -9,7 +9,7 @@
 from pash_annotations.parser.util_parser import get_json_data
 
 
-from ir_utils import format_arg_chars, string_to_argument, log
+from shell_ast.ast_util import format_arg_chars, string_to_argument
 
 
 def merge_to_single_string_with_space(list_str):
diff --git a/compiler/config.py b/compiler/config.py
index ee1812b07..040b6d4c0 100644
--- a/compiler/config.py
+++ b/compiler/config.py
@@ -6,7 +6,6 @@
 
 from datetime import datetime
 
-from ir_utils import *
 from util import *
 
 ## Global
@@ -175,49 +174,49 @@ def add_common_arguments(parser):
 def pass_common_arguments(pash_arguments):
     arguments = []
     if (pash_arguments.no_optimize):
-        arguments.append(string_to_argument("--no_optimize"))
+        arguments.append("--no_optimize")
     if (pash_arguments.dry_run_compiler):
-        arguments.append(string_to_argument("--dry_run_compiler"))
+        arguments.append("--dry_run_compiler")
     if (pash_arguments.assert_compiler_success):
-        arguments.append(string_to_argument("--assert_compiler_success"))
+        arguments.append("--assert_compiler_success")
     if (pash_arguments.avoid_pash_runtime_completion):
-        arguments.append(string_to_argument("--avoid_pash_runtime_completion"))
+        arguments.append("--avoid_pash_runtime_completion")
     if (pash_arguments.profile_driven):
-        arguments.append(string_to_argument("--profile_driven"))
+        arguments.append("--profile_driven")
     if (pash_arguments.output_time):
-        arguments.append(string_to_argument("--output_time"))
+        arguments.append("--output_time")
     if (pash_arguments.output_optimized):
-        arguments.append(string_to_argument("--output_optimized"))
-    arguments.append(string_to_argument("--graphviz"))
-    arguments.append(string_to_argument(pash_arguments.graphviz))
-    arguments.append(string_to_argument("--graphviz_dir"))
-    arguments.append(string_to_argument(pash_arguments.graphviz_dir))
+        arguments.append("--output_optimized")
+    arguments.append("--graphviz")
+    arguments.append(pash_arguments.graphviz)
+    arguments.append("--graphviz_dir")
+    arguments.append(pash_arguments.graphviz_dir)
     if(not pash_arguments.log_file == ""):
-        arguments.append(string_to_argument("--log_file"))
-        arguments.append(string_to_argument(pash_arguments.log_file))
+        arguments.append("--log_file")
+        arguments.append(pash_arguments.log_file)
     if (pash_arguments.no_eager):
-        arguments.append(string_to_argument("--no_eager"))
+        arguments.append("--no_eager")
     if (pash_arguments.no_daemon):
-        arguments.append(string_to_argument("--no_daemon"))
+        arguments.append("--no_daemon")
     if (pash_arguments.distributed_exec):
-        arguments.append(string_to_argument("--distributed_exec"))
+        arguments.append("--distributed_exec")
     if (pash_arguments.parallel_pipelines):
-        arguments.append(string_to_argument("--parallel_pipelines"))
+        arguments.append("--parallel_pipelines")
     if (pash_arguments.daemon_communicates_through_unix_pipes):
-        arguments.append(string_to_argument("--daemon_communicates_through_unix_pipes"))
-    arguments.append(string_to_argument("--r_split_batch_size"))
-    arguments.append(string_to_argument(str(pash_arguments.r_split_batch_size)))
-    arguments.append(string_to_argument("--debug"))
-    arguments.append(string_to_argument(str(pash_arguments.debug)))
-    arguments.append(string_to_argument("--termination"))
-    arguments.append(string_to_argument(pash_arguments.termination))
-    arguments.append(string_to_argument("--speculation"))
-    arguments.append(string_to_argument(pash_arguments.speculation))
-    arguments.append(string_to_argument("--width"))
-    arguments.append(string_to_argument(str(pash_arguments.width)))
+        arguments.append("--daemon_communicates_through_unix_pipes")
+    arguments.append("--r_split_batch_size")
+    arguments.append(str(pash_arguments.r_split_batch_size))
+    arguments.append("--debug")
+    arguments.append(str(pash_arguments.debug))
+    arguments.append("--termination")
+    arguments.append(pash_arguments.termination)
+    arguments.append("--speculation")
+    arguments.append(pash_arguments.speculation)
+    arguments.append("--width")
+    arguments.append(str(pash_arguments.width))
     if(not pash_arguments.config_path == ""):
-        arguments.append(string_to_argument("--config_path"))
-        arguments.append(string_to_argument(pash_arguments.config_path))
+        arguments.append("--config_path")
+        arguments.append(pash_arguments.config_path)
     return arguments
 
 def init_log_file():
diff --git a/compiler/definitions/ast_node.py b/compiler/definitions/ast_node.py
index c1731ca1a..ee1539008 100644
--- a/compiler/definitions/ast_node.py
+++ b/compiler/definitions/ast_node.py
@@ -2,7 +2,7 @@
 
 from definitions.ast_node_c import *
 from definitions.no_match_exception import *
-from ir_utils import *
+from shell_ast.ast_util import *
 from util import *
 
 
diff --git a/compiler/definitions/ir/arg.py b/compiler/definitions/ir/arg.py
index 8ca591733..b75627862 100644
--- a/compiler/definitions/ir/arg.py
+++ b/compiler/definitions/ir/arg.py
@@ -1,5 +1,5 @@
 from __future__ import annotations
-from ir_utils import *
+from shell_ast.ast_util import *
 from util import *
 
 class Arg:
diff --git a/compiler/definitions/ir/file_id.py b/compiler/definitions/ir/file_id.py
index 215d45c49..ecee07ec0 100644
--- a/compiler/definitions/ir/file_id.py
+++ b/compiler/definitions/ir/file_id.py
@@ -1,7 +1,7 @@
 import config
 import os
 
-from ir_utils import *
+from shell_ast.ast_util import *
 from util import *
 import uuid
 
diff --git a/compiler/definitions/ir/nodes/pash_split.py b/compiler/definitions/ir/nodes/pash_split.py
index add232947..621334807 100644
--- a/compiler/definitions/ir/nodes/pash_split.py
+++ b/compiler/definitions/ir/nodes/pash_split.py
@@ -3,7 +3,6 @@
 
 from definitions.ir.file_id import *
 from definitions.ir.dfg_node import *
-from ir_utils import string_to_argument
 
 import config
 import os
diff --git a/compiler/definitions/ir/nodes/r_split.py b/compiler/definitions/ir/nodes/r_split.py
index 3f4b5bc3f..ac2462b49 100644
--- a/compiler/definitions/ir/nodes/r_split.py
+++ b/compiler/definitions/ir/nodes/r_split.py
@@ -8,7 +8,7 @@
 
 from definitions.ir.dfg_node import *
 from definitions.ir.file_id import *
-from ir_utils import string_to_argument
+from shell_ast.ast_util import string_to_argument
 
 class RSplit(DFGNode):
     def __init__(self,
diff --git a/compiler/definitions/ir/nodes/r_unwrap.py b/compiler/definitions/ir/nodes/r_unwrap.py
index 112b72148..931507220 100644
--- a/compiler/definitions/ir/nodes/r_unwrap.py
+++ b/compiler/definitions/ir/nodes/r_unwrap.py
@@ -2,7 +2,6 @@
 from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars
 
 from definitions.ir.dfg_node import *
-from ir_utils import *
 
 class RUnwrap(DFGNode):
     def __init__(self,
diff --git a/compiler/definitions/ir/nodes/r_wrap.py b/compiler/definitions/ir/nodes/r_wrap.py
index 7149a8cec..1d045a838 100644
--- a/compiler/definitions/ir/nodes/r_wrap.py
+++ b/compiler/definitions/ir/nodes/r_wrap.py
@@ -4,7 +4,7 @@
 
 from annotations_utils.util_cmd_invocations import to_arg_from_cmd_inv_with_io_vars_without_streaming_inputs_or_outputs_for_wrapping
 from definitions.ir.dfg_node import *
-from ir_utils import *
+from shell_ast.ast_util import *
 
 class RWrap(DFGNode):
     def __init__(self,
diff --git a/compiler/definitions/ir/redirection.py b/compiler/definitions/ir/redirection.py
index 7be6c496a..2203a5a6e 100644
--- a/compiler/definitions/ir/redirection.py
+++ b/compiler/definitions/ir/redirection.py
@@ -1,5 +1,5 @@
 from definitions.ir.arg import *
-from ir_utils import *
+from shell_ast.ast_util import *
 
 class Redirection():
     def __init__(self, redirection):
diff --git a/compiler/definitions/ir/resource.py b/compiler/definitions/ir/resource.py
index 999792cd9..c6ad69c5e 100644
--- a/compiler/definitions/ir/resource.py
+++ b/compiler/definitions/ir/resource.py
@@ -1,7 +1,8 @@
+import socket
+
 from definitions.ir.arg import *
 from util import *
-from ir_utils import *
-import socket
+from shell_ast.ast_util import *
 
 ## TODO: Resources should probably be more elaborate than just a
 ## string and a line range. They could be URLs, and possibly other things.
diff --git a/compiler/ir.py b/compiler/ir.py
index 4f42ab49b..5d9de8675 100644
--- a/compiler/ir.py
+++ b/compiler/ir.py
@@ -21,7 +21,7 @@
 import definitions.ir.nodes.r_wrap as r_wrap
 import definitions.ir.nodes.r_unwrap as r_unwrap
 
-from ir_utils import *
+from shell_ast.ast_util import *
 from util import *
 
 import config
diff --git a/compiler/ir_to_ast.py b/compiler/ir_to_ast.py
index 078466669..ed75852b2 100644
--- a/compiler/ir_to_ast.py
+++ b/compiler/ir_to_ast.py
@@ -1,7 +1,8 @@
 import os
 from datetime import datetime
+
 from util import *
-from ir_utils import *
+from shell_ast.ast_util import *
 from parse import from_ast_objects_to_shell
 import config
 
diff --git a/compiler/ir_utils.py b/compiler/ir_utils.py
deleted file mode 100644
index 396e9bf5b..000000000
--- a/compiler/ir_utils.py
+++ /dev/null
@@ -1,181 +0,0 @@
-### Utils
-
-from util import *
-
-## TODO: Move everything to ast_util
-
-def format_args(args):
-    formatted_args = [format_arg_chars(arg_chars) for arg_chars in args]
-    return formatted_args
-
-def format_arg_chars(arg_chars):
-    chars = [format_arg_char(arg_char) for arg_char in arg_chars]
-    return "".join(chars)
-
-##
-## BIG TODO: Fix the formating of arg_chars bask to shell scripts and string.
-##           We need to do this the proper way using the parser.
-##
-def format_arg_char(arg_char):
-    key, val = get_kv(arg_char)
-    if (key == 'C'):
-        return str(chr(val))
-    elif (key == 'B'):
-        # The $() is just for illustration. This is backticks
-        return '$({})'.format(val)
-    elif (key == 'Q'):
-        formated_val = format_arg_chars(val)
-        return '"{}"'.format(formated_val)
-    elif (key == 'V'):
-        return '${{{}}}'.format(val[2])
-    elif (key == 'E'):
-        ## TODO: This is not right. I think the main reason for the
-        ## problems is the differences between bash and the posix
-        ## standard.
-        # log(" -- escape-debug -- ", val, chr(val))
-        non_escape_chars = [92, # \
-                            61, # =
-                            91, # [
-                            93, # ]
-                            45, # -
-                            58, # :
-                            126,# ~
-                            42] # *
-        if(val in non_escape_chars):
-            return '{}'.format(chr(val))
-        else:
-            return '\{}'.format(chr(val))
-    else:
-        log("Cannot format arg_char:", arg_char)
-        ## TODO: Make this correct
-        raise NotImplementedError
-
-## This function finds the first raw character in an argument.
-## It needs to be called on an expanded string.
-def format_expanded_arg_chars(arg_chars):
-    chars = [format_expanded_arg_char(arg_char) for arg_char in arg_chars]
-    return "".join(chars)
-
-def format_expanded_arg_char(arg_char):
-    key, val = get_kv(arg_char)
-    if (key == 'C'):
-        return str(chr(val))
-    elif (key == 'Q'):
-        formated_val = format_expanded_arg_chars(val)
-        return '{}'.format(formated_val)
-    elif (key == 'E'):
-        ## TODO: I am not sure if this should add \ or not
-        ##
-        ## TODO: This is not right. I think the main reason for the
-        ## problems is the differences between bash and the posix
-        ## standard.
-        # log(" -- escape-debug -- ", val, chr(val))
-        non_escape_chars = [92, # \
-                            61, # =
-                            91, # [
-                            93, # ]
-                            45, # -
-                            58, # :
-                            126,# ~
-                            42] # *
-        if(val in non_escape_chars):
-            return '{}'.format(chr(val))
-        else:
-            return '\{}'.format(chr(val))
-    else:
-        log("Expanded arg char should not contain:", arg_char)
-        ## TODO: Make this correct
-        raise ValueError
-
-
-## TODO: This seems like it should go to ast_util
-
-## This function gets a key and a value from the ast json format
-def get_kv(dic):
-    return (dic[0], dic[1])
-
-def make_kv(key, val):
-    return [key, val]
-
-def string_to_arguments(string):
-    return [string_to_argument(word) for word in string.split(" ")]
-
-def string_to_argument(string):
-    ret = [char_to_arg_char(char) for char in string]
-    return ret
-
-## FIXME: This is certainly not complete. It is used to generate the
-## AST for the call to the distributed planner. It only handles simple
-## characters
-def char_to_arg_char(char):
-    return ['C' , ord(char)]
-
-def escaped_char(char):
-    return ['E' , ord(char)]
-
-def standard_var_ast(string):
-    return make_kv("V", ["Normal", False, string, []])
-
-def make_quoted_variable(string):
-    return make_kv("Q", [standard_var_ast(string)])
-
-def quote_arg(arg):
-    return make_kv("Q", arg)
-
-def redir_append_stderr_to_string_file(string):
-    return make_kv("File",["Append",2,string_to_argument(string)])
-
-def redir_stdout_to_file(arg):
-    return make_kv("File",["To", 1, arg])
-
-def redir_file_to_stdin(arg):
-    return make_kv("File",["From", 0, arg])
-
-def make_background(body, redirections=[]):
-    lineno = 0
-    node = make_kv("Background", [lineno, body, redirections])
-    return node
-
-def make_backquote(node):
-    node = make_kv("B", node)
-    return node
-
-def make_subshell(body, redirections=[]):
-    lineno = 0
-    node = make_kv("Subshell", [lineno, body, redirections])
-    return node
-
-def make_command(arguments, redirections=[], assignments=[]):
-    lineno = 0
-    node = make_kv("Command", [lineno, assignments, arguments, redirections])
-    return node
-
-def make_nop():
-    return make_command([string_to_argument(":")])
-
-def make_assignment(var, value):
-    lineno = 0
-    assignment=(var, value)
-    assignments=[assignment]
-    node = make_kv("Command", [lineno, assignments, [], []])
-    return node
-
-def make_semi_sequence(asts):
-    if(len(asts) == 0):
-        return make_nop()
-
-    if(len(asts) == 1):
-        return asts[0]
-    else:
-        acc = asts[-1]
-        ## Remove the last ast
-        iter_asts = asts[:-1]
-        for ast in iter_asts[::-1]:
-            acc = make_kv("Semi", [ast, acc])
-        return acc
-
-def make_defun(name, body):
-    lineno = 0
-    node = make_kv("Defun", [lineno, name, body])
-    return node
-
diff --git a/compiler/shell_ast/ast_to_ast.py b/compiler/shell_ast/ast_to_ast.py
index 72756f509..4d99064c5 100644
--- a/compiler/shell_ast/ast_to_ast.py
+++ b/compiler/shell_ast/ast_to_ast.py
@@ -519,7 +519,8 @@ def make_call_to_pash_runtime(ir_filename, sequential_script_file_name,
                  string_to_argument(sequential_script_file_name),
                  string_to_argument(ir_filename)]
     ## Pass a relevant argument to the planner
-    arguments += config.pass_common_arguments(config.pash_args)
+    common_arguments_strings = config.pass_common_arguments(config.pash_args)
+    arguments += [string_to_argument(string) for string in common_arguments_strings]
     runtime_node = make_command(arguments)
 
     ## Restore the arguments to propagate internal changes, e.g., from `shift` outside.
diff --git a/compiler/shell_ast/ast_util.py b/compiler/shell_ast/ast_util.py
index 2a93b6a0f..19bd594a9 100644
--- a/compiler/shell_ast/ast_util.py
+++ b/compiler/shell_ast/ast_util.py
@@ -1,6 +1,7 @@
+
 from definitions.ast_node import *
-## TODO: These calls need to be moved here, they don't make sense in ir_utils
-from ir_utils import string_to_argument, make_kv
+from util import *
+
 
 ## This class is used by the preprocessor in ast_to_ir
 class PreprocessedAST:
@@ -54,6 +55,183 @@ def ast_match(ast_node, cases, *args):
 
     return cases[ast_node.construct.value](*args)(ast_node)
 
+
+
+def format_args(args):
+    formatted_args = [format_arg_chars(arg_chars) for arg_chars in args]
+    return formatted_args
+
+def format_arg_chars(arg_chars):
+    chars = [format_arg_char(arg_char) for arg_char in arg_chars]
+    return "".join(chars)
+
+##
+## BIG TODO: Fix the formating of arg_chars bask to shell scripts and string.
+##           We need to do this the proper way using the parser.
+##
+def format_arg_char(arg_char):
+    key, val = get_kv(arg_char)
+    if (key == 'C'):
+        return str(chr(val))
+    elif (key == 'B'):
+        # The $() is just for illustration. This is backticks
+        return '$({})'.format(val)
+    elif (key == 'Q'):
+        formated_val = format_arg_chars(val)
+        return '"{}"'.format(formated_val)
+    elif (key == 'V'):
+        return '${{{}}}'.format(val[2])
+    elif (key == 'E'):
+        ## TODO: This is not right. I think the main reason for the
+        ## problems is the differences between bash and the posix
+        ## standard.
+        # log(" -- escape-debug -- ", val, chr(val))
+        non_escape_chars = [92, # \
+                            61, # =
+                            91, # [
+                            93, # ]
+                            45, # -
+                            58, # :
+                            126,# ~
+                            42] # *
+        if(val in non_escape_chars):
+            return '{}'.format(chr(val))
+        else:
+            return '\{}'.format(chr(val))
+    else:
+        log("Cannot format arg_char:", arg_char)
+        ## TODO: Make this correct
+        raise NotImplementedError
+
+## This function finds the first raw character in an argument.
+## It needs to be called on an expanded string.
+def format_expanded_arg_chars(arg_chars):
+    chars = [format_expanded_arg_char(arg_char) for arg_char in arg_chars]
+    return "".join(chars)
+
+def format_expanded_arg_char(arg_char):
+    key, val = get_kv(arg_char)
+    if (key == 'C'):
+        return str(chr(val))
+    elif (key == 'Q'):
+        formated_val = format_expanded_arg_chars(val)
+        return '{}'.format(formated_val)
+    elif (key == 'E'):
+        ## TODO: I am not sure if this should add \ or not
+        ##
+        ## TODO: This is not right. I think the main reason for the
+        ## problems is the differences between bash and the posix
+        ## standard.
+        # log(" -- escape-debug -- ", val, chr(val))
+        non_escape_chars = [92, # \
+                            61, # =
+                            91, # [
+                            93, # ]
+                            45, # -
+                            58, # :
+                            126,# ~
+                            42] # *
+        if(val in non_escape_chars):
+            return '{}'.format(chr(val))
+        else:
+            return '\{}'.format(chr(val))
+    else:
+        log("Expanded arg char should not contain:", arg_char)
+        ## TODO: Make this correct
+        raise ValueError
+
+
+## TODO: This seems like it should go to ast_util
+
+## This function gets a key and a value from the ast json format
+def get_kv(dic):
+    return (dic[0], dic[1])
+
+def make_kv(key, val):
+    return [key, val]
+
+def string_to_arguments(string):
+    return [string_to_argument(word) for word in string.split(" ")]
+
+def string_to_argument(string):
+    ret = [char_to_arg_char(char) for char in string]
+    return ret
+
+## FIXME: This is certainly not complete. It is used to generate the
+## AST for the call to the distributed planner. It only handles simple
+## characters
+def char_to_arg_char(char):
+    return ['C' , ord(char)]
+
+def escaped_char(char):
+    return ['E' , ord(char)]
+
+def standard_var_ast(string):
+    return make_kv("V", ["Normal", False, string, []])
+
+def make_quoted_variable(string):
+    return make_kv("Q", [standard_var_ast(string)])
+
+def quote_arg(arg):
+    return make_kv("Q", arg)
+
+def redir_append_stderr_to_string_file(string):
+    return make_kv("File",["Append",2,string_to_argument(string)])
+
+def redir_stdout_to_file(arg):
+    return make_kv("File",["To", 1, arg])
+
+def redir_file_to_stdin(arg):
+    return make_kv("File",["From", 0, arg])
+
+def make_background(body, redirections=[]):
+    lineno = 0
+    node = make_kv("Background", [lineno, body, redirections])
+    return node
+
+def make_backquote(node):
+    node = make_kv("B", node)
+    return node
+
+def make_subshell(body, redirections=[]):
+    lineno = 0
+    node = make_kv("Subshell", [lineno, body, redirections])
+    return node
+
+def make_command(arguments, redirections=[], assignments=[]):
+    lineno = 0
+    node = make_kv("Command", [lineno, assignments, arguments, redirections])
+    return node
+
+def make_nop():
+    return make_command([string_to_argument(":")])
+
+def make_assignment(var, value):
+    lineno = 0
+    assignment=(var, value)
+    assignments=[assignment]
+    node = make_kv("Command", [lineno, assignments, [], []])
+    return node
+
+def make_semi_sequence(asts):
+    if(len(asts) == 0):
+        return make_nop()
+
+    if(len(asts) == 1):
+        return asts[0]
+    else:
+        acc = asts[-1]
+        ## Remove the last ast
+        iter_asts = asts[:-1]
+        for ast in iter_asts[::-1]:
+            acc = make_kv("Semi", [ast, acc])
+        return acc
+
+def make_defun(name, body):
+    lineno = 0
+    node = make_kv("Defun", [lineno, name, body])
+    return node
+
 ##
 ## Make some nodes
 ##

From 6e21f2ba7c1b747e0df8a05d948d8d271b425577 Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Tue, 21 Feb 2023 18:04:32 -0500
Subject: [PATCH 61/64] Move make_kv, get_kv to util

---
 compiler/ast_to_ir.py                             |  4 ++--
 compiler/json_ast.py                              |  2 +-
 compiler/parse.py                                 |  3 ++-
 compiler/{definitions => shell_ast}/ast_node.py   |  3 +--
 compiler/{definitions => shell_ast}/ast_node_c.py |  0
 compiler/shell_ast/ast_util.py                    | 10 +---------
 compiler/shell_ast/expand.py                      |  4 ++--
 compiler/util.py                                  |  6 ++++++
 8 files changed, 15 insertions(+), 17 deletions(-)
 rename compiler/{definitions => shell_ast}/ast_node.py (99%)
 rename compiler/{definitions => shell_ast}/ast_node_c.py (100%)

diff --git a/compiler/ast_to_ir.py b/compiler/ast_to_ir.py
index ac3d0ea60..b5ce38624 100644
--- a/compiler/ast_to_ir.py
+++ b/compiler/ast_to_ir.py
@@ -1,7 +1,7 @@
 from shell_ast.ast_util import *
 from ir import *
-from definitions.ast_node import *
-from definitions.ast_node_c import *
+from shell_ast.ast_node import *
+from shell_ast.ast_node_c import *
 from util import *
 from parse import from_ast_objects_to_shell
 from shell_ast.expand import *
diff --git a/compiler/json_ast.py b/compiler/json_ast.py
index 89583fea9..0342b5797 100644
--- a/compiler/json_ast.py
+++ b/compiler/json_ast.py
@@ -1,6 +1,6 @@
 import json
 import config
-from definitions.ast_node import CustomJSONEncoder
+from shell_ast.ast_node import CustomJSONEncoder
 from subprocess import run, PIPE
 
 from util import *
diff --git a/compiler/parse.py b/compiler/parse.py
index a701ab6bf..652f7c4e5 100644
--- a/compiler/parse.py
+++ b/compiler/parse.py
@@ -4,8 +4,9 @@
 import sys
 
 from shell_ast.ast_util import UnparsedScript
+from shell_ast.ast_node import AstNode, ast_node_to_untyped_deep
+
 from util import *
-from definitions.ast_node import AstNode, ast_node_to_untyped_deep
 
 import libdash.parser
 import libdash.printer
diff --git a/compiler/definitions/ast_node.py b/compiler/shell_ast/ast_node.py
similarity index 99%
rename from compiler/definitions/ast_node.py
rename to compiler/shell_ast/ast_node.py
index ee1539008..e83fbe3d6 100644
--- a/compiler/definitions/ast_node.py
+++ b/compiler/shell_ast/ast_node.py
@@ -1,8 +1,7 @@
 from json import JSONEncoder
 
-from definitions.ast_node_c import *
+from shell_ast.ast_node_c import *
 from definitions.no_match_exception import *
-from shell_ast.ast_util import *
 from util import *
 
 
diff --git a/compiler/definitions/ast_node_c.py b/compiler/shell_ast/ast_node_c.py
similarity index 100%
rename from compiler/definitions/ast_node_c.py
rename to compiler/shell_ast/ast_node_c.py
diff --git a/compiler/shell_ast/ast_util.py b/compiler/shell_ast/ast_util.py
index 19bd594a9..21c588a19 100644
--- a/compiler/shell_ast/ast_util.py
+++ b/compiler/shell_ast/ast_util.py
@@ -1,5 +1,5 @@
 
-from definitions.ast_node import *
+from shell_ast.ast_node import *
 from util import *
 
 
@@ -141,14 +141,6 @@ def format_expanded_arg_char(arg_char):
         raise ValueError
 
 
-## TODO: This seems like it should go to ast_util
-
-## This function gets a key and a value from the ast json format
-def get_kv(dic):
-    return (dic[0], dic[1])
-
-def make_kv(key, val):
-    return [key, val]
 
 def string_to_arguments(string):
     return [string_to_argument(word) for word in string.split(" ")]
diff --git a/compiler/shell_ast/expand.py b/compiler/shell_ast/expand.py
index 636fea2c8..b1f231691 100644
--- a/compiler/shell_ast/expand.py
+++ b/compiler/shell_ast/expand.py
@@ -1,7 +1,7 @@
 import copy
 
-from definitions.ast_node import *
-from definitions.ast_node_c import *
+from shell_ast.ast_node import *
+from shell_ast.ast_node_c import *
 
 import ast_to_ir
 import config
diff --git a/compiler/util.py b/compiler/util.py
index 04ca99fba..a09a24da7 100644
--- a/compiler/util.py
+++ b/compiler/util.py
@@ -75,3 +75,9 @@ def return_default_if_none_else_itself(arg: Optional[TType], default: TType) ->
     else:
         return arg
 
+## This function gets a key and a value from the ast json format
+def get_kv(dic):
+    return (dic[0], dic[1])
+
+def make_kv(key, val):
+    return [key, val]

From 38b16e0d9121bc1ab5480880e2e76273904fa80b Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Tue, 21 Feb 2023 18:19:22 -0500
Subject: [PATCH 62/64] Move orchestration runtime scripts to their own
 directory

---
 compiler/dspash/worker.sh                                    | 2 +-
 compiler/orchestrator_runtime/README.md                      | 5 +++++
 compiler/{ => orchestrator_runtime}/pash_declare_vars.sh     | 0
 compiler/{ => orchestrator_runtime}/pash_init_setup.sh       | 2 +-
 compiler/{ => orchestrator_runtime}/pash_ptempfile_name.sh   | 0
 .../pash_runtime_complete_execution.sh                       | 0
 .../{ => orchestrator_runtime}/pash_runtime_shell_to_pash.sh | 0
 compiler/{ => orchestrator_runtime}/pash_set_from_to.sh      | 0
 .../{ => orchestrator_runtime}/pash_source_declare_vars.sh   | 0
 compiler/{ => orchestrator_runtime}/pash_wrap_vars.sh        | 0
 pa.sh                                                        | 2 +-
 11 files changed, 8 insertions(+), 3 deletions(-)
 create mode 100644 compiler/orchestrator_runtime/README.md
 rename compiler/{ => orchestrator_runtime}/pash_declare_vars.sh (100%)
 rename compiler/{ => orchestrator_runtime}/pash_init_setup.sh (99%)
 rename compiler/{ => orchestrator_runtime}/pash_ptempfile_name.sh (100%)
 rename compiler/{ => orchestrator_runtime}/pash_runtime_complete_execution.sh (100%)
 rename compiler/{ => orchestrator_runtime}/pash_runtime_shell_to_pash.sh (100%)
 rename compiler/{ => orchestrator_runtime}/pash_set_from_to.sh (100%)
 rename compiler/{ => orchestrator_runtime}/pash_source_declare_vars.sh (100%)
 rename compiler/{ => orchestrator_runtime}/pash_wrap_vars.sh (100%)

diff --git a/compiler/dspash/worker.sh b/compiler/dspash/worker.sh
index a94285a37..1052c5990 100644
--- a/compiler/dspash/worker.sh
+++ b/compiler/dspash/worker.sh
@@ -16,7 +16,7 @@ then
     export HDFS_DATANODE_DIR=${datanode_dir#"file://"} # removes file:// prefix
 fi
 
-source "$PASH_TOP/compiler/pash_init_setup.sh" "$@" --distributed_exec
+source "$PASH_TOP/compiler/orchestrator_runtime/pash_init_setup.sh" "$@" --distributed_exec
 
 export PASH_TMP_PREFIX="$(mktemp -d /tmp/pash_XXXXXXX)/"
 
diff --git a/compiler/orchestrator_runtime/README.md b/compiler/orchestrator_runtime/README.md
new file mode 100644
index 000000000..c854c76f5
--- /dev/null
+++ b/compiler/orchestrator_runtime/README.md
@@ -0,0 +1,5 @@
+## Orchestrator Runtime
+
+These are the scripts that make up the PaSh JIT Engine, or the orchestrator runtime.
+
+The difference with the top `runtime` directory, is that the other one contains runtime commands and tools, that are part of the produced shell scripts.
diff --git a/compiler/pash_declare_vars.sh b/compiler/orchestrator_runtime/pash_declare_vars.sh
similarity index 100%
rename from compiler/pash_declare_vars.sh
rename to compiler/orchestrator_runtime/pash_declare_vars.sh
diff --git a/compiler/pash_init_setup.sh b/compiler/orchestrator_runtime/pash_init_setup.sh
similarity index 99%
rename from compiler/pash_init_setup.sh
rename to compiler/orchestrator_runtime/pash_init_setup.sh
index 733aa5357..f3753ba7e 100644
--- a/compiler/pash_init_setup.sh
+++ b/compiler/orchestrator_runtime/pash_init_setup.sh
@@ -3,7 +3,7 @@
 ## File directory
 export RUNTIME_DIR=$(dirname "${BASH_SOURCE[0]}")
 ## TODO: Is there a better way to do this?
-export RUNTIME_LIBRARY_DIR="$RUNTIME_DIR/../runtime/"
+export RUNTIME_LIBRARY_DIR="$RUNTIME_DIR/../../runtime/"
 export PASH_REDIR="&2"
 export PASH_DEBUG_LEVEL=0
 
diff --git a/compiler/pash_ptempfile_name.sh b/compiler/orchestrator_runtime/pash_ptempfile_name.sh
similarity index 100%
rename from compiler/pash_ptempfile_name.sh
rename to compiler/orchestrator_runtime/pash_ptempfile_name.sh
diff --git a/compiler/pash_runtime_complete_execution.sh b/compiler/orchestrator_runtime/pash_runtime_complete_execution.sh
similarity index 100%
rename from compiler/pash_runtime_complete_execution.sh
rename to compiler/orchestrator_runtime/pash_runtime_complete_execution.sh
diff --git a/compiler/pash_runtime_shell_to_pash.sh b/compiler/orchestrator_runtime/pash_runtime_shell_to_pash.sh
similarity index 100%
rename from compiler/pash_runtime_shell_to_pash.sh
rename to compiler/orchestrator_runtime/pash_runtime_shell_to_pash.sh
diff --git a/compiler/pash_set_from_to.sh b/compiler/orchestrator_runtime/pash_set_from_to.sh
similarity index 100%
rename from compiler/pash_set_from_to.sh
rename to compiler/orchestrator_runtime/pash_set_from_to.sh
diff --git a/compiler/pash_source_declare_vars.sh b/compiler/orchestrator_runtime/pash_source_declare_vars.sh
similarity index 100%
rename from compiler/pash_source_declare_vars.sh
rename to compiler/orchestrator_runtime/pash_source_declare_vars.sh
diff --git a/compiler/pash_wrap_vars.sh b/compiler/orchestrator_runtime/pash_wrap_vars.sh
similarity index 100%
rename from compiler/pash_wrap_vars.sh
rename to compiler/orchestrator_runtime/pash_wrap_vars.sh
diff --git a/pa.sh b/pa.sh
index e508b72ce..5779e164e 100755
--- a/pa.sh
+++ b/pa.sh
@@ -49,7 +49,7 @@ export DAEMON_SOCKET="${PASH_TMP_PREFIX}/daemon_socket"
 export DSPASH_SOCKET="${PASH_TMP_PREFIX}/dspash_socket"
 
 ## Initialize all things necessary for pash to execute (logging/functions/etc)
-source "$PASH_TOP/compiler/pash_init_setup.sh" "$@"
+source "$PASH_TOP/compiler/orchestrator_runtime/pash_init_setup.sh" "$@"
 
 if [ "$pash_daemon" -eq 1 ] && [ "$show_version" -eq 0 ]; then
   ## TODO: If possible, move the daemon start as easly as possible to reduce waiting

From 87f8bf606680dcf5667ac7afafa967c52a9d17f3 Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Tue, 21 Feb 2023 18:28:30 -0500
Subject: [PATCH 63/64] Rename pash_runtime to pash_compiler

---
 compiler/README.md                                     |  6 +++---
 compiler/config.py                                     |  4 ++--
 compiler/dspash/ir_helper.py                           |  6 +++---
 compiler/dspash/worker.py                              |  6 +++---
 ...sh_runtime_daemon.py => pash_compilation_server.py} | 10 ++++------
 compiler/{pash_runtime.py => pash_compiler.py}         |  0
 compiler/pash_runtime.sh                               |  2 +-
 pa.sh                                                  |  2 +-
 8 files changed, 17 insertions(+), 19 deletions(-)
 rename compiler/{pash_runtime_daemon.py => pash_compilation_server.py} (98%)
 rename compiler/{pash_runtime.py => pash_compiler.py} (100%)

diff --git a/compiler/README.md b/compiler/README.md
index f28eea723..3af3cabf0 100644
--- a/compiler/README.md
+++ b/compiler/README.md
@@ -9,7 +9,7 @@ A high-level diagram of PaSh's end-to-end operation is shown below:
 
 <img src="https://docs.google.com/drawings/d/e/2PACX-1vSIuacgBR_QFOzawoAdJMmTjgsdnDUkp1DbSjLVlrowlhL6kxqckXXsL7SPoRXKfaC1hw9HQzJitmDP/pub?w=1364&amp;h=454">
 
-PaSh pre-processes a sequential script to insert calls to the `pash_runtime.py`.
+PaSh pre-processes a sequential script to insert calls to the `pash_compiler.py`.
 It then invokes the script, switching between evaluation, execution, and parallelization at runtime:
 (i) it first parses the script, creating an abstact syntax tree (AST); 
 (ii) it then expands the nodes of the AST, often calling the shell which performs that expansion;
@@ -21,7 +21,7 @@ A correspondence between blocks in the diagram and Python modules is shown below
 
 - Preprocessing: [pash.py](./pash.py)
 - Expansion and compilation: [ast_to_ir.py](./ast_to_ir.py)
-- Optimization: [pash_runtime.py](./pash_runtime.py)
+- Optimization: [pash_compiler.py](./pash_compiler.py)
 
 ## Compiler Overview
 
@@ -40,7 +40,7 @@ The compiler has several stages:
    - The dataflow model is defined mostly in [ir.py](./ir.py)
    - The annotations are processed in [binpash/annotations](https://github.com/binpash/annotations)
 2. It performs transformations on the dataflow graph to expose parallelism (guided by annotations)
-   - Translations happen in [pash_runtime.py](./pash_runtime.py)
+   - Translations happen in [pash_compiler.py](./pash_compiler.py)
 3. It then translates the dataflow graph back to a shell script to execute it with bash
    - The `dfg2shell` translation happens in [ir_to_ast.py](./ir_to_ast.py)
 
diff --git a/compiler/config.py b/compiler/config.py
index 040b6d4c0..c770571af 100644
--- a/compiler/config.py
+++ b/compiler/config.py
@@ -17,7 +17,7 @@
     PASH_TOP = subprocess.run(GIT_TOP_CMD, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True).stdout.rstrip()
 
 PYTHON_VERSION = "python3"
-PLANNER_EXECUTABLE = os.path.join(PASH_TOP, "compiler/pash_runtime.py")
+PLANNER_EXECUTABLE = os.path.join(PASH_TOP, "compiler/pash_compiler.py")
 RUNTIME_EXECUTABLE = os.path.join(PASH_TOP, "compiler/pash_runtime.sh")
 SAVE_ARGS_EXECUTABLE = os.path.join(PASH_TOP, "runtime/save_args.sh")
 
@@ -90,7 +90,7 @@ def add_general_config_arguments(parser):
                         help="configure where to write the log; defaults to stderr.",
                         default="")
 
-## These are arguments that are common to pash.py and pash_runtime.py
+## These are arguments that are common to pash.py and pash_compiler.py
 def add_common_arguments(parser):
     add_general_config_arguments(parser)
 
diff --git a/compiler/dspash/ir_helper.py b/compiler/dspash/ir_helper.py
index 97368e267..dc1d8b198 100644
--- a/compiler/dspash/ir_helper.py
+++ b/compiler/dspash/ir_helper.py
@@ -28,7 +28,7 @@
 import definitions.ir.nodes.remote_pipe as remote_pipe
 import shlex
 import subprocess
-import pash_runtime
+import pash_compiler
 from collections import deque, defaultdict
 
 HOST = socket.gethostbyname(socket.gethostname())
@@ -68,7 +68,7 @@ def to_shell_file(graph: IR, args) -> str:
         os.makedirs(directory, exist_ok=True)
 
     if not args.no_eager:
-        graph = pash_runtime.add_eager_nodes(graph)
+        graph = pash_compiler.add_eager_nodes(graph)
 
     script = to_shell(graph, args)
     with open(filename, "w") as f:
@@ -192,7 +192,7 @@ def assign_workers_to_subgraphs(subgraphs:List[IR], file_id_gen: FileIdGen, inpu
         worker_subgraph_pairs: A list of pairs representing which worker
             each subgraph should be executed on.
     """
-    # The graph to execute in the main pash_runtime
+    # The graph to execute in the main pash_compiler
     main_graph = IR({}, {})
     worker_subgraph_pairs = []
 
diff --git a/compiler/dspash/worker.py b/compiler/dspash/worker.py
index d8f18db77..4b60ef766 100644
--- a/compiler/dspash/worker.py
+++ b/compiler/dspash/worker.py
@@ -16,7 +16,7 @@
 
 import config
 from util import log
-import pash_runtime
+import pash_compiler
 from dspash.socket_utils import send_msg, recv_msg
 from dspash.ir_helper import save_configs, to_shell_file
 from dspash.utils import create_filename, write_file
@@ -125,8 +125,8 @@ def init():
     ## KK: 2023-02-21 Commenting this out, we need to figure out if the new annotations work with the distribution package
     # config.annotations = load_annotation_files(
     #     config.config['distr_planner']['annotations_dir'])
-    pash_runtime.runtime_config = config.config['distr_planner']
-    pash_runtime.termination = ""
+    pash_compiler.runtime_config = config.config['distr_planner']
+    pash_compiler.termination = ""
 
 def main():
     init()
diff --git a/compiler/pash_runtime_daemon.py b/compiler/pash_compilation_server.py
similarity index 98%
rename from compiler/pash_runtime_daemon.py
rename to compiler/pash_compilation_server.py
index df80fcaed..5df6c93fe 100644
--- a/compiler/pash_runtime_daemon.py
+++ b/compiler/pash_compilation_server.py
@@ -9,7 +9,7 @@
 
 import config
 from pash_graphviz import maybe_generate_graphviz
-import pash_runtime
+import pash_compiler
 from util import *
 from dspash.worker_manager import WorkersManager
 
@@ -18,8 +18,6 @@
 ## that responds to requests for compilation
 ##
 
-# TODO: Rename the pash_runtime to pash_compiler and this to pash_daemon
-
 
 def handler(signum, frame):
     log("Signal:", signum, "caught")
@@ -48,7 +46,7 @@ def init():
     if not config.config:
         config.load_config(args.config_path)
 
-    pash_runtime.runtime_config = config.config['distr_planner']
+    pash_compiler.runtime_config = config.config['distr_planner']
 
     return args
 
@@ -177,7 +175,7 @@ def determine_compiler_config(self, input_ir_file):
             selected_width = config.pash_args.width
 
         log("Selected width:", selected_width)
-        return pash_runtime.CompilerConfig(selected_width)
+        return pash_compiler.CompilerConfig(selected_width)
 
     def get_averages_per_width(self, input_ir_file):
         ## If we haven't gathered any statistic yet
@@ -254,7 +252,7 @@ def compile_and_add(self, compiled_script_file, var_file, input_ir_file):
         ## Add the process_id -> input_ir mapping
         self.add_proc_id_map(process_id, input_ir_file, compiler_config)
 
-        ast_or_ir = pash_runtime.compile_ir(
+        ast_or_ir = pash_compiler.compile_ir(
             input_ir_file, compiled_script_file, config.pash_args, compiler_config)
 
         daemon_compile_end_time = datetime.now()
diff --git a/compiler/pash_runtime.py b/compiler/pash_compiler.py
similarity index 100%
rename from compiler/pash_runtime.py
rename to compiler/pash_compiler.py
diff --git a/compiler/pash_runtime.sh b/compiler/pash_runtime.sh
index a543f21d0..1a6768d18 100755
--- a/compiler/pash_runtime.sh
+++ b/compiler/pash_runtime.sh
@@ -146,7 +146,7 @@ else
         response_args=($daemon_response)
         process_id=${response_args[1]}
     else
-        pash_redir_all_output_always_execute python3 -S "$RUNTIME_DIR//pash_runtime.py" --var_file "${pash_runtime_shell_variables_file}" "${pash_compiled_script_file}" "${pash_input_ir_file}" "$@"
+        pash_redir_all_output_always_execute python3 -S "$PASH_TOP/compiler/pash_compiler.py" --var_file "${pash_runtime_shell_variables_file}" "${pash_compiled_script_file}" "${pash_input_ir_file}" "$@"
         pash_runtime_return_code=$?
     fi
 
diff --git a/pa.sh b/pa.sh
index 5779e164e..7eb65efb3 100755
--- a/pa.sh
+++ b/pa.sh
@@ -53,7 +53,7 @@ source "$PASH_TOP/compiler/orchestrator_runtime/pash_init_setup.sh" "$@"
 
 if [ "$pash_daemon" -eq 1 ] && [ "$show_version" -eq 0 ]; then
   ## TODO: If possible, move the daemon start as easly as possible to reduce waiting
-  python3 -S "$PASH_TOP/compiler/pash_runtime_daemon.py" "$@" &
+  python3 -S "$PASH_TOP/compiler/pash_compilation_server.py" "$@" &
   daemon_pid=$!
   ## Wait until daemon has established connection
   ##

From 9e56d534a19d6e3a6e83d30a91d546edccc84975 Mon Sep 17 00:00:00 2001
From: Konstantinos Kallas <konstantinos.kallas@hotmail.com>
Date: Tue, 21 Feb 2023 18:59:14 -0500
Subject: [PATCH 64/64] push version

---
 compiler/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler/config.py b/compiler/config.py
index c770571af..d936c8119 100644
--- a/compiler/config.py
+++ b/compiler/config.py
@@ -9,7 +9,7 @@
 from util import *
 
 ## Global
-__version__ = "0.10" # FIXME add libdash version
+__version__ = "0.11" # FIXME add libdash version
 GIT_TOP_CMD = [ 'git', 'rev-parse', '--show-toplevel', '--show-superproject-working-tree']
 if 'PASH_TOP' in os.environ:
     PASH_TOP = os.environ['PASH_TOP']