From 613d090b3c46c36e4e503ed6463201f87bc09378 Mon Sep 17 00:00:00 2001 From: Felix Stutz Date: Mon, 27 Jun 2022 13:38:38 -0400 Subject: [PATCH 01/64] First PR for new annotations and new model for DFG nodes (#583) * WIP: first changes for new annotations * Make parallelization use mapper info from new annotations * Make parallelization use mapper info from new annotations * Delete com_mapper field in dfg_node and use the info from new annotations * Use input info from new annotations * Use output info from new annotations * Make dfg_options use the information from new annotations and completely remove use of old aggregator com_aggregator * Make to_ast work for wf.sh with some hacks, e.g. to_ast for eager to handle special case of intermediate file as last operand and fixed parsing issue for newline * Remove com_mapper and com_aggregator from DFGNode * WIP: incorporating remodelled command invocations * Parsing with new dataflow node model works * WIP: parallelization * Rudimentary parallelization with new annotations works * Add way to specify where to find repository for annotations repository * Do not require flag `r_split` since we do consecutive chunks for now * 1st part of changes due to comments for PR * minor fix * 2nd part of changes due to comments for PR Co-authored-by: Felix Stutz --- README.md | 6 + TODO.md | 10 + compiler/annotations.py | 1 + compiler/annotations_utils/util_aggregator.py | 59 ++++ .../annotations_utils/util_cmd_invocations.py | 96 ++++++ .../util_file_descriptors.py | 21 ++ compiler/annotations_utils/util_mapper.py | 97 ++++++ compiler/annotations_utils/util_parsing.py | 92 ++++++ compiler/config.py | 9 + compiler/definitions/ir/aggregator_node.py | 58 +++- compiler/definitions/ir/arg.py | 7 + compiler/definitions/ir/dfg_node.py | 243 +++++++-------- compiler/definitions/ir/nodes/cat.py | 29 +- compiler/definitions/ir/nodes/eager.py | 106 ++++++- compiler/definitions/ir/nodes/pash_split.py | 40 ++- compiler/definitions/ir/nodes/r_split.py | 49 ++- compiler/definitions/ir/resource.py | 1 + compiler/ir.py | 291 ++++++++++++------ compiler/pash_runtime.py | 133 ++++---- compiler/util.py | 15 + 20 files changed, 1013 insertions(+), 350 deletions(-) create mode 100644 TODO.md create mode 100644 compiler/annotations_utils/util_aggregator.py create mode 100644 compiler/annotations_utils/util_cmd_invocations.py create mode 100644 compiler/annotations_utils/util_file_descriptors.py create mode 100644 compiler/annotations_utils/util_mapper.py create mode 100644 compiler/annotations_utils/util_parsing.py diff --git a/README.md b/README.md index fa5ab805e..ff9ce1d2d 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,11 @@ ## PaSh: Light-touch Data-Parallel Shell Processing +**TODO before testing new annotations (temporary fix):** + +Connect the new annotations repository to PaSh in the `future_annotations`-branch: +- clone the `connect_to_pash` branch from the new repository for annotations: git@github.com:binpash/annotations.git +- Specify the path in `compiler/config.py` + > _A system for parallelizing POSIX shell scripts._ > _Hosted by the [Linux Foundation](https://linuxfoundation.org/press-release/linux-foundation-to-host-the-pash-project-accelerating-shell-scripting-with-automated-parallelization-for-industrial-use-cases/)._ diff --git a/TODO.md b/TODO.md new file mode 100644 index 000000000..67b55f0ea --- /dev/null +++ b/TODO.md @@ -0,0 +1,10 @@ +## TODOs before merging to `future` + +- eager +- aggregation trees +- r_split +- cat-split fusion +- working on all tests +- Adding annotation library installation and removing ad-hoc import of the latter +- clean up utils for annotations +- Changing PaSh flags (making the default be priority r-split and then consecutive chunks), so remove the r_split flag and make defaults be the ones from the OSDI paper (I can do that TODO too) \ No newline at end of file diff --git a/compiler/annotations.py b/compiler/annotations.py index 28d61e541..a7c78be02 100644 --- a/compiler/annotations.py +++ b/compiler/annotations.py @@ -284,6 +284,7 @@ def get_command_properties_from_annotations(command, options, annotations): return command_ann['properties'] def get_command_aggregator_from_annotations(command, options, annotations): + log(f'still used') command_ann = get_command_from_annotations(command, options, annotations) if(command_ann and 'aggregator' in command_ann): diff --git a/compiler/annotations_utils/util_aggregator.py b/compiler/annotations_utils/util_aggregator.py new file mode 100644 index 000000000..3382730c6 --- /dev/null +++ b/compiler/annotations_utils/util_aggregator.py @@ -0,0 +1,59 @@ +# TODO: this file can properly be deleted + +import sys +from config import get_path_annotation_repo +sys.path.insert(1, get_path_annotation_repo()) + +from definitions.ir.dfg_node import DFGNode +from definitions.ir.nodes.cat import Cat +from annotations_utils.util_cmd_invocations import get_command_invocation_prefix_from_dfg_node +from util import log +from ir_utils import string_to_argument +from definitions.ir.arg import Arg + +def get_aggregator_as_dfg_node_from_node(node, parallelizer, inputs, outputs) -> DFGNode: + assert(False) + cmd_inv_pref = get_command_invocation_prefix_from_dfg_node(node) + log(f'cmdinvpref for agg: {cmd_inv_pref}') + aggregator = parallelizer.get_actual_aggregator(cmd_inv_pref) + log(f'here agg: {aggregator}') + # TODO: this could be simplified once we use the new attributes + if aggregator.cmd_name == 'cat': + return Cat(inputs=inputs, + outputs=outputs, + com_name=Arg(string_to_argument(aggregator.cmd_name)), + com_options=[], # empty and not taking over from other one + com_category="stateless", + com_redirs=node.com_redirs, + com_assignments=node.com_assignments, + flag_option_list=aggregator.flag_option_list, + positional_config_list=aggregator.positional_config_list, + positional_input_list=None, # TODO: somehow from inputs, future shift + positional_output_list=None # TODO: somehow from outputs, future shift + # TODO: + # implicit_use_of_stdin = False, + # implicit_use_of_stdout = False, + # omitted for now since we do not consider nested parallelization + # parallelizer_list = None, + # cmd_related_properties = None, + ) + else: + log(f'agg_com_name: {aggregator.cmd_name}') + log(f'agg_flag_option_list: {aggregator.flag_option_list}') + return DFGNode(inputs=inputs, + outputs=outputs, + com_name=Arg(string_to_argument(aggregator.cmd_name)), + com_options=node.com_options, + com_redirs=node.com_redirs, + com_assignments=node.com_assignments, + flag_option_list=aggregator.flag_option_list, + positional_config_list=aggregator.positional_config_list, + positional_input_list=None, # TODO: somehow from inputs, future shift + positional_output_list=None # TODO: somehow from outputs, future shift + # TODO: + # implicit_use_of_stdin = False, + # implicit_use_of_stdout = False, + # omitted for now since we do not consider nested parallelization + # parallelizer_list = None, + # cmd_related_properties = None, + ) diff --git a/compiler/annotations_utils/util_cmd_invocations.py b/compiler/annotations_utils/util_cmd_invocations.py new file mode 100644 index 000000000..90e5f6c10 --- /dev/null +++ b/compiler/annotations_utils/util_cmd_invocations.py @@ -0,0 +1,96 @@ +import sys + +from datatypes_new.BasicDatatypes import Flag +from datatypes_new.BasicDatatypesWithIO import OptionWithIO +from datatypes_new.CommandInvocationInitial import CommandInvocationInitial +from annotation_generation_new.datatypes.InputOutputInfo import InputOutputInfo +from annotation_generation_new.datatypes.ParallelizabilityInfo import ParallelizabilityInfo +from annotation_generation_new.datatypes.CommandProperties import CommandProperties +from annotation_generation_new.AnnotationGeneration import get_input_output_info_from_cmd_invocation, \ + get_parallelizability_info_from_cmd_invocation + +from util import log + +from config import get_path_annotation_repo +sys.path.insert(1, get_path_annotation_repo()) + +# for typing +from datatypes_new.CommandInvocationPrefix import CommandInvocationPrefix + +from ir_utils import string_to_argument, redir_stdout_to_file, redir_file_to_stdin, make_command + +def get_command_invocation_prefix_from_dfg_node(dfg_node): + return CommandInvocationPrefix(cmd_name = dfg_node.com_name, + flag_option_list = dfg_node.flag_option_list, + positional_config_list = dfg_node.positional_config_list) + +# TODO: ideally methods in the respective classes but requires refactoring of parsing infrastructure +def to_node_cmd_inv_with_io_vars(cmd_inv, edges, redirs, assignments): + log("edges", edges) + ast_cmd_name = string_to_argument(cmd_inv.cmd_name) + log("ast_cmd_name", ast_cmd_name) + ast_flagoptions = [] + for flagoption in cmd_inv.flag_option_list: + ast_flagoptions += to_ast_flagoption(flagoption, edges) + log("flagoptions", cmd_inv.flag_option_list) + log("ast_flagoptions", ast_flagoptions) + ast_operands = [to_ast_operand(operand, edges) for operand in cmd_inv.operand_list] + log("operands", cmd_inv.operand_list) + log("ast_operands", ast_operands) + # log("type of ast_operands [0]", type(ast_operands[0])) # can only be used if there are operands + cmd_asts = [ast_cmd_name] + ast_flagoptions + ast_operands + + # TODO: check for actual stdin + stdin_redir = [] + if cmd_inv.implicit_use_of_streaming_input is not None: + fid, _, _ = edges[cmd_inv.implicit_use_of_streaming_input] + if not (fid.has_file_descriptor_resource() and fid.resource.is_stdin()): + stdin_redir = [redir_file_to_stdin(fid.to_ast())] + + # TODO: check for actual stdout + stdout_redir = [] + if cmd_inv.implicit_use_of_streaming_output is not None: + fid, _, _ = edges[cmd_inv.implicit_use_of_streaming_output] + if not (fid.has_file_descriptor_resource() and fid.resource.is_stdout()): + stdout_redir = [redir_stdout_to_file(fid.to_ast())] + + new_redirs = redirs + stdin_redir + stdout_redir + node = make_command(cmd_asts, redirections=new_redirs, assignments=assignments) + log("node", node) + return node + +def to_ast_flagoption(flagoption, _edges): + if isinstance(flagoption, Flag): + return [string_to_argument(flagoption.get_name())] + elif isinstance(flagoption, OptionWithIO): # retype to IOVar + opt_name_ast = string_to_argument(flagoption.get_name()) + opt_arg_ast = translate_io_var_if_applicable(flagoption.get_arg()) + return [opt_name_ast, opt_arg_ast] + +def to_ast_operand(operand, edges): + return translate_io_var_if_applicable(operand, edges) + +def translate_io_var_if_applicable(pot_io_var, edges): + if isinstance(pot_io_var, int): + return dereference_io_var(pot_io_var, edges) + else: + return to_ast_arg_string_type(pot_io_var) + +def to_ast_arg_string_type(arg_string_type): + return arg_string_type.get_name().arg_char_list # is of type Arg + +# assumes io_var is an edge id +def dereference_io_var(io_var, edges): + fid, _, _ = edges[io_var] + log(fid) + return fid.to_ast() + +def get_input_output_info_from_cmd_invocation_util(cmd_invocationInitial : CommandInvocationInitial) -> InputOutputInfo: + return get_input_output_info_from_cmd_invocation(cmd_invocationInitial) + +def get_parallelizability_info_from_cmd_invocation_util(cmd_invocationInitial : CommandInvocationInitial) -> ParallelizabilityInfo: + return get_parallelizability_info_from_cmd_invocation(cmd_invocationInitial) + +def construct_property_container_from_list_of_properties(list_properties): + return CommandProperties(dict(list_properties)) + diff --git a/compiler/annotations_utils/util_file_descriptors.py b/compiler/annotations_utils/util_file_descriptors.py new file mode 100644 index 000000000..910efa632 --- /dev/null +++ b/compiler/annotations_utils/util_file_descriptors.py @@ -0,0 +1,21 @@ +from util import log +from definitions.ir.resource import FileResource, Resource, FileDescriptorResource +import sys +from config import get_path_annotation_repo +sys.path.insert(1, get_path_annotation_repo()) +from datatypes_new.BasicDatatypesWithIO import FileNameWithIOInfo, StdDescriptorWithIOInfo + + +def resource_from_file_descriptor(file_descriptor) -> Resource: + if isinstance(file_descriptor, FileNameWithIOInfo): + arg = file_descriptor.get_name() + log(f'filedes name: {file_descriptor.get_name()}') + log(f'filedes name type: {type(file_descriptor.get_name())}') + log(f'arg: {arg}') + return FileResource(file_descriptor.get_name()) + elif isinstance(file_descriptor, StdDescriptorWithIOInfo): + resource = ("fd", file_descriptor.get_type().value) + return FileDescriptorResource(resource) + else: + assert(False) + # unreachable diff --git a/compiler/annotations_utils/util_mapper.py b/compiler/annotations_utils/util_mapper.py new file mode 100644 index 000000000..64657cf03 --- /dev/null +++ b/compiler/annotations_utils/util_mapper.py @@ -0,0 +1,97 @@ +# TODO: this file can properly be deleted + +# imports from annotation framework +import sys +from config import get_path_annotation_repo +sys.path.insert(1, get_path_annotation_repo()) +# for typing +# for use +from annotation_generation_new.datatypes.parallelizability.Mapper import Mapper + +from definitions.ir.dfg_node import DFGNode +from annotations_utils.util_cmd_invocations import get_command_invocation_prefix_from_dfg_node +from util import log + +def get_actual_mapper_from_node(node, parallelizer) -> Mapper: + assert(False) + cmd_inv_pref = get_command_invocation_prefix_from_dfg_node(node) + return parallelizer.get_actual_mapper(cmd_inv_pref) + +def get_mapper_as_dfg_node_from_node(node, parallelizer, inputs, outputs) -> DFGNode: + assert(False) + mapper = get_actual_mapper_from_node(node, parallelizer) + log(f'mapper for cmd_name: {node.com_name}') + log(f'here mapper: {mapper}') + return DFGNode(inputs=inputs, + outputs=outputs, + com_name=mapper.cmd_name, + # com_options=node.com_options, + com_redirs=node.com_redirs, + com_assignments=node.com_assignments, + flag_option_list=mapper.flag_option_list, + positional_config_list=mapper.positional_config_list, + positional_input_list=None, # TODO: somehow from inputs, future shift + positional_output_list=None # TODO: somehow from outputs, future shift + # TODO: + # implicit_use_of_stdin = False, + # implicit_use_of_stdout = False, + # omitted for now since we do not consider nested parallelization + # parallelizer_list = None, + # cmd_related_properties = None, + ) + +## MOVED from dfg_node +## Get the file names of the outputs of the map commands. This +## differs if the command is stateless, pure that can be +## written as a map and a reduce, and a pure that can be +## written as a generalized map and reduce. +# BEGIN ANNO +# OLD +# def get_map_output_files(node, input_edge_ids, fileIdGen): +# NEW +def get_map_output_files(node, input_edge_ids, fileIdGen, parallelizer): + assert(False) + assert (node.is_parallelizable()) + # TODO ANNO: How to substitute? @KK + if (node.com_category == "stateless"): + map_output_fids = [fileIdGen.next_ephemeral_file_id() for in_fid in input_edge_ids] + elif (node.is_pure_parallelizable()): + # BEGIN ANNO + # OLD + # map_output_fids = node.pure_get_map_output_files(input_edge_ids, fileIdGen) + # NEW + map_output_fids = pure_get_map_output_files(node, input_edge_ids, fileIdGen, parallelizer) + # END ANNO + else: + log("Unreachable code reached :(") + assert (False) + ## This should be unreachable + + return map_output_fids + +## TODO: Fix this somewhere in the annotations and not in the code +# BEGIN ANNO +# OLD +# def pure_get_map_output_files(node, input_edge_ids, fileIdGen): +# NEW +def pure_get_map_output_files(node, input_edge_ids, fileIdGen, parallelizer): + assert(False) + assert (node.is_pure_parallelizable()) + # BEGIN ANNO + # OLD + ## The number of the mapper outputs defaults to 1 + # if(node.com_mapper is None): + # number_outputs = 1 + # else: + # number_outputs = node.com_mapper.num_outputs + # NEW + # TODO: which parallelizer did we choose? + actual_mapper = get_actual_mapper_from_node(node, parallelizer) + number_outputs = actual_mapper.num_outputs # defaults to 1 in class Mapper + # END ANNO + + new_output_fids = [[fileIdGen.next_ephemeral_file_id() for i in range(number_outputs)] + for in_fid in input_edge_ids] + return new_output_fids + + diff --git a/compiler/annotations_utils/util_parsing.py b/compiler/annotations_utils/util_parsing.py new file mode 100644 index 000000000..19a098403 --- /dev/null +++ b/compiler/annotations_utils/util_parsing.py @@ -0,0 +1,92 @@ +import sys +from typing import Set, List, Any + +from definitions.ir.arg import Arg + +from config import get_path_annotation_repo +sys.path.insert(1, get_path_annotation_repo()) +from datatypes_new.CommandInvocationInitial import CommandInvocationInitial +from datatypes_new.BasicDatatypes import Option, ArgStringType, Flag, Operand +from parser_new.parser import parse, get_set_of_all_flags, get_dict_flag_to_primary_repr, get_set_of_all_options, \ + get_dict_option_to_primary_repr, are_all_individually_flags +from parser_new.util_parser import get_json_data + + +from ir_utils import format_arg_chars, string_to_argument, log + + +def merge_to_single_string_with_space(list_str): + if len(list_str) == 1: + return list_str[0] + else: + return " ".join(list_str) + +def get_command_invocation(command, options) -> CommandInvocationInitial: + command_as_string: str = format_arg_chars(command) + options_and_operands_as_string: str = merge_to_single_string_with_space([format_arg_chars(option) for option in options]) + command_invocation_as_string: str = f'{command_as_string} {options_and_operands_as_string}' + command_invocation: CommandInvocationInitial = parse(command_invocation_as_string) + return command_invocation + +def get_ast_for_flagoption(flagoption): + result = string_to_argument(flagoption.get_name()) + if isinstance(flagoption, Option): + # TODO: add argument here as well but eventually also fid + assert False + return result + +def get_ast_for_argstringtype(arg): + return string_to_argument(arg.get_name()) + +# TODO: this is a hack to fix the wrong parsing of " +def fix_parsing_newline(arg): + if arg.get_name() == '\\n': + return ArgStringType(r'"\n"') + else: + return arg + + +def parse_arg_list_to_command_invocation(command, flags_options_operands) -> CommandInvocationInitial: + + cmd_name = format_arg_chars(command) + json_data = get_json_data(cmd_name) + + set_of_all_flags: Set[str] = get_set_of_all_flags(json_data) + dict_flag_to_primary_repr: dict[str, str] = get_dict_flag_to_primary_repr(json_data) + set_of_all_options: Set[str] = get_set_of_all_options(json_data) + dict_option_to_primary_repr: dict[str, str] = get_dict_option_to_primary_repr(json_data) + # we keep the Arg for everything but flag and option names + + # parse list of command invocation terms + flag_option_list: List[Any] = [] + i = 0 + while i < len(flags_options_operands): + potential_flag_or_option_arg = flags_options_operands[i] + potential_flag_or_option_name = format_arg_chars(potential_flag_or_option_arg) + if potential_flag_or_option_name in set_of_all_flags: + flag_name_as_string: str = dict_flag_to_primary_repr.get(potential_flag_or_option_name, potential_flag_or_option_name) + flag: Flag = Flag(flag_name_as_string) + flag_option_list.append(flag) + elif (potential_flag_or_option_name in set_of_all_options) and ((i+1) < len(flags_options_operands)): + option_name_as_string: str = dict_option_to_primary_repr.get(potential_flag_or_option_name, potential_flag_or_option_name) + option_arg_as_arg: Arg = Arg(flags_options_operands[i+1]) + option = Option(option_name_as_string, option_arg_as_arg) + flag_option_list.append(option) + i += 1 # since we consumed another term for the argument + elif are_all_individually_flags(potential_flag_or_option_name, set_of_all_flags): + for split_el in list(potential_flag_or_option_name[1:]): + flag: Flag = Flag(f'-{split_el}') + flag_option_list.append(flag) + else: + break # next one is Operand, and we keep these in separate list + i += 1 + + # we would probably want to skip '--' but then the unparsed command could have a different meaning so we'd need to keep it + # for now, omitted + # if parsed_elements_list[i] == '--': + # i += 1 + + operand_list = [Operand(Arg(operand_arg)) for operand_arg in flags_options_operands[i:]] + # log("type of operand_list[0].get_name()", type(operand_list[0].get_name())) can only be used if there are operands + + return CommandInvocationInitial(cmd_name, flag_option_list, operand_list) diff --git a/compiler/config.py b/compiler/config.py index f5e7648b7..71a9959fc 100644 --- a/compiler/config.py +++ b/compiler/config.py @@ -29,6 +29,15 @@ HDFS_PREFIX = "$HDFS_DATANODE_DIR/" +# move this to `config.json` if possible +PATH_ANNOTATION_REPO="/home/felix/git-repos/MIT/annotations" + +def get_path_annotation_repo(): + if PATH_ANNOTATION_REPO is None: + log("No path for annotation repository given! Specify it in compiler/config.py") + raise Exception("No path for annotation repository given! Specify it in compiler/config.py") + return PATH_ANNOTATION_REPO + config = {} annotations = [] pash_args = None diff --git a/compiler/definitions/ir/aggregator_node.py b/compiler/definitions/ir/aggregator_node.py index 511e18c9a..04b2eb8ce 100644 --- a/compiler/definitions/ir/aggregator_node.py +++ b/compiler/definitions/ir/aggregator_node.py @@ -1,10 +1,13 @@ from definitions.ir.dfg_node import * +# from definitions.ir.nodes.arg import Arg +from annotations_utils.util_cmd_invocations import get_command_invocation_prefix_from_dfg_node + ## This class corresponds to a generic n-ary aggregator ## ## TODO: Do we need to do anything special for binary aggregators? class MapperAggregatorNode(DFGNode): - def __init__(self, old_node, input_ids, output_ids, name_string, new_options): + def __init__(self, old_node, input_ids, output_ids, name_string, new_options, flag_option_list): ## The name of the aggregator command name = Arg(string_to_argument(name_string)) @@ -17,36 +20,73 @@ def __init__(self, old_node, input_ids, output_ids, name_string, new_options): super().__init__(input_ids, output_ids, name, - com_category, - com_options=old_node.com_options, + com_category, + # BEGIN ANNO + # OLD + # com_options=old_node.com_options, + # NEW + com_options=new_options, # changed that all are already in there and not appended + flag_option_list=flag_option_list, + # END ANNO com_redirs=com_redirs, com_assignments=old_node.com_assignments) - + ## TODO: This assumes that all options from the old function are copied to the new. ## ## TODO: If we need a behavior where we don't keep the old flags, we can extend this - self.append_options(new_options) + # BEGIN ANNO + # OLD + # self.append_options(new_options) + # END ANNO class AggregatorNode(MapperAggregatorNode): def __init__(self, old_node, input_ids, output_ids): + # BEGIN ANNO + used_parallelizer = old_node.get_used_parallelizer() + cmd_inv_pref = get_command_invocation_prefix_from_dfg_node(old_node) + used_aggregator = used_parallelizer.get_actual_aggregator(cmd_inv_pref) + log(f'used_agg: {used_aggregator}') + log(f'old_node: {old_node}') + # END ANNO + ## Check if an aggregator can be instantiated from the node - if(old_node.com_aggregator is None): + # BEGIN ANNO + # OLD + # if(old_node.com_aggregator is None): + # NEW + if(used_aggregator is None): + # END ANNO log("Error: Node:", old_node, "does not contain information to instantiate an aggregator!") raise Exception('No information to instantiate aggregator') ## The name of the aggregator command - agg_name_string = old_node.com_aggregator.name - new_options = old_node.com_aggregator.options + # BEGIN ANNO + # OLD + # agg_name_string = old_node.com_aggregator.name + # new_options = old_node.com_aggregator.options + # NEW + agg_name_string = used_aggregator.cmd_name + all_options_incl_new = [Arg.string_to_arg(el.get_name()) for el in used_aggregator.flag_option_list + used_aggregator.positional_config_list] + # TODO: zip is nicer + all_options_incl_new_right_format = [(i, all_options_incl_new[i]) for i in range(len(all_options_incl_new))] + # END ANNO - super().__init__(old_node, input_ids, output_ids, agg_name_string, new_options) + # BEGIN ANNO + # OLD + # super().__init__(old_node, input_ids, output_ids, agg_name_string, new_options) + # NEW + super().__init__(old_node, input_ids, output_ids, agg_name_string, all_options_incl_new_right_format, + flag_option_list=used_aggregator.flag_option_list) + # END ANNO log("Generic Aggregator Created:", self) class MapperNode(MapperAggregatorNode): def __init__(self, old_node, input_ids, output_ids): + assert(False) ## Check if an mapper can be instantiated from the node if(old_node.com_mapper is None): log("Error: Node:", old_node, "does not contain information to instantiate a mapper!") diff --git a/compiler/definitions/ir/arg.py b/compiler/definitions/ir/arg.py index 40dbcc785..8ca591733 100644 --- a/compiler/definitions/ir/arg.py +++ b/compiler/definitions/ir/arg.py @@ -1,3 +1,4 @@ +from __future__ import annotations from ir_utils import * from util import * @@ -29,3 +30,9 @@ def concatenate(self, other): space = [['C', 32]] self.arg_char_list.extend(space) self.arg_char_list.extend(other.arg_char_list) + + @staticmethod + def string_to_arg(string) -> Arg: + return Arg(string_to_argument(string)) + + diff --git a/compiler/definitions/ir/dfg_node.py b/compiler/definitions/ir/dfg_node.py index a38268082..b9e990fad 100644 --- a/compiler/definitions/ir/dfg_node.py +++ b/compiler/definitions/ir/dfg_node.py @@ -1,68 +1,53 @@ import copy -import annotations from command_categories import * -from util import * -from ir_utils import * from definitions.ir.redirection import * from definitions.ir.resource import * +from annotations_utils.util_cmd_invocations import to_node_cmd_inv_with_io_vars, construct_property_container_from_list_of_properties + +import sys +from config import get_path_annotation_repo +sys.path.insert(1, get_path_annotation_repo()) + +from util import return_empty_list_if_none_else_itself, return_default_if_none_else_itself + ## Assumption: Everything related to a DFGNode must be already expanded. ## TODO: Ensure that this is true with assertions class DFGNode: ## Unique identifier for nodes next_id = 0 - ## inputs : tuple of lists of fid_ids (that can be used to retrieve fid from edges) - ## outputs : list of fid_ids - ## com_name : command name Arg - ## com_category : string denoting category - ## input_consumption_mode : enumeration - ## com_properties : properties such as commutativity - ## com_mapper : a class that contains necessary information to instantiate a mapper (by defaule this corresponds to the command) - ## com_aggregator : a class that contains necessary information to instantiate an aggregator - ## com_options : list of tuples with the option index and the argument Arg + ## cmd_invocation_with_io_vars : command invocation data structure with edge ids as symbolic variables for filenames etc. ## com_redirs : list of redirections ## com_assignments : list of assignments - def __init__(self, inputs, outputs, com_name, com_category, - com_properties = [], - com_mapper = None, - com_aggregator = None, - com_options = [], + ## parallelizer_list : list of parallelizers for this DFGNode + ## cmd_related_properties : dict to store properties like commutativity + def __init__(self, + cmd_invocation_with_io_vars, com_redirs = [], - com_assignments=[]): + com_assignments=[], + parallelizer_list=None, + cmd_related_properties=None, + ): + # TODO []: default parameters! + + ## @KK: can this be deleted? Was there another id in the member attributes before? ## Add a unique identifier to each DFGNode since id() is not guaranteed to be unique for objects that have different lifetimes. ## This leads to issues when nodes are deleted and new ones are created, leading to id() clashes between them self.id = DFGNode.next_id DFGNode.next_id += 1 - self.set_inputs(inputs) - self.outputs = outputs - self.com_name = com_name - self.com_category = com_category - self.com_properties = com_properties - self.com_mapper = com_mapper - self.com_aggregator = com_aggregator - self.com_options = com_options self.com_redirs = [Redirection(redirection) for redirection in com_redirs] self.com_assignments = com_assignments - + self.parallelizer_list = return_empty_list_if_none_else_itself(parallelizer_list) + default_cmd_properties = construct_property_container_from_list_of_properties([]) + self.cmd_related_properties = return_default_if_none_else_itself(cmd_related_properties, default_cmd_properties) + self.cmd_invocation_with_io_vars = cmd_invocation_with_io_vars # log("Node created:", self.id, self) def __repr__(self): - prefix = "Node" - if (self.com_category == "stateless"): - prefix = "Stateless" - elif (self.com_category == "pure"): - prefix = "Pure" - elif (self.is_pure_parallelizable()): - prefix = "Par. Pure" - if (self.is_commutative()): - prefix = 'Commutative ' + prefix - output = "{}: \"{}\" in:{} out:{}".format( - prefix, self.com_name, - self.get_input_list(), - self.outputs) - return output + # TODO: add other attributes + return str(self.cmd_invocation_with_io_vars) ## Generates a dot node for the DFG node def add_dot_node(self, dot, node_id): @@ -73,7 +58,7 @@ def add_dot_node(self, dot, node_id): ## Get the label of the node. By default, it is simply the name def get_dot_label(self) -> str: ## The name could be a full path - name = self.com_name + name = self.cmd_invocation_with_io_vars.cmd_name basename = os.path.basename(str(name)) return basename @@ -90,6 +75,7 @@ def copy(self): ## TODO: Make that a proper class. def set_inputs(self, inputs): + assert(False) if(isinstance(inputs, list)): self.inputs = ([], inputs) elif(isinstance(inputs, tuple)): @@ -98,32 +84,48 @@ def set_inputs(self, inputs): raise NotImplementedError() def get_input_list(self): - return (self.inputs[0] + self.inputs[1]) - - def get_standard_inputs(self): - return self.inputs[1] - + inputs = self.cmd_invocation_with_io_vars.generate_inputs() + return inputs.get_all_inputs() + + def get_output_list(self): + return self.cmd_invocation_with_io_vars.generate_outputs() + + def get_streaming_inputs(self): + inputs = self.cmd_invocation_with_io_vars.generate_inputs() + return inputs.get_streaming_inputs() + def get_configuration_inputs(self): - return self.inputs[0] + inputs = self.cmd_invocation_with_io_vars.generate_inputs() + return inputs.get_config_inputs() - def is_at_most_pure(self): - return (self.com_category in ["stateless", "pure", "parallelizable_pure"]) + # def is_at_most_pure(self): + # return (self.com_category in ["stateless", "pure", "parallelizable_pure"]) - def is_parallelizable(self): - return (self.is_pure_parallelizable() or self.is_stateless()) + # def is_parallelizable(self): + # return (self.is_pure_parallelizable() or self.is_stateless()) - def is_stateless(self): - return (self.com_category == "stateless") + # def is_stateless(self): + # return (self.com_category == "stateless") - def is_pure_parallelizable(self): - return (self.com_category == "parallelizable_pure") + # def is_pure_parallelizable(self): + # return (self.com_category == "parallelizable_pure") def is_commutative(self): - return ('commutative' in self.com_properties) + # BEGIN ANNO + # OLD + # return ('commutative' in self.com_properties) + # NEW + val = self.cmd_related_properties.get_property_value('commutative') + if val is not None: + return val + else: + return False + # END ANNO ## kk: 2021-07-23 Not totally sure if that is generally correct. Tests will say ¯\_(ツ)_/¯ ## I think it assumes that new options can be added in the beginning if there are no options already def append_options(self, new_options): + assert(False) # unreachable if(len(self.com_options) > 0): max_opt_index = max([i for i, _opt in self.com_options]) else: @@ -140,6 +142,10 @@ def append_options(self, new_options): ## ## TODO: Abstract this function away to annotations 2.0 def special_to_ast(self, edges): + assert(False) # unreachable + # BEGIN ANNO + return None + # END ANNO ## Every argument should be completely expanded so making it a string should be fine if str(self.com_name) == "cat": redirs = self._to_ast_aux_get_redirs() @@ -166,6 +172,7 @@ def special_to_ast(self, edges): ## This function handles the input fids as arguments. def _to_ast_aux_inputs_as_args(self, edges, stdin_dash=False): + assert(False) # unreachable input_fids = [edges[in_id][0] for in_id in self.get_input_list()] input_arguments = [fid.to_ast(stdin_dash=stdin_dash) @@ -175,6 +182,7 @@ def _to_ast_aux_inputs_as_args(self, edges, stdin_dash=False): ## This function handles the redirections when a command has a single output ## and it can always be stdout. def _to_ast_aux_single_stdout_fid(self, edges): + assert(False) # unreachable output_fids = [edges[out_id][0] for out_id in self.outputs] assert len(output_fids) == 1 output_fid = output_fids[0] @@ -187,6 +195,7 @@ def _to_ast_aux_single_stdout_fid(self, edges): ## Auxiliary method that returns any necessary redirections, ## at the moment it doesn't look necessary. def _to_ast_aux_get_redirs(self): + ## still used in to_ast ## TODO: Properly handle redirections ## ## TODO: If one of the redirected outputs or inputs is changed in the IR @@ -204,60 +213,30 @@ def _to_ast_aux_get_redirs(self): return [] - ## TODO: Improve this functio to be separately implemented for different special nodes, + ## TODO: Improve this function to be separately implemented for different special nodes, ## such as cat, eager, split, etc... - def to_ast(self, edges, drain_streams): + ## I do not think this is reasonable anymore since we remodelled nodes in a way that the back-translation is trivial + def to_ast(self, edges, drain_streams): ## TODO: We might not want to implement this at all actually if (drain_streams): raise NotImplementedError() else: + # commented since "see above" ## Handle special node to ast here - node = self.special_to_ast(edges) - if node is not None: - return node - + # node = self.special_to_ast(edges) + # if node is not None: + # return node redirs = self._to_ast_aux_get_redirs() assignments = self.com_assignments - ## Start filling in the arguments - opt_arguments = [] - for i, opt in self.com_options: - ## Pad the argument list with None - opt_arguments = pad(opt_arguments, i) - opt_arguments[i] = opt.to_ast() - com_name_ast = self.com_name.to_ast() - option_asts = [opt.to_ast() for _, opt in self.com_options] - - ## - ## 1. Find the input and output fids - ## 2. Construct the rest of the arguments and input/output redirections according to - ## the command IO - input_fids = [edges[in_id][0] for in_id in self.get_input_list()] - output_fids = [edges[out_id][0] for out_id in self.outputs] - rest_argument_fids, new_redirs = create_command_arguments_redirs(com_name_ast, - option_asts, - input_fids, - output_fids) - - ## Transform the rest of the argument fids to arguments - ## Since some of the rest_arguments can be None (they only contain inputs and outputs) - ## we need to make sure that we don't turn None objects to asts. - ## - ## The None fields need to be filtered out because they are taken care of by the interleave function. - ## - ## TODO: Is this actually OK? - rest_arguments = [fid.to_ast() - for fid in rest_argument_fids - if not fid is None] - - ## Interleave the arguments since options args might contain gaps. - arguments = interleave_args(opt_arguments, rest_arguments) - - all_arguments = [com_name_ast] + arguments - all_redirs = redirs + new_redirs - - node = make_command(all_arguments, redirections=all_redirs, assignments=assignments) + node = to_node_cmd_inv_with_io_vars(self.cmd_invocation_with_io_vars, edges, redirs, assignments) + # TODO: think about redirections + # old code for this: + # rest_argument_fids, new_redirs = create_command_arguments_redirs(com_name_ast, + # option_asts, + # input_fids, + # output_fids) return node ## This method applies the redirections to get the correct, inputs, outputs of a node. @@ -280,8 +259,8 @@ def apply_redirections(self, edges): # log(redirection) file_resource = FileResource(redirection.file_arg) success = False - for i in range(len(self.outputs)): - output_edge_id = self.outputs[i] + for i in range(len(self.get_output_list())): + output_edge_id = self.get_output_list()[i] output_fid = edges[output_edge_id][0] if(output_fid.has_file_descriptor_resource() and output_fid.resource.is_stdout()): @@ -315,12 +294,7 @@ def apply_redirections(self, edges): ## ## TODO: Make this a method of graph to change the from, to too. def replace_edge(self, from_id, to_id): - new_config_inputs = self.replace_edge_in_list(self.inputs[0], from_id, to_id) - new_standard_inputs = self.replace_edge_in_list(self.inputs[1], from_id, to_id) - new_outputs = self.replace_edge_in_list(self.outputs, from_id, to_id) - - self.set_inputs((new_config_inputs, new_standard_inputs)) - self.outputs = new_outputs + self.cmd_invocation_with_io_vars.replace_var(from_id, to_id) ## TODO: There must be a lib function to do this. def replace_edge_in_list(self, edge_ids, from_id, to_id): @@ -333,33 +307,24 @@ def replace_edge_in_list(self, edge_ids, from_id, to_id): new_edge_ids.append(new_edge_id) return new_edge_ids - ## Get the file names of the outputs of the map commands. This - ## differs if the command is stateless, pure that can be - ## written as a map and a reduce, and a pure that can be - ## written as a generalized map and reduce. - def get_map_output_files(self, input_edge_ids, fileIdGen): - assert(self.is_parallelizable()) - if(self.com_category == "stateless"): - map_output_fids = [fileIdGen.next_ephemeral_file_id() for in_fid in input_edge_ids] - elif(self.is_pure_parallelizable()): - map_output_fids = self.pure_get_map_output_files(input_edge_ids, fileIdGen) - else: - log("Unreachable code reached :(") - assert(False) - ## This should be unreachable - - return map_output_fids - - ## TODO: Fix this somewhere in the annotations and not in the code - def pure_get_map_output_files(self, input_edge_ids, fileIdGen): - assert(self.is_pure_parallelizable()) - - ## The number of the mapper outputs defaults to 1 - if(self.com_mapper is None): - number_outputs = 1 - else: - number_outputs = self.com_mapper.num_outputs - - new_output_fids = [[fileIdGen.next_ephemeral_file_id() for i in range(number_outputs)] - for in_fid in input_edge_ids] - return new_output_fids + def set_used_parallelizer(self, parallelizer): + assert(False) + # TODO: instantiate in __init__ already in some way + self.used_parallelizer = parallelizer + + def get_used_parallelizer(self): + assert(False) + return self.used_parallelizer + + def get_option_implemented_round_robin_parallelizer(self): + for parallelizer in self.parallelizer_list: + splitter = parallelizer.get_splitter() + mapper_spec = parallelizer.get_mapper_spec() + aggregator_spec = parallelizer.get_aggregator_spec() + if splitter.is_splitter_round_robin() and mapper_spec.is_implemented and aggregator_spec.is_implemented: + return parallelizer + return None + + @staticmethod + def make_simple_dfg_node_from_cmd_inv_with_io_vars(cmd_inv_with_io_vars): + return DFGNode(cmd_inv_with_io_vars) \ No newline at end of file diff --git a/compiler/definitions/ir/nodes/cat.py b/compiler/definitions/ir/nodes/cat.py index a898d2dd8..a27b89f4f 100644 --- a/compiler/definitions/ir/nodes/cat.py +++ b/compiler/definitions/ir/nodes/cat.py @@ -2,13 +2,34 @@ class Cat(DFGNode): def __init__(self, inputs, outputs, com_name, com_category, - com_options = [], com_redirs = [], com_assignments=[]): + com_options = [], com_redirs = [], com_assignments=[], + # BEGIN ANNO + flag_option_list = None, + positional_config_list = None, + positional_input_list = None, + positional_output_list = None, + implicit_use_of_stdin = None, + implicit_use_of_stdout = None, + parallelizer_list = None, + cmd_related_properties = None + # END ANNO + ): + assert(False) assert(str(com_name) == "cat") assert(com_category == "stateless") super().__init__(inputs, outputs, com_name, com_category, - com_options=com_options, - com_redirs=com_redirs, - com_assignments=com_assignments) + com_options=com_options, + flag_option_list=flag_option_list, + com_redirs=com_redirs, + com_assignments=com_assignments, + positional_config_list=positional_config_list, + positional_input_list=positional_input_list, + positional_output_list=positional_output_list, + implicit_use_of_stdin=implicit_use_of_stdin, + implicit_use_of_stdout=implicit_use_of_stdout, + parallelizer_list=parallelizer_list, + cmd_related_properties=cmd_related_properties + ) def make_cat_node(inputs, output): com_name = Arg(string_to_argument("cat")) diff --git a/compiler/definitions/ir/nodes/eager.py b/compiler/definitions/ir/nodes/eager.py index d7c70210c..9cc37315f 100644 --- a/compiler/definitions/ir/nodes/eager.py +++ b/compiler/definitions/ir/nodes/eager.py @@ -1,20 +1,118 @@ from definitions.ir.dfg_node import * +from ir_utils import * class Eager(DFGNode): def __init__(self, inputs, outputs, com_name, com_category, com_options = [], - com_redirs = [], com_assignments=[]): - super().__init__(inputs, outputs, com_name, com_category, + com_redirs = [], com_assignments=[], + intermediate = None): + # BEGIN ANNO : hack for intermediate at the end + self.intermediate = intermediate + # END ANNO + super().__init__(inputs, outputs, com_name, com_category, com_options=com_options, com_redirs=com_redirs, com_assignments=com_assignments) + # BEGIN ANNO : copied from DFG node for hack for intermediate at the end + def to_ast(self, edges, drain_streams): + log(f'do we get here?') + ## TODO: We might not want to implement this at all actually + if (drain_streams): + raise NotImplementedError() + else: + ## Handle special node to ast here + # node = self.special_to_ast(edges) + # if node is not None: + # return node + + redirs = self._to_ast_aux_get_redirs() + assignments = self.com_assignments + ## Start filling in the arguments + opt_arguments = [] + # BEGIN ANNO + # get_command_invocation_prefix_from_dfg_node + log(f'com_name: {self.com_name}') + log(f'edges: {edges}') + log(f'inputs: {self.inputs}') + log(f'outputs: {self.outputs}') + log(f'com_redirs: {self.com_redirs}') + log(f'pos config: {self.positional_config_list}') + log(f'pos input: {self.positional_input_list}') + log(f'pos output: {self.positional_output_list}') + log(f'com_options: {self.com_options}') + log(f'flag_option_list: {self.flag_option_list}') + + # if self.implicit_use_of_stdin: # need to recompute + # cat a list of inputs into it; redirect a single one + # else: + + # OLD + # for i, opt in self.com_options: + # ## Pad the argument list with None + # opt_arguments = pad(opt_arguments, i) + # opt_arguments[i] = opt.to_ast() + # log(f'opt_arguments: {format_args([val for val in opt_arguments if val is not None])}') + # NEW + opt_arguments_new = [get_ast_for_flagoption(flagoption) for flagoption in self.flag_option_list] + opt_arguments_new += [get_ast_for_argstringtype(arg) for arg in self.positional_config_list] + log(f'opt_arguments_new: {format_args(opt_arguments_new)}') + # END ANNO + + com_name_ast = self.com_name.to_ast() + option_asts = [opt.to_ast() for _, opt in self.com_options] + + ## + ## 1. Find the input and output fids + ## 2. Construct the rest of the arguments and input/output redirections according to + ## the command IO + input_fids = [edges[in_id][0] for in_id in self.get_input_list()] + output_fids = [edges[out_id][0] for out_id in self.outputs] + rest_argument_fids, new_redirs = create_command_arguments_redirs(com_name_ast, + option_asts, + input_fids, + output_fids) + + ## Transform the rest of the argument fids to arguments + ## Since some of the rest_arguments can be None (they only contain inputs and outputs) + ## we need to make sure that we don't turn None objects to asts. + ## + ## The None fields need to be filtered out because they are taken care of by the interleave function. + ## + ## TODO: Is this actually OK? + rest_arguments = [fid.to_ast() + for fid in rest_argument_fids + if not fid is None] + log(f'rest_arguments: {format_args(rest_arguments)}') + + ## Interleave the arguments since options args might contain gaps. + # BEGIN ANNO + rest_arguments_backup = rest_arguments.copy() + # OLD + # arguments = interleave_args(opt_arguments, rest_arguments) + # log(f'arguments fin: {format_args(arguments)}') + # NEW + arguments_new = opt_arguments_new + rest_arguments_backup + [self.intermediate.to_ast()] + log(f'arguments_new: {format_args(arguments_new)}') + # END ANNO + + all_arguments = [com_name_ast] + arguments_new + all_redirs = redirs + new_redirs + + node = make_command(all_arguments, redirections=all_redirs, assignments=assignments) + return node + + def make_eager_node(input_id, output_id, intermediate_file_id, eager_exec_path): com_name = Arg(string_to_argument(eager_exec_path)) com_category = "pure" ## TODO: In theory the intermediate file id is also an output... - com_options = [(2, Arg(intermediate_file_id.to_ast()))] + # BEGIN ANNO + # OLD + intermediate_identifier = Arg(intermediate_file_id.to_ast()) + com_options = [(2, intermediate_identifier)] return Eager([input_id], [output_id], com_name, com_category, - com_options=com_options) + com_options=com_options, + intermediate=intermediate_identifier) diff --git a/compiler/definitions/ir/nodes/pash_split.py b/compiler/definitions/ir/nodes/pash_split.py index 7c6c8d9ba..9c28267d5 100644 --- a/compiler/definitions/ir/nodes/pash_split.py +++ b/compiler/definitions/ir/nodes/pash_split.py @@ -1,3 +1,6 @@ +from datatypes_new.AccessKind import AccessKind +from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars + from definitions.ir.file_id import * from definitions.ir.dfg_node import * from ir_utils import string_to_argument @@ -6,19 +9,30 @@ import os class Split(DFGNode): - def __init__(self, inputs, outputs, com_name, com_category, com_options = [], - com_redirs = [], com_assignments=[]): - super().__init__(inputs, outputs, com_name, com_category, - com_options=com_options, - com_redirs=com_redirs, - com_assignments=com_assignments) + def __init__(self, + cmd_invocation_with_io_vars, + com_redirs=[], + com_assignments=[], + parallelizer_list=None, + cmd_related_properties=None): + # TODO []: default arguments! + super().__init__(cmd_invocation_with_io_vars=cmd_invocation_with_io_vars, + com_redirs=com_redirs, + com_assignments=com_assignments, + parallelizer_list=parallelizer_list, + cmd_related_properties=cmd_related_properties) -## TODO: Make a proper splitter subclass of Node def make_split_file(input_id, out_ids): auto_split_bin = os.path.join(config.PASH_TOP, config.config['runtime']['auto_split_binary']) - com_name = Arg(string_to_argument(auto_split_bin)) - com_category = "pure" - return Split([input_id], - out_ids, - com_name, - com_category) + operand_list = [input_id] + operand_list.extend(out_ids) + access_map = {output_id: AccessKind.make_stream_output() for output_id in out_ids} + access_map[input_id] = AccessKind.make_stream_input() + cmd_inv_with_io_vars = CommandInvocationWithIOVars( + cmd_name=auto_split_bin, + flag_option_list=[], + operand_list=operand_list, + implicit_use_of_streaming_input=None, + implicit_use_of_streaming_output=None, + access_map=access_map) + return Split(cmd_inv_with_io_vars) diff --git a/compiler/definitions/ir/nodes/r_split.py b/compiler/definitions/ir/nodes/r_split.py index 1c29b89b8..68a889f2f 100644 --- a/compiler/definitions/ir/nodes/r_split.py +++ b/compiler/definitions/ir/nodes/r_split.py @@ -1,4 +1,9 @@ import os + +from datatypes_new.AccessKind import AccessKind +from datatypes_new.BasicDatatypes import Operand +from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars + import config from definitions.ir.dfg_node import * @@ -6,15 +11,22 @@ from ir_utils import string_to_argument class RSplit(DFGNode): - def __init__(self, inputs, outputs, com_name, com_category, com_options = [], - com_redirs = [], com_assignments=[]): - super().__init__(inputs, outputs, com_name, com_category, - com_options=com_options, - com_redirs=com_redirs, - com_assignments=com_assignments) - + def __init__(self, + cmd_invocation_with_io_vars, + com_redirs=[], + com_assignments=[], + parallelizer_list=None, + cmd_related_properties=None): + # TODO []: default arguments! + super().__init__(cmd_invocation_with_io_vars=cmd_invocation_with_io_vars, + com_redirs=com_redirs, + com_assignments=com_assignments, + parallelizer_list=parallelizer_list, + cmd_related_properties=cmd_related_properties) + ## TODO: Generalize this code (for this and SortGReduce) to be able to add an option to any command. def add_r_flag(self): + assert(False) assert(len(self.com_options) <= 1) ## Add -r in r_split @@ -24,18 +36,23 @@ def add_r_flag(self): ## This is not a proper option check. It just works if the r_flag is added as a separate option. def has_r_flag(self): + assert(False) option_strings = [str(opt) for i, opt in self.com_options] return ("-r" in option_strings) -## TODO: Make a proper splitter subclass of Node def make_r_split(input_id, out_ids, r_split_batch_size): r_split_bin = os.path.join(config.PASH_TOP, config.config['runtime']['r_split_binary']) - com_name = Arg(string_to_argument(r_split_bin)) - com_category = "pure" - com_option = (1, Arg(string_to_argument(str(r_split_batch_size)))) - return RSplit([input_id], - out_ids, - com_name, - com_category, - com_options=[com_option]) + operand_list = [input_id, + Operand(Arg(string_to_argument(str(r_split_batch_size))))] + operand_list.extend(out_ids) + access_map = {output_id: AccessKind.make_stream_output() for output_id in out_ids} + access_map[input_id] = AccessKind.make_stream_input() + cmd_inv_with_io_vars = CommandInvocationWithIOVars( + cmd_name=r_split_bin, + flag_option_list=[], + operand_list=operand_list, + implicit_use_of_streaming_input=None, + implicit_use_of_streaming_output=None, + access_map=access_map) + return RSplit(cmd_inv_with_io_vars) diff --git a/compiler/definitions/ir/resource.py b/compiler/definitions/ir/resource.py index ade86870f..999792cd9 100644 --- a/compiler/definitions/ir/resource.py +++ b/compiler/definitions/ir/resource.py @@ -44,6 +44,7 @@ def is_stdout(self): class FileResource(Resource): ## The uri is the path of the file. def __init__(self, path): + log("class of path", type(path)) assert(isinstance(path, Arg)) ## TODO: Make sure that paths are normalized self.uri = path diff --git a/compiler/ir.py b/compiler/ir.py index eb32479f4..641546cfc 100644 --- a/compiler/ir.py +++ b/compiler/ir.py @@ -1,12 +1,31 @@ -import os +# BEGIN ANNO +import sys + +from config import get_path_annotation_repo +sys.path.insert(1, get_path_annotation_repo()) +# for typing +from datatypes_new.CommandInvocationInitial import CommandInvocationInitial +from datatypes_new.BasicDatatypesWithIO import FileNameWithIOInfo, StdDescriptorWithIOInfo +from annotation_generation_new.datatypes.InputOutputInfo import InputOutputInfo +from annotation_generation_new.datatypes.ParallelizabilityInfo import ParallelizabilityInfo +from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars + +# for use +# -- + +from annotations_utils.util_parsing import parse_arg_list_to_command_invocation +from annotations_utils.util_cmd_invocations import get_input_output_info_from_cmd_invocation_util, get_parallelizability_info_from_cmd_invocation_util +from annotations_utils.util_mapper import get_mapper_as_dfg_node_from_node, get_map_output_files +from annotations_utils.util_aggregator import get_aggregator_as_dfg_node_from_node +from annotations_utils.util_file_descriptors import resource_from_file_descriptor +# END ANNO + +# BEGIN REMODEL + +# END REMODEL -from definitions.ir.arg import * -from definitions.ir.dfg_node import * -from definitions.ir.aggregator_node import * from definitions.ir.file_id import * -from definitions.ir.resource import * from definitions.ir.nodes.cat import * -from definitions.ir.nodes.hdfs_cat import HDFSCat import definitions.ir.nodes.pash_split as pash_split import definitions.ir.nodes.r_merge as r_merge @@ -14,7 +33,6 @@ import definitions.ir.nodes.r_wrap as r_wrap import definitions.ir.nodes.r_unwrap as r_unwrap -from command_categories import * from ir_utils import * from util import * @@ -94,78 +112,126 @@ def create_edges_from_opt_or_fd_list(opt_or_fd_list, edges_dict, options, fileId new_edge_list.append(fid_id) return new_edge_list -def find_input_edges(inputs, dfg_edges, options, fileIdGen): - if(isinstance(inputs, list)): - return create_edges_from_opt_or_fd_list(inputs, dfg_edges, options, fileIdGen) - elif(isinstance(inputs, tuple)): - config_inputs = create_edges_from_opt_or_fd_list(inputs[0], dfg_edges, options, fileIdGen) - standard_inputs = create_edges_from_opt_or_fd_list(inputs[1], dfg_edges, options, fileIdGen) - return (config_inputs, standard_inputs) + +def find_input_edges(positional_input_list, implicit_use_of_stdin, dfg_edges, fileIdGen) -> List[int]: + assert (not implicit_use_of_stdin or len(positional_input_list) == 0) + if implicit_use_of_stdin: + resources = [FileDescriptorResource(("fd", 0))] + else: + resources = [resource_from_file_descriptor(input_el) for input_el in positional_input_list] + file_ids = [create_file_id_for_resource(resource, fileIdGen) for resource in resources] + return get_edge_list_from_file_id_list(dfg_edges, file_ids) + + +def find_output_edges(positional_output_list, implicit_use_of_stdout, dfg_edges, fileIdGen) -> List[int]: + assert (not implicit_use_of_stdout or len(positional_output_list) == 0) + if implicit_use_of_stdout: + resources = [FileDescriptorResource(("fd", 1))] + else: + resources = [resource_from_file_descriptor(input_el) for input_el in positional_output_list] + file_ids = [create_file_id_for_resource(resource, fileIdGen) for resource in resources] + return get_edge_list_from_file_id_list(dfg_edges, file_ids) + + +def get_edge_list_from_file_id_list(dfg_edges, file_ids): + new_edge_list = [] + for file_id in file_ids: + fid_id = file_id.get_ident() + dfg_edges[fid_id] = (file_id, None, None) + new_edge_list.append(fid_id) + return new_edge_list + + +def add_file_id_vars(command_invocation_with_io, fileIdGen): + # make pass over everything and create file_id for everything + # only for operands for now: + dfg_edges = {} + new_operand_list = [] + access_map = dict() + + def add_var_for_descriptor(operand): + resource = resource_from_file_descriptor(operand) + file_id = create_file_id_for_resource(resource, fileIdGen) + fid_id = file_id.get_ident() + dfg_edges[fid_id] = (file_id, None, None) + access_map[fid_id] = operand.get_access() + return fid_id + + for i in range(len(command_invocation_with_io.operand_list)): + operand = command_invocation_with_io.operand_list[i] + if isinstance(operand, FileNameWithIOInfo) or isinstance(operand, StdDescriptorWithIOInfo): + fid_id = add_var_for_descriptor(operand) + new_operand_list.append(fid_id) + else: + new_operand_list.append(operand) + if command_invocation_with_io.implicit_use_of_streaming_input: + new_implicit_use_of_streaming_input = add_var_for_descriptor(command_invocation_with_io.implicit_use_of_streaming_input) + else: + new_implicit_use_of_streaming_input = None + if command_invocation_with_io.implicit_use_of_streaming_output: + new_implicit_use_of_streaming_output = add_var_for_descriptor(command_invocation_with_io.implicit_use_of_streaming_output) else: - raise NotImplementedError() + new_implicit_use_of_streaming_output = None + + # this shall become copy-based + command_invocation_with_io_vars = CommandInvocationWithIOVars.get_from_without_vars(command_invocation_with_io, access_map) + command_invocation_with_io_vars.operand_list = new_operand_list + command_invocation_with_io_vars.implicit_use_of_streaming_input = new_implicit_use_of_streaming_input + command_invocation_with_io_vars.implicit_use_of_streaming_output = new_implicit_use_of_streaming_output + return command_invocation_with_io_vars, dfg_edges + -## This function creates a DFG with a single node given a command. def compile_command_to_DFG(fileIdGen, command, options, redirections=[]): - ## TODO: There is no need for this redirection here. We can just straight - ## come up with inputs, outputs, options - inputs, out_stream, opt_indices = find_command_input_output(command, options) - # log("Opt indices:", opt_indices, "options:", options) - category = find_command_category(command, options) - com_properties = find_command_properties(command, options) - com_mapper, com_aggregator = find_command_mapper_aggregator(command, options) + command_invocation: CommandInvocationInitial = parse_arg_list_to_command_invocation(command, options) + io_info: InputOutputInfo = get_input_output_info_from_cmd_invocation_util(command_invocation) + para_info: ParallelizabilityInfo = get_parallelizability_info_from_cmd_invocation_util(command_invocation) + command_invocation_with_io = io_info.apply_input_output_info_to_command_invocation(command_invocation) + parallelizer_list, round_robin_compatible_with_cat, is_commutative = para_info.unpack_info() + property_list = [('round_robin_compatible_with_cat', round_robin_compatible_with_cat), + ('is_commutative', is_commutative)] + cmd_related_properties = construct_property_container_from_list_of_properties(property_list) ## TODO: Make an empty IR and add edges and nodes incrementally (using the methods defined in IR). - dfg_edges = {} ## Add all inputs and outputs to the DFG edges - dfg_inputs = find_input_edges(inputs, dfg_edges, options, fileIdGen) - dfg_outputs = create_edges_from_opt_or_fd_list(out_stream, dfg_edges, options, fileIdGen) - - com_name = Arg(command) - com_category = category - - ## Get the options - dfg_options = [get_option(opt_or_fd, options, fileIdGen) - for opt_or_fd in opt_indices] + cmd_invocation_with_io_vars, dfg_edges = add_file_id_vars(command_invocation_with_io, fileIdGen) com_redirs = redirections ## TODO: Add assignments com_assignments = [] ## TODO: Combine them both in a constructor that decided whether to instantiate Cat or DFGNode - if(str(com_name) == "cat"): - dfg_node = Cat(dfg_inputs, - dfg_outputs, - com_name, - ## TODO: We don't really need to pass category, name, or input_consumption for Cat - com_category, - com_options=dfg_options, - com_redirs=com_redirs, - com_assignments=com_assignments) - elif(str(com_name) == "hdfs" and str(dfg_options[0][1]) == "dfs" and str(dfg_options[1][1]) == "-cat"): - dfg_node = HDFSCat(dfg_inputs, - dfg_outputs, - com_name, - com_category, - com_options=dfg_options, - com_redirs=com_redirs, - com_assignments=com_assignments) - else: + # if(str(com_name) == "cat"): + # dfg_node = Cat(dfg_inputs, + # dfg_outputs, + # com_name, + # ## TODO: We don't really need to pass category, name, or input_consumption for Cat + # com_category, + # com_options=dfg_options, + # com_redirs=com_redirs, + # com_assignments=com_assignments, + # ) + # elif(str(com_name) == "hdfs" and str(dfg_options[0][1]) == "dfs" and str(dfg_options[1][1]) == "-cat"): + # dfg_node = HDFSCat(dfg_inputs, + # dfg_outputs, + # com_name, + # com_category, + # com_options=dfg_options, + # com_redirs=com_redirs, + # com_assignments=com_assignments) + # else: + if(True): ## Assume: Everything must be completely expanded ## TODO: Add an assertion about that. - dfg_node = DFGNode(dfg_inputs, - dfg_outputs, - com_name, - com_category, - com_properties=com_properties, - com_mapper=com_mapper, - com_aggregator=com_aggregator, - com_options=dfg_options, + dfg_node = DFGNode(cmd_invocation_with_io_vars, com_redirs=com_redirs, - com_assignments=com_assignments) - - if(not dfg_node.is_at_most_pure()): - raise ValueError() + com_assignments=com_assignments, + parallelizer_list=parallelizer_list, + cmd_related_properties=cmd_related_properties + ) + + # if(not dfg_node.is_at_most_pure()): # which consequences has this check had? + # raise ValueError() node_id = dfg_node.get_id() @@ -175,7 +241,7 @@ def compile_command_to_DFG(fileIdGen, command, options, assert(to_node is None) dfg_edges[fid_id] = (fid, from_node, node_id) - for fid_id in dfg_node.outputs: + for fid_id in dfg_node.get_output_list(): fid, from_node, to_node = dfg_edges[fid_id] assert(from_node is None) dfg_edges[fid_id] = (fid, node_id, to_node) @@ -212,15 +278,8 @@ def make_tee(input, outputs): com_name, com_category) -def make_map_node(node, new_inputs, new_outputs): - ## Some nodes have special map commands - if(not node.com_mapper is None): - new_node = MapperNode(node, new_inputs, new_outputs) - else: - new_node = node.copy() - new_node.inputs = new_inputs - new_node.outputs = new_outputs - return new_node +def make_map_node(node, new_inputs, new_outputs, parallelizer): + return get_mapper_as_dfg_node_from_node(node, parallelizer, new_inputs, new_outputs) ## Makes a wrap node that encloses a map parallel node. ## @@ -623,7 +682,7 @@ def get_node_inputs(self, node_id): return input_edge_ids def get_node_outputs(self, node_id): - output_edge_ids = self.nodes[node_id].outputs + output_edge_ids = self.nodes[node_id].get_output_list() return output_edge_ids def get_next_nodes(self, node_id): @@ -658,7 +717,7 @@ def get_node_input_fids(self, node_id): def get_node_output_ids_fids(self, node_id): node = self.get_node(node_id) - return [(output_edge_id, self.edges[output_edge_id][0]) for output_edge_id in node.outputs] + return [(output_edge_id, self.edges[output_edge_id][0]) for output_edge_id in node.get_output_list()] def get_node_output_ids(self, node_id): return [fid_id for fid_id, _fid in self.get_node_output_ids_fids(node_id)] @@ -681,8 +740,8 @@ def remove_node(self, node_id): ## Remove the node in the edges dictionary for in_id in node.get_input_list(): self.set_edge_to(in_id, None) - - for out_id in node.outputs: + + for out_id in node.get_output_list(): self.set_edge_from(out_id, None) @@ -692,15 +751,19 @@ def add_node(self, node): ## Add the node in the edges dictionary for in_id in node.get_input_list(): self.set_edge_to(in_id, node_id) - - for out_id in node.outputs: + + for out_id in node.get_output_list(): self.set_edge_from(out_id, node_id) + def generate_ephemeral_edges(self, fileIdGen, num_of_edges): + file_ids = [fileIdGen.next_ephemeral_file_id() for _ in range(num_of_edges)] + self.add_edges(file_ids) + return [edge_fid.get_ident() for edge_fid in file_ids] def add_edges(self, edge_fids): for edge_fid in edge_fids: self.add_edge(edge_fid) - + def add_edge(self, edge_fid): fid_id = edge_fid.get_ident() assert(not fid_id in self.edges) @@ -743,14 +806,25 @@ def empty(self): ## ## In this case the stateless command is wrapped with wrap so we cannot actually tee the input (since we do not know apriori how many forks we have). ## However, we can actually write it to a file (not always worth performance wise) and then read it from all at once. - ## - ## + ## + ## ## TODO: Eventually delete the fileIdGen from here and always use the graph internal one. ## ## TODO: Eventually this should be tunable to not happen for all inputs (but maybe for less) def parallelize_node(self, node_id, fileIdGen): + assert(False) node = self.get_node(node_id) - assert(node.is_parallelizable()) + # BEGIN ANNO + # OLD + # assert(node.is_parallelizable()) + # NEW + log(f'parallelizers: {node.parallelizer_list}') + rr_parallelizer_list = [parallelizer for parallelizer in node.parallelizer_list if parallelizer.splitter.is_splitter_round_robin()] + assert(len(rr_parallelizer_list) == 1) + rr_parallelizer = rr_parallelizer_list[0] + # to have this info later when the merger is created in a reduce tree + node.set_used_parallelizer(rr_parallelizer) + # END ANNO ## Initialize the new_node list new_nodes = [] @@ -765,13 +839,14 @@ def parallelize_node(self, node_id, fileIdGen): previous_node = self.get_node(previous_node_id) assert(isinstance(previous_node, Cat) or isinstance(previous_node, r_merge.RMerge)) - + ## Determine if the previous node is r_merge to determine which of the three parallelization cases to follow r_merge_flag = isinstance(previous_node, r_merge.RMerge) - ## If the previous node of r_merge is an r_split, then we need to replace it with -r, + ## If the previous node of r_merge is an r_split, then we need to replace it with -r, ## instead of doing unwraps. if(r_merge_flag): + assert(False) assert(isinstance(previous_node, r_merge.RMerge)) r_merge_prev_node_ids = self.get_previous_nodes(previous_node_id) @@ -784,16 +859,16 @@ def parallelize_node(self, node_id, fileIdGen): r_split_before_r_merge_opt_flag = all([isinstance(self.get_node(node_id), r_split.RSplit) for node_id in r_merge_prev_node_ids]) - ## If r_split was right before the r_merge, and the node is pure parallelizable, + ## If r_split was right before the r_merge, and the node is pure parallelizable, ## this means that we will not add unwraps, and therefore we need to add the -r flag to r_split. if (r_split_before_r_merge_opt_flag and node.is_pure_parallelizable()): assert(node.is_commutative()) r_split_id = r_merge_prev_node_ids[0] r_split_node = self.get_node(r_split_id) - + ## Add -r flag in r_split - r_split_node.add_r_flag() + r_split_node.add_r_flag() else: r_split_before_r_merge_opt_flag = False @@ -803,7 +878,7 @@ def parallelize_node(self, node_id, fileIdGen): parallelism = len(parallel_input_ids) ## Identify the output. - node_output_edge_ids = node.outputs + node_output_edge_ids = node.get_output_list() assert(len(node_output_edge_ids) == 1) node_output_edge_id = node_output_edge_ids[0] @@ -818,22 +893,31 @@ def parallelize_node(self, node_id, fileIdGen): parallel_configuration_ids = [[] for _ in range(parallelism)] node_conf_inputs = node.get_configuration_inputs() for conf_edge_id in node_conf_inputs: + assert(False) ## TODO: For now this does not work for r_merge assert(not r_merge_flag) # self.set_edge_to(conf_edge_id, None) tee_id = self.tee_edge(conf_edge_id, parallelism, fileIdGen) tee_node = self.get_node(tee_id) for i in range(parallelism): + # TODO outputs probably non-existent parallel_configuration_ids[i].append(tee_node.outputs[i]) - + ## Create a temporary output edge for each parallel command. - map_output_fids = node.get_map_output_files(parallel_input_ids, fileIdGen) + # BEGIN ANNO + # OLD + # map_output_fids = node.get_map_output_files(parallel_input_ids, fileIdGen) + # NEW (added parameter) + map_output_fids = get_map_output_files(node, parallel_input_ids, fileIdGen, rr_parallelizer) + # END ANNO + assert(len(map_output_fids) == len(parallel_input_ids)) all_map_output_ids = [] ## For each parallel input, create a parallel command for index in range(parallelism): ## Gather inputs and outputs conf_ins = parallel_configuration_ids[index] + assert(len(conf_ins) == 0) standard_in = parallel_input_ids[index] new_inputs = (conf_ins, [standard_in]) map_output_fid = map_output_fids[index] @@ -848,9 +932,10 @@ def parallelize_node(self, node_id, fileIdGen): for output_fid in output_fid_list: self.add_edge(output_fid) - ## If the previous merger is r_merge we need to put wrap around the nodes + ## If the previous merger is r_merge we need to put wrap around the nodes ## or unwrap before a commutative command if(r_merge_flag is True): + assert(False) ## For stateless nodes we are in case (2) and we wrap them if (node.is_stateless()): parallel_node = make_wrap_map_node(node, new_inputs, new_output_ids) @@ -888,23 +973,34 @@ def parallelize_node(self, node_id, fileIdGen): parallel_node = unwrap_node else: ## If we are working with a `cat` (and not an r_merge), then we just make a parallel node - parallel_node = make_map_node(node, new_inputs, new_output_ids) + parallel_node = make_map_node(node, new_inputs, new_output_ids, rr_parallelizer) self.add_node(parallel_node) parallel_node_id = parallel_node.get_id() ## Set the to of all input edges for conf_in in conf_ins: + assert(False) self.set_edge_to(conf_in, parallel_node_id) self.set_edge_to(standard_in, parallel_node_id) if (node.com_category == "stateless"): if(r_merge_flag is True): + assert(False) new_merger = r_merge.make_r_merge_node(flatten_list(all_map_output_ids), node_output_edge_id) else: - new_merger = make_cat_node(flatten_list(all_map_output_ids), node_output_edge_id) - + # BEGIN ANNO + # OLD + # new_merger = make_cat_node(flatten_list(all_map_output_ids), node_output_edge_id) + # log(f'old_new_merger: {new_merger}') + # NEW + log(f'node: {node}') + log(f'rr_parallelizer: {rr_parallelizer}') + new_merger = get_aggregator_as_dfg_node_from_node(node, rr_parallelizer, flatten_list(all_map_output_ids), [node_output_edge_id]) + log(f'new_new_merger: {new_merger}') + # END ANNO + self.add_node(new_merger) new_nodes.append(new_merger) self.set_edge_from(node_output_edge_id, new_merger.get_id()) @@ -977,7 +1073,7 @@ def edge_node_consistency(self): for edge_id, (_, from_node_id, to_node_id) in self.edges.items(): if (not from_node_id is None): from_node = self.get_node(from_node_id) - if(not (edge_id in from_node.outputs)): + if(not (edge_id in from_node.get_output_list())): log("Consistency Error: Edge id:", edge_id, "is not in the node outputs:", from_node) return False if (not to_node_id is None): @@ -992,7 +1088,7 @@ def edge_node_consistency(self): if(not (to_node_id == node_id)): log("Consistency Error: The to_node_id of the input_edge:", edge_id, "of the node:", node, "is equal to:", to_node_id) return False - for edge_id in node.outputs: + for edge_id in node.get_output_list(): _, from_node_id, _ = self.edges[edge_id] if(not (from_node_id == node_id)): log("Consistency Error: The from_node_id of the output_edge:", edge_id, "of the node:", node, "is equal to:", from_node_id) @@ -1015,4 +1111,3 @@ def valid(self): # and not self.get_stdin() is None # and not self.get_stdout() is None))) - diff --git a/compiler/pash_runtime.py b/compiler/pash_runtime.py index b5f881119..6426cce06 100644 --- a/compiler/pash_runtime.py +++ b/compiler/pash_runtime.py @@ -14,6 +14,7 @@ from definitions.ir.aggregator_node import * +from definitions.ir.dfg_node import DFGNode from definitions.ir.nodes.eager import * from definitions.ir.nodes.pash_split import * @@ -284,9 +285,10 @@ def naive_parallelize_stateless_nodes_bfs(graph, fan_out, batch_size, no_cat_spl next_node_ids = graph.get_next_nodes(curr_id) workset += next_node_ids - new_nodes = parallelize_cat(curr_id, graph, fileIdGen, - fan_out, batch_size, no_cat_split_vanish, - r_split_flag, r_split_batch_size) + # function application has side effects on graphs + new_nodes = parallelize_node(curr_id, graph, fileIdGen, + fan_out, batch_size, no_cat_split_vanish, + r_split_flag, r_split_batch_size) ## Assert that the graph stayed valid after the transformation ## TODO: Do not run this everytime in the loop if we are not in debug mode. @@ -413,71 +415,66 @@ def split_hdfs_cat_input(hdfs_cat, next_node, graph, fileIdGen): ## TODO: At the moment we greedily try to add r-splits if possible, so we need to have a better procedure of deciding whether to put them or not. ## For example for non-commutative pure commands. -## If the current command is a cat, and is followed by a node that -## is either stateless or pure parallelizable, commute the cat -## after the node. -def parallelize_cat(curr_id, graph, fileIdGen, fan_out, - batch_size, no_cat_split_vanish, r_split_flag, r_split_batch_size): +## This function takes a node (id) and parallelizes it +def parallelize_node(curr_id, graph, fileIdGen, fan_out, + batch_size, no_cat_split_vanish, r_split_flag, r_split_batch_size): curr = graph.get_node(curr_id) new_nodes_for_workset = [] - # log("Check to parallelize curr:", curr) + option_parallelizer_rr = curr.get_option_implemented_round_robin_parallelizer() + + if option_parallelizer_rr is not None: + # TODO: this whole fragment could be moved to the graph after picking a parallelizer + # TODO: we only do consecutive chunks here but from a rr splitter + parallelizer_rr = option_parallelizer_rr + streaming_inputs = curr.get_streaming_inputs() + assert(len(streaming_inputs) == 1) + streaming_input = streaming_inputs[0] + configuration_inputs = curr.get_configuration_inputs() + assert(len(configuration_inputs) == 0) + streaming_outputs = curr.get_output_list() + assert(len(streaming_outputs) == 1) + streaming_output = streaming_outputs[0] + original_cmd_invocation_with_io_vars = curr.cmd_invocation_with_io_vars + + graph.remove_node(curr_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph + + out_split_ids = graph.generate_ephemeral_edges(fileIdGen, fan_out) + splitter = pash_split.make_split_file(streaming_input, out_split_ids) + graph.set_edge_to(streaming_input, splitter.get_id()) + for out_split_id in out_split_ids: + graph.set_edge_from(out_split_id, splitter.get_id()) + + + in_mapper_ids = out_split_ids + out_mapper_ids = graph.generate_ephemeral_edges(fileIdGen, fan_out) + zip_mapper_in_out_ids = zip(in_mapper_ids, out_mapper_ids) + + all_mappers = [] + for (in_id, out_id) in zip_mapper_in_out_ids: + # BEGIN: these 4 lines could be refactored to be a function in graph such that + # creating end point of edges and the creation of edges is not decoupled + mapper_cmd_inv = parallelizer_rr.get_actual_mapper(original_cmd_invocation_with_io_vars, in_id, out_id) + mapper = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(mapper_cmd_inv) + graph.set_edge_to(in_id, mapper.get_id()) + graph.set_edge_from(out_id, mapper.get_id()) + # END + all_mappers.append(mapper) + + in_aggregator_ids = out_mapper_ids + out_aggregator_id = streaming_output + ## TODO: This could potentially be an aggregator tree (at least in the old PaSh versions) + ## We need to extend the annotations/parallelizers to support this (e.g., for sort) + aggregator_cmd_inv = parallelizer_rr.get_actual_aggregator(original_cmd_invocation_with_io_vars, in_aggregator_ids, out_aggregator_id) + aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(aggregator_cmd_inv) + for in_aggregator_id in in_aggregator_ids: + graph.set_edge_to(in_aggregator_id, aggregator.get_id()) + graph.set_edge_from(streaming_output, aggregator.get_id()) - ## Get next nodes in the graph - next_node_ids = graph.get_next_nodes(curr_id) - - ## We try to parallelize for all the edges that go out from the current node and into another node - for next_node_id in next_node_ids: - next_node = graph.get_node(next_node_id) - # log("|-- its next node is:", next_node) - new_curr = curr - new_curr_id = curr_id - - ## If the next node can be parallelized, then we should try to parallelize - ## - ## If the user has provided the r_split flag (they want to use r_split), - ## then parallelizability depends on commutativity (if a command is pure parallelizable but not commutative) - ## then it can't be parallelized. Therefore we do not parallelize non-commutative pure parallelizable commands. - ## - ## TODO: We need to extend PaSh to have a mode where it can have both r_splits and auto_split if a command is not - ## commutative. This can be added as an option to the r_split flag, e.g., r_split="no" | "yes" | "optimal". - if(next_node.is_parallelizable() - and not isinstance(next_node, Cat) - and (not r_split_flag - or (next_node.is_commutative() - or next_node.is_stateless()))): - ## If the current node is not a merger, it means that we need - ## to generate a merger using a splitter (auto_split or r_split) - if (isinstance(curr, HDFSCat) and config.pash_args.distributed_exec): - new_curr = split_hdfs_cat_input(curr, next_node, graph, fileIdGen) # Cat merger - new_curr_id = new_curr.get_id() - ## no_cat_split_vanish shortcircuits this and inserts a split even if the current node is a cat. - elif (fan_out > 1 - and (no_cat_split_vanish - or (not (isinstance(curr, Cat) - or isinstance(curr, r_merge.RMerge)) - or ((isinstance(curr, Cat) - or isinstance(curr, r_merge.RMerge)) - and len(curr.get_input_list()) < fan_out)))): - new_merger = split_command_input(next_node, graph, fileIdGen, fan_out, batch_size, r_split_flag, r_split_batch_size) - ## After split has succeeded we know that the curr node (previous of the next) - ## has changed. Therefore we need to retrieve it again. - if (not new_merger is None): - new_curr_id = new_merger.get_id() - new_curr = new_merger - assert(isinstance(new_curr, Cat) - or isinstance(new_curr, r_merge.RMerge)) - - ## If curr is cat, it means that split suceeded, or it was - ## already a cat. In any case, we can proceed with the - ## parallelization. - ## - ## Both Cat and RMerge can be "commuted" with parallelizable nodes - if(isinstance(new_curr, Cat) - or isinstance(new_curr, r_merge.RMerge)): - new_nodes = check_parallelize_dfg_node(new_curr_id, next_node_id, graph, fileIdGen) - # log("New nodes:", new_nodes) - new_nodes_for_workset += new_nodes + ## Add the merge commands in the graph + new_nodes = [splitter] + all_mappers + [aggregator] + for new_node in new_nodes: + graph.add_node(new_node) return new_nodes_for_workset @@ -491,6 +488,7 @@ def parallelize_cat(curr_id, graph, fileIdGen, fan_out, ## ## TODO: We need to check if the previous node is a cat or a merge def check_parallelize_dfg_node(merger_id, node_id, graph, fileIdGen): + assert(False) ## Get merger inputs (cat or r_merge). merger_input_edge_ids = graph.get_node_input_ids(merger_id) @@ -511,6 +509,7 @@ def check_parallelize_dfg_node(merger_id, node_id, graph, fileIdGen): return new_nodes def parallelize_dfg_node(old_merger_id, node_id, graph, fileIdGen): + assert(False) node = graph.get_node(node_id) assert(node.is_parallelizable()) @@ -749,18 +748,18 @@ def add_eager_nodes(graph, use_dgsh_tee): add_eager(curr_input_id, graph, fileIdGen, intermediateFileIdGen, use_dgsh_tee) if(isinstance(curr, Split)): - eager_input_ids = curr.outputs[:-1] + eager_input_ids = curr.get_output_list()[:-1] for edge_id in eager_input_ids: add_eager(edge_id, graph, fileIdGen, intermediateFileIdGen, use_dgsh_tee) ## Add an eager after r_unwrap if(isinstance(curr, r_unwrap.RUnwrap)): - eager_input_id = curr.outputs[0] + eager_input_id = curr.get_output_list()[0] add_eager(eager_input_id, graph, fileIdGen, intermediateFileIdGen, use_dgsh_tee) ## Add an eager after r_split if(isinstance(curr, r_split.RSplit)): - eager_input_ids = curr.outputs + eager_input_ids = curr.get_output_list() for edge_id in eager_input_ids: add_eager(edge_id, graph, fileIdGen, intermediateFileIdGen, use_dgsh_tee) diff --git a/compiler/util.py b/compiler/util.py index a6b3857a9..1c4b30ace 100644 --- a/compiler/util.py +++ b/compiler/util.py @@ -1,4 +1,6 @@ from datetime import timedelta +from typing import Optional, TypeVar, Union, List, Any +TType = TypeVar("TType") import os import sys import config @@ -45,3 +47,16 @@ def ptempfile(): ## TODO: Get a name without opening the fd too if possible os.close(fd) return name + +def return_empty_list_if_none_else_itself(arg: Optional[TType]) -> Union[TType, List[Any]]: #list always empty + if arg is None: + return [] + else: + return arg + +def return_default_if_none_else_itself(arg: Optional[TType], default: TType) -> TType: + if arg is None: + return default + else: + return arg + From 8407ca51927d1a3b41ed4627db7c361cbdcaf48a Mon Sep 17 00:00:00 2001 From: Felix Stutz Date: Mon, 27 Jun 2022 14:56:08 -0400 Subject: [PATCH 02/64] Add support for eager-nodes (#589) * Add support for eager-nodes Signed-off-by: Felix Stutz * Remove eager from TODO-list Signed-off-by: Felix Stutz * Addressed comments from PR Signed-off-by: Felix Stutz --- TODO.md | 2 +- compiler/definitions/ir/nodes/eager.py | 136 +++++-------------------- compiler/ir.py | 4 + compiler/pash_runtime.py | 10 +- 4 files changed, 37 insertions(+), 115 deletions(-) diff --git a/TODO.md b/TODO.md index 67b55f0ea..08bb235b8 100644 --- a/TODO.md +++ b/TODO.md @@ -1,9 +1,9 @@ ## TODOs before merging to `future` -- eager - aggregation trees - r_split - cat-split fusion +- dgsh_tee - working on all tests - Adding annotation library installation and removing ad-hoc import of the latter - clean up utils for annotations diff --git a/compiler/definitions/ir/nodes/eager.py b/compiler/definitions/ir/nodes/eager.py index 9cc37315f..2a5ee5aa9 100644 --- a/compiler/definitions/ir/nodes/eager.py +++ b/compiler/definitions/ir/nodes/eager.py @@ -1,118 +1,32 @@ +from datatypes_new.AccessKind import AccessKind +from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars + from definitions.ir.dfg_node import * -from ir_utils import * class Eager(DFGNode): - def __init__(self, inputs, outputs, com_name, com_category, com_options = [], - com_redirs = [], com_assignments=[], - intermediate = None): - # BEGIN ANNO : hack for intermediate at the end - self.intermediate = intermediate - # END ANNO - super().__init__(inputs, outputs, com_name, com_category, - com_options=com_options, - com_redirs=com_redirs, + def __init__(self, + cmd_invocation_with_io_vars, + com_redirs=[], com_assignments=[] + ): + # TODO []: default + super().__init__( + cmd_invocation_with_io_vars, + com_redirs=com_redirs, com_assignments=com_assignments) - # BEGIN ANNO : copied from DFG node for hack for intermediate at the end - def to_ast(self, edges, drain_streams): - log(f'do we get here?') - ## TODO: We might not want to implement this at all actually - if (drain_streams): - raise NotImplementedError() - else: - ## Handle special node to ast here - # node = self.special_to_ast(edges) - # if node is not None: - # return node - - redirs = self._to_ast_aux_get_redirs() - assignments = self.com_assignments - ## Start filling in the arguments - opt_arguments = [] - # BEGIN ANNO - # get_command_invocation_prefix_from_dfg_node - log(f'com_name: {self.com_name}') - log(f'edges: {edges}') - log(f'inputs: {self.inputs}') - log(f'outputs: {self.outputs}') - log(f'com_redirs: {self.com_redirs}') - log(f'pos config: {self.positional_config_list}') - log(f'pos input: {self.positional_input_list}') - log(f'pos output: {self.positional_output_list}') - log(f'com_options: {self.com_options}') - log(f'flag_option_list: {self.flag_option_list}') - - # if self.implicit_use_of_stdin: # need to recompute - # cat a list of inputs into it; redirect a single one - # else: - - # OLD - # for i, opt in self.com_options: - # ## Pad the argument list with None - # opt_arguments = pad(opt_arguments, i) - # opt_arguments[i] = opt.to_ast() - # log(f'opt_arguments: {format_args([val for val in opt_arguments if val is not None])}') - # NEW - opt_arguments_new = [get_ast_for_flagoption(flagoption) for flagoption in self.flag_option_list] - opt_arguments_new += [get_ast_for_argstringtype(arg) for arg in self.positional_config_list] - log(f'opt_arguments_new: {format_args(opt_arguments_new)}') - # END ANNO - - com_name_ast = self.com_name.to_ast() - option_asts = [opt.to_ast() for _, opt in self.com_options] - - ## - ## 1. Find the input and output fids - ## 2. Construct the rest of the arguments and input/output redirections according to - ## the command IO - input_fids = [edges[in_id][0] for in_id in self.get_input_list()] - output_fids = [edges[out_id][0] for out_id in self.outputs] - rest_argument_fids, new_redirs = create_command_arguments_redirs(com_name_ast, - option_asts, - input_fids, - output_fids) - - ## Transform the rest of the argument fids to arguments - ## Since some of the rest_arguments can be None (they only contain inputs and outputs) - ## we need to make sure that we don't turn None objects to asts. - ## - ## The None fields need to be filtered out because they are taken care of by the interleave function. - ## - ## TODO: Is this actually OK? - rest_arguments = [fid.to_ast() - for fid in rest_argument_fids - if not fid is None] - log(f'rest_arguments: {format_args(rest_arguments)}') - - ## Interleave the arguments since options args might contain gaps. - # BEGIN ANNO - rest_arguments_backup = rest_arguments.copy() - # OLD - # arguments = interleave_args(opt_arguments, rest_arguments) - # log(f'arguments fin: {format_args(arguments)}') - # NEW - arguments_new = opt_arguments_new + rest_arguments_backup + [self.intermediate.to_ast()] - log(f'arguments_new: {format_args(arguments_new)}') - # END ANNO - - all_arguments = [com_name_ast] + arguments_new - all_redirs = redirs + new_redirs - - node = make_command(all_arguments, redirections=all_redirs, assignments=assignments) - return node - def make_eager_node(input_id, output_id, intermediate_file_id, eager_exec_path): - com_name = Arg(string_to_argument(eager_exec_path)) - com_category = "pure" - ## TODO: In theory the intermediate file id is also an output... - # BEGIN ANNO - # OLD - intermediate_identifier = Arg(intermediate_file_id.to_ast()) - com_options = [(2, intermediate_identifier)] - return Eager([input_id], - [output_id], - com_name, - com_category, - com_options=com_options, - intermediate=intermediate_identifier) + eager_name = eager_exec_path + intermediate_file_id_id = intermediate_file_id.get_ident() + operand_list = [input_id, output_id, intermediate_file_id_id] + access_map = {output_id: AccessKind.make_stream_output(), + input_id: AccessKind.make_stream_input(), + intermediate_file_id_id: AccessKind.make_other_output()} + cmd_inv_with_io_vars = CommandInvocationWithIOVars( + cmd_name=eager_name, + flag_option_list=[], + operand_list=operand_list, + implicit_use_of_streaming_input=None, + implicit_use_of_streaming_output=None, + access_map=access_map) + return Eager(cmd_inv_with_io_vars) diff --git a/compiler/ir.py b/compiler/ir.py index 641546cfc..d5a395462 100644 --- a/compiler/ir.py +++ b/compiler/ir.py @@ -72,6 +72,10 @@ def next_ephemeral_file_id(self): fileId.make_ephemeral() return fileId + def bump_counter_to_value_of(self, OtherFileIdGen): + # TODO: find a better solution to make unique numbers, currently: set to max-value + 1 + self.next = OtherFileIdGen.next + 1 + ## Returns the resource or file descriptor related to this specific opt_or_fd ## NOTE: Assumes that everything is expanded. def get_option_or_fd(opt_or_fd, options, fileIdGen): diff --git a/compiler/pash_runtime.py b/compiler/pash_runtime.py index 6426cce06..06d66c129 100644 --- a/compiler/pash_runtime.py +++ b/compiler/pash_runtime.py @@ -218,8 +218,7 @@ def optimize_irs(asts_and_irs, args, compiler_config): args.no_cat_split_vanish, args.r_split, args.r_split_batch_size) # pr.print_stats() - # log(distributed_graph) - + # Eagers are added in remote notes when using distributed exec if(not args.no_eager and not args.distributed_exec): eager_distributed_graph = add_eager_nodes(distributed_graph, args.dgsh_tee) @@ -231,7 +230,6 @@ def optimize_irs(asts_and_irs, args, compiler_config): ## Print statistics of output nodes print_graph_statistics(eager_distributed_graph) - # log(eager_distributed_graph) optimized_asts_and_irs.append(eager_distributed_graph) else: @@ -686,12 +684,18 @@ def add_eager(eager_input_id, graph, fileIdGen, intermediateFileIdGen, use_dgsh_ new_id = new_fid.get_ident() if use_dgsh_tee: + assert(False) ## TODO: seperate to better use dgsh-tee params and maybe deprecate eager eager_node = dgsh_tee.make_dgsh_tee_node(eager_input_id, new_id) else: ## TODO: Remove the line below if eager creates its intermediate file ## on its own. + # TODO: find a better solution to make unique numbers, currently: set to max-value + 1 + intermediateFileIdGen.bump_counter_to_value_of(fileIdGen) intermediate_fid = intermediateFileIdGen.next_temporary_file_id() + # TODO: this edge will never have to since eager is set to output even though it reads from it + graph.add_edge(intermediate_fid) + fileIdGen.bump_counter_to_value_of(intermediateFileIdGen) eager_exec_path = '{}/{}'.format(config.PASH_TOP, runtime_config['eager_executable_path']) From d92cbba6fe30c6c9a703fb25818e6d76cc7c1911 Mon Sep 17 00:00:00 2001 From: Felix Stutz Date: Mon, 27 Jun 2022 19:03:28 -0400 Subject: [PATCH 03/64] Add reduce/aggregation trees (#590) * Add reduce trees Signed-off-by: Felix Stutz * Remove task in TODO Signed-off-by: Felix Stutz * Remove log statements Signed-off-by: Felix Stutz --- TODO.md | 1 - .../annotations_utils/util_cmd_invocations.py | 11 --- compiler/ir.py | 89 +++++++++++++++-- compiler/pash_runtime.py | 98 +++++-------------- 4 files changed, 106 insertions(+), 93 deletions(-) diff --git a/TODO.md b/TODO.md index 08bb235b8..9edb7d878 100644 --- a/TODO.md +++ b/TODO.md @@ -1,6 +1,5 @@ ## TODOs before merging to `future` -- aggregation trees - r_split - cat-split fusion - dgsh_tee diff --git a/compiler/annotations_utils/util_cmd_invocations.py b/compiler/annotations_utils/util_cmd_invocations.py index 90e5f6c10..85cc36b41 100644 --- a/compiler/annotations_utils/util_cmd_invocations.py +++ b/compiler/annotations_utils/util_cmd_invocations.py @@ -9,8 +9,6 @@ from annotation_generation_new.AnnotationGeneration import get_input_output_info_from_cmd_invocation, \ get_parallelizability_info_from_cmd_invocation -from util import log - from config import get_path_annotation_repo sys.path.insert(1, get_path_annotation_repo()) @@ -26,18 +24,11 @@ def get_command_invocation_prefix_from_dfg_node(dfg_node): # TODO: ideally methods in the respective classes but requires refactoring of parsing infrastructure def to_node_cmd_inv_with_io_vars(cmd_inv, edges, redirs, assignments): - log("edges", edges) ast_cmd_name = string_to_argument(cmd_inv.cmd_name) - log("ast_cmd_name", ast_cmd_name) ast_flagoptions = [] for flagoption in cmd_inv.flag_option_list: ast_flagoptions += to_ast_flagoption(flagoption, edges) - log("flagoptions", cmd_inv.flag_option_list) - log("ast_flagoptions", ast_flagoptions) ast_operands = [to_ast_operand(operand, edges) for operand in cmd_inv.operand_list] - log("operands", cmd_inv.operand_list) - log("ast_operands", ast_operands) - # log("type of ast_operands [0]", type(ast_operands[0])) # can only be used if there are operands cmd_asts = [ast_cmd_name] + ast_flagoptions + ast_operands # TODO: check for actual stdin @@ -56,7 +47,6 @@ def to_node_cmd_inv_with_io_vars(cmd_inv, edges, redirs, assignments): new_redirs = redirs + stdin_redir + stdout_redir node = make_command(cmd_asts, redirections=new_redirs, assignments=assignments) - log("node", node) return node def to_ast_flagoption(flagoption, _edges): @@ -82,7 +72,6 @@ def to_ast_arg_string_type(arg_string_type): # assumes io_var is an edge id def dereference_io_var(io_var, edges): fid, _, _ = edges[io_var] - log(fid) return fid.to_ast() def get_input_output_info_from_cmd_invocation_util(cmd_invocationInitial : CommandInvocationInitial) -> InputOutputInfo: diff --git a/compiler/ir.py b/compiler/ir.py index d5a395462..0b8938c0f 100644 --- a/compiler/ir.py +++ b/compiler/ir.py @@ -289,8 +289,6 @@ def make_map_node(node, new_inputs, new_outputs, parallelizer): ## ## At the moment it only works with one input and one output since wrap cannot redirect input in the command. def make_wrap_map_node(node, new_inputs, new_outputs): - # log("Inputs:", new_inputs) - # log("Outputs:", new_outputs) assert(is_single_input(new_inputs)) assert(len(new_outputs) == 1) @@ -318,8 +316,6 @@ def __init__(self, nodes, edges, background = False): self.nodes = nodes self.edges = edges self.background = background - # log("Nodes:", self.nodes) - # log("Edges:", self.edges) ## Apply the redirections for each separate node. ## This needs to be called here because nodes do not @@ -822,7 +818,6 @@ def parallelize_node(self, node_id, fileIdGen): # OLD # assert(node.is_parallelizable()) # NEW - log(f'parallelizers: {node.parallelizer_list}') rr_parallelizer_list = [parallelizer for parallelizer in node.parallelizer_list if parallelizer.splitter.is_splitter_round_robin()] assert(len(rr_parallelizer_list) == 1) rr_parallelizer = rr_parallelizer_list[0] @@ -997,12 +992,8 @@ def parallelize_node(self, node_id, fileIdGen): # BEGIN ANNO # OLD # new_merger = make_cat_node(flatten_list(all_map_output_ids), node_output_edge_id) - # log(f'old_new_merger: {new_merger}') # NEW - log(f'node: {node}') - log(f'rr_parallelizer: {rr_parallelizer}') new_merger = get_aggregator_as_dfg_node_from_node(node, rr_parallelizer, flatten_list(all_map_output_ids), [node_output_edge_id]) - log(f'new_new_merger: {new_merger}') # END ANNO self.add_node(new_merger) @@ -1115,3 +1106,83 @@ def valid(self): # and not self.get_stdin() is None # and not self.get_stdout() is None))) + ## This is a function that creates a reduce tree for a given node + def create_generic_aggregator_tree(self, curr_node, parallelizer, input_ids_for_aggregators, out_aggregator_id, fileIdGen): + def function_to_get_binary_aggregator(in_ids, out_ids): + assert(len(out_ids) == 1) + aggregator_cmd_inv = parallelizer.get_actual_aggregator(curr_node.cmd_invocation_with_io_vars, in_ids, out_ids[0]) + aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(aggregator_cmd_inv) + return aggregator + ## The Aggregator node takes a sequence of input ids and an output id + all_aggregators, new_edges, final_output_id = self.create_reduce_tree(lambda in_ids, out_ids: function_to_get_binary_aggregator(in_ids, out_ids), + input_ids_for_aggregators, fileIdGen) + ## Add the edges in the graph + self.add_edges(new_edges) + ## Add the merge commands in the graph + for new_node in all_aggregators: + self.add_node(new_node) + + ## Replace the previous final_output_id with the previous id + node_output_edge_id = out_aggregator_id + final_merge_node_id = self.edges[final_output_id][1] + final_merge_node = self.get_node(final_merge_node_id) + final_merge_node.replace_edge(final_output_id, node_output_edge_id) + self.set_edge_from(node_output_edge_id, final_merge_node_id) + self.set_edge_from(final_output_id, None) + + ## This function creates the reduce tree. Both input and output file + ## ids must be lists of lists, as the input file ids and the output + ## file ids might contain auxiliary files. + def create_reduce_tree(self, init_func, input_ids, fileIdGen): + tree = [] + new_edges = [] + curr_ids = input_ids + while(len(curr_ids) > 1): + new_level, curr_ids, new_fids = self.create_reduce_tree_level(init_func, curr_ids, fileIdGen) + tree += new_level + new_edges += new_fids + + # Find the final output (provided with parameter) + final_output_id = curr_ids[0][0] + + ## Drain the final auxiliary outputs + final_auxiliary_outputs = curr_ids[0][1:] + drain_fids = [fileIdGen.next_file_id() + for final_auxiliary_output in final_auxiliary_outputs] + for drain_fid in drain_fids: + drain_fid.set_resource(FileResource(Arg(string_to_argument('/dev/null')))) + new_edges.append(drain_fid) + drain_ids = [fid.get_ident() for fid in drain_fids] + + drain_cat_commands = [make_cat_node([final_auxiliary_output], drain_id) + for final_auxiliary_output, drain_id in zip(final_auxiliary_outputs, drain_ids)] + return (tree + drain_cat_commands), new_edges, final_output_id + + @staticmethod + ## This function creates a level of the reduce tree. Both input and + ## output file ids must be lists of lists, as the input file ids and + ## the output file ids might contain auxiliary files. + def create_reduce_tree_level(init_func, input_ids, fileIdGen): + if(len(input_ids) % 2 == 0): + output_ids = [] + even_input_ids = input_ids + else: + output_ids = [input_ids[0]] + even_input_ids = input_ids[1:] + + new_fids = [] + level = [] + for i in range(0, len(even_input_ids), 2): + new_out_fids = [fileIdGen.next_ephemeral_file_id() for _ in input_ids[i]] + new_fids += new_out_fids + new_out_ids = [fid.get_ident() for fid in new_out_fids] + output_ids.append(new_out_ids) + new_node = IR.create_reduce_node(init_func, even_input_ids[i:i+2], new_out_ids) + level.append(new_node) + return (level, output_ids, new_fids) + + @staticmethod + ## This function creates one node of the reduce tree + def create_reduce_node(init_func, input_ids, output_ids): + return init_func(flatten_list(input_ids), output_ids) + # TODO: this is where we need to use our aggregator spec/node diff --git a/compiler/pash_runtime.py b/compiler/pash_runtime.py index 06d66c129..f8045d5e8 100644 --- a/compiler/pash_runtime.py +++ b/compiler/pash_runtime.py @@ -4,6 +4,8 @@ import traceback from datetime import datetime +from annotation_generation_new.datatypes.parallelizability.AggregatorKind import AggregatorKindEnum + import config from ir import * from ast_to_ir import compile_asts @@ -442,7 +444,7 @@ def parallelize_node(curr_id, graph, fileIdGen, fan_out, graph.set_edge_to(streaming_input, splitter.get_id()) for out_split_id in out_split_ids: graph.set_edge_from(out_split_id, splitter.get_id()) - + graph.add_node(splitter) in_mapper_ids = out_split_ids out_mapper_ids = graph.generate_ephemeral_edges(fileIdGen, fan_out) @@ -458,21 +460,32 @@ def parallelize_node(curr_id, graph, fileIdGen, fan_out, graph.set_edge_from(out_id, mapper.get_id()) # END all_mappers.append(mapper) + for new_node in all_mappers: + graph.add_node(new_node) in_aggregator_ids = out_mapper_ids out_aggregator_id = streaming_output ## TODO: This could potentially be an aggregator tree (at least in the old PaSh versions) ## We need to extend the annotations/parallelizers to support this (e.g., for sort) - aggregator_cmd_inv = parallelizer_rr.get_actual_aggregator(original_cmd_invocation_with_io_vars, in_aggregator_ids, out_aggregator_id) - aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(aggregator_cmd_inv) - for in_aggregator_id in in_aggregator_ids: - graph.set_edge_to(in_aggregator_id, aggregator.get_id()) - graph.set_edge_from(streaming_output, aggregator.get_id()) + aggregator_spec = parallelizer_rr.get_aggregator_spec() + aggregator_kind = aggregator_spec.get_kind() + if aggregator_kind == AggregatorKindEnum.CONCATENATE or aggregator_kind == AggregatorKindEnum.CUSTOM_N_ARY: + aggregator_cmd_inv = parallelizer_rr.get_actual_aggregator(original_cmd_invocation_with_io_vars, in_aggregator_ids, out_aggregator_id) + aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(aggregator_cmd_inv) + for in_aggregator_id in in_aggregator_ids: + graph.set_edge_to(in_aggregator_id, aggregator.get_id()) + graph.set_edge_from(streaming_output, aggregator.get_id()) + all_aggregators = [aggregator] + ## Add the merge commands in the graph + for new_node in all_aggregators: + graph.add_node(new_node) + elif aggregator_kind == AggregatorKindEnum.CUSTOM_2_ARY: + # TODO: we simplify and assume that every mapper produces a single output for now: + map_in_aggregator_ids = [[id] for id in in_aggregator_ids] + graph.create_generic_aggregator_tree(curr, parallelizer_rr, map_in_aggregator_ids, out_aggregator_id, fileIdGen) + else: + raise Exception("aggregator kind not yet implemented") - ## Add the merge commands in the graph - new_nodes = [splitter] + all_mappers + [aggregator] - for new_node in new_nodes: - graph.add_node(new_node) return new_nodes_for_workset @@ -579,28 +592,24 @@ def parallelize_dfg_node(old_merger_id, node_id, graph, fileIdGen): ## ## TODO: Make that generic to work through annotations def create_merge_commands(curr, new_output_ids, fileIdGen): + assert(False) if(str(curr.com_name) == "uniq"): return create_uniq_merge_commands(curr, new_output_ids, fileIdGen) else: return create_generic_aggregator_tree(curr, new_output_ids, fileIdGen) -## This is a function that creates a reduce tree for a generic function -def create_generic_aggregator_tree(curr, new_output_ids, fileIdGen): - ## The Aggregator node takes a sequence of input ids and an output id - output = create_reduce_tree(lambda in_ids, out_ids: AggregatorNode(curr, in_ids, out_ids), - new_output_ids, fileIdGen) - return output - ## TODO: These must be generated using some file information ## ## TODO: Find a better place to put these functions def create_sort_merge_commands(curr, new_output_ids, fileIdGen): + assert(False) output = create_reduce_tree(lambda ids: SortGReduce(curr, ids), new_output_ids, fileIdGen) return output ## Instead of creating a tree, we just create a single level reducer for uniq def create_uniq_merge_commands(curr, new_output_ids, fileIdGen): + assert(False) ## Make an intermediate cat node intermediate_fid = fileIdGen.next_ephemeral_file_id() intermediate_id = intermediate_fid.get_ident() @@ -622,61 +631,6 @@ def create_uniq_merge_commands(curr, new_output_ids, fileIdGen): return ([new_cat, node], [intermediate_fid, new_out_fid], new_out_id) -## This function creates the reduce tree. Both input and output file -## ids must be lists of lists, as the input file ids and the output -## file ids might contain auxiliary files. -def create_reduce_tree(init_func, input_ids, fileIdGen): - tree = [] - new_edges = [] - curr_ids = input_ids - while(len(curr_ids) > 1): - new_level, curr_ids, new_fids = create_reduce_tree_level(init_func, curr_ids, fileIdGen) - tree += new_level - new_edges += new_fids - - ## Find the final output - final_output_id = curr_ids[0][0] - - ## Drain the final auxiliary outputs - final_auxiliary_outputs = curr_ids[0][1:] - drain_fids = [fileIdGen.next_file_id() - for final_auxiliary_output in final_auxiliary_outputs] - for drain_fid in drain_fids: - drain_fid.set_resource(FileResource(Arg(string_to_argument('/dev/null')))) - new_edges.append(drain_fid) - drain_ids = [fid.get_ident() for fid in drain_fids] - - drain_cat_commands = [make_cat_node([final_auxiliary_output], drain_id) - for final_auxiliary_output, drain_id in zip(final_auxiliary_outputs, drain_ids)] - return (tree + drain_cat_commands), new_edges, final_output_id - - -## This function creates a level of the reduce tree. Both input and -## output file ids must be lists of lists, as the input file ids and -## the output file ids might contain auxiliary files. -def create_reduce_tree_level(init_func, input_ids, fileIdGen): - if(len(input_ids) % 2 == 0): - output_ids = [] - even_input_ids = input_ids - else: - output_ids = [input_ids[0]] - even_input_ids = input_ids[1:] - - new_fids = [] - level = [] - for i in range(0, len(even_input_ids), 2): - new_out_fids = [fileIdGen.next_ephemeral_file_id() for _ in input_ids[i]] - new_fids += new_out_fids - new_out_ids = [fid.get_ident() for fid in new_out_fids] - output_ids.append(new_out_ids) - new_node = create_reduce_node(init_func, even_input_ids[i:i+2], new_out_ids) - level.append(new_node) - return (level, output_ids, new_fids) - -## This function creates one node of the reduce tree -def create_reduce_node(init_func, input_ids, output_ids): - return init_func(flatten_list(input_ids), output_ids) - ## This functions adds an eager on a given edge. def add_eager(eager_input_id, graph, fileIdGen, intermediateFileIdGen, use_dgsh_tee): From 2b9935d487fa716aa7a4bb7466f2e2953127c702 Mon Sep 17 00:00:00 2001 From: Felix Stutz Date: Tue, 28 Jun 2022 16:09:34 -0400 Subject: [PATCH 04/64] Add support for round-robin parallelization, including unwrap for commutative commands (#591) Signed-off-by: Felix Stutz --- TODO.md | 4 +- .../annotations_utils/util_cmd_invocations.py | 57 ++++++++++- compiler/definitions/ir/dfg_node.py | 11 +-- compiler/definitions/ir/nodes/eager.py | 3 +- compiler/definitions/ir/nodes/r_merge.py | 38 +++++--- compiler/definitions/ir/nodes/r_unwrap.py | 39 +++++--- compiler/definitions/ir/nodes/r_wrap.py | 90 ++++++++++-------- compiler/ir.py | 2 +- compiler/ir_utils.py | 1 + compiler/pash_runtime.py | 94 +++++++++++++++++-- 10 files changed, 254 insertions(+), 85 deletions(-) diff --git a/TODO.md b/TODO.md index 9edb7d878..135a4cf03 100644 --- a/TODO.md +++ b/TODO.md @@ -1,8 +1,8 @@ ## TODOs before merging to `future` -- r_split -- cat-split fusion - dgsh_tee +- cat-split fusion +- r-unwrap-commutative fusion - working on all tests - Adding annotation library installation and removing ad-hoc import of the latter - clean up utils for annotations diff --git a/compiler/annotations_utils/util_cmd_invocations.py b/compiler/annotations_utils/util_cmd_invocations.py index 85cc36b41..19d2d6c0f 100644 --- a/compiler/annotations_utils/util_cmd_invocations.py +++ b/compiler/annotations_utils/util_cmd_invocations.py @@ -1,6 +1,6 @@ import sys -from datatypes_new.BasicDatatypes import Flag +from datatypes_new.BasicDatatypes import Flag, ArgStringType, Operand from datatypes_new.BasicDatatypesWithIO import OptionWithIO from datatypes_new.CommandInvocationInitial import CommandInvocationInitial from annotation_generation_new.datatypes.InputOutputInfo import InputOutputInfo @@ -8,8 +8,11 @@ from annotation_generation_new.datatypes.CommandProperties import CommandProperties from annotation_generation_new.AnnotationGeneration import get_input_output_info_from_cmd_invocation, \ get_parallelizability_info_from_cmd_invocation +from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars from config import get_path_annotation_repo +from definitions.ir.arg import Arg + sys.path.insert(1, get_path_annotation_repo()) # for typing @@ -23,6 +26,7 @@ def get_command_invocation_prefix_from_dfg_node(dfg_node): positional_config_list = dfg_node.positional_config_list) # TODO: ideally methods in the respective classes but requires refactoring of parsing infrastructure +# TODO: isn't this `to_ast`? def to_node_cmd_inv_with_io_vars(cmd_inv, edges, redirs, assignments): ast_cmd_name = string_to_argument(cmd_inv.cmd_name) ast_flagoptions = [] @@ -58,13 +62,24 @@ def to_ast_flagoption(flagoption, _edges): return [opt_name_ast, opt_arg_ast] def to_ast_operand(operand, edges): + if isinstance(operand, Operand): + return translate_io_var_if_applicable(operand.get_name(), edges) return translate_io_var_if_applicable(operand, edges) def translate_io_var_if_applicable(pot_io_var, edges): + # TODO: this is currently a hack but eventually every possible type gets their own to_ast-function if isinstance(pot_io_var, int): return dereference_io_var(pot_io_var, edges) - else: + elif isinstance(pot_io_var, ArgStringType): return to_ast_arg_string_type(pot_io_var) + elif isinstance(pot_io_var, CommandInvocationWithIOVars): + assert(False) + # only happens as r-wrapped node + return to_node_cmd_inv_with_io_vars(pot_io_var, edges, [], []) + elif isinstance(pot_io_var, Arg): + return pot_io_var.to_ast() + else: + raise Exception("Unhandled type for operand in to_ast!") def to_ast_arg_string_type(arg_string_type): return arg_string_type.get_name().arg_char_list # is of type Arg @@ -83,3 +98,41 @@ def get_parallelizability_info_from_cmd_invocation_util(cmd_invocationInitial : def construct_property_container_from_list_of_properties(list_properties): return CommandProperties(dict(list_properties)) +# this function is needed to wrap a node in `r_wrap` +def to_arg_from_cmd_inv_with_io_vars_without_streaming_inputs_or_outputs_for_wrapping(cmd_inv, edges): + # we already expand here + whole_cmd = Arg(string_to_argument("\'")) + arg_cmd_name = Arg(string_to_argument(cmd_inv.cmd_name)) + arg_flagoptions = [] + for flagoption in cmd_inv.flag_option_list: + arg_flagoptions += to_arg_flagoption(flagoption, edges) + arg_operands = [to_arg_operand(operand, edges) for operand in cmd_inv.operand_list] + all_cmd_parts_arg = [arg_cmd_name] + all_cmd_parts_arg.extend(arg_flagoptions) + all_cmd_parts_arg.extend(arg_operands) + for part in all_cmd_parts_arg: + whole_cmd.concatenate(part) + whole_cmd.concatenate(Arg(string_to_argument("\'"))) + return whole_cmd + +def to_arg_flagoption(flagoption, _edges): + if isinstance(flagoption, Flag): + return [Arg(string_to_argument(flagoption.get_name()))] + elif isinstance(flagoption, OptionWithIO): + opt_name_arg = Arg(string_to_argument(flagoption.get_name())) + opt_arg_arg = translate_io_var_to_arg_if_applicable(flagoption.get_arg()) + return [opt_name_arg, opt_arg_arg] + +def to_arg_operand(operand, edges): + if isinstance(operand, Operand): + return translate_io_var_to_arg_if_applicable(operand.get_name(), edges) + return translate_io_var_to_arg_if_applicable(operand, edges) + +def translate_io_var_to_arg_if_applicable(pot_io_var, edges): + if isinstance(pot_io_var, int): + return Arg(dereference_io_var(pot_io_var, edges)) + elif isinstance(pot_io_var, ArgStringType): + result = pot_io_var.get_name() # is of type Arg + return result + else: + raise Exception("Unhandled type for operand in to_arg!") diff --git a/compiler/definitions/ir/dfg_node.py b/compiler/definitions/ir/dfg_node.py index b9e990fad..3564e5928 100644 --- a/compiler/definitions/ir/dfg_node.py +++ b/compiler/definitions/ir/dfg_node.py @@ -111,16 +111,11 @@ def get_configuration_inputs(self): # return (self.com_category == "parallelizable_pure") def is_commutative(self): - # BEGIN ANNO - # OLD - # return ('commutative' in self.com_properties) - # NEW - val = self.cmd_related_properties.get_property_value('commutative') + val = self.cmd_related_properties.get_property_value('is_commutative') if val is not None: return val else: return False - # END ANNO ## kk: 2021-07-23 Not totally sure if that is generally correct. Tests will say ¯\_(ツ)_/¯ ## I think it assumes that new options can be added in the beginning if there are no options already @@ -215,7 +210,9 @@ def _to_ast_aux_get_redirs(self): ## TODO: Improve this function to be separately implemented for different special nodes, ## such as cat, eager, split, etc... - ## I do not think this is reasonable anymore since we remodelled nodes in a way that the back-translation is trivial + ## I do not think this separation is reasonable anymore since we remodelled nodes in a way that the back-translation is trivial + ## One exception: r_wrap; currently, the wrapped command is translated at creation of the r_wrap already and + ## hence assumes that non-streaming inputs/outputs will not change def to_ast(self, edges, drain_streams): ## TODO: We might not want to implement this at all actually if (drain_streams): diff --git a/compiler/definitions/ir/nodes/eager.py b/compiler/definitions/ir/nodes/eager.py index 2a5ee5aa9..ac49a576e 100644 --- a/compiler/definitions/ir/nodes/eager.py +++ b/compiler/definitions/ir/nodes/eager.py @@ -9,8 +9,7 @@ def __init__(self, com_redirs=[], com_assignments=[] ): # TODO []: default - super().__init__( - cmd_invocation_with_io_vars, + super().__init__(cmd_invocation_with_io_vars, com_redirs=com_redirs, com_assignments=com_assignments) diff --git a/compiler/definitions/ir/nodes/r_merge.py b/compiler/definitions/ir/nodes/r_merge.py index 4eee7285d..f587a94fc 100644 --- a/compiler/definitions/ir/nodes/r_merge.py +++ b/compiler/definitions/ir/nodes/r_merge.py @@ -1,18 +1,32 @@ +from datatypes_new.AccessKind import AccessKind +from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars + from definitions.ir.dfg_node import * class RMerge(DFGNode): - def __init__(self, inputs, outputs, com_name, com_category, - com_options = [], com_redirs = [], com_assignments=[]): - super().__init__(inputs, outputs, com_name, com_category, - com_options=com_options, - com_redirs=com_redirs, - com_assignments=com_assignments) + def __init__(self, + cmd_invocation_with_io_vars, + com_redirs=[], + com_assignments=[], + parallelizer_list=None, + cmd_related_properties=None): + # TODO []: default arguments! + super().__init__(cmd_invocation_with_io_vars=cmd_invocation_with_io_vars, + com_redirs=com_redirs, + com_assignments=com_assignments, + parallelizer_list=parallelizer_list, + cmd_related_properties=cmd_related_properties) def make_r_merge_node(inputs, output): r_merge_bin = os.path.join(config.PASH_TOP, config.config['runtime']['r_merge_binary']) - com_name = Arg(string_to_argument(r_merge_bin)) - com_category = "pure" - return RMerge(inputs, - [output], - com_name, - com_category) + # TODO: assume that the inputs and output is provided as operands + access_map = {input_id: AccessKind.make_stream_input() for input_id in inputs} + access_map[output] = AccessKind.make_stream_output() + cmd_inv_with_io_vars = CommandInvocationWithIOVars( + cmd_name=r_merge_bin, + flag_option_list=[], + operand_list=inputs, + implicit_use_of_streaming_input=None, + implicit_use_of_streaming_output=output, + access_map=access_map) + return RMerge(cmd_inv_with_io_vars) diff --git a/compiler/definitions/ir/nodes/r_unwrap.py b/compiler/definitions/ir/nodes/r_unwrap.py index f3baa6eae..38cb03dcc 100644 --- a/compiler/definitions/ir/nodes/r_unwrap.py +++ b/compiler/definitions/ir/nodes/r_unwrap.py @@ -1,20 +1,33 @@ +from datatypes_new.AccessKind import AccessKind +from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars + from definitions.ir.dfg_node import * from ir_utils import * class RUnwrap(DFGNode): - def __init__(self, inputs, outputs, com_name, com_category, - com_options = [], com_redirs = [], com_assignments=[]): - super().__init__(inputs, outputs, com_name, com_category, - com_options=com_options, - com_redirs=com_redirs, - com_assignments=com_assignments) + def __init__(self, + cmd_invocation_with_io_vars, + com_redirs=[], + com_assignments=[], + parallelizer_list=None, + cmd_related_properties=None): + # TODO []: default + super().__init__(cmd_invocation_with_io_vars, + com_redirs=com_redirs, + com_assignments=com_assignments, + parallelizer_list=parallelizer_list, + cmd_related_properties=cmd_related_properties) def make_unwrap_node(inputs, output): - assert(is_single_input(inputs)) + assert(len(inputs) == 1) + input_id = inputs[0] + access_map = {input_id: AccessKind.make_stream_input(), output: AccessKind.make_stream_output()} r_unwrap_bin = os.path.join(config.PASH_TOP, config.config['runtime']['r_unwrap_binary']) - com_name = Arg(string_to_argument(r_unwrap_bin)) - com_category = "pure" - return RUnwrap(inputs, - [output], - com_name, - com_category) + cmd_inv_with_io_vars = CommandInvocationWithIOVars( + cmd_name=r_unwrap_bin, + flag_option_list=[], + operand_list=[], + implicit_use_of_streaming_input=input_id, + implicit_use_of_streaming_output=output, + access_map=access_map) + return RUnwrap(cmd_inv_with_io_vars) diff --git a/compiler/definitions/ir/nodes/r_wrap.py b/compiler/definitions/ir/nodes/r_wrap.py index 62913b1d5..8fd44f6ca 100644 --- a/compiler/definitions/ir/nodes/r_wrap.py +++ b/compiler/definitions/ir/nodes/r_wrap.py @@ -1,66 +1,76 @@ +from datatypes_new.AccessKind import AccessKind +from datatypes_new.BasicDatatypes import ArgStringType +from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars + +from annotations_utils.util_cmd_invocations import to_arg_from_cmd_inv_with_io_vars_without_streaming_inputs_or_outputs_for_wrapping from definitions.ir.dfg_node import * from ir_utils import * class RWrap(DFGNode): - def __init__(self, inputs, outputs, com_name, com_category, - com_options = [], com_redirs = [], com_assignments=[], wrapped_node_name=None): - super().__init__(inputs, outputs, com_name, com_category, - com_options=com_options, - com_redirs=com_redirs, - com_assignments=com_assignments) + def __init__(self, + cmd_invocation_with_io_vars, + com_redirs=[], + com_assignments=[], + parallelizer_list=None, + cmd_related_properties=None, + wrapped_node_name=None): + # TODO []: default self.wrapped_node_name = wrapped_node_name - + super().__init__(cmd_invocation_with_io_vars, + com_redirs=com_redirs, + com_assignments=com_assignments, + parallelizer_list=parallelizer_list, + cmd_related_properties=cmd_related_properties) + ## Get the label of the node. By default, it is simply the name def get_dot_label(self) -> str: ## The name could be a full path - name = self.com_name + name = self.cmd_invocation_with_io_vars.cmd_name basename = os.path.basename(str(name)) wrapped_node_name = self.wrapped_node_name return f'{basename}({wrapped_node_name})' -def wrap_node(node): +def wrap_node(node: DFGNode, edges): r_wrap_bin = os.path.join(config.PASH_TOP, config.config['runtime']['r_wrap_binary']) - com_name = Arg(string_to_argument(r_wrap_bin)) - ## TODO: Is it actually pure? What is it? - com_category = "pure" - ## At the moment we can only wrap a node that takes its input from stdin - ## and outputs to stdout. Therefore the node needs to have only one input and one output. - inputs = node.inputs - assert(is_single_input(inputs)) - outputs = node.outputs + ## At the moment we can only wrap a node that takes its input from stdin + ## and outputs to stdout. Therefore the node needs to have only one input and one output. + ## TO CHECK: with the remodelling also other cases should be handled + inputs = node.get_input_list() + assert(len(inputs) == 1) + input_id = inputs[0] + outputs = node.get_output_list() ## TODO: Would it make sense for outputs to be less than one? - assert(len(outputs) <= 1) - - ## TODO: For now we can only wrap stateless commands - assert(node.com_category == "stateless") - - ## TODO: All arguments must be options, otherwise there must be - ## special handling in the wrap node2ast code. - single_quote = Arg(string_to_argument("\'")) - cmd = Arg(string_to_argument("")) + ## TODO: changed this from <= to == 1 to simplify reasoning later for now + assert(len(outputs) == 1) + output_id = outputs[0] + access_map = {input_id: AccessKind.make_stream_input(), output_id: AccessKind.make_stream_output()} #create bash -c argument - cmd.concatenate(single_quote) - cmd.concatenate(node.com_name) - for i, opt in node.com_options: - cmd.concatenate(opt) - cmd.concatenate(single_quote) + cmd_inv_with_io_vars: CommandInvocationWithIOVars = node.cmd_invocation_with_io_vars + # do we need to copy here? currently, it seems fine + cmd_inv_with_io_vars.remove_streaming_inputs() + cmd_inv_with_io_vars.remove_streaming_outputs() + # any non-streaming inputs or outputs are converted here already! + cmd = to_arg_from_cmd_inv_with_io_vars_without_streaming_inputs_or_outputs_for_wrapping(cmd_inv_with_io_vars, edges) + + bash_command_arg = [Arg(string_to_argument("bash -c"))] + operand_list = bash_command_arg + [cmd] - wrapped_command_arg = [(1, cmd)] - bash_command_arg = [(0, Arg(string_to_argument("bash -c")))] - options = bash_command_arg + wrapped_command_arg + cmd_inv_with_io_vars = CommandInvocationWithIOVars( + cmd_name=r_wrap_bin, + flag_option_list=[], + operand_list=operand_list, + implicit_use_of_streaming_input=input_id, + implicit_use_of_streaming_output=output_id, + access_map=access_map) ## TODO: It is not clear if it is safe to just pass redirections and assignments down the line as is redirs = node.com_redirs assignments = node.com_assignments - return RWrap(inputs, - outputs, - com_name, - com_category, - com_options=options, + return RWrap(cmd_inv_with_io_vars, com_redirs=redirs, com_assignments=assignments, - wrapped_node_name=node.com_name) + wrapped_node_name=node.cmd_invocation_with_io_vars.cmd_name) diff --git a/compiler/ir.py b/compiler/ir.py index 0b8938c0f..0e387ed47 100644 --- a/compiler/ir.py +++ b/compiler/ir.py @@ -289,7 +289,7 @@ def make_map_node(node, new_inputs, new_outputs, parallelizer): ## ## At the moment it only works with one input and one output since wrap cannot redirect input in the command. def make_wrap_map_node(node, new_inputs, new_outputs): - assert(is_single_input(new_inputs)) + assert(len(new_inputs) == 1) assert(len(new_outputs) == 1) new_node = make_map_node(node, new_inputs, new_outputs) diff --git a/compiler/ir_utils.py b/compiler/ir_utils.py index 32a5f474f..ef42e875f 100644 --- a/compiler/ir_utils.py +++ b/compiler/ir_utils.py @@ -145,6 +145,7 @@ def format_expanded_arg_char(arg_char): ## These functions check tuple inputs (configuration and streaming ones) def is_single_input(inputs): + assert(False) assert(isinstance(inputs, tuple)) conf_inputs = inputs[0] streaming_inputs = inputs[1] diff --git a/compiler/pash_runtime.py b/compiler/pash_runtime.py index f8045d5e8..9ec83aa09 100644 --- a/compiler/pash_runtime.py +++ b/compiler/pash_runtime.py @@ -421,11 +421,96 @@ def parallelize_node(curr_id, graph, fileIdGen, fan_out, curr = graph.get_node(curr_id) new_nodes_for_workset = [] + # TODO: this whole fragment could be moved to the graph after picking a parallelizer option_parallelizer_rr = curr.get_option_implemented_round_robin_parallelizer() + # for now, we use the `r_split_flag` here again: + if r_split_flag and option_parallelizer_rr is not None: + parallelizer_rr = option_parallelizer_rr + streaming_inputs = curr.get_streaming_inputs() + assert(len(streaming_inputs) == 1) + streaming_input = streaming_inputs[0] + configuration_inputs = curr.get_configuration_inputs() + assert(len(configuration_inputs) == 0) + streaming_outputs = curr.get_output_list() + assert(len(streaming_outputs) == 1) + streaming_output = streaming_outputs[0] + original_cmd_invocation_with_io_vars = curr.cmd_invocation_with_io_vars + + graph.remove_node(curr_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph + + out_split_ids = graph.generate_ephemeral_edges(fileIdGen, fan_out) + splitter = r_split.make_r_split(streaming_input, out_split_ids, r_split_batch_size) + graph.set_edge_to(streaming_input, splitter.get_id()) + for out_split_id in out_split_ids: + graph.set_edge_from(out_split_id, splitter.get_id()) + graph.add_node(splitter) + + in_mapper_ids = out_split_ids + out_mapper_ids = graph.generate_ephemeral_edges(fileIdGen, fan_out) + zip_mapper_in_out_ids = zip(in_mapper_ids, out_mapper_ids) + + aggregator_spec = parallelizer_rr.get_aggregator_spec() + aggregator_kind = aggregator_spec.get_kind() + if aggregator_kind == AggregatorKindEnum.CONCATENATE: # is turned into an r_merge + all_mappers = [] + for (in_id, out_id) in zip_mapper_in_out_ids: + # BEGIN: these 4 lines could be refactored to be a function in graph such that + # creating end point of edges and the creation of edges is not decoupled + mapper_cmd_inv = parallelizer_rr.get_actual_mapper(original_cmd_invocation_with_io_vars, in_id, out_id) + mapper = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(mapper_cmd_inv) + # add r_wrap here: + mapper_r_wrapped = r_wrap.wrap_node(mapper, graph.edges) + graph.set_edge_to(in_id, mapper_r_wrapped.get_id()) + graph.set_edge_from(out_id, mapper_r_wrapped.get_id()) + # END + all_mappers.append(mapper_r_wrapped) + for new_node in all_mappers: + graph.add_node(new_node) - if option_parallelizer_rr is not None: - # TODO: this whole fragment could be moved to the graph after picking a parallelizer - # TODO: we only do consecutive chunks here but from a rr splitter + in_aggregator_ids = out_mapper_ids + out_aggregator_id = streaming_output + aggregator = r_merge.make_r_merge_node(in_aggregator_ids, out_aggregator_id) + for in_aggregator_id in in_aggregator_ids: + graph.set_edge_to(in_aggregator_id, aggregator.get_id()) + graph.set_edge_from(streaming_output, aggregator.get_id()) + all_aggregators = [aggregator] + ## Add the merge commands in the graph + for new_node in all_aggregators: + graph.add_node(new_node) + elif curr.is_commutative(): # we can apply RR and do r_unwrap before the aggregator + all_mappers = [] + for (in_id, out_id) in zip_mapper_in_out_ids: + # generate ephemeral edge for wrap to unwrap + [wrap_to_unwrap_id] = graph.generate_ephemeral_edges(fileIdGen, 1) + # BEGIN: these 4 lines could be refactored to be a function in graph such that + # creating end point of edges and the creation of edges is not decoupled + mapper_cmd_inv = parallelizer_rr.get_actual_mapper(original_cmd_invocation_with_io_vars, in_id, wrap_to_unwrap_id) + mapper = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(mapper_cmd_inv) + # add r_wrap here: + mapper_r_wrapped = r_wrap.wrap_node(mapper, graph.edges) + graph.set_edge_to(in_id, mapper_r_wrapped.get_id()) + graph.set_edge_from(wrap_to_unwrap_id, mapper_r_wrapped.get_id()) + # add unwrap as the command is commutative + unwrap = r_unwrap.make_unwrap_node([wrap_to_unwrap_id], out_id) + graph.set_edge_to(wrap_to_unwrap_id, unwrap.get_id()) + graph.set_edge_from(out_id, unwrap.get_id()) + # END + all_mappers.append(mapper_r_wrapped) + all_mappers.append(unwrap) + for new_node in all_mappers: + graph.add_node(new_node) + + in_aggregator_ids = out_mapper_ids + out_aggregator_id = streaming_output + if aggregator_kind == AggregatorKindEnum.CUSTOM_2_ARY: + # TODO: we simplify and assume that every mapper produces a single output for now: + map_in_aggregator_ids = [[id] for id in in_aggregator_ids] + graph.create_generic_aggregator_tree(curr, parallelizer_rr, map_in_aggregator_ids, out_aggregator_id, + fileIdGen) + else: + raise Exception("aggregator kind not yet implemented") + elif option_parallelizer_rr is not None: # do consecutive chunks + # TODO: we do consecutive chunks here but from a rr splitter parallelizer_rr = option_parallelizer_rr streaming_inputs = curr.get_streaming_inputs() assert(len(streaming_inputs) == 1) @@ -465,8 +550,6 @@ def parallelize_node(curr_id, graph, fileIdGen, fan_out, in_aggregator_ids = out_mapper_ids out_aggregator_id = streaming_output - ## TODO: This could potentially be an aggregator tree (at least in the old PaSh versions) - ## We need to extend the annotations/parallelizers to support this (e.g., for sort) aggregator_spec = parallelizer_rr.get_aggregator_spec() aggregator_kind = aggregator_spec.get_kind() if aggregator_kind == AggregatorKindEnum.CONCATENATE or aggregator_kind == AggregatorKindEnum.CUSTOM_N_ARY: @@ -486,7 +569,6 @@ def parallelize_node(curr_id, graph, fileIdGen, fan_out, else: raise Exception("aggregator kind not yet implemented") - return new_nodes_for_workset ## TODO: Instead of moving a cat after a node, we need to parallelize cat, From 77da7070d63a463e8e30754baa03a07afa625a71 Mon Sep 17 00:00:00 2001 From: Felix Stutz Date: Tue, 28 Jun 2022 17:31:07 -0400 Subject: [PATCH 05/64] Fix unwrap and commutative interplay (#593) Signed-off-by: Felix Stutz --- compiler/pash_runtime.py | 78 ++++++++++++---------------------------- 1 file changed, 23 insertions(+), 55 deletions(-) diff --git a/compiler/pash_runtime.py b/compiler/pash_runtime.py index 9ec83aa09..f3daa3378 100644 --- a/compiler/pash_runtime.py +++ b/compiler/pash_runtime.py @@ -426,32 +426,32 @@ def parallelize_node(curr_id, graph, fileIdGen, fan_out, # for now, we use the `r_split_flag` here again: if r_split_flag and option_parallelizer_rr is not None: parallelizer_rr = option_parallelizer_rr - streaming_inputs = curr.get_streaming_inputs() - assert(len(streaming_inputs) == 1) - streaming_input = streaming_inputs[0] - configuration_inputs = curr.get_configuration_inputs() - assert(len(configuration_inputs) == 0) - streaming_outputs = curr.get_output_list() - assert(len(streaming_outputs) == 1) - streaming_output = streaming_outputs[0] - original_cmd_invocation_with_io_vars = curr.cmd_invocation_with_io_vars - - graph.remove_node(curr_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph - - out_split_ids = graph.generate_ephemeral_edges(fileIdGen, fan_out) - splitter = r_split.make_r_split(streaming_input, out_split_ids, r_split_batch_size) - graph.set_edge_to(streaming_input, splitter.get_id()) - for out_split_id in out_split_ids: - graph.set_edge_from(out_split_id, splitter.get_id()) - graph.add_node(splitter) - - in_mapper_ids = out_split_ids - out_mapper_ids = graph.generate_ephemeral_edges(fileIdGen, fan_out) - zip_mapper_in_out_ids = zip(in_mapper_ids, out_mapper_ids) - aggregator_spec = parallelizer_rr.get_aggregator_spec() aggregator_kind = aggregator_spec.get_kind() if aggregator_kind == AggregatorKindEnum.CONCATENATE: # is turned into an r_merge + streaming_inputs = curr.get_streaming_inputs() + assert(len(streaming_inputs) == 1) + streaming_input = streaming_inputs[0] + configuration_inputs = curr.get_configuration_inputs() + assert(len(configuration_inputs) == 0) + streaming_outputs = curr.get_output_list() + assert(len(streaming_outputs) == 1) + streaming_output = streaming_outputs[0] + original_cmd_invocation_with_io_vars = curr.cmd_invocation_with_io_vars + + graph.remove_node(curr_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph + + out_split_ids = graph.generate_ephemeral_edges(fileIdGen, fan_out) + splitter = r_split.make_r_split(streaming_input, out_split_ids, r_split_batch_size) + graph.set_edge_to(streaming_input, splitter.get_id()) + for out_split_id in out_split_ids: + graph.set_edge_from(out_split_id, splitter.get_id()) + graph.add_node(splitter) + + in_mapper_ids = out_split_ids + out_mapper_ids = graph.generate_ephemeral_edges(fileIdGen, fan_out) + zip_mapper_in_out_ids = zip(in_mapper_ids, out_mapper_ids) + all_mappers = [] for (in_id, out_id) in zip_mapper_in_out_ids: # BEGIN: these 4 lines could be refactored to be a function in graph such that @@ -477,38 +477,6 @@ def parallelize_node(curr_id, graph, fileIdGen, fan_out, ## Add the merge commands in the graph for new_node in all_aggregators: graph.add_node(new_node) - elif curr.is_commutative(): # we can apply RR and do r_unwrap before the aggregator - all_mappers = [] - for (in_id, out_id) in zip_mapper_in_out_ids: - # generate ephemeral edge for wrap to unwrap - [wrap_to_unwrap_id] = graph.generate_ephemeral_edges(fileIdGen, 1) - # BEGIN: these 4 lines could be refactored to be a function in graph such that - # creating end point of edges and the creation of edges is not decoupled - mapper_cmd_inv = parallelizer_rr.get_actual_mapper(original_cmd_invocation_with_io_vars, in_id, wrap_to_unwrap_id) - mapper = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(mapper_cmd_inv) - # add r_wrap here: - mapper_r_wrapped = r_wrap.wrap_node(mapper, graph.edges) - graph.set_edge_to(in_id, mapper_r_wrapped.get_id()) - graph.set_edge_from(wrap_to_unwrap_id, mapper_r_wrapped.get_id()) - # add unwrap as the command is commutative - unwrap = r_unwrap.make_unwrap_node([wrap_to_unwrap_id], out_id) - graph.set_edge_to(wrap_to_unwrap_id, unwrap.get_id()) - graph.set_edge_from(out_id, unwrap.get_id()) - # END - all_mappers.append(mapper_r_wrapped) - all_mappers.append(unwrap) - for new_node in all_mappers: - graph.add_node(new_node) - - in_aggregator_ids = out_mapper_ids - out_aggregator_id = streaming_output - if aggregator_kind == AggregatorKindEnum.CUSTOM_2_ARY: - # TODO: we simplify and assume that every mapper produces a single output for now: - map_in_aggregator_ids = [[id] for id in in_aggregator_ids] - graph.create_generic_aggregator_tree(curr, parallelizer_rr, map_in_aggregator_ids, out_aggregator_id, - fileIdGen) - else: - raise Exception("aggregator kind not yet implemented") elif option_parallelizer_rr is not None: # do consecutive chunks # TODO: we do consecutive chunks here but from a rr splitter parallelizer_rr = option_parallelizer_rr From a1c87a5b03c84df47d8e1058a78cdda89ed34e20 Mon Sep 17 00:00:00 2001 From: Felix Stutz Date: Tue, 28 Jun 2022 17:58:21 -0400 Subject: [PATCH 06/64] Add support for dgsh_tee nodes (#594) Signed-off-by: Felix Stutz --- TODO.md | 1 + .../annotations_utils/util_cmd_invocations.py | 4 +- compiler/definitions/ir/dfg_node.py | 6 +- compiler/definitions/ir/nodes/dgsh_tee.py | 66 ++++++++++++++----- compiler/pash_runtime.py | 1 - 5 files changed, 56 insertions(+), 22 deletions(-) diff --git a/TODO.md b/TODO.md index 135a4cf03..b28ed019d 100644 --- a/TODO.md +++ b/TODO.md @@ -6,4 +6,5 @@ - working on all tests - Adding annotation library installation and removing ad-hoc import of the latter - clean up utils for annotations +- graphviz - Changing PaSh flags (making the default be priority r-split and then consecutive chunks), so remove the r_split flag and make defaults be the ones from the OSDI paper (I can do that TODO too) \ No newline at end of file diff --git a/compiler/annotations_utils/util_cmd_invocations.py b/compiler/annotations_utils/util_cmd_invocations.py index 19d2d6c0f..d624affe6 100644 --- a/compiler/annotations_utils/util_cmd_invocations.py +++ b/compiler/annotations_utils/util_cmd_invocations.py @@ -53,12 +53,12 @@ def to_node_cmd_inv_with_io_vars(cmd_inv, edges, redirs, assignments): node = make_command(cmd_asts, redirections=new_redirs, assignments=assignments) return node -def to_ast_flagoption(flagoption, _edges): +def to_ast_flagoption(flagoption, edges): if isinstance(flagoption, Flag): return [string_to_argument(flagoption.get_name())] elif isinstance(flagoption, OptionWithIO): # retype to IOVar opt_name_ast = string_to_argument(flagoption.get_name()) - opt_arg_ast = translate_io_var_if_applicable(flagoption.get_arg()) + opt_arg_ast = translate_io_var_if_applicable(flagoption.get_arg(), edges) return [opt_name_ast, opt_arg_ast] def to_ast_operand(operand, edges): diff --git a/compiler/definitions/ir/dfg_node.py b/compiler/definitions/ir/dfg_node.py index 3564e5928..45ea066d0 100644 --- a/compiler/definitions/ir/dfg_node.py +++ b/compiler/definitions/ir/dfg_node.py @@ -211,8 +211,10 @@ def _to_ast_aux_get_redirs(self): ## TODO: Improve this function to be separately implemented for different special nodes, ## such as cat, eager, split, etc... ## I do not think this separation is reasonable anymore since we remodelled nodes in a way that the back-translation is trivial - ## One exception: r_wrap; currently, the wrapped command is translated at creation of the r_wrap already and - ## hence assumes that non-streaming inputs/outputs will not change + ## Two exceptions: + ## - r_wrap; currently, the wrapped command is translated at creation of the r_wrap already and + ## hence assumes that non-streaming inputs/outputs will not change + ## - dgsh_tee: it requires the operands to appear before the flags/options (not XBD standard compliant) def to_ast(self, edges, drain_streams): ## TODO: We might not want to implement this at all actually if (drain_streams): diff --git a/compiler/definitions/ir/nodes/dgsh_tee.py b/compiler/definitions/ir/nodes/dgsh_tee.py index 772d79e73..f449e4aa1 100644 --- a/compiler/definitions/ir/nodes/dgsh_tee.py +++ b/compiler/definitions/ir/nodes/dgsh_tee.py @@ -1,27 +1,59 @@ +from datatypes_new.AccessKind import AccessKind +from datatypes_new.BasicDatatypes import Flag, ArgStringType +from datatypes_new.BasicDatatypesWithIO import OptionWithIO +from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars + +from annotations_utils.util_cmd_invocations import to_ast_flagoption, to_ast_operand from definitions.ir.dfg_node import * class DGSHTee(DFGNode): - def __init__(self, inputs, outputs, com_name, com_category, com_options = [], - com_redirs = [], com_assignments=[]): - super().__init__(inputs, outputs, com_name, com_category, - com_options=com_options, - com_redirs=com_redirs, + def __init__(self, + cmd_invocation_with_io_vars, + com_redirs=[], com_assignments=[] + ): + # TODO []: default + super().__init__(cmd_invocation_with_io_vars, + com_redirs=com_redirs, com_assignments=com_assignments) + # TODO: this is only needed since dgsh.sh does not comply with the XBD standard + def to_ast(self, edges, drain_streams): + if (drain_streams): + raise NotImplementedError() + else: + redirs = self._to_ast_aux_get_redirs() + assignments = self.com_assignments + node = to_node_cmd_inv_with_io_vars_for_dgsh_tee(self.cmd_invocation_with_io_vars, edges, redirs, assignments) + return node + def make_dgsh_tee_node(input_id, output_id): dgsh_tee_bin = os.path.join(config.PASH_TOP, config.config['runtime']['dgsh_tee_binary']) - com_name = Arg(string_to_argument(dgsh_tee_bin)) - com_category = "pure" + operand_list = [input_id, output_id] + access_map = {output_id: AccessKind.make_stream_output(), + input_id: AccessKind.make_stream_input()} + + flag_option_list = [Flag("-I"), + Flag("-f"), + OptionWithIO("-b", ArgStringType(Arg(string_to_argument(str(config.config['runtime']['dgsh_buffer_size'])))))] - ## TODO: add as command line arguments - com_options = [(2, Arg(string_to_argument("-I")))] # Eager functionality - com_options.append((3, Arg(string_to_argument("-f")))) # use file on disk when buffer reaches maximum - com_options.append((4, Arg(string_to_argument(f"-b {config.config['runtime']['dgsh_buffer_size']}")))) # set buffer size - # com_options.append(4, Arg(string_to_argument("−m batch_size"))) # set the + cmd_inv_with_io_vars = CommandInvocationWithIOVars( + cmd_name=dgsh_tee_bin, + flag_option_list=flag_option_list, + operand_list=operand_list, + implicit_use_of_streaming_input=None, + implicit_use_of_streaming_output=None, + access_map=access_map) + return DGSHTee(cmd_inv_with_io_vars) - return DGSHTee([input_id], - [output_id], - com_name, - com_category, - com_options=com_options) +def to_node_cmd_inv_with_io_vars_for_dgsh_tee(cmd_inv, edges, redirs, assignments): + ast_cmd_name = string_to_argument(cmd_inv.cmd_name) + ast_flagoptions = [] + for flagoption in cmd_inv.flag_option_list: + ast_flagoptions += to_ast_flagoption(flagoption, edges) + ast_operands = [to_ast_operand(operand, edges) for operand in cmd_inv.operand_list] + # This is where it differs ... in the order + cmd_asts = [ast_cmd_name] + ast_operands + ast_flagoptions + # we omit stuff for stdin and stdout as we know it does not exist + node = make_command(cmd_asts, redirections=redirs, assignments=assignments) + return node diff --git a/compiler/pash_runtime.py b/compiler/pash_runtime.py index f3daa3378..bf730919b 100644 --- a/compiler/pash_runtime.py +++ b/compiler/pash_runtime.py @@ -688,7 +688,6 @@ def add_eager(eager_input_id, graph, fileIdGen, intermediateFileIdGen, use_dgsh_ new_id = new_fid.get_ident() if use_dgsh_tee: - assert(False) ## TODO: seperate to better use dgsh-tee params and maybe deprecate eager eager_node = dgsh_tee.make_dgsh_tee_node(eager_input_id, new_id) else: From 034f41dc341d1c647854eeeff74abdce0fe1a4e1 Mon Sep 17 00:00:00 2001 From: Felix Stutz Date: Wed, 29 Jun 2022 09:50:12 -0400 Subject: [PATCH 07/64] Update TODOs before merging to future Signed-off-by: Felix Stutz --- TODO.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TODO.md b/TODO.md index b28ed019d..9cbc2104a 100644 --- a/TODO.md +++ b/TODO.md @@ -1,10 +1,10 @@ ## TODOs before merging to `future` -- dgsh_tee +- separate checking and application of parallelization - cat-split fusion - r-unwrap-commutative fusion - working on all tests - Adding annotation library installation and removing ad-hoc import of the latter - clean up utils for annotations - graphviz -- Changing PaSh flags (making the default be priority r-split and then consecutive chunks), so remove the r_split flag and make defaults be the ones from the OSDI paper (I can do that TODO too) \ No newline at end of file +- Changing PaSh flags (making the default be priority r-split and then consecutive chunks), so remove the r_split flag and make defaults be the ones from the OSDI paper \ No newline at end of file From f6a3df14c6e87c678b819bb115df62bfc926ba13 Mon Sep 17 00:00:00 2001 From: Felix Stutz Date: Wed, 29 Jun 2022 10:39:17 -0400 Subject: [PATCH 08/64] Modify dgsh-wrapper to not require operands before options but pass options for input and output directly (#595) Signed-off-by: Felix Stutz --- compiler/definitions/ir/dfg_node.py | 5 ++-- compiler/definitions/ir/nodes/dgsh_tee.py | 29 ++++------------------- runtime/dgsh_tee.sh | 12 ++++++---- 3 files changed, 14 insertions(+), 32 deletions(-) diff --git a/compiler/definitions/ir/dfg_node.py b/compiler/definitions/ir/dfg_node.py index 45ea066d0..f09c17303 100644 --- a/compiler/definitions/ir/dfg_node.py +++ b/compiler/definitions/ir/dfg_node.py @@ -211,10 +211,9 @@ def _to_ast_aux_get_redirs(self): ## TODO: Improve this function to be separately implemented for different special nodes, ## such as cat, eager, split, etc... ## I do not think this separation is reasonable anymore since we remodelled nodes in a way that the back-translation is trivial - ## Two exceptions: + ## One exception: ## - r_wrap; currently, the wrapped command is translated at creation of the r_wrap already and - ## hence assumes that non-streaming inputs/outputs will not change - ## - dgsh_tee: it requires the operands to appear before the flags/options (not XBD standard compliant) + ## hence assumes that non-streaming inputs/outputs will not change; with a special to_ast, we could circumvent this def to_ast(self, edges, drain_streams): ## TODO: We might not want to implement this at all actually if (drain_streams): diff --git a/compiler/definitions/ir/nodes/dgsh_tee.py b/compiler/definitions/ir/nodes/dgsh_tee.py index f449e4aa1..c417b8f58 100644 --- a/compiler/definitions/ir/nodes/dgsh_tee.py +++ b/compiler/definitions/ir/nodes/dgsh_tee.py @@ -16,44 +16,23 @@ def __init__(self, com_redirs=com_redirs, com_assignments=com_assignments) - # TODO: this is only needed since dgsh.sh does not comply with the XBD standard - def to_ast(self, edges, drain_streams): - if (drain_streams): - raise NotImplementedError() - else: - redirs = self._to_ast_aux_get_redirs() - assignments = self.com_assignments - node = to_node_cmd_inv_with_io_vars_for_dgsh_tee(self.cmd_invocation_with_io_vars, edges, redirs, assignments) - return node - def make_dgsh_tee_node(input_id, output_id): dgsh_tee_bin = os.path.join(config.PASH_TOP, config.config['runtime']['dgsh_tee_binary']) - operand_list = [input_id, output_id] access_map = {output_id: AccessKind.make_stream_output(), input_id: AccessKind.make_stream_input()} - flag_option_list = [Flag("-I"), + flag_option_list = [OptionWithIO("-i", input_id), + OptionWithIO("-o", output_id), + Flag("-I"), Flag("-f"), OptionWithIO("-b", ArgStringType(Arg(string_to_argument(str(config.config['runtime']['dgsh_buffer_size'])))))] cmd_inv_with_io_vars = CommandInvocationWithIOVars( cmd_name=dgsh_tee_bin, flag_option_list=flag_option_list, - operand_list=operand_list, + operand_list=[], implicit_use_of_streaming_input=None, implicit_use_of_streaming_output=None, access_map=access_map) return DGSHTee(cmd_inv_with_io_vars) - -def to_node_cmd_inv_with_io_vars_for_dgsh_tee(cmd_inv, edges, redirs, assignments): - ast_cmd_name = string_to_argument(cmd_inv.cmd_name) - ast_flagoptions = [] - for flagoption in cmd_inv.flag_option_list: - ast_flagoptions += to_ast_flagoption(flagoption, edges) - ast_operands = [to_ast_operand(operand, edges) for operand in cmd_inv.operand_list] - # This is where it differs ... in the order - cmd_asts = [ast_cmd_name] + ast_operands + ast_flagoptions - # we omit stuff for stdin and stdout as we know it does not exist - node = make_command(cmd_asts, redirections=redirs, assignments=assignments) - return node diff --git a/runtime/dgsh_tee.sh b/runtime/dgsh_tee.sh index 7fc992a7b..ce4ab4081 100755 --- a/runtime/dgsh_tee.sh +++ b/runtime/dgsh_tee.sh @@ -1,8 +1,9 @@ #!/usr/bin/env bash -input=${1?"ERROR: dgsh-tee: No input file given"} -output=${2?"ERROR: dgsh-tee: No output file given"} -args=("${@:3}") +# input and output properly provided in original args already now +# input=${1?"ERROR: dgsh-tee: No input file given"} +# output=${2?"ERROR: dgsh-tee: No output file given"} +args=("${@:1}") # Set a default DISH_TOP in this directory if it doesn't exist PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)} @@ -18,4 +19,7 @@ PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)} # $PASH_TOP/runtime/dgsh-tee -i "$input" -o "$output" $args & # dgsh_tee_pid=$! # wait $dgsh_tee_pid -"$PASH_TOP"/runtime/dgsh-tee -i "$input" -o "$output" "${args[@]}" +#"$PASH_TOP"/runtime/dgsh-tee -i "$input" -o "$output" "${args[@]}" + +# input and output properly provided in original args already now +"$PASH_TOP"/runtime/dgsh-tee "${args[@]}" From 558dc45cac61709c7adfea7df49af3cdb8be3fdb Mon Sep 17 00:00:00 2001 From: Felix Stutz Date: Wed, 29 Jun 2022 11:04:03 -0400 Subject: [PATCH 09/64] Remove dgsh-tee wrapper and call dgsh-tee directly (#596) Signed-off-by: Felix Stutz --- compiler/config.json | 2 +- runtime/dgsh_tee.sh | 25 ------------------------- 2 files changed, 1 insertion(+), 26 deletions(-) delete mode 100755 runtime/dgsh_tee.sh diff --git a/compiler/config.json b/compiler/config.json index 2968b663d..4bed7462b 100644 --- a/compiler/config.json +++ b/compiler/config.json @@ -6,7 +6,7 @@ "r_merge_binary": "runtime/r_merge", "r_wrap_binary": "runtime/r_wrap", "r_unwrap_binary": "runtime/r_unwrap", - "dgsh_tee_binary": "runtime/dgsh_tee.sh", + "dgsh_tee_binary": "runtime/dgsh-tee", "remote_read_binary": "runtime/dspash/remote_read.sh", "remote_write_binary": "runtime/dspash/remote_write.sh", "dfs_split_reader_binary": "runtime/dspash/dfs_split_reader.sh", diff --git a/runtime/dgsh_tee.sh b/runtime/dgsh_tee.sh deleted file mode 100755 index ce4ab4081..000000000 --- a/runtime/dgsh_tee.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env bash - -# input and output properly provided in original args already now -# input=${1?"ERROR: dgsh-tee: No input file given"} -# output=${2?"ERROR: dgsh-tee: No output file given"} -args=("${@:1}") - -# Set a default DISH_TOP in this directory if it doesn't exist -PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)} - -# TODO: Doable check if this is still needed. Turned off for distributed exection. -# PR https://github.com/binpash/pash/pull/495 might've resolved it. -# cleanup() -# { -# kill -SIGTERM $dgsh_tee_pid > /dev/null 2>&1 -# } -# trap cleanup EXIT - -# $PASH_TOP/runtime/dgsh-tee -i "$input" -o "$output" $args & -# dgsh_tee_pid=$! -# wait $dgsh_tee_pid -#"$PASH_TOP"/runtime/dgsh-tee -i "$input" -o "$output" "${args[@]}" - -# input and output properly provided in original args already now -"$PASH_TOP"/runtime/dgsh-tee "${args[@]}" From a77d4eef2739a59010fbd58a90157ae122f7b84f Mon Sep 17 00:00:00 2001 From: Felix Stutz Date: Thu, 30 Jun 2022 08:56:54 -0400 Subject: [PATCH 10/64] Separate parallelization into choose and apply phases (#597) * Refactor parallelizing transformations in separate choose and apply phases Signed-off-by: Felix Stutz --- TODO.md | 3 +- compiler/definitions/ir/dfg_node.py | 33 ++++--- compiler/ir.py | 145 +++++++++++++++++++++++++++- compiler/pash_runtime.py | 54 ++++++++++- 4 files changed, 217 insertions(+), 18 deletions(-) diff --git a/TODO.md b/TODO.md index 9cbc2104a..35ffdfc11 100644 --- a/TODO.md +++ b/TODO.md @@ -7,4 +7,5 @@ - Adding annotation library installation and removing ad-hoc import of the latter - clean up utils for annotations - graphviz -- Changing PaSh flags (making the default be priority r-split and then consecutive chunks), so remove the r_split flag and make defaults be the ones from the OSDI paper \ No newline at end of file +- Changing PaSh flags (making the default be priority r-split and then consecutive chunks), so remove the r_split flag and make defaults be the ones from the OSDI paper +- Remove code which got obsolete due to the changes \ No newline at end of file diff --git a/compiler/definitions/ir/dfg_node.py b/compiler/definitions/ir/dfg_node.py index f09c17303..e3e56631c 100644 --- a/compiler/definitions/ir/dfg_node.py +++ b/compiler/definitions/ir/dfg_node.py @@ -305,24 +305,31 @@ def replace_edge_in_list(self, edge_ids, from_id, to_id): new_edge_ids.append(new_edge_id) return new_edge_ids - def set_used_parallelizer(self, parallelizer): - assert(False) - # TODO: instantiate in __init__ already in some way - self.used_parallelizer = parallelizer - - def get_used_parallelizer(self): - assert(False) - return self.used_parallelizer - def get_option_implemented_round_robin_parallelizer(self): for parallelizer in self.parallelizer_list: splitter = parallelizer.get_splitter() - mapper_spec = parallelizer.get_mapper_spec() - aggregator_spec = parallelizer.get_aggregator_spec() - if splitter.is_splitter_round_robin() and mapper_spec.is_implemented and aggregator_spec.is_implemented: + if splitter.is_splitter_round_robin() and parallelizer.are_all_parts_implemented(): + return parallelizer + return None + + def get_option_implemented_consecutive_chunks_parallelizer(self): + for parallelizer in self.parallelizer_list: + splitter = parallelizer.get_splitter() + if splitter.is_splitter_consec_chunks() and parallelizer.are_all_parts_implemented(): return parallelizer return None @staticmethod def make_simple_dfg_node_from_cmd_inv_with_io_vars(cmd_inv_with_io_vars): - return DFGNode(cmd_inv_with_io_vars) \ No newline at end of file + return DFGNode(cmd_inv_with_io_vars) + + def get_single_streaming_input_single_output_and_configuration_inputs_of_node_for_parallelization(self): + streaming_inputs = self.get_streaming_inputs() + assert (len(streaming_inputs) == 1) + streaming_input = streaming_inputs[0] + configuration_inputs = self.get_configuration_inputs() + assert (len(configuration_inputs) == 0) + streaming_outputs = self.get_output_list() + assert (len(streaming_outputs) == 1) + streaming_output = streaming_outputs[0] + return streaming_input, streaming_output, configuration_inputs diff --git a/compiler/ir.py b/compiler/ir.py index 0e387ed47..11e305bf5 100644 --- a/compiler/ir.py +++ b/compiler/ir.py @@ -774,6 +774,147 @@ def add_edge(self, edge_fid): def empty(self): return (len(self.nodes) == 0) + def apply_parallelization_to_node(self, node_id, parallelizer, fileIdGen, fan_out, + batch_size, no_cat_split_vanish, r_split_batch_size): + splitter = parallelizer.get_splitter() + if splitter.is_splitter_round_robin(): + # TODO: for both functions, check which parameters are needed + self.apply_round_robin_parallelization_to_node(node_id, parallelizer, fileIdGen, fan_out, + batch_size, no_cat_split_vanish, r_split_batch_size) + elif splitter.is_splitter_consec_chunks(): + self.apply_consecutive_chunks_parallelization_to_node(node_id, parallelizer, fileIdGen, fan_out, + batch_size, no_cat_split_vanish, r_split_batch_size) + else: + raise Exception("Splitter not yet implemented") + + def apply_round_robin_parallelization_to_node(self, node_id, parallelizer, fileIdGen, fan_out, + batch_size, no_cat_split_vanish, r_split_batch_size): + # TODO: this control flow should move done to aggregators once we implement them; + # currently, this cannot be done since splitter etc. would be added... + aggregator_spec = parallelizer.get_aggregator_spec() + if aggregator_spec.is_aggregator_spec_adj_lines_merge(): + raise Exception("adj_lines_merge not yet implemented in PaSh") + elif aggregator_spec.is_aggregator_spec_adj_lines_seq(): + raise Exception("adj_lines_seq not yet implemented in PaSh") + elif aggregator_spec.is_aggregator_spec_adj_lines_func(): + raise Exception("adj_lines_func not yet implemented in PaSh") + # END of what to move + + node = self.get_node(node_id) + # get info from node, and delete it from graph + streaming_input, streaming_output, configuration_inputs = \ + node.get_single_streaming_input_single_output_and_configuration_inputs_of_node_for_parallelization() + original_cmd_invocation_with_io_vars = node.cmd_invocation_with_io_vars + self.remove_node(node_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph + + # splitter + round_robin_splitter_generator = lambda input_id, output_ids: r_split.make_r_split(input_id, output_ids, r_split_batch_size) + out_split_ids = self.introduce_splitter(round_robin_splitter_generator, fan_out, fileIdGen, streaming_input) + + # mappers + in_mapper_ids = out_split_ids + out_mapper_ids = self.introduce_mappers(fan_out, fileIdGen, in_mapper_ids, original_cmd_invocation_with_io_vars, + parallelizer) + + # aggregator(s) + self.introduce_aggregator_for_round_robin(out_mapper_ids, parallelizer, streaming_output) + + def apply_consecutive_chunks_parallelization_to_node(self, node_id, parallelizer, fileIdGen, fan_out, + batch_size, no_cat_split_vanish, r_split_batch_size): + node = self.get_node(node_id) + streaming_input, streaming_output, configuration_inputs = \ + node.get_single_streaming_input_single_output_and_configuration_inputs_of_node_for_parallelization() + original_cmd_invocation_with_io_vars = node.cmd_invocation_with_io_vars + self.remove_node(node_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph + + # splitter + consec_chunks_splitter_generator = lambda input_id, output_ids: pash_split.make_split_file(input_id, output_ids) + out_split_ids = self.introduce_splitter(consec_chunks_splitter_generator, fan_out, fileIdGen, streaming_input) + + # mappers + in_mapper_ids = out_split_ids + out_mapper_ids = self.introduce_mappers(fan_out, fileIdGen, in_mapper_ids, original_cmd_invocation_with_io_vars, + parallelizer) + + # aggregators + in_aggregator_ids = out_mapper_ids + out_aggregator_id = streaming_output + self.introduce_aggregators_for_consec_chunks(fileIdGen, in_aggregator_ids, + original_cmd_invocation_with_io_vars, out_aggregator_id, parallelizer, + streaming_output) + + def introduce_splitter(self, splitter_generator, fan_out, fileIdGen, streaming_input): + out_split_ids = self.generate_ephemeral_edges(fileIdGen, fan_out) + splitter = splitter_generator(streaming_input, out_split_ids) + self.set_edge_to(streaming_input, splitter.get_id()) + for out_split_id in out_split_ids: + self.set_edge_from(out_split_id, splitter.get_id()) + self.add_node(splitter) + return out_split_ids + + def introduce_mappers(self, fan_out, fileIdGen, in_mapper_ids, original_cmd_invocation_with_io_vars, parallelizer): + out_mapper_ids = self.generate_ephemeral_edges(fileIdGen, fan_out) + zip_mapper_in_out_ids = zip(in_mapper_ids, out_mapper_ids) + all_mappers = [] + for (in_id, out_id) in zip_mapper_in_out_ids: + # BEGIN: these 4 lines could be refactored to be a function in graph such that + # creating end point of edges and the creation of edges is not decoupled + mapper_cmd_inv = parallelizer.get_actual_mapper(original_cmd_invocation_with_io_vars, in_id, out_id) + mapper = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(mapper_cmd_inv) + self.set_edge_to(in_id, mapper.get_id()) + self.set_edge_from(out_id, mapper.get_id()) + # END + splitter = parallelizer.get_splitter() + if splitter.is_splitter_round_robin(): + mapper_r_wrapped = r_wrap.wrap_node(mapper, self.edges) + self.set_edge_to(in_id, mapper_r_wrapped.get_id()) + self.set_edge_from(out_id, mapper_r_wrapped.get_id()) + mapper = mapper_r_wrapped + all_mappers.append(mapper) + for new_node in all_mappers: + self.add_node(new_node) + return out_mapper_ids + + def introduce_aggregators_for_consec_chunks(self, fileIdGen, in_aggregator_ids, + original_cmd_invocation_with_io_vars, out_aggregator_id, parallelizer, + streaming_output): + aggregator_spec = parallelizer.get_aggregator_spec() + if aggregator_spec.is_aggregator_spec_concatenate() or aggregator_spec.is_aggregator_spec_custom_n_ary(): + aggregator_cmd_inv = parallelizer.get_actual_aggregator(original_cmd_invocation_with_io_vars, + in_aggregator_ids, out_aggregator_id) + aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(aggregator_cmd_inv) + for in_aggregator_id in in_aggregator_ids: + self.set_edge_to(in_aggregator_id, aggregator.get_id()) + self.set_edge_from(streaming_output, aggregator.get_id()) + all_aggregators = [aggregator] + ## Add the merge commands in the graph + for new_node in all_aggregators: + self.add_node(new_node) + elif aggregator_spec.is_aggregator_spec_custom_2_ary(): + # TODO: we simplify and assume that every mapper produces a single output for now + map_in_aggregator_ids = [[id] for id in in_aggregator_ids] + # TODO: turn node into cmd_invocation_with_io_vars since this is the only thing required in this function + self.create_generic_aggregator_tree(original_cmd_invocation_with_io_vars, parallelizer, map_in_aggregator_ids, out_aggregator_id, fileIdGen) + else: + raise Exception("aggregator kind not yet implemented") + + def introduce_aggregator_for_round_robin(self, out_mapper_ids, parallelizer, streaming_output): + aggregator_spec = parallelizer.get_aggregator_spec() + if aggregator_spec.is_aggregator_spec_concatenate(): + in_aggregator_ids = out_mapper_ids + out_aggregator_id = streaming_output + aggregator = r_merge.make_r_merge_node(in_aggregator_ids, out_aggregator_id) + for in_aggregator_id in in_aggregator_ids: + self.set_edge_to(in_aggregator_id, aggregator.get_id()) + self.set_edge_from(streaming_output, aggregator.get_id()) + all_aggregators = [aggregator] + ## Add the aggregator node(s) in the graph + for new_node in all_aggregators: + self.add_node(new_node) + else: + # TODO: this is where the other cases for aggregators need to be added + pass + ## This function parallelizes a merger followed by a parallelizable node ## @@ -1107,10 +1248,10 @@ def valid(self): # and not self.get_stdout() is None))) ## This is a function that creates a reduce tree for a given node - def create_generic_aggregator_tree(self, curr_node, parallelizer, input_ids_for_aggregators, out_aggregator_id, fileIdGen): + def create_generic_aggregator_tree(self, cmd_invocation_with_io_vars, parallelizer, input_ids_for_aggregators, out_aggregator_id, fileIdGen): def function_to_get_binary_aggregator(in_ids, out_ids): assert(len(out_ids) == 1) - aggregator_cmd_inv = parallelizer.get_actual_aggregator(curr_node.cmd_invocation_with_io_vars, in_ids, out_ids[0]) + aggregator_cmd_inv = parallelizer.get_actual_aggregator(cmd_invocation_with_io_vars, in_ids, out_ids[0]) aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(aggregator_cmd_inv) return aggregator ## The Aggregator node takes a sequence of input ids and an output id diff --git a/compiler/pash_runtime.py b/compiler/pash_runtime.py index bf730919b..698cff210 100644 --- a/compiler/pash_runtime.py +++ b/compiler/pash_runtime.py @@ -215,7 +215,7 @@ def optimize_irs(asts_and_irs, args, compiler_config): # log(ir_node) # with cProfile.Profile() as pr: - distributed_graph = naive_parallelize_stateless_nodes_bfs(ast_or_ir, compiler_config.width, + distributed_graph = choose_and_apply_parallelizing_transformations(ast_or_ir, compiler_config.width, runtime_config['batch_size'], args.no_cat_split_vanish, args.r_split, args.r_split_batch_size) @@ -252,6 +252,54 @@ def print_graph_statistics(graph): log("Cat nodes:", len(cat_nodes)) log("Eager nodes:", len(eager_nodes)) + +def choose_and_apply_parallelizing_transformations(graph, fan_out, batch_size, no_cat_split_vanish, + r_split_flag, r_split_batch_size): + parallelizer_map = choose_parallelizing_transformations(graph, r_split_flag) + apply_parallelizing_transformations(graph, parallelizer_map, fan_out, batch_size, no_cat_split_vanish, + r_split_flag, r_split_batch_size) + return graph + + +def choose_parallelizing_transformations(graph, r_split_flag): # shall return map + source_node_ids = graph.source_nodes() + parallelizer_map = {} + workset = source_node_ids + visited = set() + # We apply a modified BFS such that we ensure that we know which parallelizer was chosen for all previous nodes + # and assume that the decision for any subsequent node will exploit any potential synergy effects + while (len(workset) > 0): + curr_id = workset.pop(0) + assert(isinstance(curr_id, int)) + all_previous_nodes_visited = all(prev in visited for prev in graph.get_previous_nodes(curr_id)) + if not all_previous_nodes_visited: + workset.append(curr_id) + elif not curr_id in visited: + next_node_ids = graph.get_next_nodes(curr_id) + workset += next_node_ids + parallelizer_map[curr_id] = choose_parallelizing_transformation(curr_id, graph, r_split_flag) + visited.add(curr_id) + return parallelizer_map + + +def choose_parallelizing_transformation(curr_id, graph, r_split_flag): # shall return map entry + # TODO: here we can implement more sophisticated techniques to decide how to parallelize + curr = graph.get_node(curr_id) + if r_split_flag: + option_parallelizer = curr.get_option_implemented_round_robin_parallelizer() + else: + option_parallelizer = curr.get_option_implemented_consecutive_chunks_parallelizer() + return option_parallelizer + + +def apply_parallelizing_transformations(graph, parallelizer_map, fan_out, batch_size, no_cat_split_vanish, + r_split_flag, r_split_batch_size): + fileIdGen = graph.get_file_id_gen() + node_id_non_none_parallelizer_list = [(node_id, parallelizer) for (node_id, parallelizer) in parallelizer_map.items() + if parallelizer is not None] + for (node_id, parallelizer) in node_id_non_none_parallelizer_list: + graph.apply_parallelization_to_node(node_id, parallelizer, fileIdGen, fan_out, + batch_size, no_cat_split_vanish, r_split_batch_size) ## This is a simplistic planner, that pushes the available ## parallelization from the inputs in file stateless commands. The ## planner starts from the sources of the graph, and pushes @@ -261,6 +309,7 @@ def print_graph_statistics(graph): ## be scheduled depending on the available computational resources. def naive_parallelize_stateless_nodes_bfs(graph, fan_out, batch_size, no_cat_split_vanish, r_split_flag, r_split_batch_size): + assert(False) source_node_ids = graph.source_nodes() ## Generate a fileIdGen from a graph, that doesn't clash with the @@ -418,6 +467,7 @@ def split_hdfs_cat_input(hdfs_cat, next_node, graph, fileIdGen): ## This function takes a node (id) and parallelizes it def parallelize_node(curr_id, graph, fileIdGen, fan_out, batch_size, no_cat_split_vanish, r_split_flag, r_split_batch_size): + assert(False) curr = graph.get_node(curr_id) new_nodes_for_workset = [] @@ -533,7 +583,7 @@ def parallelize_node(curr_id, graph, fileIdGen, fan_out, elif aggregator_kind == AggregatorKindEnum.CUSTOM_2_ARY: # TODO: we simplify and assume that every mapper produces a single output for now: map_in_aggregator_ids = [[id] for id in in_aggregator_ids] - graph.create_generic_aggregator_tree(curr, parallelizer_rr, map_in_aggregator_ids, out_aggregator_id, fileIdGen) + graph.create_generic_aggregator_tree(original_cmd_invocation_with_io_vars, parallelizer_rr, map_in_aggregator_ids, out_aggregator_id, fileIdGen) else: raise Exception("aggregator kind not yet implemented") From f1221ffc21f887a679d2955b965322906f0b8e03 Mon Sep 17 00:00:00 2001 From: Felix Stutz Date: Thu, 30 Jun 2022 10:45:19 -0400 Subject: [PATCH 11/64] Fuse cat and subsequent split (#599) * Fuse cat and subsequent split Signed-off-by: Felix Stutz --- TODO.md | 1 - compiler/ir.py | 40 +++++++++++++++++++++++++--------------- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/TODO.md b/TODO.md index 35ffdfc11..5725529a9 100644 --- a/TODO.md +++ b/TODO.md @@ -1,6 +1,5 @@ ## TODOs before merging to `future` -- separate checking and application of parallelization - cat-split fusion - r-unwrap-commutative fusion - working on all tests diff --git a/compiler/ir.py b/compiler/ir.py index 11e305bf5..795893e0a 100644 --- a/compiler/ir.py +++ b/compiler/ir.py @@ -1,28 +1,18 @@ -# BEGIN ANNO import sys from config import get_path_annotation_repo sys.path.insert(1, get_path_annotation_repo()) -# for typing from datatypes_new.CommandInvocationInitial import CommandInvocationInitial from datatypes_new.BasicDatatypesWithIO import FileNameWithIOInfo, StdDescriptorWithIOInfo from annotation_generation_new.datatypes.InputOutputInfo import InputOutputInfo from annotation_generation_new.datatypes.ParallelizabilityInfo import ParallelizabilityInfo from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars -# for use -# -- - from annotations_utils.util_parsing import parse_arg_list_to_command_invocation from annotations_utils.util_cmd_invocations import get_input_output_info_from_cmd_invocation_util, get_parallelizability_info_from_cmd_invocation_util from annotations_utils.util_mapper import get_mapper_as_dfg_node_from_node, get_map_output_files from annotations_utils.util_aggregator import get_aggregator_as_dfg_node_from_node from annotations_utils.util_file_descriptors import resource_from_file_descriptor -# END ANNO - -# BEGIN REMODEL - -# END REMODEL from definitions.ir.file_id import * from definitions.ir.nodes.cat import * @@ -816,23 +806,43 @@ def apply_round_robin_parallelization_to_node(self, node_id, parallelizer, fileI out_mapper_ids = self.introduce_mappers(fan_out, fileIdGen, in_mapper_ids, original_cmd_invocation_with_io_vars, parallelizer) - # aggregator(s) + # aggregator self.introduce_aggregator_for_round_robin(out_mapper_ids, parallelizer, streaming_output) def apply_consecutive_chunks_parallelization_to_node(self, node_id, parallelizer, fileIdGen, fan_out, batch_size, no_cat_split_vanish, r_split_batch_size): + # check whether we can fuse with previous node's parallelization: + # we can do so if the previous node's parallelization is the same, and the aggregator is concatenation + # Assumption: it suffices to check that the previous node is an aggregator node of type concatenate + # as this is unique for consecutive chunk parallelization (for now, this is true) node = self.get_node(node_id) streaming_input, streaming_output, configuration_inputs = \ node.get_single_streaming_input_single_output_and_configuration_inputs_of_node_for_parallelization() original_cmd_invocation_with_io_vars = node.cmd_invocation_with_io_vars + + prev_nodes = self.get_previous_nodes(node_id) + assert(len(prev_nodes) > 0) + # get info about first one but also ensure that it is the only one if we fuse + first_pred_id = prev_nodes[0] + first_pred_node = self.get_node(first_pred_id) + first_pred_cmd_inv = first_pred_node.cmd_invocation_with_io_vars + + # remove node to be parallelized self.remove_node(node_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph - # splitter - consec_chunks_splitter_generator = lambda input_id, output_ids: pash_split.make_split_file(input_id, output_ids) - out_split_ids = self.introduce_splitter(consec_chunks_splitter_generator, fan_out, fileIdGen, streaming_input) + if len(prev_nodes) == 1 and first_pred_cmd_inv.is_aggregator_concatenate(): + # can be fused + self.remove_node(first_pred_id) # also sets respective edge to's and from's to None + in_mapper_ids = first_pred_cmd_inv.operand_list + else: # cannot be fused so introduce splitter + # splitter + consec_chunks_splitter_generator = lambda input_id, output_ids: pash_split.make_split_file(input_id, + output_ids) + out_split_ids = self.introduce_splitter(consec_chunks_splitter_generator, fan_out, fileIdGen, + streaming_input) + in_mapper_ids = out_split_ids # mappers - in_mapper_ids = out_split_ids out_mapper_ids = self.introduce_mappers(fan_out, fileIdGen, in_mapper_ids, original_cmd_invocation_with_io_vars, parallelizer) From 334091f6c8b18a841c0d01ee7d5c66b9a35dc93f Mon Sep 17 00:00:00 2001 From: Felix Stutz Date: Thu, 30 Jun 2022 11:12:38 -0400 Subject: [PATCH 12/64] Fuse r_merge and subsequent r_split (#600) Signed-off-by: Felix Stutz --- compiler/ir.py | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/compiler/ir.py b/compiler/ir.py index 795893e0a..2bd6fccb1 100644 --- a/compiler/ir.py +++ b/compiler/ir.py @@ -795,14 +795,24 @@ def apply_round_robin_parallelization_to_node(self, node_id, parallelizer, fileI streaming_input, streaming_output, configuration_inputs = \ node.get_single_streaming_input_single_output_and_configuration_inputs_of_node_for_parallelization() original_cmd_invocation_with_io_vars = node.cmd_invocation_with_io_vars + + prev_nodes = self.get_previous_nodes(node_id) + first_pred_node, first_pred_cmd_inv = self.get_first_previous_node_and_first_previous_cmd_invocation(prev_nodes) + + # remove node to be parallelized self.remove_node(node_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph - # splitter - round_robin_splitter_generator = lambda input_id, output_ids: r_split.make_r_split(input_id, output_ids, r_split_batch_size) - out_split_ids = self.introduce_splitter(round_robin_splitter_generator, fan_out, fileIdGen, streaming_input) + if len(prev_nodes) == 1 and isinstance(first_pred_node, r_merge.RMerge): + # can be fused + self.remove_node(prev_nodes[0]) # also sets respective edge to's and from's to None + in_mapper_ids = first_pred_cmd_inv.operand_list + else: # cannot be fused so introduce splitter + # splitter + round_robin_splitter_generator = lambda input_id, output_ids: r_split.make_r_split(input_id, output_ids, r_split_batch_size) + out_split_ids = self.introduce_splitter(round_robin_splitter_generator, fan_out, fileIdGen, streaming_input) + in_mapper_ids = out_split_ids # mappers - in_mapper_ids = out_split_ids out_mapper_ids = self.introduce_mappers(fan_out, fileIdGen, in_mapper_ids, original_cmd_invocation_with_io_vars, parallelizer) @@ -821,18 +831,15 @@ def apply_consecutive_chunks_parallelization_to_node(self, node_id, parallelizer original_cmd_invocation_with_io_vars = node.cmd_invocation_with_io_vars prev_nodes = self.get_previous_nodes(node_id) - assert(len(prev_nodes) > 0) - # get info about first one but also ensure that it is the only one if we fuse - first_pred_id = prev_nodes[0] - first_pred_node = self.get_node(first_pred_id) - first_pred_cmd_inv = first_pred_node.cmd_invocation_with_io_vars + first_pred_node, first_pred_cmd_inv = self.get_first_previous_node_and_first_previous_cmd_invocation(prev_nodes) # remove node to be parallelized self.remove_node(node_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph + # TODO: change to check on Node (first_pred_node) and not cmd_inv if len(prev_nodes) == 1 and first_pred_cmd_inv.is_aggregator_concatenate(): # can be fused - self.remove_node(first_pred_id) # also sets respective edge to's and from's to None + self.remove_node(prev_nodes[0]) # also sets respective edge to's and from's to None in_mapper_ids = first_pred_cmd_inv.operand_list else: # cannot be fused so introduce splitter # splitter @@ -853,6 +860,14 @@ def apply_consecutive_chunks_parallelization_to_node(self, node_id, parallelizer original_cmd_invocation_with_io_vars, out_aggregator_id, parallelizer, streaming_output) + def get_first_previous_node_and_first_previous_cmd_invocation(self, prev_nodes): + assert (len(prev_nodes) > 0) + # get info about first one but also ensure that it is the only one if we fuse + first_pred_id = prev_nodes[0] + first_pred_node = self.get_node(first_pred_id) + first_pred_cmd_inv = first_pred_node.cmd_invocation_with_io_vars + return first_pred_node, first_pred_cmd_inv + def introduce_splitter(self, splitter_generator, fan_out, fileIdGen, streaming_input): out_split_ids = self.generate_ephemeral_edges(fileIdGen, fan_out) splitter = splitter_generator(streaming_input, out_split_ids) From 58fbd29a4a31cdba2f66319e0ffdc6e63d62b60a Mon Sep 17 00:00:00 2001 From: Felix Stutz Date: Thu, 30 Jun 2022 11:55:37 -0400 Subject: [PATCH 13/64] Fuse r_merge and subsequent commutative command (#601) Signed-off-by: Felix Stutz --- compiler/ir.py | 19 ++++++++++++++++++- compiler/pash_runtime.py | 18 ++++++++++++------ 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/compiler/ir.py b/compiler/ir.py index 2bd6fccb1..c2f36543e 100644 --- a/compiler/ir.py +++ b/compiler/ir.py @@ -836,11 +836,17 @@ def apply_consecutive_chunks_parallelization_to_node(self, node_id, parallelizer # remove node to be parallelized self.remove_node(node_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph - # TODO: change to check on Node (first_pred_node) and not cmd_inv + # TODO: change first check to first_pred_node and not cmd_inv if len(prev_nodes) == 1 and first_pred_cmd_inv.is_aggregator_concatenate(): # can be fused self.remove_node(prev_nodes[0]) # also sets respective edge to's and from's to None in_mapper_ids = first_pred_cmd_inv.operand_list + elif len(prev_nodes) == 1 and isinstance(first_pred_node, r_merge.RMerge) and node.is_commutative(): + self.remove_node(prev_nodes[0]) # also sets respective edge to's and from's to None + + in_unwrap_ids = first_pred_cmd_inv.operand_list + out_unwrap_ids = self.introduce_unwraps(fileIdGen, in_unwrap_ids) + in_mapper_ids = out_unwrap_ids else: # cannot be fused so introduce splitter # splitter consec_chunks_splitter_generator = lambda input_id, output_ids: pash_split.make_split_file(input_id, @@ -900,6 +906,17 @@ def introduce_mappers(self, fan_out, fileIdGen, in_mapper_ids, original_cmd_invo self.add_node(new_node) return out_mapper_ids + def introduce_unwraps(self, fileIdGen, in_unwrap_ids): + unwrap_to_commutative_mappers_ids = self.generate_ephemeral_edges(fileIdGen, len(in_unwrap_ids)) + in_out_unwrap_ids = zip(in_unwrap_ids, unwrap_to_commutative_mappers_ids) + for in_unwrap, out_unwrap in in_out_unwrap_ids: + unwrap = r_unwrap.make_unwrap_node([in_unwrap], out_unwrap) + self.add_node(unwrap) + self.set_edge_to(in_unwrap, unwrap.get_id()) # from are still (wrapped) mappers + self.set_edge_from(out_unwrap, unwrap.get_id()) # to will be set to mappers of current node + in_mapper_ids = unwrap_to_commutative_mappers_ids + return in_mapper_ids + def introduce_aggregators_for_consec_chunks(self, fileIdGen, in_aggregator_ids, original_cmd_invocation_with_io_vars, out_aggregator_id, parallelizer, streaming_output): diff --git a/compiler/pash_runtime.py b/compiler/pash_runtime.py index 698cff210..d5ec78bca 100644 --- a/compiler/pash_runtime.py +++ b/compiler/pash_runtime.py @@ -283,13 +283,19 @@ def choose_parallelizing_transformations(graph, r_split_flag): # shall return ma def choose_parallelizing_transformation(curr_id, graph, r_split_flag): # shall return map entry - # TODO: here we can implement more sophisticated techniques to decide how to parallelize + # here we can implement more sophisticated techniques to decide how to parallelize curr = graph.get_node(curr_id) - if r_split_flag: - option_parallelizer = curr.get_option_implemented_round_robin_parallelizer() - else: - option_parallelizer = curr.get_option_implemented_consecutive_chunks_parallelizer() - return option_parallelizer + # we ignore `r_split_flag` here as we want to exploit r_merge followed by commutative command + # which only works if the a parallelizer for the latter is chosen (sort does not have RR-parallelizer) + # we prioritize round robin over consecutive chunks: + return return_default_if_none_else_itself(curr.get_option_implemented_round_robin_parallelizer(), + curr.get_option_implemented_consecutive_chunks_parallelizer()) + # When `r_split_flag` should be used: + # if r_split_flag: + # option_parallelizer = curr.get_option_implemented_round_robin_parallelizer() + # else: + # option_parallelizer = curr.get_option_implemented_consecutive_chunks_parallelizer() + # return option_parallelizer def apply_parallelizing_transformations(graph, parallelizer_map, fan_out, batch_size, no_cat_split_vanish, From 4a0e5a4287d95af0d642a493c51c3587f0e5c48f Mon Sep 17 00:00:00 2001 From: Felix Stutz Date: Thu, 30 Jun 2022 11:56:08 -0400 Subject: [PATCH 14/64] Add TODO Signed-off-by: Felix Stutz --- TODO.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/TODO.md b/TODO.md index 5725529a9..e604dbdd9 100644 --- a/TODO.md +++ b/TODO.md @@ -1,7 +1,6 @@ ## TODOs before merging to `future` -- cat-split fusion -- r-unwrap-commutative fusion +- support for RR with unwrap for commutative commands - working on all tests - Adding annotation library installation and removing ad-hoc import of the latter - clean up utils for annotations From cd00918c106ab5c6cb309624e6ee8ca9f079a018 Mon Sep 17 00:00:00 2001 From: Felix Stutz Date: Thu, 30 Jun 2022 14:35:37 -0400 Subject: [PATCH 15/64] Support round-robin parallelization for commutative commands (#602) Signed-off-by: Felix Stutz --- compiler/definitions/ir/dfg_node.py | 8 ++++ compiler/definitions/ir/nodes/r_split.py | 22 ++++------- compiler/ir.py | 48 +++++++++++++++++++++--- compiler/pash_runtime.py | 8 ++-- 4 files changed, 62 insertions(+), 24 deletions(-) diff --git a/compiler/definitions/ir/dfg_node.py b/compiler/definitions/ir/dfg_node.py index e3e56631c..587622680 100644 --- a/compiler/definitions/ir/dfg_node.py +++ b/compiler/definitions/ir/dfg_node.py @@ -312,6 +312,14 @@ def get_option_implemented_round_robin_parallelizer(self): return parallelizer return None + def get_option_implemented_round_robin_with_unwrap_parallelizer(self): + for parallelizer in self.parallelizer_list: + splitter = parallelizer.get_splitter() + if splitter.is_splitter_round_robin_with_unwrap_flag() and parallelizer.are_all_parts_implemented(): + return parallelizer + return None + + def get_option_implemented_consecutive_chunks_parallelizer(self): for parallelizer in self.parallelizer_list: splitter = parallelizer.get_splitter() diff --git a/compiler/definitions/ir/nodes/r_split.py b/compiler/definitions/ir/nodes/r_split.py index 68a889f2f..011df0559 100644 --- a/compiler/definitions/ir/nodes/r_split.py +++ b/compiler/definitions/ir/nodes/r_split.py @@ -1,7 +1,7 @@ import os from datatypes_new.AccessKind import AccessKind -from datatypes_new.BasicDatatypes import Operand +from datatypes_new.BasicDatatypes import Operand, Flag from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars import config @@ -24,21 +24,8 @@ def __init__(self, parallelizer_list=parallelizer_list, cmd_related_properties=cmd_related_properties) - ## TODO: Generalize this code (for this and SortGReduce) to be able to add an option to any command. def add_r_flag(self): - assert(False) - assert(len(self.com_options) <= 1) - - ## Add -r in r_split - new_opt = (0, Arg(string_to_argument("-r"))) - shifted_options = [(i+1, opt) for i, opt in self.com_options] - self.com_options = [new_opt] + shifted_options - - ## This is not a proper option check. It just works if the r_flag is added as a separate option. - def has_r_flag(self): - assert(False) - option_strings = [str(opt) for i, opt in self.com_options] - return ("-r" in option_strings) + self.cmd_invocation_with_io_vars.flag_option_list.append(Flag("-r")) def make_r_split(input_id, out_ids, r_split_batch_size): @@ -56,3 +43,8 @@ def make_r_split(input_id, out_ids, r_split_batch_size): implicit_use_of_streaming_output=None, access_map=access_map) return RSplit(cmd_inv_with_io_vars) + +def make_r_split_with_unwrap_flag(input_id, out_ids, r_split_batch_size): + standard_r_split = make_r_split(input_id, out_ids, r_split_batch_size) + standard_r_split.add_r_flag() + return standard_r_split diff --git a/compiler/ir.py b/compiler/ir.py index c2f36543e..e5cd5c423 100644 --- a/compiler/ir.py +++ b/compiler/ir.py @@ -771,6 +771,9 @@ def apply_parallelization_to_node(self, node_id, parallelizer, fileIdGen, fan_ou # TODO: for both functions, check which parameters are needed self.apply_round_robin_parallelization_to_node(node_id, parallelizer, fileIdGen, fan_out, batch_size, no_cat_split_vanish, r_split_batch_size) + elif splitter.is_splitter_round_robin_with_unwrap_flag(): + self.apply_round_robin_with_unwrap_flag_parallelization_to_node(node_id, parallelizer, fileIdGen, fan_out, + batch_size, no_cat_split_vanish, r_split_batch_size) elif splitter.is_splitter_consec_chunks(): self.apply_consecutive_chunks_parallelization_to_node(node_id, parallelizer, fileIdGen, fan_out, batch_size, no_cat_split_vanish, r_split_batch_size) @@ -819,6 +822,45 @@ def apply_round_robin_parallelization_to_node(self, node_id, parallelizer, fileI # aggregator self.introduce_aggregator_for_round_robin(out_mapper_ids, parallelizer, streaming_output) + def apply_round_robin_with_unwrap_flag_parallelization_to_node(self, node_id, parallelizer, fileIdGen, fan_out, + batch_size, no_cat_split_vanish, r_split_batch_size): + # round robin with unwrap flag is an inferred parallelizer which ensures that + # the command is commutative and has an aggregator for consecutive chunks; + # thus we can check whether we can re-open a previous "RR"-parallelization ending with `r_merge` + node = self.get_node(node_id) + streaming_input, streaming_output, configuration_inputs = \ + node.get_single_streaming_input_single_output_and_configuration_inputs_of_node_for_parallelization() + original_cmd_invocation_with_io_vars = node.cmd_invocation_with_io_vars + + prev_nodes = self.get_previous_nodes(node_id) + first_pred_node, first_pred_cmd_inv = self.get_first_previous_node_and_first_previous_cmd_invocation(prev_nodes) + + # remove node to be parallelized + self.remove_node(node_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph + + if len(prev_nodes) == 1 and isinstance(first_pred_node, r_merge.RMerge): + # and node.is_commutative(): implied by how this kind of splitter is inferred + self.remove_node(prev_nodes[0]) # also sets respective edge to's and from's to None + + in_unwrap_ids = first_pred_cmd_inv.operand_list + out_unwrap_ids = self.introduce_unwraps(fileIdGen, in_unwrap_ids) + in_mapper_ids = out_unwrap_ids + else: + # splitter + round_robin_with_unwrap_flag_splitter_generator = lambda input_id, output_ids: r_split.make_r_split_with_unwrap_flag(input_id, output_ids, r_split_batch_size) + out_split_ids = self.introduce_splitter(round_robin_with_unwrap_flag_splitter_generator, fan_out, fileIdGen, streaming_input) + in_mapper_ids = out_split_ids + + # mappers + out_mapper_ids = self.introduce_mappers(fan_out, fileIdGen, in_mapper_ids, original_cmd_invocation_with_io_vars, + parallelizer) + + in_aggregator_ids = out_mapper_ids + out_aggregator_id = streaming_output + self.introduce_aggregators_for_consec_chunks(fileIdGen, in_aggregator_ids, + original_cmd_invocation_with_io_vars, out_aggregator_id, parallelizer, + streaming_output) + def apply_consecutive_chunks_parallelization_to_node(self, node_id, parallelizer, fileIdGen, fan_out, batch_size, no_cat_split_vanish, r_split_batch_size): # check whether we can fuse with previous node's parallelization: @@ -841,12 +883,6 @@ def apply_consecutive_chunks_parallelization_to_node(self, node_id, parallelizer # can be fused self.remove_node(prev_nodes[0]) # also sets respective edge to's and from's to None in_mapper_ids = first_pred_cmd_inv.operand_list - elif len(prev_nodes) == 1 and isinstance(first_pred_node, r_merge.RMerge) and node.is_commutative(): - self.remove_node(prev_nodes[0]) # also sets respective edge to's and from's to None - - in_unwrap_ids = first_pred_cmd_inv.operand_list - out_unwrap_ids = self.introduce_unwraps(fileIdGen, in_unwrap_ids) - in_mapper_ids = out_unwrap_ids else: # cannot be fused so introduce splitter # splitter consec_chunks_splitter_generator = lambda input_id, output_ids: pash_split.make_split_file(input_id, diff --git a/compiler/pash_runtime.py b/compiler/pash_runtime.py index d5ec78bca..27217a971 100644 --- a/compiler/pash_runtime.py +++ b/compiler/pash_runtime.py @@ -287,9 +287,11 @@ def choose_parallelizing_transformation(curr_id, graph, r_split_flag): # shall r curr = graph.get_node(curr_id) # we ignore `r_split_flag` here as we want to exploit r_merge followed by commutative command # which only works if the a parallelizer for the latter is chosen (sort does not have RR-parallelizer) - # we prioritize round robin over consecutive chunks: - return return_default_if_none_else_itself(curr.get_option_implemented_round_robin_parallelizer(), - curr.get_option_implemented_consecutive_chunks_parallelizer()) + # we prioritize round robin over round robin with unwrap over consecutive chunks: + list_all_parallelizers_in_priority = [curr.get_option_implemented_round_robin_parallelizer(), + curr.get_option_implemented_round_robin_with_unwrap_parallelizer(), + curr.get_option_implemented_consecutive_chunks_parallelizer()] + return next((item for item in list_all_parallelizers_in_priority if item is not None), None) # When `r_split_flag` should be used: # if r_split_flag: # option_parallelizer = curr.get_option_implemented_round_robin_parallelizer() From dfa03a760f9e0d16081b6c45d3519fd888b35ce7 Mon Sep 17 00:00:00 2001 From: Konstantinos Kallas Date: Fri, 1 Jul 2022 15:59:43 -0400 Subject: [PATCH 16/64] Install annotations lib using `pip` (#603) * Add a proper installation of the annotation lib Signed-off-by: Konstantinos Kallas * Remove unnecessary sys.path Signed-off-by: Konstantinos Kallas * Fix bug in setup Signed-off-by: Konstantinos Kallas * fix setup script Signed-off-by: Konstantinos Kallas --- TODO.md | 2 +- compiler/annotations_utils/util_aggregator.py | 4 ---- compiler/annotations_utils/util_cmd_invocations.py | 5 ----- compiler/annotations_utils/util_file_descriptors.py | 3 --- compiler/annotations_utils/util_mapper.py | 3 --- compiler/annotations_utils/util_parsing.py | 3 --- compiler/config.py | 9 --------- compiler/definitions/ir/dfg_node.py | 2 -- compiler/ir.py | 2 -- scripts/setup-pash.sh | 4 ++++ 10 files changed, 5 insertions(+), 32 deletions(-) diff --git a/TODO.md b/TODO.md index e604dbdd9..a6aa0a7d4 100644 --- a/TODO.md +++ b/TODO.md @@ -2,8 +2,8 @@ - support for RR with unwrap for commutative commands - working on all tests -- Adding annotation library installation and removing ad-hoc import of the latter - clean up utils for annotations - graphviz - Changing PaSh flags (making the default be priority r-split and then consecutive chunks), so remove the r_split flag and make defaults be the ones from the OSDI paper +- Fixing annotation library installation to a specific commit - Remove code which got obsolete due to the changes \ No newline at end of file diff --git a/compiler/annotations_utils/util_aggregator.py b/compiler/annotations_utils/util_aggregator.py index 3382730c6..f51a5ab42 100644 --- a/compiler/annotations_utils/util_aggregator.py +++ b/compiler/annotations_utils/util_aggregator.py @@ -1,9 +1,5 @@ # TODO: this file can properly be deleted -import sys -from config import get_path_annotation_repo -sys.path.insert(1, get_path_annotation_repo()) - from definitions.ir.dfg_node import DFGNode from definitions.ir.nodes.cat import Cat from annotations_utils.util_cmd_invocations import get_command_invocation_prefix_from_dfg_node diff --git a/compiler/annotations_utils/util_cmd_invocations.py b/compiler/annotations_utils/util_cmd_invocations.py index d624affe6..5d6e206ee 100644 --- a/compiler/annotations_utils/util_cmd_invocations.py +++ b/compiler/annotations_utils/util_cmd_invocations.py @@ -1,5 +1,3 @@ -import sys - from datatypes_new.BasicDatatypes import Flag, ArgStringType, Operand from datatypes_new.BasicDatatypesWithIO import OptionWithIO from datatypes_new.CommandInvocationInitial import CommandInvocationInitial @@ -10,11 +8,8 @@ get_parallelizability_info_from_cmd_invocation from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars -from config import get_path_annotation_repo from definitions.ir.arg import Arg -sys.path.insert(1, get_path_annotation_repo()) - # for typing from datatypes_new.CommandInvocationPrefix import CommandInvocationPrefix diff --git a/compiler/annotations_utils/util_file_descriptors.py b/compiler/annotations_utils/util_file_descriptors.py index 910efa632..fe68ed9fb 100644 --- a/compiler/annotations_utils/util_file_descriptors.py +++ b/compiler/annotations_utils/util_file_descriptors.py @@ -1,8 +1,5 @@ from util import log from definitions.ir.resource import FileResource, Resource, FileDescriptorResource -import sys -from config import get_path_annotation_repo -sys.path.insert(1, get_path_annotation_repo()) from datatypes_new.BasicDatatypesWithIO import FileNameWithIOInfo, StdDescriptorWithIOInfo diff --git a/compiler/annotations_utils/util_mapper.py b/compiler/annotations_utils/util_mapper.py index 64657cf03..14bd965d1 100644 --- a/compiler/annotations_utils/util_mapper.py +++ b/compiler/annotations_utils/util_mapper.py @@ -1,9 +1,6 @@ # TODO: this file can properly be deleted # imports from annotation framework -import sys -from config import get_path_annotation_repo -sys.path.insert(1, get_path_annotation_repo()) # for typing # for use from annotation_generation_new.datatypes.parallelizability.Mapper import Mapper diff --git a/compiler/annotations_utils/util_parsing.py b/compiler/annotations_utils/util_parsing.py index 19a098403..516c43da7 100644 --- a/compiler/annotations_utils/util_parsing.py +++ b/compiler/annotations_utils/util_parsing.py @@ -1,10 +1,7 @@ -import sys from typing import Set, List, Any from definitions.ir.arg import Arg -from config import get_path_annotation_repo -sys.path.insert(1, get_path_annotation_repo()) from datatypes_new.CommandInvocationInitial import CommandInvocationInitial from datatypes_new.BasicDatatypes import Option, ArgStringType, Flag, Operand from parser_new.parser import parse, get_set_of_all_flags, get_dict_flag_to_primary_repr, get_set_of_all_options, \ diff --git a/compiler/config.py b/compiler/config.py index 71a9959fc..f5e7648b7 100644 --- a/compiler/config.py +++ b/compiler/config.py @@ -29,15 +29,6 @@ HDFS_PREFIX = "$HDFS_DATANODE_DIR/" -# move this to `config.json` if possible -PATH_ANNOTATION_REPO="/home/felix/git-repos/MIT/annotations" - -def get_path_annotation_repo(): - if PATH_ANNOTATION_REPO is None: - log("No path for annotation repository given! Specify it in compiler/config.py") - raise Exception("No path for annotation repository given! Specify it in compiler/config.py") - return PATH_ANNOTATION_REPO - config = {} annotations = [] pash_args = None diff --git a/compiler/definitions/ir/dfg_node.py b/compiler/definitions/ir/dfg_node.py index 587622680..fe1559194 100644 --- a/compiler/definitions/ir/dfg_node.py +++ b/compiler/definitions/ir/dfg_node.py @@ -6,8 +6,6 @@ from annotations_utils.util_cmd_invocations import to_node_cmd_inv_with_io_vars, construct_property_container_from_list_of_properties import sys -from config import get_path_annotation_repo -sys.path.insert(1, get_path_annotation_repo()) from util import return_empty_list_if_none_else_itself, return_default_if_none_else_itself diff --git a/compiler/ir.py b/compiler/ir.py index e5cd5c423..a3dc8a1ed 100644 --- a/compiler/ir.py +++ b/compiler/ir.py @@ -1,7 +1,5 @@ import sys -from config import get_path_annotation_repo -sys.path.insert(1, get_path_annotation_repo()) from datatypes_new.CommandInvocationInitial import CommandInvocationInitial from datatypes_new.BasicDatatypesWithIO import FileNameWithIOInfo, StdDescriptorWithIOInfo from annotation_generation_new.datatypes.InputOutputInfo import InputOutputInfo diff --git a/scripts/setup-pash.sh b/scripts/setup-pash.sh index 719450796..78721c656 100755 --- a/scripts/setup-pash.sh +++ b/scripts/setup-pash.sh @@ -93,6 +93,10 @@ python3 -m pip install graphviz --root $PYTHON_PKG_DIR --ignore-installed #&> $L python3 -m pip install numpy --root $PYTHON_PKG_DIR --ignore-installed #&> $LOG_DIR/pip_install_numpy.log python3 -m pip install matplotlib --root $PYTHON_PKG_DIR --ignore-installed #&> $LOG_DIR/pip_install_matplotlib.log +## TODO: Fix a specific version somehow, maybe commit? +git clone https://github.com/binpash/annotations.git ./annotations_repo +python3 -m pip install ./annotations_repo --root $PYTHON_PKG_DIR --ignore-installed #&> $LOG_DIR/pip_install_annotations.log + # clean the python packages cd $PYTHON_PKG_DIR # can we find a better alternative to that From 624d8171a1a67002a3bb6a86b379ccff47ac379f Mon Sep 17 00:00:00 2001 From: Konstantinos Kallas Date: Tue, 5 Jul 2022 06:17:00 -0700 Subject: [PATCH 17/64] Add a whitespace to trigger CI Signed-off-by: Konstantinos Kallas --- compiler/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/compiler/config.py b/compiler/config.py index f5e7648b7..053320f98 100644 --- a/compiler/config.py +++ b/compiler/config.py @@ -29,6 +29,7 @@ HDFS_PREFIX = "$HDFS_DATANODE_DIR/" + config = {} annotations = [] pash_args = None From 3bd0cf63442246fed043458b59a2d4c9cfd1016d Mon Sep 17 00:00:00 2001 From: Felix Stutz Date: Tue, 12 Jul 2022 14:48:04 -0400 Subject: [PATCH 18/64] Refactored to remove __future__ from annotations library (#609) Signed-off-by: Felix Stutz --- compiler/definitions/ir/nodes/dgsh_tee.py | 6 +++--- compiler/definitions/ir/nodes/eager.py | 8 ++++---- compiler/definitions/ir/nodes/r_merge.py | 6 +++--- compiler/definitions/ir/nodes/r_split.py | 6 +++--- compiler/definitions/ir/nodes/r_unwrap.py | 4 ++-- compiler/definitions/ir/nodes/r_wrap.py | 4 ++-- 6 files changed, 17 insertions(+), 17 deletions(-) diff --git a/compiler/definitions/ir/nodes/dgsh_tee.py b/compiler/definitions/ir/nodes/dgsh_tee.py index c417b8f58..cacdd94c9 100644 --- a/compiler/definitions/ir/nodes/dgsh_tee.py +++ b/compiler/definitions/ir/nodes/dgsh_tee.py @@ -1,4 +1,4 @@ -from datatypes_new.AccessKind import AccessKind +from datatypes_new.AccessKind import make_stream_output, make_stream_input from datatypes_new.BasicDatatypes import Flag, ArgStringType from datatypes_new.BasicDatatypesWithIO import OptionWithIO from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars @@ -19,8 +19,8 @@ def __init__(self, def make_dgsh_tee_node(input_id, output_id): dgsh_tee_bin = os.path.join(config.PASH_TOP, config.config['runtime']['dgsh_tee_binary']) - access_map = {output_id: AccessKind.make_stream_output(), - input_id: AccessKind.make_stream_input()} + access_map = {output_id: make_stream_output(), + input_id: make_stream_input()} flag_option_list = [OptionWithIO("-i", input_id), OptionWithIO("-o", output_id), diff --git a/compiler/definitions/ir/nodes/eager.py b/compiler/definitions/ir/nodes/eager.py index ac49a576e..ae931b486 100644 --- a/compiler/definitions/ir/nodes/eager.py +++ b/compiler/definitions/ir/nodes/eager.py @@ -1,4 +1,4 @@ -from datatypes_new.AccessKind import AccessKind +from datatypes_new.AccessKind import AccessKind, make_stream_output, make_stream_input, make_other_output from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars from definitions.ir.dfg_node import * @@ -18,9 +18,9 @@ def make_eager_node(input_id, output_id, intermediate_file_id, eager_exec_path): eager_name = eager_exec_path intermediate_file_id_id = intermediate_file_id.get_ident() operand_list = [input_id, output_id, intermediate_file_id_id] - access_map = {output_id: AccessKind.make_stream_output(), - input_id: AccessKind.make_stream_input(), - intermediate_file_id_id: AccessKind.make_other_output()} + access_map = {output_id: make_stream_output(), + input_id: make_stream_input(), + intermediate_file_id_id: make_other_output()} cmd_inv_with_io_vars = CommandInvocationWithIOVars( cmd_name=eager_name, flag_option_list=[], diff --git a/compiler/definitions/ir/nodes/r_merge.py b/compiler/definitions/ir/nodes/r_merge.py index f587a94fc..453f0c01f 100644 --- a/compiler/definitions/ir/nodes/r_merge.py +++ b/compiler/definitions/ir/nodes/r_merge.py @@ -1,4 +1,4 @@ -from datatypes_new.AccessKind import AccessKind +from datatypes_new.AccessKind import make_stream_input, make_stream_output from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars from definitions.ir.dfg_node import * @@ -20,8 +20,8 @@ def __init__(self, def make_r_merge_node(inputs, output): r_merge_bin = os.path.join(config.PASH_TOP, config.config['runtime']['r_merge_binary']) # TODO: assume that the inputs and output is provided as operands - access_map = {input_id: AccessKind.make_stream_input() for input_id in inputs} - access_map[output] = AccessKind.make_stream_output() + access_map = {input_id: make_stream_input() for input_id in inputs} + access_map[output] = make_stream_output() cmd_inv_with_io_vars = CommandInvocationWithIOVars( cmd_name=r_merge_bin, flag_option_list=[], diff --git a/compiler/definitions/ir/nodes/r_split.py b/compiler/definitions/ir/nodes/r_split.py index 011df0559..05900a1d9 100644 --- a/compiler/definitions/ir/nodes/r_split.py +++ b/compiler/definitions/ir/nodes/r_split.py @@ -1,6 +1,6 @@ import os -from datatypes_new.AccessKind import AccessKind +from datatypes_new.AccessKind import AccessKind, make_stream_input, make_stream_output from datatypes_new.BasicDatatypes import Operand, Flag from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars @@ -33,8 +33,8 @@ def make_r_split(input_id, out_ids, r_split_batch_size): operand_list = [input_id, Operand(Arg(string_to_argument(str(r_split_batch_size))))] operand_list.extend(out_ids) - access_map = {output_id: AccessKind.make_stream_output() for output_id in out_ids} - access_map[input_id] = AccessKind.make_stream_input() + access_map = {output_id: make_stream_output() for output_id in out_ids} + access_map[input_id] = make_stream_input() cmd_inv_with_io_vars = CommandInvocationWithIOVars( cmd_name=r_split_bin, flag_option_list=[], diff --git a/compiler/definitions/ir/nodes/r_unwrap.py b/compiler/definitions/ir/nodes/r_unwrap.py index 38cb03dcc..0a2aec195 100644 --- a/compiler/definitions/ir/nodes/r_unwrap.py +++ b/compiler/definitions/ir/nodes/r_unwrap.py @@ -1,4 +1,4 @@ -from datatypes_new.AccessKind import AccessKind +from datatypes_new.AccessKind import make_stream_input, make_stream_output from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars from definitions.ir.dfg_node import * @@ -21,7 +21,7 @@ def __init__(self, def make_unwrap_node(inputs, output): assert(len(inputs) == 1) input_id = inputs[0] - access_map = {input_id: AccessKind.make_stream_input(), output: AccessKind.make_stream_output()} + access_map = {input_id: make_stream_input(), output: make_stream_output()} r_unwrap_bin = os.path.join(config.PASH_TOP, config.config['runtime']['r_unwrap_binary']) cmd_inv_with_io_vars = CommandInvocationWithIOVars( cmd_name=r_unwrap_bin, diff --git a/compiler/definitions/ir/nodes/r_wrap.py b/compiler/definitions/ir/nodes/r_wrap.py index 8fd44f6ca..316e81f33 100644 --- a/compiler/definitions/ir/nodes/r_wrap.py +++ b/compiler/definitions/ir/nodes/r_wrap.py @@ -1,4 +1,4 @@ -from datatypes_new.AccessKind import AccessKind +from datatypes_new.AccessKind import make_stream_output, make_stream_input from datatypes_new.BasicDatatypes import ArgStringType from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars @@ -45,7 +45,7 @@ def wrap_node(node: DFGNode, edges): ## TODO: changed this from <= to == 1 to simplify reasoning later for now assert(len(outputs) == 1) output_id = outputs[0] - access_map = {input_id: AccessKind.make_stream_input(), output_id: AccessKind.make_stream_output()} + access_map = {input_id: make_stream_input(), output_id: make_stream_output()} #create bash -c argument cmd_inv_with_io_vars: CommandInvocationWithIOVars = node.cmd_invocation_with_io_vars From 44017dc6af2c1cc8ea14b5dd010eaf3f4e66d61b Mon Sep 17 00:00:00 2001 From: Felix Stutz Date: Wed, 20 Jul 2022 14:24:42 -0400 Subject: [PATCH 19/64] Cover all but one test case from compiler/test_evaluation_scripts.sh (#612) * Fix bug in parser to switch from flag to operand mode when reading hyphen Signed-off-by: Felix Stutz * Adapt shortest_scripts.sh to work with parser Signed-off-by: Felix Stutz * Cover more test cases from script_microbenchmarks Signed-off-by: Felix Stutz * Parallelize spell-grep as done in `future`, i.e., not RR but CC for `set_diff` Signed-off-by: Felix Stutz * Clean up and clarifying comment in parser Signed-off-by: Felix Stutz * Simplify control flow in parallelization Signed-off-by: Felix Stutz --- TODO.md | 13 +++-- .../annotations_utils/util_cmd_invocations.py | 4 +- compiler/annotations_utils/util_parsing.py | 2 + compiler/definitions/ir/nodes/pash_split.py | 6 +-- compiler/ir.py | 48 ++++++++++++------- evaluation/tests/shortest_scripts.sh | 4 +- 6 files changed, 50 insertions(+), 27 deletions(-) diff --git a/TODO.md b/TODO.md index a6aa0a7d4..0ffd62f55 100644 --- a/TODO.md +++ b/TODO.md @@ -1,9 +1,16 @@ ## TODOs before merging to `future` -- support for RR with unwrap for commutative commands -- working on all tests +- fix tests from compiler/test_evaluation_scripts.sh: + + bigrams - clean up utils for annotations - graphviz - Changing PaSh flags (making the default be priority r-split and then consecutive chunks), so remove the r_split flag and make defaults be the ones from the OSDI paper - Fixing annotation library installation to a specific commit -- Remove code which got obsolete due to the changes \ No newline at end of file +- Remove code which got obsolete due to the changes +- Room for optimization: basically disable parallelization after a tr which squeezes all new lines since there are no sequences of data to parallelize anyway for the moment. + Long-term, we could allow parallelization but with a adj_line_merge aggregator. +- Changes to scripts: + + `shortest_scripts.sh`: here I only needed to modify the script slightly: + (1) option arguments for `cut` with whitespace as the parser cannot deal with them otherwise currently but we might want to change this in the future, + (2) `head -n 15` instead of `head -15` which might be a bit harder to support. I did not really see how the man-page supports this actually when skimming but I might have missed that. +- tr_test.sh: Outside the testing script, the outputs are the same but somehow it still shows different outputs. Checked this with Konstantinos and he will check the testing script later. diff --git a/compiler/annotations_utils/util_cmd_invocations.py b/compiler/annotations_utils/util_cmd_invocations.py index 5d6e206ee..1be87e28b 100644 --- a/compiler/annotations_utils/util_cmd_invocations.py +++ b/compiler/annotations_utils/util_cmd_invocations.py @@ -110,12 +110,12 @@ def to_arg_from_cmd_inv_with_io_vars_without_streaming_inputs_or_outputs_for_wra whole_cmd.concatenate(Arg(string_to_argument("\'"))) return whole_cmd -def to_arg_flagoption(flagoption, _edges): +def to_arg_flagoption(flagoption, edges): if isinstance(flagoption, Flag): return [Arg(string_to_argument(flagoption.get_name()))] elif isinstance(flagoption, OptionWithIO): opt_name_arg = Arg(string_to_argument(flagoption.get_name())) - opt_arg_arg = translate_io_var_to_arg_if_applicable(flagoption.get_arg()) + opt_arg_arg = translate_io_var_to_arg_if_applicable(flagoption.get_arg(), edges) return [opt_name_arg, opt_arg_arg] def to_arg_operand(operand, edges): diff --git a/compiler/annotations_utils/util_parsing.py b/compiler/annotations_utils/util_parsing.py index 516c43da7..cef464f79 100644 --- a/compiler/annotations_utils/util_parsing.py +++ b/compiler/annotations_utils/util_parsing.py @@ -70,6 +70,8 @@ def parse_arg_list_to_command_invocation(command, flags_options_operands) -> Com option = Option(option_name_as_string, option_arg_as_arg) flag_option_list.append(option) i += 1 # since we consumed another term for the argument + elif potential_flag_or_option_name == "-": # switch to operand mode (interpreted as hyphen-stdin) + break elif are_all_individually_flags(potential_flag_or_option_name, set_of_all_flags): for split_el in list(potential_flag_or_option_name[1:]): flag: Flag = Flag(f'-{split_el}') diff --git a/compiler/definitions/ir/nodes/pash_split.py b/compiler/definitions/ir/nodes/pash_split.py index 9c28267d5..e21de4b1d 100644 --- a/compiler/definitions/ir/nodes/pash_split.py +++ b/compiler/definitions/ir/nodes/pash_split.py @@ -1,4 +1,4 @@ -from datatypes_new.AccessKind import AccessKind +from datatypes_new.AccessKind import make_stream_input, make_stream_output from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars from definitions.ir.file_id import * @@ -26,8 +26,8 @@ def make_split_file(input_id, out_ids): auto_split_bin = os.path.join(config.PASH_TOP, config.config['runtime']['auto_split_binary']) operand_list = [input_id] operand_list.extend(out_ids) - access_map = {output_id: AccessKind.make_stream_output() for output_id in out_ids} - access_map[input_id] = AccessKind.make_stream_input() + access_map = {output_id: make_stream_output() for output_id in out_ids} + access_map[input_id] = make_stream_input() cmd_inv_with_io_vars = CommandInvocationWithIOVars( cmd_name=auto_split_bin, flag_option_list=[], diff --git a/compiler/ir.py b/compiler/ir.py index a3dc8a1ed..0816a663f 100644 --- a/compiler/ir.py +++ b/compiler/ir.py @@ -177,6 +177,10 @@ def compile_command_to_DFG(fileIdGen, command, options, redirections=[]): command_invocation: CommandInvocationInitial = parse_arg_list_to_command_invocation(command, options) io_info: InputOutputInfo = get_input_output_info_from_cmd_invocation_util(command_invocation) + if io_info is None: + raise Exception(f"InputOutputInformation for {format_arg_chars(command)} not provided so considered side-effectful.") + if io_info.has_other_outputs(): + raise Exception(f"Command {format_arg_chars(command)} has outputs other than streaming.") para_info: ParallelizabilityInfo = get_parallelizability_info_from_cmd_invocation_util(command_invocation) command_invocation_with_io = io_info.apply_input_output_info_to_command_invocation(command_invocation) parallelizer_list, round_robin_compatible_with_cat, is_commutative = para_info.unpack_info() @@ -766,7 +770,6 @@ def apply_parallelization_to_node(self, node_id, parallelizer, fileIdGen, fan_ou batch_size, no_cat_split_vanish, r_split_batch_size): splitter = parallelizer.get_splitter() if splitter.is_splitter_round_robin(): - # TODO: for both functions, check which parameters are needed self.apply_round_robin_parallelization_to_node(node_id, parallelizer, fileIdGen, fan_out, batch_size, no_cat_split_vanish, r_split_batch_size) elif splitter.is_splitter_round_robin_with_unwrap_flag(): @@ -797,14 +800,19 @@ def apply_round_robin_parallelization_to_node(self, node_id, parallelizer, fileI node.get_single_streaming_input_single_output_and_configuration_inputs_of_node_for_parallelization() original_cmd_invocation_with_io_vars = node.cmd_invocation_with_io_vars + + can_be_fused_with_prev = False prev_nodes = self.get_previous_nodes(node_id) - first_pred_node, first_pred_cmd_inv = self.get_first_previous_node_and_first_previous_cmd_invocation(prev_nodes) + if len(prev_nodes) == 1: + first_pred_node, first_pred_cmd_inv = \ + self.get_only_previous_node_and_only_previous_cmd_invocation(prev_nodes) + if isinstance(first_pred_node, r_merge.RMerge): + can_be_fused_with_prev = True # remove node to be parallelized self.remove_node(node_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph - if len(prev_nodes) == 1 and isinstance(first_pred_node, r_merge.RMerge): - # can be fused + if can_be_fused_with_prev: self.remove_node(prev_nodes[0]) # also sets respective edge to's and from's to None in_mapper_ids = first_pred_cmd_inv.operand_list else: # cannot be fused so introduce splitter @@ -830,16 +838,19 @@ def apply_round_robin_with_unwrap_flag_parallelization_to_node(self, node_id, pa node.get_single_streaming_input_single_output_and_configuration_inputs_of_node_for_parallelization() original_cmd_invocation_with_io_vars = node.cmd_invocation_with_io_vars + can_be_fused_with_prev = False prev_nodes = self.get_previous_nodes(node_id) - first_pred_node, first_pred_cmd_inv = self.get_first_previous_node_and_first_previous_cmd_invocation(prev_nodes) + if len(prev_nodes) == 1: + first_pred_node, first_pred_cmd_inv = \ + self.get_only_previous_node_and_only_previous_cmd_invocation(prev_nodes) + if isinstance(first_pred_node, r_merge.RMerge): + can_be_fused_with_prev = True # remove node to be parallelized self.remove_node(node_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph - if len(prev_nodes) == 1 and isinstance(first_pred_node, r_merge.RMerge): - # and node.is_commutative(): implied by how this kind of splitter is inferred + if can_be_fused_with_prev: # and node.is_commutative(): implied by how this kind of splitter is inferred self.remove_node(prev_nodes[0]) # also sets respective edge to's and from's to None - in_unwrap_ids = first_pred_cmd_inv.operand_list out_unwrap_ids = self.introduce_unwraps(fileIdGen, in_unwrap_ids) in_mapper_ids = out_unwrap_ids @@ -870,23 +881,24 @@ def apply_consecutive_chunks_parallelization_to_node(self, node_id, parallelizer node.get_single_streaming_input_single_output_and_configuration_inputs_of_node_for_parallelization() original_cmd_invocation_with_io_vars = node.cmd_invocation_with_io_vars + can_be_fused_with_prev = False prev_nodes = self.get_previous_nodes(node_id) - first_pred_node, first_pred_cmd_inv = self.get_first_previous_node_and_first_previous_cmd_invocation(prev_nodes) + if len(prev_nodes) == 1: + first_pred_node, first_pred_cmd_inv = \ + self.get_only_previous_node_and_only_previous_cmd_invocation(prev_nodes) + if first_pred_cmd_inv.is_aggregator_concatenate(): + can_be_fused_with_prev = True # remove node to be parallelized self.remove_node(node_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph - # TODO: change first check to first_pred_node and not cmd_inv - if len(prev_nodes) == 1 and first_pred_cmd_inv.is_aggregator_concatenate(): - # can be fused + if can_be_fused_with_prev: self.remove_node(prev_nodes[0]) # also sets respective edge to's and from's to None in_mapper_ids = first_pred_cmd_inv.operand_list else: # cannot be fused so introduce splitter # splitter - consec_chunks_splitter_generator = lambda input_id, output_ids: pash_split.make_split_file(input_id, - output_ids) - out_split_ids = self.introduce_splitter(consec_chunks_splitter_generator, fan_out, fileIdGen, - streaming_input) + consec_chunks_splitter_generator = lambda input_id, output_ids: pash_split.make_split_file(input_id, output_ids) + out_split_ids = self.introduce_splitter(consec_chunks_splitter_generator, fan_out, fileIdGen, streaming_input) in_mapper_ids = out_split_ids # mappers @@ -900,9 +912,9 @@ def apply_consecutive_chunks_parallelization_to_node(self, node_id, parallelizer original_cmd_invocation_with_io_vars, out_aggregator_id, parallelizer, streaming_output) - def get_first_previous_node_and_first_previous_cmd_invocation(self, prev_nodes): - assert (len(prev_nodes) > 0) + def get_only_previous_node_and_only_previous_cmd_invocation(self, prev_nodes): # get info about first one but also ensure that it is the only one if we fuse + assert len(prev_nodes) == 1 first_pred_id = prev_nodes[0] first_pred_node = self.get_node(first_pred_id) first_pred_cmd_inv = first_pred_node.cmd_invocation_with_io_vars diff --git a/evaluation/tests/shortest_scripts.sh b/evaluation/tests/shortest_scripts.sh index 0d3913119..7321d775e 100644 --- a/evaluation/tests/shortest_scripts.sh +++ b/evaluation/tests/shortest_scripts.sh @@ -4,4 +4,6 @@ # +p.95 multiple sed # +p.XX crawler -cat $IN | xargs file | grep "shell script" | cut -d: -f1 | xargs -L 1 wc -l | grep -v '^0$' | sort -n | head -15 +# cut -d: -f1 -> cut -d : -f 1; as parser recognizes option arguments only if given with whitespace +# head -15 -> head -n 15; not documented in man page +cat $IN | xargs file | grep "shell script" | cut -d : -f 1 | xargs -L 1 wc -l | grep -v '^0$' | sort -n | head -n 15 From a75378cd9b4f2039d2b5155fe79f98d2ef3c3d06 Mon Sep 17 00:00:00 2001 From: Felix Stutz Date: Mon, 25 Jul 2022 13:44:53 -0400 Subject: [PATCH 20/64] Add support for bigrams (#614) Signed-off-by: Felix Stutz --- compiler/definitions/ir/nodes/cat.py | 11 ++-- compiler/ir.py | 95 +++++++++++++++++++--------- 2 files changed, 70 insertions(+), 36 deletions(-) diff --git a/compiler/definitions/ir/nodes/cat.py b/compiler/definitions/ir/nodes/cat.py index a27b89f4f..28df3920e 100644 --- a/compiler/definitions/ir/nodes/cat.py +++ b/compiler/definitions/ir/nodes/cat.py @@ -1,4 +1,5 @@ -from definitions.ir.dfg_node import * +from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars +from definitions.ir.dfg_node import DFGNode class Cat(DFGNode): def __init__(self, inputs, outputs, com_name, com_category, @@ -32,9 +33,5 @@ def __init__(self, inputs, outputs, com_name, com_category, ) def make_cat_node(inputs, output): - com_name = Arg(string_to_argument("cat")) - com_category = "stateless" - return Cat(inputs, - [output], - com_name, - com_category) + cmd_inv_cat = CommandInvocationWithIOVars.make_cat_command_invocation_with_io_vars(inputs, output) + return DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(cmd_inv_cat) diff --git a/compiler/ir.py b/compiler/ir.py index 0816a663f..49bd57a91 100644 --- a/compiler/ir.py +++ b/compiler/ir.py @@ -4,6 +4,7 @@ from datatypes_new.BasicDatatypesWithIO import FileNameWithIOInfo, StdDescriptorWithIOInfo from annotation_generation_new.datatypes.InputOutputInfo import InputOutputInfo from annotation_generation_new.datatypes.ParallelizabilityInfo import ParallelizabilityInfo +from annotation_generation_new.datatypes.CommandProperties import CommandProperties from datatypes_new.CommandInvocationWithIOVars import CommandInvocationWithIOVars from annotations_utils.util_parsing import parse_arg_list_to_command_invocation @@ -184,9 +185,9 @@ def compile_command_to_DFG(fileIdGen, command, options, para_info: ParallelizabilityInfo = get_parallelizability_info_from_cmd_invocation_util(command_invocation) command_invocation_with_io = io_info.apply_input_output_info_to_command_invocation(command_invocation) parallelizer_list, round_robin_compatible_with_cat, is_commutative = para_info.unpack_info() - property_list = [('round_robin_compatible_with_cat', round_robin_compatible_with_cat), - ('is_commutative', is_commutative)] - cmd_related_properties = construct_property_container_from_list_of_properties(property_list) + property_dict = [{'round_robin_compatible_with_cat': round_robin_compatible_with_cat, + 'is_commutative': is_commutative}] + cmd_related_properties = CommandProperties(property_dict) ## TODO: Make an empty IR and add edges and nodes incrementally (using the methods defined in IR). @@ -824,6 +825,7 @@ def apply_round_robin_parallelization_to_node(self, node_id, parallelizer, fileI # mappers out_mapper_ids = self.introduce_mappers(fan_out, fileIdGen, in_mapper_ids, original_cmd_invocation_with_io_vars, parallelizer) + out_mapper_ids = [out_ids[0] for out_ids in out_mapper_ids] # since we get list of list back for potential aux info # aggregator self.introduce_aggregator_for_round_robin(out_mapper_ids, parallelizer, streaming_output) @@ -913,6 +915,7 @@ def apply_consecutive_chunks_parallelization_to_node(self, node_id, parallelizer streaming_output) def get_only_previous_node_and_only_previous_cmd_invocation(self, prev_nodes): + assert (len(prev_nodes) > 0) # get info about first one but also ensure that it is the only one if we fuse assert len(prev_nodes) == 1 first_pred_id = prev_nodes[0] @@ -930,16 +933,26 @@ def introduce_splitter(self, splitter_generator, fan_out, fileIdGen, streaming_i return out_split_ids def introduce_mappers(self, fan_out, fileIdGen, in_mapper_ids, original_cmd_invocation_with_io_vars, parallelizer): - out_mapper_ids = self.generate_ephemeral_edges(fileIdGen, fan_out) + # -> [[input, aux1, aux2], [...], [...], ...] + num_aux_mapper_to_aggregator = parallelizer.info_mapper_aggregator + out_mapper_ids = [] + for _ in range(0,fan_out): + out_mapper_ids.append(self.generate_ephemeral_edges(fileIdGen, num_aux_mapper_to_aggregator+1)) + # TODO: Fix that we use different ones here! + # list of output, aux_output_1, aux_output_2, ... zip_mapper_in_out_ids = zip(in_mapper_ids, out_mapper_ids) all_mappers = [] - for (in_id, out_id) in zip_mapper_in_out_ids: + for (in_id, out_ids) in zip_mapper_in_out_ids: # BEGIN: these 4 lines could be refactored to be a function in graph such that # creating end point of edges and the creation of edges is not decoupled - mapper_cmd_inv = parallelizer.get_actual_mapper(original_cmd_invocation_with_io_vars, in_id, out_id) + out_id = out_ids[0] + aux_out_ids = out_ids[1:] + mapper_cmd_inv = parallelizer.get_actual_mapper(original_cmd_invocation_with_io_vars, in_id, out_id, aux_out_ids) mapper = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(mapper_cmd_inv) self.set_edge_to(in_id, mapper.get_id()) self.set_edge_from(out_id, mapper.get_id()) + for aux_out_id in aux_out_ids: + self.set_edge_from(aux_out_id, mapper.get_id()) # END splitter = parallelizer.get_splitter() if splitter.is_splitter_round_robin(): @@ -966,25 +979,34 @@ def introduce_unwraps(self, fileIdGen, in_unwrap_ids): def introduce_aggregators_for_consec_chunks(self, fileIdGen, in_aggregator_ids, original_cmd_invocation_with_io_vars, out_aggregator_id, parallelizer, streaming_output): - aggregator_spec = parallelizer.get_aggregator_spec() - if aggregator_spec.is_aggregator_spec_concatenate() or aggregator_spec.is_aggregator_spec_custom_n_ary(): - aggregator_cmd_inv = parallelizer.get_actual_aggregator(original_cmd_invocation_with_io_vars, - in_aggregator_ids, out_aggregator_id) - aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(aggregator_cmd_inv) - for in_aggregator_id in in_aggregator_ids: - self.set_edge_to(in_aggregator_id, aggregator.get_id()) - self.set_edge_from(streaming_output, aggregator.get_id()) - all_aggregators = [aggregator] - ## Add the merge commands in the graph - for new_node in all_aggregators: - self.add_node(new_node) - elif aggregator_spec.is_aggregator_spec_custom_2_ary(): - # TODO: we simplify and assume that every mapper produces a single output for now - map_in_aggregator_ids = [[id] for id in in_aggregator_ids] - # TODO: turn node into cmd_invocation_with_io_vars since this is the only thing required in this function - self.create_generic_aggregator_tree(original_cmd_invocation_with_io_vars, parallelizer, map_in_aggregator_ids, out_aggregator_id, fileIdGen) - else: - raise Exception("aggregator kind not yet implemented") + # in_aggregator_ids: [[input, aux1, aux2, ...], [...], [...], ...] + if parallelizer.info_mapper_aggregator == 0: + in_aggregator_ids = [in_ids[0] for in_ids in in_aggregator_ids] # since we get list of list back for potential aux info + aggregator_spec = parallelizer.get_aggregator_spec() + if aggregator_spec.is_aggregator_spec_concatenate() or aggregator_spec.is_aggregator_spec_custom_n_ary(): + aggregator_cmd_inv = parallelizer.get_actual_aggregator(original_cmd_invocation_with_io_vars, + in_aggregator_ids, out_aggregator_id) + aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(aggregator_cmd_inv) + for in_aggregator_id in in_aggregator_ids: + self.set_edge_to(in_aggregator_id, aggregator.get_id()) + self.set_edge_from(streaming_output, aggregator.get_id()) + all_aggregators = [aggregator] + ## Add the merge commands in the graph + for new_node in all_aggregators: + self.add_node(new_node) + elif aggregator_spec.is_aggregator_spec_custom_2_ary(): + # TODO: we simplify and assume that every mapper produces a single output for now + map_in_aggregator_ids = [[id] for id in in_aggregator_ids] + # TODO: turn node into cmd_invocation_with_io_vars since this is the only thing required in this function + self.create_generic_aggregator_tree(original_cmd_invocation_with_io_vars, parallelizer, map_in_aggregator_ids, out_aggregator_id, fileIdGen) + else: + raise Exception("aggregator kind not yet implemented") + else: # we got auxiliary information + assert(parallelizer.core_aggregator_spec.is_aggregator_spec_custom_2_ary()) + map_in_aggregator_ids = in_aggregator_ids + self.create_generic_aggregator_tree(original_cmd_invocation_with_io_vars, parallelizer, + map_in_aggregator_ids, out_aggregator_id, fileIdGen) + def introduce_aggregator_for_round_robin(self, out_mapper_ids, parallelizer, streaming_output): aggregator_spec = parallelizer.get_aggregator_spec() @@ -1338,10 +1360,25 @@ def valid(self): ## This is a function that creates a reduce tree for a given node def create_generic_aggregator_tree(self, cmd_invocation_with_io_vars, parallelizer, input_ids_for_aggregators, out_aggregator_id, fileIdGen): def function_to_get_binary_aggregator(in_ids, out_ids): - assert(len(out_ids) == 1) - aggregator_cmd_inv = parallelizer.get_actual_aggregator(cmd_invocation_with_io_vars, in_ids, out_ids[0]) - aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(aggregator_cmd_inv) - return aggregator + if len(out_ids) == 1: + aggregator_cmd_inv = parallelizer.get_actual_aggregator(cmd_invocation_with_io_vars, in_ids, out_ids[0]) + aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(aggregator_cmd_inv) + return aggregator + else: + # list has been flattened ... + num_input_ids = len(in_ids) + assert(num_input_ids % 2 == 0) + fst_normal_input = in_ids[0] + fst_aux_inputs_from = in_ids[1:int(num_input_ids/2)] + snd_normal_input = in_ids[int(num_input_ids/2)] + snd_aux_inputs_from = in_ids[int(num_input_ids/2)+1:] + output_to = out_ids[0] + aux_outputs_to = out_ids[1:] + aggregator_cmd_inv = parallelizer.get_actual_2_ary_aggregator_with_aux( + fst_normal_input, fst_aux_inputs_from, snd_normal_input, snd_aux_inputs_from, + output_to, aux_outputs_to) + aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(aggregator_cmd_inv) + return aggregator ## The Aggregator node takes a sequence of input ids and an output id all_aggregators, new_edges, final_output_id = self.create_reduce_tree(lambda in_ids, out_ids: function_to_get_binary_aggregator(in_ids, out_ids), input_ids_for_aggregators, fileIdGen) From 3f79626bbe45f3831fa40b8fab45b771f226d265 Mon Sep 17 00:00:00 2001 From: Felix Stutz Date: Mon, 25 Jul 2022 16:46:38 -0400 Subject: [PATCH 21/64] Refactor to have defaults for AnnotationInfo (#615) Signed-off-by: Felix Stutz --- compiler/ir.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/compiler/ir.py b/compiler/ir.py index 49bd57a91..c7c1afea4 100644 --- a/compiler/ir.py +++ b/compiler/ir.py @@ -183,6 +183,8 @@ def compile_command_to_DFG(fileIdGen, command, options, if io_info.has_other_outputs(): raise Exception(f"Command {format_arg_chars(command)} has outputs other than streaming.") para_info: ParallelizabilityInfo = get_parallelizability_info_from_cmd_invocation_util(command_invocation) + if para_info is None: + para_info = ParallelizabilityInfo() # defaults to no parallelizer's and all properties False command_invocation_with_io = io_info.apply_input_output_info_to_command_invocation(command_invocation) parallelizer_list, round_robin_compatible_with_cat, is_commutative = para_info.unpack_info() property_dict = [{'round_robin_compatible_with_cat': round_robin_compatible_with_cat, From b5bd70bcf2194964ebc3d35d8785c00d71a0e038 Mon Sep 17 00:00:00 2001 From: Felix Stutz Date: Tue, 30 Aug 2022 05:33:33 -0400 Subject: [PATCH 22/64] Minor changes due to typing in annotations repository (#622) Signed-off-by: Felix Stutz --- compiler/ir.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/compiler/ir.py b/compiler/ir.py index c7c1afea4..0ca5d8453 100644 --- a/compiler/ir.py +++ b/compiler/ir.py @@ -1,7 +1,8 @@ import sys from datatypes_new.CommandInvocationInitial import CommandInvocationInitial -from datatypes_new.BasicDatatypesWithIO import FileNameWithIOInfo, StdDescriptorWithIOInfo +from datatypes_new.BasicDatatypes import ArgStringType +from datatypes_new.BasicDatatypesWithIO import FileNameWithIOInfo, StdDescriptorWithIOInfo, OptionWithIO from annotation_generation_new.datatypes.InputOutputInfo import InputOutputInfo from annotation_generation_new.datatypes.ParallelizabilityInfo import ParallelizabilityInfo from annotation_generation_new.datatypes.CommandProperties import CommandProperties @@ -139,6 +140,7 @@ def add_file_id_vars(command_invocation_with_io, fileIdGen): # make pass over everything and create file_id for everything # only for operands for now: dfg_edges = {} + new_flagoption_list = [] new_operand_list = [] access_map = dict() @@ -150,6 +152,15 @@ def add_var_for_descriptor(operand): access_map[fid_id] = operand.get_access() return fid_id + for i in range(len(command_invocation_with_io.flag_option_list)): + flagoption = command_invocation_with_io.flag_option_list[i] + if isinstance(flagoption, OptionWithIO) and not isinstance(flagoption.option_arg, ArgStringType): + fid_id = add_var_for_descriptor(flagoption.option_arg) + new_option = OptionWithIOVar(flagoption.name, fid_id) + new_flagoption_list.append(new_option) + else: # Flag + new_flagoption_list.append(flagoption) + for i in range(len(command_invocation_with_io.operand_list)): operand = command_invocation_with_io.operand_list[i] if isinstance(operand, FileNameWithIOInfo) or isinstance(operand, StdDescriptorWithIOInfo): @@ -166,11 +177,12 @@ def add_var_for_descriptor(operand): else: new_implicit_use_of_streaming_output = None - # this shall become copy-based - command_invocation_with_io_vars = CommandInvocationWithIOVars.get_from_without_vars(command_invocation_with_io, access_map) - command_invocation_with_io_vars.operand_list = new_operand_list - command_invocation_with_io_vars.implicit_use_of_streaming_input = new_implicit_use_of_streaming_input - command_invocation_with_io_vars.implicit_use_of_streaming_output = new_implicit_use_of_streaming_output + command_invocation_with_io_vars = CommandInvocationWithIOVars(cmd_name=command_invocation_with_io.cmd_name, + flag_option_list=new_flagoption_list, + operand_list=new_operand_list, + implicit_use_of_streaming_input=new_implicit_use_of_streaming_input, + implicit_use_of_streaming_output=new_implicit_use_of_streaming_output, + access_map=access_map) return command_invocation_with_io_vars, dfg_edges @@ -186,6 +198,8 @@ def compile_command_to_DFG(fileIdGen, command, options, if para_info is None: para_info = ParallelizabilityInfo() # defaults to no parallelizer's and all properties False command_invocation_with_io = io_info.apply_input_output_info_to_command_invocation(command_invocation) + if para_info is None: + para_info = ParallelizabilityInfo() # defaults to no parallelizer's and all properties False parallelizer_list, round_robin_compatible_with_cat, is_commutative = para_info.unpack_info() property_dict = [{'round_robin_compatible_with_cat': round_robin_compatible_with_cat, 'is_commutative': is_commutative}] From 1ee23bc83e554eac7a9be58e1fad1f0b3a4849a9 Mon Sep 17 00:00:00 2001 From: Konstantinos Kallas Date: Mon, 5 Sep 2022 12:24:23 -0400 Subject: [PATCH 23/64] revert setup merge issue Signed-off-by: Konstantinos Kallas --- scripts/setup-pash.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/setup-pash.sh b/scripts/setup-pash.sh index 2668b5188..8e04219ec 100755 --- a/scripts/setup-pash.sh +++ b/scripts/setup-pash.sh @@ -28,6 +28,10 @@ python3 -m pip install matplotlib --root $PYTHON_PKG_DIR --ignore-installed #&> # TODO 2022-08-01 if libdash wheel isn't available, we need autmake etc. python3 -m pip install libdash --root $PYTHON_PKG_DIR --ignore-installed #&> $LOG_DIR/pip_install_libdash.log +## TODO: Fix a specific version somehow, maybe commit? +git clone https://github.com/binpash/annotations.git ./annotations_repo +python3 -m pip install ./annotations_repo --root $PYTHON_PKG_DIR --ignore-installed #&> $LOG_DIR/pip_install_annotations.log + # clean the python packages cd $PYTHON_PKG_DIR # can we find a better alternative to that From 92364f6a2ba4adf57c216a9e2dfbbf338d704472 Mon Sep 17 00:00:00 2001 From: Felix Stutz Date: Thu, 8 Sep 2022 13:56:01 +0200 Subject: [PATCH 24/64] Remove old annotations and move them to annotations repository Signed-off-by: Felix Stutz --- annotations/README.md | 222 ------- annotations/alt_bigram_aux_reduce.json | 12 - annotations/alt_bigrams_aux.json | 17 - annotations/auto-split.json | 12 - annotations/awk.json | 12 - annotations/bc.json | 12 - annotations/bigram_aux_map.json | 12 - annotations/bigram_aux_reduce.json | 12 - annotations/bigrams_aux.json | 23 - annotations/c_stats/README.md | 15 - annotations/c_stats/builtins.txt | 108 ---- annotations/c_stats/coreutils.txt | 106 ---- annotations/c_stats/linux_command_list.txt | 667 --------------------- annotations/c_stats/output.txt | 667 --------------------- annotations/c_stats/plan9.txt | 180 ------ annotations/c_stats/pman.txt | 667 --------------------- annotations/c_stats/posix.txt | 160 ----- annotations/cat.json | 37 -- annotations/chmod.json | 12 - annotations/col.json | 12 - annotations/comm.json | 13 - annotations/convert.json | 13 - annotations/custom_aggregators/cat.py.json | 12 - annotations/custom_aggregators/concat.json | 12 - annotations/custom_sort.json | 29 - annotations/custom_tr.json | 12 - annotations/cut.json | 33 - annotations/date.json | 12 - annotations/dd.json | 12 - annotations/dfs_split_reader.json | 12 - annotations/dgsh_tee.json | 12 - annotations/diff.json | 12 - annotations/eager-no-task-par.json | 12 - annotations/eager.json | 12 - annotations/echo.json | 12 - annotations/export.json | 12 - annotations/extract_text.json | 13 - annotations/ffmpeg.json | 13 - annotations/find.json | 12 - annotations/fmt.json | 22 - annotations/grep.json | 47 -- annotations/groff.json | 12 - annotations/gunzip.json | 12 - annotations/gzip.json | 14 - annotations/hdfs.json | 23 - annotations/head.json | 13 - annotations/history.json | 12 - annotations/iconv.json | 12 - annotations/jobs.json | 12 - annotations/ls.json | 12 - annotations/mkfifo.json | 12 - annotations/mktemp.json | 12 - annotations/multiply.json | 12 - annotations/nc.json | 12 - annotations/nl.json | 13 - annotations/notes.md | 30 - annotations/p_stats/README.md | 39 -- annotations/p_stats/coreutils-summary.txt | 104 ---- annotations/p_stats/coreutils.txt | 112 ---- annotations/p_stats/get-summary.sh | 14 - annotations/p_stats/mr.md | 19 - annotations/p_stats/output.txt | 667 --------------------- annotations/p_stats/posix-summary.txt | 94 --- annotations/p_stats/posix_mandatory1.txt | 57 -- annotations/p_stats/posix_mandatory2.txt | 37 -- annotations/p_stats/statistics.sh | 27 - annotations/package_build_aux.json | 12 - annotations/pandoc.json | 12 - annotations/paste.json | 13 - annotations/pr.json | 36 -- annotations/process_bio_s_line.json | 13 - annotations/ps.json | 12 - annotations/pwd.json | 12 - annotations/r_merge.json | 13 - annotations/r_split.json | 12 - annotations/r_unwrap.json | 12 - annotations/r_wrap.json | 12 - annotations/read.json | 12 - annotations/readelf.json | 12 - annotations/remote_read.json | 12 - annotations/remote_write.json | 12 - annotations/resize.json | 13 - annotations/rev.json | 12 - annotations/rm.json | 12 - annotations/run_tests.json | 12 - annotations/sed.json | 13 - annotations/seq.json | 12 - annotations/set.json | 12 - annotations/set_diff.json | 16 - annotations/sha256sum.json | 12 - annotations/shuf.json | 12 - annotations/sort.json | 30 - annotations/split.json | 12 - annotations/stem-words.json | 13 - annotations/tac.json | 14 - annotations/tail.json | 13 - annotations/tee.json | 12 - annotations/test_one.json | 16 - annotations/test_two.json | 17 - annotations/test_uniq_1.json | 17 - annotations/test_uniq_2.json | 18 - annotations/tr.json | 37 -- annotations/trigrams_aux.json | 13 - annotations/uniq.json | 14 - annotations/wc.json | 13 - annotations/xargs.json | 26 - annotations/xxd.json | 12 - 107 files changed, 5297 deletions(-) delete mode 100644 annotations/README.md delete mode 100644 annotations/alt_bigram_aux_reduce.json delete mode 100644 annotations/alt_bigrams_aux.json delete mode 100644 annotations/auto-split.json delete mode 100644 annotations/awk.json delete mode 100644 annotations/bc.json delete mode 100644 annotations/bigram_aux_map.json delete mode 100644 annotations/bigram_aux_reduce.json delete mode 100644 annotations/bigrams_aux.json delete mode 100644 annotations/c_stats/README.md delete mode 100644 annotations/c_stats/builtins.txt delete mode 100644 annotations/c_stats/coreutils.txt delete mode 100644 annotations/c_stats/linux_command_list.txt delete mode 100644 annotations/c_stats/output.txt delete mode 100644 annotations/c_stats/plan9.txt delete mode 100644 annotations/c_stats/pman.txt delete mode 100644 annotations/c_stats/posix.txt delete mode 100644 annotations/cat.json delete mode 100644 annotations/chmod.json delete mode 100644 annotations/col.json delete mode 100644 annotations/comm.json delete mode 100644 annotations/convert.json delete mode 100644 annotations/custom_aggregators/cat.py.json delete mode 100644 annotations/custom_aggregators/concat.json delete mode 100644 annotations/custom_sort.json delete mode 100644 annotations/custom_tr.json delete mode 100644 annotations/cut.json delete mode 100644 annotations/date.json delete mode 100644 annotations/dd.json delete mode 100644 annotations/dfs_split_reader.json delete mode 100644 annotations/dgsh_tee.json delete mode 100644 annotations/diff.json delete mode 100644 annotations/eager-no-task-par.json delete mode 100644 annotations/eager.json delete mode 100644 annotations/echo.json delete mode 100644 annotations/export.json delete mode 100644 annotations/extract_text.json delete mode 100644 annotations/ffmpeg.json delete mode 100644 annotations/find.json delete mode 100644 annotations/fmt.json delete mode 100644 annotations/grep.json delete mode 100644 annotations/groff.json delete mode 100644 annotations/gunzip.json delete mode 100644 annotations/gzip.json delete mode 100644 annotations/hdfs.json delete mode 100644 annotations/head.json delete mode 100644 annotations/history.json delete mode 100644 annotations/iconv.json delete mode 100644 annotations/jobs.json delete mode 100644 annotations/ls.json delete mode 100644 annotations/mkfifo.json delete mode 100644 annotations/mktemp.json delete mode 100644 annotations/multiply.json delete mode 100644 annotations/nc.json delete mode 100644 annotations/nl.json delete mode 100644 annotations/notes.md delete mode 100644 annotations/p_stats/README.md delete mode 100644 annotations/p_stats/coreutils-summary.txt delete mode 100644 annotations/p_stats/coreutils.txt delete mode 100755 annotations/p_stats/get-summary.sh delete mode 100644 annotations/p_stats/mr.md delete mode 100644 annotations/p_stats/output.txt delete mode 100644 annotations/p_stats/posix-summary.txt delete mode 100644 annotations/p_stats/posix_mandatory1.txt delete mode 100644 annotations/p_stats/posix_mandatory2.txt delete mode 100755 annotations/p_stats/statistics.sh delete mode 100644 annotations/package_build_aux.json delete mode 100644 annotations/pandoc.json delete mode 100644 annotations/paste.json delete mode 100644 annotations/pr.json delete mode 100644 annotations/process_bio_s_line.json delete mode 100644 annotations/ps.json delete mode 100644 annotations/pwd.json delete mode 100644 annotations/r_merge.json delete mode 100644 annotations/r_split.json delete mode 100644 annotations/r_unwrap.json delete mode 100644 annotations/r_wrap.json delete mode 100644 annotations/read.json delete mode 100644 annotations/readelf.json delete mode 100644 annotations/remote_read.json delete mode 100644 annotations/remote_write.json delete mode 100644 annotations/resize.json delete mode 100644 annotations/rev.json delete mode 100644 annotations/rm.json delete mode 100644 annotations/run_tests.json delete mode 100644 annotations/sed.json delete mode 100644 annotations/seq.json delete mode 100644 annotations/set.json delete mode 100644 annotations/set_diff.json delete mode 100644 annotations/sha256sum.json delete mode 100644 annotations/shuf.json delete mode 100644 annotations/sort.json delete mode 100644 annotations/split.json delete mode 100644 annotations/stem-words.json delete mode 100644 annotations/tac.json delete mode 100644 annotations/tail.json delete mode 100644 annotations/tee.json delete mode 100644 annotations/test_one.json delete mode 100644 annotations/test_two.json delete mode 100644 annotations/test_uniq_1.json delete mode 100644 annotations/test_uniq_2.json delete mode 100644 annotations/tr.json delete mode 100644 annotations/trigrams_aux.json delete mode 100644 annotations/uniq.json delete mode 100644 annotations/wc.json delete mode 100644 annotations/xargs.json delete mode 100644 annotations/xxd.json diff --git a/annotations/README.md b/annotations/README.md deleted file mode 100644 index be3578524..000000000 --- a/annotations/README.md +++ /dev/null @@ -1,222 +0,0 @@ -# Parallelizability Study & Annotation Language -Quick Jump: [Parallelizability](#main-parallelizability-classes) | [study](#parallelizability-study-of-commands-in-gnu--posix) | [example 1](#a-simple-example-chmod) | [example 2](#another-example-cut) | [howto](#how-to-annotate-a-command) | [issues](#Issues) - -PaSh includes - (i) a parallelizability study of commands in POSIX and GNU Coreutils, and - (ii) an annotation language for describing the parallelizability properties of individual commands. -The parallelizability study informed the design of the annotation language, which was in turn used to capture the key parallelizability characteristics in many of these commands. - -> _N.b.: We welcome contributions to the study and annotatations for common commands._ - -## Main Parallelizability Classes - -PaSh introduces four major parallelizability classes: - -* _Stateless Commands:_ -The first class, `stateless`, contains commands that operate on individual line elements of their input, without maintaining state across invocations. -These are commands that can be expressed as a purely functional `map` or `filter` -- _e.g.,_ `grep` filters out individual lines and `basename` removes a path prefix from a string. -They may produce multiple elements -- _e.g.,_ `tr` may insert `NL` tokens -- but always return empty output for empty input. -Workloads that use only stateless commands are trivial to parallelize: - they do not require any synchronization to maintain correctness, nor caution about where to split inputs. - -* _Parallelizable Pure Commands:_ -The second class, `parallelizable_pure`, contains commands that respect functional purity -- _i.e.,_ same outputs for same inputs -- but maintain internal state across their entire pass. -The details of this state and its propagation during element processing affect their parallelizability characteristics. -Some commands are easy to parallelize, because they maintain trivial state and are commutative -- _e.g.,_ `wc` simply maintains a counter. -Other commands, such as `sort`, maintain more complex invariants that have to be taken into account when merging partial results. - -* _Non-parallelizable Pure Commands:_ -The third class, `pure`, contains commands that, while purely functional, cannot be parallelized within a single data stream. -This is because their internal state depends on prior state in non-trivial ways over the same pass. % should we say something about state machines? -For example, hashing commands such as `sha1sum` maintain complex state that has to be updated sequentially. -If parallelized on a single input, each stage would need to wait on the results of all previous stages, foregoing any parallelism benefits. - -* _Side-effectful Commands:_ -The last class, `side-effectful`, contains commands that have side-effects across the system -- for example, updating environment variables, interacting with the filesystem, and accessing the network. -Such commands are not parallelizable without finer-grained concurrency control mechanisms that can detect side-effects across the system. - -## Parallelizability Study of Commands in GNU & POSIX - -The parallelizability study of commands in GNU and POSIX is comprised of two parts: a coarse-grained parallelizability study, and a set of annotations for commands. - -The main results of the parallelizability study are summarized in the [PaSh EuroSys'21 paper (Sec. 3.1 and Tab. 1)](https://arxiv.org/pdf/2007.09436.pdf). -To see the results of the parallelizability study, run [./p_stats](./p_stats). - -Annotations for about 60 popular commands are stored in this directory encoded as JSON files (about 14 lines per annotation on average, for a total of 846 lines of annotations). -Annotations can be thought of as defining a bidirectional correspondence between a command and a node in the dataflow graph---the abstraction used by the PaSh compiler. -Since command behaviors (and correspondence) can change based on their arguments, annotations contain a sequence of predicates. -Each predicate is accompanied by information that instantiates the correspondence between a command and a dataflow node. - -## A Simple Example: `chmod` - -As a first example, below we present the annotations for `chmod`. - -```json -{ - "command": "chmod", - "cases": [ - { - "predicate": "default", - "class": "side-effectful" - } - ] -} -``` - -The annotation for `chmod` is very simple, since it only needs to establish that `chmod` is side-effectful and therefore cannot be translated to a dataflow node. - -## Another Example: `cut` - -As another example, below we present the annotations for `cut`. - -```json -{ - "command": "cut", - "cases": [ - { - "predicate": { - "operator": "or", - "operands": [ - { - "operator": "val_opt_eq", - "operands": [ - "-d", - "\n" - ] - }, - { - "operator": "exists", - "operands": [ - "-z" - ] - } - ] - }, - "class": "pure", - "inputs": [ - "args[:]" - ], - "outputs": [ - "stdout" - ] - }, - { - "predicate": "default", - "class": "stateless", - "inputs": [ - "args[:]" - ], - "outputs": [ - "stdout" - ] - } - ], - "options": [ - "stdin-hyphen", - "empty-args-stdin" - ], - "short-long": [ - { - "short": "-d", - "long": "--delimiter" - }, - { - "short": "-z", - "long": "--zero-terminated" - } - ] -} -``` - -The annotation for `cut` has two cases, each of which consists of a predicate on its arguments, and then an assignment of its parallelizability class, inputs, and outputs. -The first predicate indicates that `cut` is "pure" -- _i.e._, not parallelizable but representable as a dataflow node -- if the value accompanying the `-d` option is `\n` or if it was used with the `-z` flag. -In both of these cases, newlines do not represent data item boundaries, but are rather used internally by the command, making it unsafe to parallelize by splitting on line boundaries. -In all other cases (see the "default" case) the command is stateless. -Inputs are always assigned to the non-option arguments and the output is always stdout. -The option "stdin-hyphen" indicates that a non-option argument that is just a dash `-` represents the stdin, and the option “empty-args-stdin” indicates that if non-option arguments are empty, then the command reads from its stdin. -The list identified by "short-long" contains a correspondence of short and long argument names for this command. - -## How to Annotate a Command - -The first step to annotating a command is to identify its default class: `stateless`, `parallelizable_pure`, `pure`, and `side-effectful`. How does the command behave without any inputs? -The next step is to identify the set of inputs and their order. - -This process then has to be repeated for every set of arguments, which have to be expressed as first-order-logic predicates (see examples above). -This can be (and is currently) achieved in an incremental fashion: - a few flags at a time. - -For more details, here is an early version of the annotation language: - -``` -