improve oneliners eval and setup scripts

binpash · Jun 7, 2022 · 9d82499 · 9d82499
1 parent 6f0f5f3
commit 9d82499
Show file tree

Hide file tree

Showing 11 changed files with 44 additions and 71 deletions.
diff --git a/evaluation/distr_benchmarks/analytics-mts/input/setup.sh b/evaluation/distr_benchmarks/analytics-mts/input/setup.sh
@@ -13,31 +13,21 @@ if [[ "$1" == "-c" ]]; then
     exit
 fi
 
-setup_dataset() {
-  hdfs dfs -mkdir /analytics-mts
-  if [ ! -f ./in.csv ] && [ "$1" != "--small" ];; then
-    # yesterday=$(date --date='1 days ago' +'%y-%m-%d')
-    # curl https://www.balab.aueb.gr/~dds/oasa-$yesterday.bz2 |
-    curl -sf 'https://www.balab.aueb.gr/~dds/oasa-2021-01-08.bz2' | bzip2 -d > in.csv
-    if [ $? -ne 0 ]; then
-      echo "oasa-2021-01-08.bz2 / bzip2 not available, contact the pash authors"
-      exit 1
-    fi
-    hdfs dfs -put in.csv  /analytics-mts/in.csv
-  elif [ ! -f ./in_small.csv ] && [ "$1" = "--small" ]; then
-    if [ ! -f ./in_small.csv ]; then                                                       
-      echo "Generating small-size inputs"                                                  
-      # FIXME PR: Do we need all of them?                                                  
-      curl -sf 'http://pac-n4.csail.mit.edu:81/pash_data/small/in_small.csv' > in_small.csv
-    fi
-    hdfs dfs -put in_small.csv  /analytics-mts/in_small.csv                                                                                     
+hdfs dfs -mkdir /analytics-mts
+if [ ! -f ./in.csv ] && [ "$1" != "--small" ]; then
+  # yesterday=$(date --date='1 days ago' +'%y-%m-%d')
+  # curl https://www.balab.aueb.gr/~dds/oasa-$yesterday.bz2 |
+  curl -sf 'https://www.balab.aueb.gr/~dds/oasa-2021-01-08.bz2' | bzip2 -d > in.csv
+  if [ $? -ne 0 ]; then
+    echo "oasa-2021-01-08.bz2 / bzip2 not available, contact the pash authors"
+    exit 1
   fi
-}
-
-source_var() {
-  if [[ "$1" == "--small" ]]; then
-    export IN="analytics-mts/in_small.csv"
-  else
-    export IN="analytics-mts/in.csv"
-  fi    
-}
+  hdfs dfs -put in.csv  /analytics-mts/in.csv
+elif [ ! -f ./in_small.csv ] && [ "$1" = "--small" ]; then
+  if [ ! -f ./in_small.csv ]; then                                                       
+    echo "Generating small-size inputs"                                                  
+    # FIXME PR: Do we need all of them?                                                  
+    curl -sf 'http://pac-n4.csail.mit.edu:81/pash_data/small/in_small.csv' > in_small.csv
+  fi
+  hdfs dfs -put in_small.csv  /analytics-mts/in_small.csv                                                                                     
+fi
diff --git a/evaluation/distr_benchmarks/oneliners/diff.sh b/evaluation/distr_benchmarks/oneliners/diff.sh
@@ -3,7 +3,7 @@
 # Taken from https://crashingdaily.wordpress.com/2008/03/06/diff-two-stdout-streams/
 # shuf() { awk 'BEGIN {srand(); OFMT="%.17f"} {print rand(), $0}' "$@" | sort -k1,1n | cut -d ' ' -f2-; }
 
-IN=${IN:-/1G.txt}
+IN=${IN:-/oneliners/1G.txt}
 
 mkfifo s1 s2
 

diff --git a/evaluation/distr_benchmarks/oneliners/input/setup.sh b/evaluation/distr_benchmarks/oneliners/input/setup.sh
@@ -13,6 +13,8 @@ if [[ "$1" == "-c" ]]; then
     exit
 fi
 
+hdfs dfs -mkdir /oneliners
+
 if [ ! -f ./1M.txt ]; then
     curl -sf 'http://ndr.md/data/dummy/1M.txt' > 1M.txt
     if [ $? -ne 0 ]; then
@@ -67,30 +69,16 @@ if [ ! -f ./all_cmdsx100.txt ]; then
         done
 fi
 
-
-if [ "$#" -eq 1 ] && [ "$1" = "--full" ]; then
-    echo "Generating full-size inputs"
-
-
-    if [ ! -f ./3G.txt ]; then
-        touch 3G.txt
-        for (( i = 0; i < 3; i++ )); do
-            cat 1G.txt >> 3G.txt
-        done
-    fi
-    input_files+=("3G.txt")
-
-    if [ ! -f ./10G.txt ]; then
-        touch 10G.txt
-        for (( i = 0; i < 10; i++ )); do
-            cat 1G.txt >> 10G.txt
-        done
-    fi
-    input_files+=("10G.txt")
+if [ ! -f ./3G.txt ]; then
+    touch 3G.txt
+    for (( i = 0; i < 3; i++ )); do
+        cat 1G.txt >> 3G.txt
+    done
 fi
+input_files+=("3G.txt")
 
 # Add files with different replication factors
 for file in "${input_files[@]}"; do
-    hdfs dfs -Ddfs.replication=1  -put $file /rep1_$file
-    hdfs dfs -Ddfs.replication=3  -put $file /rep3_$file
+    hdfs dfs -put $file /oneliners/$file
+    rm -f $file
 done
diff --git a/evaluation/distr_benchmarks/oneliners/nfa-regex.sh b/evaluation/distr_benchmarks/oneliners/nfa-regex.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 # Match complex regular-expression over input
 
-IN=${IN:-/1G.txt}
+IN=${IN:-/oneliners/1G.txt}
 
 hdfs dfs -cat $IN | tr A-Z a-z | grep '\(.\).*\1\(.\).*\2\(.\).*\3\(.\).*\4'
diff --git a/evaluation/distr_benchmarks/oneliners/run.distr.sh b/evaluation/distr_benchmarks/oneliners/run.distr.sh
@@ -19,9 +19,8 @@ scripts_inputs=(
 
 oneliners_bash() {
     outputs_dir="outputs"
-    rep=${1:-rep3}
-    seq_times_file=$rep"_seq.res"
-    seq_outputs_suffix=$rep"_seq.out"
+    seq_times_file="seq.res"
+    seq_outputs_suffix="seq.out"
 
     mkdir -p "$outputs_dir"
 
@@ -36,7 +35,7 @@ oneliners_bash() {
     script="${script_input_parsed[0]}"
     input="${script_input_parsed[1]}"
 
-    export IN=/$rep\_$input
+    export IN="/oneliners/$input"
     export dict=
 
     printf -v pad %30s
@@ -52,8 +51,7 @@ oneliners_bash() {
 oneliners_pash(){
   flags=${1:-$PASH_FLAGS}
   prefix=${2:-par}
-  rep=${3:-rep3}
-  prefix=$prefix\_$rep
+  prefix=$prefix
 
   times_file="$prefix.res"
   outputs_suffix="$prefix.out"
@@ -66,7 +64,7 @@ oneliners_pash(){
 
   touch "$times_file"
   cat $times_file >> $times_file.d
-  echo executing one-liners with $prefix pash with data $rep $(date) | tee "$times_file"
+  echo executing one-liners with $prefix pash with data $(date) | tee "$times_file"
   echo '' >> "$times_file"
 
   for script_input in ${scripts_inputs[@]}
@@ -75,7 +73,7 @@ oneliners_pash(){
     script="${script_input_parsed[0]}"
     input="${script_input_parsed[1]}"
 
-    export IN=/$rep\_$input
+    export IN="/oneliners/$input"
     export dict=
 
     printf -v pad %30s
@@ -92,11 +90,8 @@ oneliners_pash(){
   done
 }
 
-# oneliners_bash "rep1"
-oneliners_bash "rep3"
+oneliners_bash
 
-# oneliners_pash "$PASH_FLAGS" "par" "rep1"
-oneliners_pash "$PASH_FLAGS" "par" "rep3"
+oneliners_pash "$PASH_FLAGS" "par"
 
-# oneliners_pash "$PASH_FLAGS --distributed_exec" "distr" "rep1"
-oneliners_pash "$PASH_FLAGS --distributed_exec" "distr" "rep3"
+oneliners_pash "$PASH_FLAGS --distributed_exec" "distr"
diff --git a/evaluation/distr_benchmarks/oneliners/set-diff.sh b/evaluation/distr_benchmarks/oneliners/set-diff.sh
@@ -2,7 +2,7 @@
 # Show the set-difference between two streams (i.e., elements in the first that are not in the second).
 # https://stackoverflow.com/questions/2509533/bash-linux-set-difference-between-two-text-files
 
-IN=${IN:-/1G.txt}
+IN=${IN:-/oneliners/1G.txt}
 
 mkfifo s1 s2
 

diff --git a/evaluation/distr_benchmarks/oneliners/shortest-scripts.sh b/evaluation/distr_benchmarks/oneliners/shortest-scripts.sh
@@ -6,6 +6,6 @@
 
 # FIX: Input here should be a set of commands, more precisely, the ones on this specific machine.
 
-IN=${IN:-/all_cmdsx100.txt}
+IN=${IN:-/oneliners/all_cmdsx100.txt}
 
 hdfs dfs -cat $IN | xargs file | grep "shell script" | cut -d: -f1 | xargs -L 1 wc -l | grep -v '^0$' | sort -n | head -15
diff --git a/evaluation/distr_benchmarks/oneliners/sort-sort.sh b/evaluation/distr_benchmarks/oneliners/sort-sort.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 # Calculate sort twice
 
-IN=${IN:-/1G.txt}
+IN=${IN:-/oneliners/1G.txt}
 
 hdfs dfs -cat $IN | tr A-Z a-z | sort | sort -r
diff --git a/evaluation/distr_benchmarks/oneliners/sort.sh b/evaluation/distr_benchmarks/oneliners/sort.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 # Sort input
 
-IN=${IN:-/1G.txt}
+IN=${IN:-/oneliners/1G.txt}
 
 hdfs dfs -cat $IN | sort
 
diff --git a/evaluation/distr_benchmarks/oneliners/spell.sh b/evaluation/distr_benchmarks/oneliners/spell.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 # Calculate mispelled words in an input
 # https://dl.acm.org/doi/10.1145/3532.315102
-IN=${IN:-/1G.txt}
+IN=${IN:-/oneliners/1G.txt}
 dict=${dict:-$PASH_TOP/evaluation/distr_benchmarks/oneliners/input/dict.txt}
 
 hdfs dfs -cat $IN |

diff --git a/evaluation/distr_benchmarks/oneliners/top-n.sh b/evaluation/distr_benchmarks/oneliners/top-n.sh
@@ -2,7 +2,7 @@
 # Top-N (1000) terms
 # from https://dl.acm.org/doi/10.1145/5948.315654
 
-IN=${IN:-/1G.txt}
+IN=${IN:-/oneliners/1G.txt}
 
 hdfs dfs -cat $IN | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | tr A-Z a-z | sort | uniq -c | sort -rn | sed 100q