Skip to content

Commit

Permalink
improve oneliners eval and setup scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
tammam1998 committed Jun 7, 2022
1 parent 6f0f5f3 commit 9d82499
Show file tree
Hide file tree
Showing 11 changed files with 44 additions and 71 deletions.
44 changes: 17 additions & 27 deletions evaluation/distr_benchmarks/analytics-mts/input/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,31 +13,21 @@ if [[ "$1" == "-c" ]]; then
exit
fi

setup_dataset() {
hdfs dfs -mkdir /analytics-mts
if [ ! -f ./in.csv ] && [ "$1" != "--small" ];; then
# yesterday=$(date --date='1 days ago' +'%y-%m-%d')
# curl https://www.balab.aueb.gr/~dds/oasa-$yesterday.bz2 |
curl -sf 'https://www.balab.aueb.gr/~dds/oasa-2021-01-08.bz2' | bzip2 -d > in.csv
if [ $? -ne 0 ]; then
echo "oasa-2021-01-08.bz2 / bzip2 not available, contact the pash authors"
exit 1
fi
hdfs dfs -put in.csv /analytics-mts/in.csv
elif [ ! -f ./in_small.csv ] && [ "$1" = "--small" ]; then
if [ ! -f ./in_small.csv ]; then
echo "Generating small-size inputs"
# FIXME PR: Do we need all of them?
curl -sf 'http://pac-n4.csail.mit.edu:81/pash_data/small/in_small.csv' > in_small.csv
fi
hdfs dfs -put in_small.csv /analytics-mts/in_small.csv
hdfs dfs -mkdir /analytics-mts
if [ ! -f ./in.csv ] && [ "$1" != "--small" ]; then
# yesterday=$(date --date='1 days ago' +'%y-%m-%d')
# curl https://www.balab.aueb.gr/~dds/oasa-$yesterday.bz2 |
curl -sf 'https://www.balab.aueb.gr/~dds/oasa-2021-01-08.bz2' | bzip2 -d > in.csv
if [ $? -ne 0 ]; then
echo "oasa-2021-01-08.bz2 / bzip2 not available, contact the pash authors"
exit 1
fi
}

source_var() {
if [[ "$1" == "--small" ]]; then
export IN="analytics-mts/in_small.csv"
else
export IN="analytics-mts/in.csv"
fi
}
hdfs dfs -put in.csv /analytics-mts/in.csv
elif [ ! -f ./in_small.csv ] && [ "$1" = "--small" ]; then
if [ ! -f ./in_small.csv ]; then
echo "Generating small-size inputs"
# FIXME PR: Do we need all of them?
curl -sf 'http://pac-n4.csail.mit.edu:81/pash_data/small/in_small.csv' > in_small.csv
fi
hdfs dfs -put in_small.csv /analytics-mts/in_small.csv
fi
2 changes: 1 addition & 1 deletion evaluation/distr_benchmarks/oneliners/diff.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Taken from https://crashingdaily.wordpress.com/2008/03/06/diff-two-stdout-streams/
# shuf() { awk 'BEGIN {srand(); OFMT="%.17f"} {print rand(), $0}' "$@" | sort -k1,1n | cut -d ' ' -f2-; }

IN=${IN:-/1G.txt}
IN=${IN:-/oneliners/1G.txt}

mkfifo s1 s2

Expand Down
32 changes: 10 additions & 22 deletions evaluation/distr_benchmarks/oneliners/input/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ if [[ "$1" == "-c" ]]; then
exit
fi

hdfs dfs -mkdir /oneliners

if [ ! -f ./1M.txt ]; then
curl -sf 'http://ndr.md/data/dummy/1M.txt' > 1M.txt
if [ $? -ne 0 ]; then
Expand Down Expand Up @@ -67,30 +69,16 @@ if [ ! -f ./all_cmdsx100.txt ]; then
done
fi


if [ "$#" -eq 1 ] && [ "$1" = "--full" ]; then
echo "Generating full-size inputs"


if [ ! -f ./3G.txt ]; then
touch 3G.txt
for (( i = 0; i < 3; i++ )); do
cat 1G.txt >> 3G.txt
done
fi
input_files+=("3G.txt")

if [ ! -f ./10G.txt ]; then
touch 10G.txt
for (( i = 0; i < 10; i++ )); do
cat 1G.txt >> 10G.txt
done
fi
input_files+=("10G.txt")
if [ ! -f ./3G.txt ]; then
touch 3G.txt
for (( i = 0; i < 3; i++ )); do
cat 1G.txt >> 3G.txt
done
fi
input_files+=("3G.txt")

# Add files with different replication factors
for file in "${input_files[@]}"; do
hdfs dfs -Ddfs.replication=1 -put $file /rep1_$file
hdfs dfs -Ddfs.replication=3 -put $file /rep3_$file
hdfs dfs -put $file /oneliners/$file
rm -f $file
done
2 changes: 1 addition & 1 deletion evaluation/distr_benchmarks/oneliners/nfa-regex.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash
# Match complex regular-expression over input

IN=${IN:-/1G.txt}
IN=${IN:-/oneliners/1G.txt}

hdfs dfs -cat $IN | tr A-Z a-z | grep '\(.\).*\1\(.\).*\2\(.\).*\3\(.\).*\4'
23 changes: 9 additions & 14 deletions evaluation/distr_benchmarks/oneliners/run.distr.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,8 @@ scripts_inputs=(

oneliners_bash() {
outputs_dir="outputs"
rep=${1:-rep3}
seq_times_file=$rep"_seq.res"
seq_outputs_suffix=$rep"_seq.out"
seq_times_file="seq.res"
seq_outputs_suffix="seq.out"

mkdir -p "$outputs_dir"

Expand All @@ -36,7 +35,7 @@ oneliners_bash() {
script="${script_input_parsed[0]}"
input="${script_input_parsed[1]}"

export IN=/$rep\_$input
export IN="/oneliners/$input"
export dict=

printf -v pad %30s
Expand All @@ -52,8 +51,7 @@ oneliners_bash() {
oneliners_pash(){
flags=${1:-$PASH_FLAGS}
prefix=${2:-par}
rep=${3:-rep3}
prefix=$prefix\_$rep
prefix=$prefix

times_file="$prefix.res"
outputs_suffix="$prefix.out"
Expand All @@ -66,7 +64,7 @@ oneliners_pash(){

touch "$times_file"
cat $times_file >> $times_file.d
echo executing one-liners with $prefix pash with data $rep $(date) | tee "$times_file"
echo executing one-liners with $prefix pash with data $(date) | tee "$times_file"
echo '' >> "$times_file"

for script_input in ${scripts_inputs[@]}
Expand All @@ -75,7 +73,7 @@ oneliners_pash(){
script="${script_input_parsed[0]}"
input="${script_input_parsed[1]}"

export IN=/$rep\_$input
export IN="/oneliners/$input"
export dict=

printf -v pad %30s
Expand All @@ -92,11 +90,8 @@ oneliners_pash(){
done
}

# oneliners_bash "rep1"
oneliners_bash "rep3"
oneliners_bash

# oneliners_pash "$PASH_FLAGS" "par" "rep1"
oneliners_pash "$PASH_FLAGS" "par" "rep3"
oneliners_pash "$PASH_FLAGS" "par"

# oneliners_pash "$PASH_FLAGS --distributed_exec" "distr" "rep1"
oneliners_pash "$PASH_FLAGS --distributed_exec" "distr" "rep3"
oneliners_pash "$PASH_FLAGS --distributed_exec" "distr"
2 changes: 1 addition & 1 deletion evaluation/distr_benchmarks/oneliners/set-diff.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Show the set-difference between two streams (i.e., elements in the first that are not in the second).
# https://stackoverflow.com/questions/2509533/bash-linux-set-difference-between-two-text-files

IN=${IN:-/1G.txt}
IN=${IN:-/oneliners/1G.txt}

mkfifo s1 s2

Expand Down
2 changes: 1 addition & 1 deletion evaluation/distr_benchmarks/oneliners/shortest-scripts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@

# FIX: Input here should be a set of commands, more precisely, the ones on this specific machine.

IN=${IN:-/all_cmdsx100.txt}
IN=${IN:-/oneliners/all_cmdsx100.txt}

hdfs dfs -cat $IN | xargs file | grep "shell script" | cut -d: -f1 | xargs -L 1 wc -l | grep -v '^0$' | sort -n | head -15
2 changes: 1 addition & 1 deletion evaluation/distr_benchmarks/oneliners/sort-sort.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash
# Calculate sort twice

IN=${IN:-/1G.txt}
IN=${IN:-/oneliners/1G.txt}

hdfs dfs -cat $IN | tr A-Z a-z | sort | sort -r
2 changes: 1 addition & 1 deletion evaluation/distr_benchmarks/oneliners/sort.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash
# Sort input

IN=${IN:-/1G.txt}
IN=${IN:-/oneliners/1G.txt}

hdfs dfs -cat $IN | sort

2 changes: 1 addition & 1 deletion evaluation/distr_benchmarks/oneliners/spell.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash
# Calculate mispelled words in an input
# https://dl.acm.org/doi/10.1145/3532.315102
IN=${IN:-/1G.txt}
IN=${IN:-/oneliners/1G.txt}
dict=${dict:-$PASH_TOP/evaluation/distr_benchmarks/oneliners/input/dict.txt}

hdfs dfs -cat $IN |
Expand Down
2 changes: 1 addition & 1 deletion evaluation/distr_benchmarks/oneliners/top-n.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Top-N (1000) terms
# from https://dl.acm.org/doi/10.1145/5948.315654

IN=${IN:-/1G.txt}
IN=${IN:-/oneliners/1G.txt}

hdfs dfs -cat $IN | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | tr A-Z a-z | sort | uniq -c | sort -rn | sed 100q

0 comments on commit 9d82499

Please sign in to comment.