Skip to content

Commit

Permalink
Upload cleaned-up code.
Browse files Browse the repository at this point in the history
  • Loading branch information
Marko Pranjic committed Apr 17, 2024
1 parent 1f7476d commit 4dfd702
Show file tree
Hide file tree
Showing 8 changed files with 511 additions and 0 deletions.
6 changes: 6 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
FROM python:3.10

RUN --mount=type=cache,target=/root/.cache \
--mount=type=bind,target=/tmp/requirements.txt,source=requirements.txt \
pip install --root-user-action=ignore --disable-pip-version-check -r /tmp/requirements.txt

5 changes: 5 additions & 0 deletions Dockerfile.eval
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
FROM python:3.10

RUN --mount=type=cache,target=/root/.cache \
pip install --root-user-action=ignore --disable-pip-version-check morphoeval pandas

115 changes: 115 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
SHELL := bash
.ONESHELL:
.SHELLFLAGS := -eu -o pipefail -c
.DELETE_ON_ERROR:
MAKEFLAGS += --warn-undefined-variables
MAKEFLAGS += --no-builtin-rules
.RECIPEPREFIX = >

include config.mk


DOCKER := docker run
DOCKER_IMG := $(shell basename $(dir $(realpath $(lastword $(MAKEFILE_LIST)))))
DOCKER_ARGS := --rm \
--env TRANSFORMERS_CACHE=/out/.cache \
--env PYTHONUNBUFFERED=1 \
--env PYTHONDONTWRITEBYTECODE=1 \
--env SENTENCE_TRANSFORMERS_HOME='/out/.cache/sent-trans' \
--env HF_DATASETS_CACHE="/out/.cache/datasets" \
-v $$(realpath data):/data \
-v $$(realpath src):/app \
-v $$(realpath out):/out

ifeq ($(DEVICE), cuda)
DOCKER_ARGS += --gpus all
endif


METRIC := bpr

# Default - top level rule is what gets ran when you run just `make`
all: out/.build.sentinel out/.eval.sentinel out/.run.sentinel
.PHONY: all

build: out/.build.sentinel
.PHONY: build

run: out/.run.sentinel
.PHONY: run

prepare-test: out/.test.sentinel
.PHONY: prepare-test

prepare-gold: out/.gold.sentinel
.PHONY: prepare-gold

eval: out/.eval.sentinel
.PHONY: eval

clean:
> rm -rf out
.PHONY: clean


out/.split.sentinel: goldstd_trainset.segmentation.eng goldstd_trainset.segmentation.tur goldstd_trainset.segmentation.fin
> mkdir -p out/llm-segm
> $(DOCKER) $(DOCKER_ARGS) $(DOCKER_IMG) python /app/split_data.py
> touch $@


out/.test.sentinel: out/.split.sentinel
> cat data/eng_test.segmentation.csv | sed -E 's/([^,]+),.+/\1/' > out/eng.test
> cat data/fin_test.segmentation.csv | sed -E 's/([^,]+),.+/\1/' > out/fin.test
> cat data/tur_test.segmentation.csv | sed -E 's/([^,]+),.+/\1/' > out/tur.test
> cat data/swati.clean.test.conll | sed -E 's/ \| .+//' > out/swati.test
> cat data/zulu.clean.test.conll | sed -E 's/ \| .+//' > out/zulu.test
> cat data/xhosa.clean.test.conll | sed -E 's/ \| .+//' > out/xhosa.test
# all of the above + handle ',' (replace with !)
> cat data/ndebele.clean.test.conll | sed -E 's/ \| .+//' | sed ' y/-,/ !/' > out/ndebele.test
> touch $@


out/.gold.sentinel: out/.split.sentinel
> cat data/eng_test.segmentation.csv | sed 'y/@/ /' | sed 's/,/\t/' | sed 's/,/, /g' > out/eng.gold
> cat data/fin_test.segmentation.csv | sed 'y/@/ /' | sed 's/,/\t/' | sed 's/,/, /g' > out/fin.gold
> cat data/tur_test.segmentation.csv | sed 'y/@/ /' | sed 's/,/\t/' | sed 's/,/, /g' > out/tur.gold
# Replace '-' with ' ' (space) to conform to the format of evaluation
> cat data/swati.clean.test.conll | cut -sd\| -f1-2 | sed 's/ | /\t/' | sed ' y/-/ /' > out/swati.gold
> cat data/zulu.clean.test.conll | cut -sd\| -f1-2 | sed 's/ | /\t/' | sed ' y/-/ /' > out/zulu.gold
> cat data/xhosa.clean.test.conll | cut -sd\| -f1-2 | sed 's/ | /\t/' | sed ' y/-/ /' > out/xhosa.gold
# Additionally, replace ',' as evaluation cannot work with this character (we replaced it with '!')
> cat data/ndebele.clean.test.conll | cut -sd\| -f1-2 | sed 's/ | /\t/' | sed ' y/-,/ !/' > out/ndebele.gold
> touch $@


out/.build.sentinel: Dockerfile
> DOCKER_BUILDKIT=1 docker build . --tag=$(DOCKER_IMG)
> DOCKER_BUILDKIT=1 docker build -f Dockerfile.eval --tag=$(DOCKER_IMG)-eval .
> mkdir -p out
> touch $@


out/.run.sentinel: out/.build.sentinel
> mkdir -p out/llm-segm
> $(DOCKER) $(DOCKER_ARGS) $(DOCKER_IMG) python /app/main.py
> touch $@


out/.eval.sentinel: out/.run.sentinel
> CMD="$(DOCKER) $(DOCKER_ARGS) $(DOCKER_IMG)-eval"
> $${CMD} morphoeval --metric ${METRIC} /out/eng.gold /out/llm-segm/eng.pred /out/llm-segm/eng-result.txt
> $${CMD} morphoeval --metric ${METRIC} /out/fin.gold /out/llm-segm/fin.pred /out/llm-segm/fin-result.txt
> $${CMD} morphoeval --metric ${METRIC} /out/tur.gold /out/llm-segm/tur.pred /out/llm-segm/tur-result.txt
> $${CMD} morphoeval --metric ${METRIC} /out/zulu.gold /out/llm-segm/zulu.pred /out/llm-segm/zulu-result.txt
> $${CMD} morphoeval --metric ${METRIC} /out/swati.gold /out/llm-segm/swati.pred /out/llm-segm/swati-result.txt
> $${CMD} morphoeval --metric ${METRIC} /out/xhosa.gold /out/llm-segm/xhosa.pred /out/llm-segm/xhosa-result.txt
> $${CMD} morphoeval --metric ${METRIC} /out/ndebele.gold /out/llm-segm/ndebele.pred /out/llm-segm/ndebele-result.txt
> $${CMD} python /app/calc_accuracy.py /out/eng.gold /out/llm-segm/eng.pred /out/llm-segm/eng-result-acc.txt
> $${CMD} python /app/calc_accuracy.py /out/fin.gold /out/llm-segm/fin.pred /out/llm-segm/fin-result-acc.txt
> $${CMD} python /app/calc_accuracy.py /out/tur.gold /out/llm-segm/tur.pred /out/llm-segm/tur-result-acc.txt
> $${CMD} python /app/calc_accuracy.py /out/zulu.gold /out/llm-segm/zulu.pred /out/llm-segm/zulu-result-acc.txt
> $${CMD} python /app/calc_accuracy.py /out/xhosa.gold /out/llm-segm/xhosa.pred /out/llm-segm/xhosa-result-acc.txt
> $${CMD} python /app/calc_accuracy.py /out/swati.gold /out/llm-segm/swati.pred /out/llm-segm/swati-result-acc.txt
> $${CMD} python /app/calc_accuracy.py /out/ndebele.gold /out/llm-segm/ndebele.pred /out/llm-segm/ndebele-result-acc.txt
> touch $@
1 change: 1 addition & 0 deletions config.mk
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
DEVICE = cuda
12 changes: 12 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
-f https://download.pytorch.org/whl/cu113

tqdm
numpy
torch==1.12.1
transformers
accelerate
sentencepiece
evaluate
pandas
protobuf
scikit-learn
51 changes: 51 additions & 0 deletions src/calc_accuracy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import argparse
import json

import pandas as pd


def _calc_accuracy(gold, pred):
acc = 0
assert len(gold) == len(
pred
), f"Error! Files contain {len(gold)} gold entries and {len(pred)} predictions."
total = len(gold)
for g, p in zip(gold, pred):
g_in, g_out = g.split("\t")
g_list = g_out.split(",")
p_in, p_out = p.split("\t")
p_list = p_out.split(",")
p_list = [p.strip().lower() for p in p_list]
g_list = [g.strip().lower() for g in g_list]
assert p_in == g_in, f"Error, {p_in} should be the same as {g_in}"
if set(p_list).intersection(g_list):
acc += 1
return acc / total


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"goldfile", type=argparse.FileType("r"), help="gold standard analysis file"
)
parser.add_argument(
"predfile", type=argparse.FileType("r"), help="predicted analysis file"
)
parser.add_argument(
"output",
type=argparse.FileType("w"),
nargs="?",
default="-",
help="output file",
)
args = parser.parse_args()
gold_df = pd.Series(args.goldfile.read().strip().split("\n"))
pred_df = pd.Series(args.predfile.read().strip().split("\n"))
acc = _calc_accuracy(gold_df, pred_df)
output = {
"metric": "accuracy-any",
"files": {"reference": args.goldfile.name, "predictions": args.predfile.name},
"score": round(acc, 4),
}

json.dump(output, args.output, indent=2)
Loading

0 comments on commit 4dfd702

Please sign in to comment.