Skip to content

Commit

Permalink
#76 updated AREkit, updated DeepPavlov default library version
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolay-r committed Aug 16, 2023
1 parent c488325 commit 889c561
Show file tree
Hide file tree
Showing 13 changed files with 45 additions and 51 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# ARElight 0.23.2
# ARElight 0.24.0

![](https://img.shields.io/badge/Python-3.6.9-brightgreen.svg)
![](https://img.shields.io/badge/Python-3.9-brightgreen.svg)
![](https://img.shields.io/badge/AREkit-0.23.1-orange.svg)

### :point_right: [DEMO](https://github.com/nicolay-r/ARElight/tree/v0.22.0#installation) :point_left:
Expand All @@ -21,7 +21,7 @@ we adopt [DeepPavlov](https://github.com/deeppavlovteam/DeepPavlov) (BertOntoNo

1. Main library installation
```bash
pip install git+https://github.com/nicolay-r/arelight@v0.23.1
pip install git+https://github.com/nicolay-r/arelight@v0.24.0
```

2. (Optional) BRAT: [Download](https://github.com/nlplab/brat/releases/tag/v1.3_Crunchy_Frog)
Expand Down
4 changes: 2 additions & 2 deletions arelight/brat_backend.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from arekit.common.docs.entity import DocumentEntity
from arekit.contrib.networks.input.const import FrameVariantIndices
from arekit.contrib.networks.input.rows_parser import ParsedSampleRow
from arekit.contrib.utils.data.readers.csv_pd import PandasCsvReader
Expand All @@ -11,7 +12,6 @@
from arekit.common.data.storages.base import BaseRowsStorage
from arekit.common.entities.base import Entity
from arekit.common.frames.variants.base import FrameVariant
from arekit.common.news.entity import DocumentEntity
from arekit.contrib.utils.processing.text.tokens import Tokens


Expand Down Expand Up @@ -114,7 +114,7 @@ def __iter_sample_labels(samples, label_to_rel):
assert(isinstance(samples, BaseRowsStorage))

for row_ind, row in samples:
str_label = str(row[const.LABEL]) if const.LABEL in row else None
str_label = str(row[const.LABEL_UINT]) if const.LABEL_UINT in row else None
label = label_to_rel[str_label] if str_label in label_to_rel else None
yield row_ind, label

Expand Down
4 changes: 2 additions & 2 deletions arelight/doc_ops.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from arekit.common.experiment.api.ops_doc import DocumentOperations
from arekit.common.data.doc_provider import DocumentProvider


class InMemoryDocOperations(DocumentOperations):
class InMemoryDocOperations(DocumentProvider):

def __init__(self, docs=None):
assert(isinstance(docs, list) or docs is None)
Expand Down
2 changes: 1 addition & 1 deletion arelight/pipelines/items/entities_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def __process_word(word):

# If this is a special word which is related to the [entity] mention.
if word[0] == "[" and word[-1] == "]":
entity = Entity(value=word[1:-1], e_type=None)
entity = Entity(value=word[1:-1], e_type="UNDEFINED")
return entity

return word
Expand Down
2 changes: 1 addition & 1 deletion arelight/pipelines/items/entities_ner_dp.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from arekit.common.bound import Bound
from arekit.common.docs.objects_parser import SentenceObjectsParserPipelineItem
from arekit.common.entities.base import Entity
from arekit.common.news.objects_parser import SentenceObjectsParserPipelineItem
from arekit.common.text.partitioning.terms import TermsPartitioning

from arelight.ner.deep_pavlov import DeepPavlovNER
Expand Down
6 changes: 3 additions & 3 deletions arelight/pipelines/items/train_bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,19 +47,19 @@ def apply_core(self, input_data, pipeline_ctx):
def __iter_batches(s, batch_size):
assert(isinstance(s, BaseRowsStorage))

data = {"text_a": [], "text_b": [], "label": []}
data = {"text_a": [], "text_b": [], "label_uint": []}

# NOTE: it is important to iter shuffled data!
for row_ind, row in s.iter_shuffled():
data["text_a"].append(row['text_a'])
data["text_b"].append(row['text_b'])
data["label"].append(row[const.LABEL])
data["label_uint"].append(row[const.LABEL_UINT])

for i in range(0, len(data["text_a"]), batch_size):

texts_a = data["text_a"][i:i + batch_size]
texts_b = data["text_b"][i:i + batch_size]
labels = data["label"][i:i + batch_size]
labels = data["label_uint"][i:i + batch_size]

batch_features = self.__proc(texts_a=texts_a, texts_b=texts_b)

Expand Down
8 changes: 4 additions & 4 deletions arelight/pipelines/items/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from arekit.common.news.base import News
from arekit.common.news.sentence import BaseNewsSentence
from arekit.common.docs.base import Document
from arekit.common.docs.sentence import BaseDocumentSentence


def input_to_docs(input_data, sentence_parser):
Expand All @@ -15,8 +15,8 @@ def input_to_docs(input_data, sentence_parser):
for doc_id, contents in enumerate(input_data):
# setup input data.
sentences = sentence_parser(contents)
sentences = list(map(lambda text: BaseNewsSentence(text), sentences))
sentences = list(map(lambda text: BaseDocumentSentence(text), sentences))
# Documents.
docs.append(News(doc_id=doc_id, sentences=sentences))
docs.append(Document(doc_id=doc_id, sentences=sentences))

return docs
8 changes: 3 additions & 5 deletions arelight/run/infer.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import argparse
from os.path import join, dirname, basename

from arekit.common.docs.entities_grouping import EntitiesGroupingPipelineItem
from arekit.common.experiment.data_type import DataType
from arekit.common.folding.nofold import NoFolding
from arekit.common.news.entities_grouping import EntitiesGroupingPipelineItem
from arekit.common.synonyms.grouping import SynonymsCollectionValuesGroupingProviders
from arekit.common.text.parser import BaseTextParser
from arekit.contrib.utils.pipelines.items.text.terms_splitter import TermsSplitterParser
Expand Down Expand Up @@ -87,13 +87,11 @@
BratHtmlEmbeddingPipelineItem(brat_url="http://localhost:8001/")
)

no_folding = NoFolding(doc_ids=list(range(len(actual_content))),
supported_data_type=DataType.Test)

pipeline.run(None, {
"template_filepath": join(const.DATA_DIR, "brat_template.html"),
"predict_fp": "{}.tsv.gz".format(backend_template) if backend_template is not None else None,
"brat_vis_fp": "{}.html".format(backend_template) if backend_template is not None else None,
"data_type_pipelines": {DataType.Test: data_pipeline},
"data_folding": no_folding
"data_folding": NoFolding(),
"doc_ids": {DataType.Test: list(range(len(actual_content)))},
})
10 changes: 4 additions & 6 deletions arelight/run/serialize.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import argparse
from os.path import join, dirname, basename

from arekit.common.docs.entities_grouping import EntitiesGroupingPipelineItem
from arekit.common.experiment.data_type import DataType
from arekit.common.folding.nofold import NoFolding
from arekit.common.labels.base import NoLabel
from arekit.common.labels.provider.constant import ConstantLabelProvider
from arekit.common.labels.scaler.single import SingleLabelScaler
from arekit.common.labels.str_fmt import StringLabelsFormatter
from arekit.common.news.entities_grouping import EntitiesGroupingPipelineItem
from arekit.common.opinions.annot.algo.pair_based import PairBasedOpinionAnnotationAlgorithm
from arekit.common.opinions.annot.base import BaseOpinionAnnotator
from arekit.common.pipeline.base import BasePipeline
Expand Down Expand Up @@ -102,14 +102,12 @@
samples_io=SamplesIO(target_dir=dirname(backend_template),
prefix=basename(backend_template),
writer=NativeCsvWriter(delimiter=',')),
save_labels_func=lambda data_type: data_type != DataType.Test,
balance_func=lambda data_type: data_type == DataType.Train)
save_labels_func=lambda data_type: data_type != DataType.Test)
])

no_folding = NoFolding(doc_ids=list(range(len(texts_from_files))), supported_data_type=DataType.Test)

pipeline.run(input_data=None,
params_dict={
"data_folding": no_folding,
"data_folding": NoFolding(),
"doc_ids": {DataType.Test: list(range(len(texts_from_files)))},
"data_type_pipelines": {DataType.Test: test_pipeline}
})
12 changes: 6 additions & 6 deletions dependencies.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
deeppavlov==0.11.0
# DeepPavlov bert-dp dependencies:
bert_dp @ git+https://github.com/deepmipt/bert.git@feat/multi_gpu
# Install arekit
arekit @ git+https://github.com/nicolay-r/AREkit@0.23.1-rc
nltk
deeppavlov==1.2.0
transformers==4.31.0
torch==2.0.1
pytorch-crf==0.7.2
arekit @ git+https://github.com/nicolay-r/AREkit@0.24.0-rc
nltk==3.4.5
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def get_requirements(filenames):

setup(
name='arelight',
version='0.23.2',
version='0.24.0',
description='About Mass-media text processing application for your '
'Relation Extraction task, powered by AREkit.',
url='https://github.com/nicolay-r/ARElight',
Expand Down
30 changes: 14 additions & 16 deletions test/test_bert_serialization.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
import unittest
import ru_sent_tokenize
from arekit.common.docs.base import Document
from arekit.common.docs.entities_grouping import EntitiesGroupingPipelineItem
from arekit.common.docs.sentence import BaseDocumentSentence
from ru_sent_tokenize import ru_sent_tokenize
from os.path import dirname, join, realpath

from arekit.common.experiment.data_type import DataType
from arekit.common.folding.nofold import NoFolding
from arekit.common.labels.base import NoLabel
from arekit.common.labels.scaler.single import SingleLabelScaler
from arekit.common.news.base import News
from arekit.common.news.entities_grouping import EntitiesGroupingPipelineItem
from arekit.common.news.sentence import BaseNewsSentence
from arekit.common.pipeline.base import BasePipeline
from arekit.common.synonyms.grouping import SynonymsCollectionValuesGroupingProviders
from arekit.common.text.parser import BaseTextParser
Expand All @@ -25,6 +25,7 @@

from arelight.doc_ops import InMemoryDocOperations
from arelight.pipelines.data.annot_pairs_nolabel import create_neutral_annotation_pipeline
from arelight.pipelines.items.entities_default import TextEntitiesParser
from arelight.pipelines.items.entities_ner_dp import DeepPavlovNERPipelineItem
from arelight.samplers.bert import create_bert_sample_provider
from arelight.samplers.types import BertSampleProviderTypes
Expand Down Expand Up @@ -54,8 +55,8 @@ def input_to_docs(texts):
docs = []
for doc_id, contents in enumerate(texts):
sentences = ru_sent_tokenize(contents)
sentences = list(map(lambda text: BaseNewsSentence(text), sentences))
doc = News(doc_id=doc_id, sentences=sentences)
sentences = list(map(lambda text: BaseDocumentSentence(text), sentences))
doc = Document(doc_id=doc_id, sentences=sentences)
docs.append(doc)
return docs

Expand All @@ -70,10 +71,10 @@ def test(self):
# Declare input texts.
texts = [
# Text 1.
"""24 марта президент США Джо Байден провел переговоры с
лидерами стран Евросоюза в Брюсселе, вызвав внимание рынка и предположения о
том, что Америке удалось уговорить ЕС совместно бойкотировать российские нефть
и газ. Европейский Союз крайне зависим от России в плане поставок нефти и
"""24 марта президент [США] [Джо Байден] провел переговоры с
лидерами стран [Евросоюза] в [Брюсселе], вызвав внимание рынка и предположения о
том, что [Америке] удалось уговорить [ЕС] совместно бойкотировать российские нефть
и газ. [Европейский Союз] крайне зависим от [России] в плане поставок нефти и
газа."""
]

Expand All @@ -88,7 +89,7 @@ def test(self):
# Declare text parser.
text_parser = BaseTextParser(pipeline=[
TermsSplitterParser(),
DeepPavlovNERPipelineItem(lambda s_obj: s_obj.ObjectType in ["ORG", "PERSON", "LOC", "GPE"]),
TextEntitiesParser(),
EntitiesGroupingPipelineItem(lambda value:
SynonymsCollectionValuesGroupingProviders.provide_existed_or_register_missed_value(
synonyms=synonyms, value=value))
Expand All @@ -97,9 +98,6 @@ def test(self):
# Single label scaler.
single_label_scaler = SingleLabelScaler(NoLabel())

# Declare folding and experiment context.
no_folding = NoFolding(doc_ids=list(range(len(texts))), supported_data_type=DataType.Test)

# Composing labels formatter and experiment preparation.
doc_ops = InMemoryDocOperations(docs=BertTestSerialization.input_to_docs(texts))

Expand All @@ -113,8 +111,7 @@ def test(self):
rows_provider=rows_provider,
storage=RowCacheStorage(),
samples_io=SamplesIO(target_dir=self.TEST_DATA_DIR, writer=NativeCsvWriter(delimiter=',')),
save_labels_func=lambda data_type: data_type != DataType.Test,
balance_func=lambda data_type: data_type == DataType.Train)
save_labels_func=lambda data_type: data_type != DataType.Test)
])

synonyms = StemmerBasedSynonymCollection(iter_group_values_lists=[],
Expand All @@ -131,7 +128,8 @@ def test(self):

pipeline.run(input_data=None,
params_dict={
"data_folding": no_folding,
"data_folding": NoFolding(),
"doc_ids": {DataType.Test: list(range(len(texts)))},
"data_type_pipelines": {DataType.Test: test_pipeline}
})

Expand Down
2 changes: 1 addition & 1 deletion update_arekit.sh
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
pip3 uninstall arekit
pip3 install git+https://github.com/nicolay-r/AREkit@0.23.1-rc --no-deps
pip3 install git+https://github.com/nicolay-r/AREkit@0.24.0-rc --no-deps

0 comments on commit 889c561

Please sign in to comment.